# HG changeset patch # User Timo Lindfors # Date 1203112216 18000 # Node ID a8e0f7184a594defad43201fedfc4311b298cf8c # Parent 196a759fc7e8a784f71b10c3497d5b358b4287f3 handle ties in kruskal_wallis_test diff -r 196a759fc7e8 -r a8e0f7184a59 doc/interpreter/contributors.in --- a/doc/interpreter/contributors.in Fri Feb 15 16:37:32 2008 -0500 +++ b/doc/interpreter/contributors.in Fri Feb 15 16:50:16 2008 -0500 @@ -104,6 +104,7 @@ Dirk Laurie Maurice LeBrun Friedrich Leisch +Timo Lindfors Benjamin Lindner Ross Lippert David Livings diff -r 196a759fc7e8 -r a8e0f7184a59 scripts/ChangeLog --- a/scripts/ChangeLog Fri Feb 15 16:37:32 2008 -0500 +++ b/scripts/ChangeLog Fri Feb 15 16:50:16 2008 -0500 @@ -1,3 +1,8 @@ +2008-02-15 Timo Lindfors + + * statistics/tests/kruskal_wallis_test.m: Handle ties. + * general/runlength.m: New function from Paul Kienzle. + 2008-02-15 Rolf Fabian * linear-algebra/cond.m: New optional second argument to diff -r 196a759fc7e8 -r a8e0f7184a59 scripts/general/runlength.m --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/general/runlength.m Fri Feb 15 16:50:16 2008 -0500 @@ -0,0 +1,36 @@ +## Copyright (C) 2005, 2008 Paul Kienzle +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {Function File} {} runlength (@var{x}) +## Find the lengths of all sequences of common values. Return the +## vector of lengths and the value that was repeated. +## +## @example +## runlength ([2, 2, 0, 4, 4, 4, 0, 1, 1, 1, 1]) +## @result{} [2, 1, 3, 1, 4] +## @end example +## @end deftypefn + +function [count, value] = runlength (x) + idx = [find(x(1:end-1) != x(2:end)), length(x)]; + value = x(idx); + count = diff ([0 idx]); +endfunction + +%!assert (runlength([2 2 0 4 4 4 0 1 1 1 1]), [2 1 3 1 4]); diff -r 196a759fc7e8 -r a8e0f7184a59 scripts/statistics/tests/kruskal_wallis_test.m --- a/scripts/statistics/tests/kruskal_wallis_test.m Fri Feb 15 16:37:32 2008 -0500 +++ b/scripts/statistics/tests/kruskal_wallis_test.m Fri Feb 15 16:50:16 2008 -0500 @@ -29,6 +29,18 @@ ## approximately chi-square with @var{df} = @var{k} - 1 degrees of ## freedom. ## +## If the data contains ties (some value appears more than once) +## @var{k} is divided by +## +## 1 - @var{sumTies} / ( @var{n}^3 - @var{n} ) +## +## where @var{sumTies} is the sum of @var{t}^2 - @var{t} over each group +## of ties where @var{t} is the number of ties in the group and @var{n} +## is the total number of values in the input data. For more info on +## this adjustment see "Use of Ranks in One-Criterion Variance Analysis" +## in Journal of the American Statistical Association, Vol. 47, +## No. 260 (Dec 1952) by William H. Kruskal and W. Allen Wallis. +## ## The p-value (1 minus the CDF of this distribution at @var{k}) is ## returned in @var{pval}. ## @@ -67,9 +79,14 @@ j = j + n(i); endfor - n = length (p); - k = 12 * k / (n * (n + 1)) - 3 * (n + 1); - df = m - 1; + n = length (p); + k = 12 * k / (n * (n + 1)) - 3 * (n + 1); + + ## Adjust the result to takes ties into account. + sum_ties = sum (polyval ([1, 0, -1, 0], runlength (sort (p)))); + k = k / (1 - sum_ties / (n^3 - n)); + + df = m - 1; pval = 1 - chisquare_cdf (k, df); if (nargout == 0) @@ -78,4 +95,5 @@ endfunction - +## Test with ties +%!assert (abs(kruskal_wallis_test([86 86], [74]) - 0.157299207050285) < 0.0000000000001)