Mercurial > forge
changeset 10568:d95379792766 octave-forge
kmeans: bugfix to deal with clusters that have only one element (patch by Christoph Kling <ckling@uni-koblenz.de>)
author | carandraug |
---|---|
date | Thu, 19 Jul 2012 03:22:00 +0000 |
parents | 717877001d22 |
children | bb392deb832e |
files | main/statistics/NEWS main/statistics/inst/kmeans.m |
diffstat | 2 files changed, 25 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/main/statistics/NEWS Wed Jul 18 16:16:30 2012 +0000 +++ b/main/statistics/NEWS Thu Jul 19 03:22:00 2012 +0000 @@ -9,6 +9,9 @@ mvnrnd + ** `kmeans' has been fixed to deal with clusters that contain only + one element. + Summary of important user-visible changes for statistics 1.1.3: -------------------------------------------------------------------
--- a/main/statistics/inst/kmeans.m Wed Jul 18 16:16:30 2012 +0000 +++ b/main/statistics/inst/kmeans.m Thu Jul 19 03:22:00 2012 +0000 @@ -33,20 +33,20 @@ ## used to hold the distances from each sample to each class D = zeros (nRows, k); - + #used for convergence of the centroids err = 1; - + #initial sum of distances sumd = Inf; - + ## Input checking, validate the matrix and k if (!isnumeric (data) || !ismatrix (data) || !isreal (data)) error ("kmeans: first input argument must be a DxN real data matrix"); elseif (!isscalar (k)) error ("kmeans: second input argument must be a scalar"); endif - + if (length (varargin) > 0) ## check for the 'emptyaction' property found = find (strcmpi (prop, "emptyaction") == 1); @@ -57,7 +57,7 @@ error ("kmeans: unsupported empty cluster action parameter"); endswitch endif - + ## check for the 'start' property switch (lower (start)) case "sample" @@ -66,27 +66,30 @@ otherwise error ("kmeans: unsupported initial clustering parameter"); endswitch - + ## Run the algorithm while err > .001 ## Compute distances for i = 1:k D (:, i) = sumsq (data - repmat (centers(i, :), nRows, 1), 2); endfor - + ## Classify [tmp, classes] = min (D, [], 2); - - ## Calcualte new centroids + + ## Calculate new centroids for i = 1:k + ## Get binary vector indicating membership in cluster i + membership = (classes == i); ## Check for empty clusters - if (sum (classes == i) ==0 || length (mean (data(classes == i, :))) == 0) - + if (sum (membership) == 0) switch emptyaction ## if 'singleton', then find the point that is the ## farthest and add it to the empty cluster case 'singleton' - classes(maxCostSampleIndex (data, centers(i,:))) = i; + idx=maxCostSampleIndex (data, centers(i,:)); + classes(idx) = i; + membership(idx)=1; ## if 'error' then throw the error otherwise error ("kmeans: empty cluster created"); @@ -94,10 +97,11 @@ endif ## end check for empty clusters ## update the centroids - centers(i, :) = mean (data(classes == i, :)); + members = data(membership, :); + centers(i, :) = sum(members,1)/size(members,1); endfor - ## calculate the differnece in the sum of distances + ## calculate the difference in the sum of distances err = sumd - objCost (data, classes, centers); ## update the current sum of distances sumd = objCost (data, classes, centers); @@ -112,11 +116,11 @@ endfor endfunction -function index = maxCostSampleIndex (data, centers) +function idx = maxCostSampleIndex (data, centers) cost = 0; - for index = 1:rows (data) - if cost < sumsq (data(index,:) - centers) - cost = sumsq (data(index,:) - centers); + for idx = 1:rows (data) + if cost < sumsq (data(idx,:) - centers) + cost = sumsq (data(idx,:) - centers); endif endfor endfunction