Mercurial > forge
changeset 11576:19a8d9d20324 octave-forge
Faster implementation of princomp.
author | asnelt |
---|---|
date | Thu, 28 Mar 2013 13:32:36 +0000 |
parents | 957eec76b69b |
children | 6f40373812df |
files | main/statistics/NEWS main/statistics/inst/princomp.m |
diffstat | 2 files changed, 135 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- a/main/statistics/NEWS Wed Mar 27 17:28:48 2013 +0000 +++ b/main/statistics/NEWS Thu Mar 28 13:32:36 2013 +0000 @@ -8,6 +8,8 @@ ** dendogram now returns the leaf node numbers and order that the nodes were displayed in. + ** New faster implementation of princomp. + Summary of important user-visible changes for statistics 1.2.0: -------------------------------------------------------------------
--- a/main/statistics/inst/princomp.m Wed Mar 27 17:28:48 2013 +0000 +++ b/main/statistics/inst/princomp.m Thu Mar 28 13:32:36 2013 +0000 @@ -1,51 +1,149 @@ -## Author: Paul Kienzle <pkienzle@users.sf.net> -## This program is granted to the public domain. +## Copyright (C) 2013 Fernando Damian Nieuwveldt <fdnieuwveldt@gmail.com> +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License +## as published by the Free Software Foundation; either version 3 +## of the License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see <http://www.gnu.org/licenses/>. ## -*- texinfo -*- -## @deftypefn {Function File} {[@var{pc}, @var{z}, @var{w}, @var{Tsq}] =} princomp (@var{X}) -## -## Compute principal components of @var{X}. +## @deftypefn {Function File} {[@var{COEFF}]} = princomp(@var{X}) +## @end deftypefn +## @deftypefn {Function File} {[@var{COEFF},@var{SCORE}]} = princomp(@var{X}) +## @end deftypefn +## @deftypefn {Function File} {[@var{COEFF},@var{SCORE},@var{latent}]} = princomp(@var{X}) +## @end deftypefn +## @deftypefn {Function File} {[@var{COEFF},@var{SCORE},@var{latent},@var{tsquare}]} = princomp(@var{X}) +## @end deftypefn +## @deftypefn {Function File} {[...]} = princomp(@var{X},'econ') +## @end deftypefn +## @itemize @bullet +## @item +## princomp performs principal component analysis on a NxP data matrix X +## @item +## @var{COEFF} : returns the principal component coefficients +## @item +## @var{SCORE} : returns the principal component scores, the representation of X +## in the principal component space +## @item +## @var{LATENT} : returns the principal component variances, i.e., the +## eigenvalues of the covariance matrix X. +## @item +## @var{TSQUARE} : returns Hotelling's T-squared Statistic for each observation in X +## @item +## [...] = princomp(X,'econ') returns only the elements of latent that are not +## necessarily zero, and the corresponding columns of COEFF and SCORE, that is, +## when n <= p, only the first n-1. This can be significantly faster when p is +## much larger than n. In this case the svd will be applied on the transpose of +## the data matrix X ## -## The first output argument @var{pc} is the principal components of @var{X}. -## The second @var{z} is the transformed data, and @var{w} is the eigenvalues of -## the covariance matrix of @var{X}. @var{Tsq} is the Hotelling's @math{T^2} -## statistic for the transformed data. -## @end deftypefn +## @end itemize +## +## @subheading References +## +## @enumerate +## @item +## Jolliffe, I. T., Principal Component Analysis, 2nd Edition, Springer, 2002 +## +## @end enumerate + +function [COEFF,SCORE,latent,tsquare] = princomp(X,varargin) + + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + if (nargin == 2 && ! strcmpi (varargin{:}, "econ")) + error ("princomp: if a second input argument is present, it must be the string 'econ'"); + endif -function [pc,z,w,Tsq] = princomp(X) - C = cov(X); - [U,D,pc] = svd(C,1); - if nargout>1, z = center(X)*pc; end - if nargout>2, w = diag(D); end - if nargout>3, Tsq = sumsq(zscore(z),2); - warning('XXX FIXME XXX Tsq return from princomp fails some tests'); - end + [nobs nvars] = size(X); + + # Center the columns to mean zero + Xcentered = bsxfun(@minus,X,mean(X)); + + # Check if there are more variables then observations + if nvars <= nobs + + [U,S,COEFF] = svd(Xcentered); + + else + + # Calculate the svd on the transpose matrix, much faster + if (nargin == 2 && strcmpi ( varargin{:} , "econ")) + [COEFF,S,V] = svd(Xcentered' , 'econ'); + else + [COEFF,S,V] = svd(Xcentered'); + endif + + endif + + if nargout > 1 + + # Get the Scores + SCORE = Xcentered*COEFF; + + # Get the rank of the SCORE matrix + r = rank(SCORE); + + # Only use the first r columns, pad rest with zeros if economy != 'econ' + SCORE = SCORE(:,1:r) ; + + if !(nargin == 2 && strcmpi ( varargin{:} , "econ")) + SCORE = [SCORE, zeros(nobs , nvars-r)]; + else + COEFF = COEFF(: , 1:r); + endif + + endif + + # This is the same as the eigenvalues of the covariance matrix of X + latent = (diag(S'*S)/(size(Xcentered,1)-1))(1:r); + + if nargout > 2 + if !(nargin == 2 && strcmpi ( varargin{:} , "econ")) + latent= [latent;zeros(nvars-r,1)]; + endif + endif + + if nargout > 3 + # Calculate the Hotelling T-Square statistic for the observations + tsquare = sumsq(zscore(SCORE(:,1:r)),2); + endif + endfunction -%!shared pc,z,w,Tsq,m,x + +%!shared COEFF,SCORE,latent,tsquare,m,x %!test %! x=[1,2,3;2,1,3]'; -%! [pc,z,w,Tsq]=princomp(x); +%! [COEFF,SCORE,latent,tsquare] = princomp(x); %! m=[sqrt(2),sqrt(2);sqrt(2),-sqrt(2);-2*sqrt(2),0]/2; -%! m(:,1) = m(:,1)*sign(pc(1,1)); -%! m(:,2) = m(:,2)*sign(pc(1,2)); +%! m(:,1) = m(:,1)*sign(COEFF(1,1)); +%! m(:,2) = m(:,2)*sign(COEFF(1,2)); -%!assert(pc,m(1:2,:),10*eps); -%!assert(z,-m,10*eps); -%!assert(w,[1.5;.5],10*eps); -%!assert(Tsq,[4;4;4]/3,10*eps); +%!assert(COEFF,m(1:2,:),10*eps); +%!assert(SCORE,-m,10*eps); +%!assert(latent,[1.5;.5],10*eps); +%!assert(tsquare,[4;4;4]/3,10*eps); %!test %! x=x'; -%! [pc,z,w,Tsq]=princomp(x); +%! [COEFF,SCORE,latent,tsquare] = princomp(x); %! m=[sqrt(2),sqrt(2),0;-sqrt(2),sqrt(2),0;0,0,2]/2; -%! m(:,1) = m(:,1)*sign(pc(1,1)); -%! m(:,2) = m(:,2)*sign(pc(1,2)); -%! m(:,3) = m(:,3)*sign(pc(3,3)); +%! m(:,1) = m(:,1)*sign(COEFF(1,1)); +%! m(:,2) = m(:,2)*sign(COEFF(1,2)); +%! m(:,3) = m(:,3)*sign(COEFF(3,3)); -%!assert(pc,m,10*eps); -%!assert(z(:,1),-m(1:2,1),10*eps); -%!assert(z(:,2:3),zeros(2),10*eps); -%!assert(w,[1;0;0],10*eps); +%!assert(COEFF,m,10*eps); +%!assert(SCORE(:,1),-m(1:2,1),10*eps); +%!assert(SCORE(:,2:3),zeros(2),10*eps); +%!assert(latent,[1;0;0],10*eps); %!xtest -%! assert(Tsq,1,10*eps); +%! assert(tsquare,[0.5;0.5],10*eps)