changeset 11576:19a8d9d20324 octave-forge

Faster implementation of princomp.
author asnelt
date Thu, 28 Mar 2013 13:32:36 +0000
parents 957eec76b69b
children 6f40373812df
files main/statistics/NEWS main/statistics/inst/princomp.m
diffstat 2 files changed, 135 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/main/statistics/NEWS	Wed Mar 27 17:28:48 2013 +0000
+++ b/main/statistics/NEWS	Thu Mar 28 13:32:36 2013 +0000
@@ -8,6 +8,8 @@
  ** dendogram now returns the leaf node numbers and order that the nodes were
     displayed in.
 
+ ** New faster implementation of princomp.
+
 Summary of important user-visible changes for statistics 1.2.0:
 -------------------------------------------------------------------
 
--- a/main/statistics/inst/princomp.m	Wed Mar 27 17:28:48 2013 +0000
+++ b/main/statistics/inst/princomp.m	Thu Mar 28 13:32:36 2013 +0000
@@ -1,51 +1,149 @@
-## Author: Paul Kienzle <pkienzle@users.sf.net>
-## This program is granted to the public domain.
+## Copyright (C) 2013 Fernando Damian Nieuwveldt <fdnieuwveldt@gmail.com>
+##
+## This program is free software; you can redistribute it and/or
+## modify it under the terms of the GNU General Public License
+## as published by the Free Software Foundation; either version 3
+## of the License, or (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see <http://www.gnu.org/licenses/>.
 
 ## -*- texinfo -*-
-## @deftypefn {Function File} {[@var{pc}, @var{z}, @var{w}, @var{Tsq}] =} princomp (@var{X})
-##
-## Compute principal components of @var{X}.
+## @deftypefn {Function File} {[@var{COEFF}]} = princomp(@var{X})
+## @end deftypefn
+## @deftypefn {Function File} {[@var{COEFF},@var{SCORE}]} = princomp(@var{X})
+## @end deftypefn
+## @deftypefn {Function File} {[@var{COEFF},@var{SCORE},@var{latent}]} = princomp(@var{X})
+## @end deftypefn
+## @deftypefn {Function File} {[@var{COEFF},@var{SCORE},@var{latent},@var{tsquare}]} = princomp(@var{X})
+## @end deftypefn
+## @deftypefn {Function File} {[...]} = princomp(@var{X},'econ')
+## @end deftypefn
+## @itemize @bullet
+## @item
+## princomp performs principal component analysis on a NxP data matrix X
+## @item
+## @var{COEFF} : returns the principal component coefficients
+## @item
+## @var{SCORE} : returns the principal component scores, the representation of X 
+## in the principal component space
+## @item
+## @var{LATENT} : returns the principal component variances, i.e., the 
+## eigenvalues of the covariance matrix X.
+## @item
+## @var{TSQUARE} : returns Hotelling's T-squared Statistic for each observation in X 
+## @item
+## [...] = princomp(X,'econ') returns only the elements of latent that are not 
+## necessarily zero, and the corresponding columns of COEFF and SCORE, that is, 
+## when n <= p, only the first n-1. This can be significantly faster when p is 
+## much larger than n. In this case the svd will be applied on the transpose of 
+## the data matrix X
 ##
-## The first output argument @var{pc} is the principal components of @var{X}.
-## The second @var{z} is the transformed data, and @var{w} is the eigenvalues of
-## the covariance matrix of @var{X}. @var{Tsq} is the Hotelling's @math{T^2}
-## statistic for the transformed data.
-## @end deftypefn
+## @end itemize
+##
+## @subheading References
+##
+## @enumerate
+## @item
+## Jolliffe, I. T., Principal Component Analysis, 2nd Edition, Springer, 2002
+## 
+## @end enumerate
+
+function [COEFF,SCORE,latent,tsquare] = princomp(X,varargin)
+
+   if (nargin < 1 || nargin > 2)
+      print_usage ();
+   endif
+	
+   if (nargin == 2 && ! strcmpi (varargin{:}, "econ"))
+      error ("princomp: if a second input argument is present, it must be the string  'econ'");
+   endif
 
-function [pc,z,w,Tsq] = princomp(X)
-  C = cov(X);
-  [U,D,pc] = svd(C,1);
-  if nargout>1, z = center(X)*pc; end
-  if nargout>2, w = diag(D); end
-  if nargout>3, Tsq = sumsq(zscore(z),2); 
-    warning('XXX FIXME XXX Tsq return from princomp fails some tests'); 
-  end
+   [nobs nvars] = size(X);
+	 
+   # Center the columns to mean zero
+   Xcentered = bsxfun(@minus,X,mean(X));
+
+   # Check if there are more variables then observations
+   if nvars <= nobs
+		
+      [U,S,COEFF] = svd(Xcentered);
+
+   else
+
+      # Calculate the svd on the transpose matrix, much faster
+      if (nargin == 2 && strcmpi ( varargin{:} , "econ"))
+	     [COEFF,S,V] = svd(Xcentered' , 'econ');
+      else
+	     [COEFF,S,V] = svd(Xcentered');
+      endif	
+
+   endif
+
+   if nargout > 1
+
+      # Get the Scores
+      SCORE = Xcentered*COEFF;
+	
+      # Get the rank of the SCORE matrix	
+      r = rank(SCORE); 
+
+      # Only use the first r columns, pad rest with zeros if economy != 'econ'
+      SCORE = SCORE(:,1:r) ; 
+	 
+      if !(nargin == 2 && strcmpi ( varargin{:} , "econ"))
+	    SCORE = [SCORE, zeros(nobs , nvars-r)];
+      else
+	    COEFF   = COEFF(: , 1:r);   
+      endif
+
+    endif
+
+    # This is the same as the eigenvalues of the covariance matrix of X
+    latent  = (diag(S'*S)/(size(Xcentered,1)-1))(1:r);
+
+    if nargout > 2
+      if !(nargin == 2 && strcmpi ( varargin{:} , "econ"))
+	  latent= [latent;zeros(nvars-r,1)];
+      endif
+    endif
+
+    if nargout > 3
+ 	# Calculate the Hotelling T-Square statistic for the observations
+	tsquare = sumsq(zscore(SCORE(:,1:r)),2);
+    endif
+
 endfunction
-%!shared pc,z,w,Tsq,m,x
+
+%!shared COEFF,SCORE,latent,tsquare,m,x
 
 %!test
 %! x=[1,2,3;2,1,3]';
-%! [pc,z,w,Tsq]=princomp(x);
+%! [COEFF,SCORE,latent,tsquare] = princomp(x);
 %! m=[sqrt(2),sqrt(2);sqrt(2),-sqrt(2);-2*sqrt(2),0]/2;
-%! m(:,1) = m(:,1)*sign(pc(1,1));
-%! m(:,2) = m(:,2)*sign(pc(1,2));
+%! m(:,1) = m(:,1)*sign(COEFF(1,1));
+%! m(:,2) = m(:,2)*sign(COEFF(1,2));
 
-%!assert(pc,m(1:2,:),10*eps);
-%!assert(z,-m,10*eps);
-%!assert(w,[1.5;.5],10*eps);
-%!assert(Tsq,[4;4;4]/3,10*eps);
+%!assert(COEFF,m(1:2,:),10*eps);
+%!assert(SCORE,-m,10*eps);
+%!assert(latent,[1.5;.5],10*eps);
+%!assert(tsquare,[4;4;4]/3,10*eps);
 
 %!test
 %! x=x';
-%! [pc,z,w,Tsq]=princomp(x);
+%! [COEFF,SCORE,latent,tsquare] = princomp(x);
 %! m=[sqrt(2),sqrt(2),0;-sqrt(2),sqrt(2),0;0,0,2]/2;
-%! m(:,1) = m(:,1)*sign(pc(1,1));
-%! m(:,2) = m(:,2)*sign(pc(1,2));
-%! m(:,3) = m(:,3)*sign(pc(3,3));
+%! m(:,1) = m(:,1)*sign(COEFF(1,1));
+%! m(:,2) = m(:,2)*sign(COEFF(1,2));
+%! m(:,3) = m(:,3)*sign(COEFF(3,3));
 
-%!assert(pc,m,10*eps);
-%!assert(z(:,1),-m(1:2,1),10*eps);
-%!assert(z(:,2:3),zeros(2),10*eps);
-%!assert(w,[1;0;0],10*eps);
+%!assert(COEFF,m,10*eps);
+%!assert(SCORE(:,1),-m(1:2,1),10*eps);
+%!assert(SCORE(:,2:3),zeros(2),10*eps);
+%!assert(latent,[1;0;0],10*eps);
 %!xtest
-%! assert(Tsq,1,10*eps);
+%! assert(tsquare,[0.5;0.5],10*eps)