changeset 988:adda1f4aa84a octave-forge

checking correlation of NaN's: works now with SPEARMAN and RANK, calculation corrected; did several checks with Matlab and Octave4Win 2.1.42
author schloegl
date Mon, 30 Jun 2003 14:58:58 +0000
parents aa42e47dbf06
children 965d5b601b6f
files extra/NaN/corrcoef.m
diffstat 1 files changed, 51 insertions(+), 55 deletions(-) [+]
line wrap: on
line diff
--- a/extra/NaN/corrcoef.m	Sun Jun 29 21:44:29 2003 +0000
+++ b/extra/NaN/corrcoef.m	Mon Jun 30 14:58:58 2003 +0000
@@ -1,16 +1,12 @@
-function [R,sig,ci1,ci2] = corrcoef(X,Y,Mode);
+function [R,sig,ci1,ci2,nan_sig] = corrcoef(X,Y,Mode);
 % CORRCOEF calculates the correlation coefficient.
 % X and Y can contain missing values encoded with NaN.
 % NaN's are skipped, NaN do not result in a NaN output. 
 % A significance test to check the independence of NaN's
 % is included. 
-% The result is only valid if the occurence of NaN's is uncorrelated, 
-%
-% The output gives NaN in case of insufficient input data.
-%
-% CORRCOEF(X [,Mode]);
+%   CORRCOEF(X [,Mode]);
 %      calculates the (auto-)correlation matrix of X
-% CORRCOEF(X,Y [,Mode]);
+%   CORRCOEF(X,Y [,Mode]);
 %      calculates the crosscorrelation between X and Y
 %
 % Mode='Pearson' or 'parametric' [default]
@@ -21,7 +17,13 @@
 % Mode='Rank' 		gives a nonparametric Rank Correlation Coefficient
 %	This replaces RANKCORR.M
 %
-% [R,p,ci1,ci2] = CORRCOEF(...);
+% The result is only valid if the occurence of NaN's is uncorrelated. 
+%   This can be checked with 
+%       [nan_R,nan_sig]=corrcoef(X,isnan(X))
+%   or  [nan_R,nan_sig]=corrcoef([X,Y],isnan([X,Y]))
+%   or  [R,p,ci1,ci2,nan_sig] = CORRCOEF(...);
+%
+% [R,p,ci1,ci2,nansig] = CORRCOEF(...);
 % 	R is the correlation matrix
 %	R(i,j) is the correlation coefficient r between X(:,i) and Y(:,j)
 %  p    gives the significance of R
@@ -32,7 +34,8 @@
 %  p < alpha: The alternative hypothesis "R2 is larger than zero" is true with probability (1-alpha).
 %  ci1	lower 0.95 confidence interval 
 %  ci2	upper 0.95 confidence interval 
-%
+%  nan_sig 	p-value whether H0: "NaN's are not correlated" must be rejected
+% 
 % Further recommandation related to the correlation coefficient 
 % + LOOK AT THE SCATTERPLOTS!
 % + Correlation is not causation. The observed correlation between two variables 
@@ -101,6 +104,9 @@
         end;
 end;        
 Mode=[Mode,'        '];
+
+FLAG_WARNING = warning;		% save warning status
+warning('off');
 
 [r1,c1]=size(X);
 if ~isempty(Y)
@@ -227,6 +233,12 @@
 end;
 
 
+if nargout<2, 
+	warning(FLAG_WARNING); 	% restore warning status
+        return;
+end;
+
+
 % CONFIDENCE INTERVAL
 if exist('flag_implicit_significance')==2,
         alpha = flag_implicit_significance;
@@ -235,70 +247,30 @@
 end;
 % fprintf(1,'CORRCOEF: confidence interval is based on alpha=%f\n',alpha);
 
-
-%%%%% ----- check independence of NaNs (missing values) -----
-if YESNAN,
-	tmp = isnan(X);
-	X(tmp) = 0;
-	% [nan_R,nan_sig,nan_ci1,nan_ci2] = corrcoef(X,tmp);
-
-	[nan_R, nan_sig] = corrcoef(X,tmp);
-
-	% remove diagonal elements, because these have not any meaning %
-	tmp = logical(eye(size(nan_sig)));
-	% nan_R(tmp)   = nan;
-	nan_sig(tmp)   = nan;
-	% nan_ci1(tmp) = nan;
-	% nan_ci2(tmp) = nan;
-	
-	if any(nan_sig(:) < alpha),
-		fprintf(1,'Warning CORRCOFF: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n', min(nan_sig(:)));
-		fprintf(1,'Its recommended to remove all samples with any missing value (NaN).\n');
-		fprintf(1,'The following combinations show a significant correlation of NaNs\n');
-		[ix,iy] = find(nan_sig < alpha);
-		disp([ix,iy])
-	end;
-end;	
-%%%%% ----- end of independence check ------
-
-
-if nargout<2, 
-        return;
-end;
-
-
+
 % SIGNIFICANCE TEST
-
-% warning off; 	% prevent division-by-zero warnings in Matlab.
-
 tmp = 1 - R.*R;
-ix1 = (tmp==0);
-tmp(ix1) = nan;		% avoid division-by-zero warning
+tmp(tmp<0) = 0;		% prevent tmp<0 i.e. imag(t)~=0 
 t   = R.*sqrt(max(NN-2,0)./tmp);
 
-ix2 = (t<0)|(t>1);	% mark abs(r)==1
-if 0,any(ix2(:));
-        warning('CORRCOEF: t-value out of range - probably due to some rounding error') 
-        t(ix2)=nan;
-end;
-
 if exist('t_cdf')>1;
         sig = t_cdf(t,NN-2);
 elseif exist('tcdf')>1;
         sig = tcdf(t,NN-2);
 else
-        fprintf('Warning CORRCOEF: significance test not completed because of missing TCDF-function\n')
+        fprintf('CORRCOEF: significance test not completed because of missing TCDF-function\n')
         sig = repmat(nan,size(R));
 end;
-sig  = 2 * min(sig,1 - sig);
+sig  = 2 * min(sig,1 - sig);
 
 if nargout<3, 
+	warning(FLAG_WARNING); 	% restore warning status
         return;
 end;
 
 
 tmp = R;
-tmp(ix1 | ix2) = nan;		% avoid division-by-zero warning
+%tmp(ix1 | ix2) = nan;		% avoid division-by-zero warning
 z   = log((1+tmp)./(1-tmp))/2; 	% Fisher's z-transform; 
 %sz = 1./sqrt(NN-3);		% standard error of z
 sz  = sqrt(2)*erfinv(1-2*alpha)./sqrt(NN-3);		% confidence interval for alpha of z
@@ -309,4 +281,28 @@
 %ci1(isnan(ci1))=R(isnan(ci1));	% in case of isnan(ci), the interval limits are exactly the R value 
 %ci2(isnan(ci2))=R(isnan(ci2));
 
+if nargout<5, 
+	warning(FLAG_WARNING); 	% restore warning status
+        return;
+end;
+
+
+%%%%% ----- check independence of NaNs (missing values) -----
+%[nan_R,nan_sig,nan_ci1,nan_ci2] = corrcoef(X,isnan(X))
+[nan_R, nan_sig] = corrcoef(X,isnan(X));
+
+% remove diagonal elements, because these have not any meaning %
+nan_sig(isnan(nan_R)) = nan;
+
+if any(nan_sig(:) < alpha),
+        fprintf(1,'CORRCOFF Warning: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n', min(nan_sig(:)));
+        fprintf(1,'   Its recommended to remove all samples with any missing value (NaN).\n');
+        fprintf(1,'   In the following combinations the null-hypotheses (NaNs are uncorrelated) must be rejected.\n');
+        [ix,iy] = find(nan_sig < alpha);
+        disp([ix,iy])
+end;
+
+%%%%% ----- end of independence check ------
+
+warning(FLAG_WARNING); 	% restore warning status
 return;