changeset 6100:329b9cde3604 octave-forge

any combination of naive and Kahan summation with double and extended can be controlled through flag_accuracy_level
author schloegl
date Wed, 30 Sep 2009 22:18:27 +0000
parents 7e8f7a4e9b45
children 34f64c54a7d0
files extra/NaN/inst/flag_accuracy_level.m extra/NaN/src/covm_mex.cpp extra/NaN/src/sumskipnan_mex.cpp
diffstat 3 files changed, 745 insertions(+), 81 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extra/NaN/inst/flag_accuracy_level.m	Wed Sep 30 22:18:27 2009 +0000
@@ -0,0 +1,67 @@
+function FLAG = flag_accuracy_level(i)
+% FLAG_ACCURACY_LEVEL sets and gets accuracy level 
+%   used in SUMSKIPNAN_MEX and COVM_MEX
+%   The error margin of the naive summation is N*eps (N is the number of samples),
+%   the error margin is only 2*eps if Kahan's summation is used [1].    
+%
+%	0: maximum speed and minimum accuracy (error = N*2^-52) 
+%	   of double (64bit float) without Kahan summation  
+%	1: {default] accuracy of extend double (80bit float) 
+%	   without Kahan summation (error = N*2^-64) 
+%	2: minimum speed and minimum accuracy (error = 2^-64) 
+%	   of double with Kahan summation  
+%	3: accuracy (error = 2^-52) 
+%	   of double (64bit float) with Kahan summation  
+%
+%   First tests suggest that 1 is a good solution
+% 
+% FLAG = flag_accuracy_level()
+% 	gets current level
+%
+% flag_accuracy_level(FLAG)
% 	sets mode 
+% 
+% This function is experimental and might disappear without further notice, 
+% so donot rely on it.
+%
+% Reference:
+% [1] David Goldberg, 
+%       What Every Computer Scientist Should Know About Floating-Point Arithmetic
+%       ACM Computing Surveys, Vol 23, No 1, March 1991. 
+
+
+%    This program is free software; you can redistribute it and/or modify
+%    it under the terms of the GNU General Public License as published by
+%    the Free Software Foundation; either version 3 of the License, or
+%    (at your option) any later version.
+%
+%    This program is distributed in the hope that it will be useful,
+%    but WITHOUT ANY WARRANTY; without even the implied warranty of
+%    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+%    GNU General Public License for more details.
+%
+%    You should have received a copy of the GNU General Public License
+%    along with this program; if not, write to the Free Software
+%    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+%	$Id$
+% 	Copyright (C) 2009 by Alois Schloegl <a.schloegl@ieee.org>
+%       This function is part of the NaN-toolbox
+%       http://hci.tu-graz.ac.at/~schloegl/matlab/NaN/
+
+
+persistent FLAG_ACCURACY_LEVEL;
+
+%% if strcmp(version,'3.6'), FLAG_ACCURACY_LEVEL=1; end;	%% hack for the use with Freemat3.6
+
+%%% set DEFAULT value of FLAG
+if isempty(FLAG_ACCURACY_LEVEL),
+	FLAG_ACCURACY_LEVEL = 1; 
+end;
+
+if nargin>0,
+	if (i>3) i=3; end;
+	if (i<0) i=0; end;
+	FLAG_ACCURACY_LEVEL = double(i); 
+end;
+FLAG = FLAG_ACCURACY_LEVEL;
+
--- a/extra/NaN/src/covm_mex.cpp	Wed Sep 30 19:09:46 2009 +0000
+++ b/extra/NaN/src/covm_mex.cpp	Wed Sep 30 22:18:27 2009 +0000
@@ -109,18 +109,18 @@
 			W  = mxGetPr(PInputs[3]);
 		else 	
 			mexErrMsgTxt("number of elements in W must match numbers of rows in X");
-
 	}
         
-       	if ((PInputCount > 4) && mxGetChars(PInputs[4])) {
-                switch (*mxGetChars(PInputs[4])) { 
-                case 'S': 
-                case 's': 
-                        flag_speed = 1; 
-                        break; 
-                }        
-       	}
-        //mexPrintf("Flag Speed=%i\n",flag_speed);
+	int ACC_LEVEL  = 1;
+	{
+		mxArray *LEVEL = NULL;
+		int s = mexCallMATLAB(1, &LEVEL, 0, NULL, "flag_accuracy_level");
+		if (!s) {
+			ACC_LEVEL = (int) mxGetScalar(LEVEL);
+		}	
+		mxDestroyArray(LEVEL);
+	}
+	mexPrintf("Accuracy Level=%i\n",ACC_LEVEL);
 
 	if (Y0==NULL) {
 		Y0 = X0; 
@@ -202,7 +202,112 @@
 	}
 	
 #else
-   if (flag_speed) {
+   if (ACC_LEVEL == 0) {
+	/*------ version 2 --------------------- 
+		 this version seems to be faster than the one above. 
+		 it is also faster but less accurate than the version below
+	*/
+	if ( (X0 != Y0) || (cX != cY) )
+		/******** X!=Y, output is not symetric *******/	
+	    if (W) /* weighted version */
+	    for (i=0; i<cX; i++)
+	    for (j=0; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		double nn=0.0;
+		for (k=0; k<rX; k++) {
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			cc += z*W[k];
+			nn += W[k];
+		}	
+		CC[i+j*cX] = cc; 
+		if (NN != NULL) 
+			NN[i+j*cX] = nn; 
+	    }
+	    else /* no weights, all weights are 1 */
+  	    for (i=0; i<cX; i++)
+	    for (j=0; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		size_t nn=0;
+		for (k=0; k<rX; k++) {
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			cc += z;
+			nn++;
+		}	
+		CC[i+j*cX] = cc; 
+		if (NN != NULL) 
+			NN[i+j*cX] = (double)nn; 
+	    }
+	else // if (X0==Y0) && (cX==cY)
+		/******** X==Y, output is symetric *******/	
+	    if (W) /* weighted version */
+	    for (i=0; i<cX; i++)
+	    for (j=i; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		double nn=0.0;
+		for (k=0; k<rX; k++) {
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			cc += z*W[k];
+			nn += W[k];
+		}	
+		CC[i+j*cX] = cc; 
+		CC[j+i*cX] = cc; 
+		if (NN != NULL) {
+			NN[i+j*cX] = nn; 
+			NN[j+i*cX] = nn; 
+		}	
+	    }
+	    else /* no weights, all weights are 1 */
+	    for (i=0; i<cX; i++)
+	    for (j=i; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		size_t nn=0;
+		for (k=0; k<rX; k++) {
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			cc += z;
+			nn++;
+		}	
+		CC[i+j*cX] = cc; 
+		CC[j+i*cX] = cc; 
+		if (NN != NULL) {
+			NN[i+j*cX] = (double)nn; 
+			NN[j+i*cX] = (double)nn; 
+		}	
+	    }
+
+    }
+    else if (ACC_LEVEL == 1) {
 	/*------ version 2 --------------------- 
 		 this version seems to be faster than the one above. 
 		 it is also faster but less accurate than the version below
@@ -307,7 +412,7 @@
 	    }
 
     }
-    else {
+    else if (ACC_LEVEL == 2) {
 	/*------ version 3 --------------------- 
 	        using Kahan's summation formula [1] 
 	        this gives more accurate results while the computational effort within the loop is about 4x as high  
@@ -454,6 +559,154 @@
 		}	
 	    }
     }
+    else if (ACC_LEVEL == 3) {
+	/*------ version 3 --------------------- 
+	        using Kahan's summation formula [1] 
+	        this gives more accurate results while the computational effort within the loop is about 4x as high  
+	        However, first test show an increase in computational time of only about 25 %.   
+
+                [1] David Goldberg, 
+                What Every Computer Scientist Should Know About Floating-Point Arithmetic
+                ACM Computing Surveys, Vol 23, No 1, March 1991
+	*/
+	if ( (X0 != Y0) || (cX != cY) )
+		/******** X!=Y, output is not symetric *******/	
+	    if (W) /* weighted version */
+	    for (i=0; i<cX; i++)
+	    for (j=0; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		double nn=0.0;
+		double rc=0.0;
+		double rn=0.0;
+		for (k=0; k<rX; k++) {
+		        double t,y; 
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			// cc += z*W[k]; [1]
+			y = z*W[k]-rc;
+			t = cc+y;
+			rc= (t-cc)-y;
+			cc= t; 
+
+			// nn += W[k]; [1]
+			y = z*W[k]-rn;
+			t = nn+y;
+			rn= (t-nn)-y;
+			nn= t; 
+		}	
+		CC[i+j*cX] = cc; 
+		if (NN != NULL) 
+			NN[i+j*cX] = nn; 
+	    }
+	    else /* no weights, all weights are 1 */
+  	    for (i=0; i<cX; i++)
+	    for (j=0; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		double rc=0.0;
+		size_t nn=0;
+		for (k=0; k<rX; k++) {
+		        double t,y; 
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			// cc += z;  [1]
+			y = z-rc;
+			t = cc+y;
+			rc= (t-cc)-y;
+			cc= t; 
+
+			nn++;
+		}	
+		CC[i+j*cX] = cc; 
+		if (NN != NULL) 
+			NN[i+j*cX] = (double)nn; 
+	    }
+	else // if (X0==Y0) && (cX==cY)
+		/******** X==Y, output is symetric *******/	
+	    if (W) /* weighted version */
+	    for (i=0; i<cX; i++)
+	    for (j=i; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		double nn=0.0;
+		double rc=0.0;
+		double rn=0.0;
+		for (k=0; k<rX; k++) {
+		        double t,y; 
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			// cc += z*W[k];  [1]
+			y = z*W[k]-rc;
+			t = cc+y;
+			rc= (t-cc)-y;
+			cc= t; 
+
+			// nn += W[k];  [1]
+			y = z*W[k]-rn;
+			t = nn+y;
+			rn= (t-nn)-y;
+			nn= t; 
+		}	
+		CC[i+j*cX] = cc; 
+		CC[j+i*cX] = cc; 
+		if (NN != NULL) {
+			NN[i+j*cX] = nn; 
+			NN[j+i*cX] = nn; 
+		}	
+	    }
+	    else /* no weights, all weights are 1 */
+	    for (i=0; i<cX; i++)
+	    for (j=i; j<cY; j++) {
+		X = X0+i*rX;
+		Y = Y0+j*rY;
+		double cc=0.0;
+		double rc=0.0;
+		size_t nn=0;
+		for (k=0; k<rX; k++) {
+		        double t,y; 
+			double z = X[k]*Y[k];
+			if (isnan(z)) {
+#ifndef NO_FLAG
+				flag_isNaN = 1;
+#endif 
+				continue;
+			}
+			// cc += z; [1]
+			y = z-rc;
+			t = cc+y;
+			rc= (t-cc)-y;
+			cc= t; 
+
+			nn++;
+		}	
+		CC[i+j*cX] = cc; 
+		CC[j+i*cX] = cc; 
+		if (NN != NULL) {
+			NN[i+j*cX] = (double)nn; 
+			NN[j+i*cX] = (double)nn; 
+		}	
+	    }
+    }
+
 
 #ifndef NO_FLAG
 	//mexPrintf("Third argument must be not empty - otherwise status whether a NaN occured or not cannot be returned.");
--- a/extra/NaN/src/sumskipnan_mex.cpp	Wed Sep 30 19:09:46 2009 +0000
+++ b/extra/NaN/src/sumskipnan_mex.cpp	Wed Sep 30 22:18:27 2009 +0000
@@ -53,10 +53,14 @@
 #include <math.h>
 #include "mex.h"
 
+inline int __sumskipnan2wr__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan3wr__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
 inline int __sumskipnan2w__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
 inline int __sumskipnan3w__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
 inline int __sumskipnan2we__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
 inline int __sumskipnan3we__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan2wer__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan3wer__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
 
 //#define NO_FLAG
 
@@ -149,7 +153,20 @@
 			W = mxGetPr(PInputs[3]);
 		else
 			mexErrMsgTxt("Error SUMSKIPNAN.MEX: length of weight vector does not match size of dimension");
-	}	
+	}
+
+	int ACC_LEVEL  = 1;
+	{
+		mxArray *LEVEL = NULL;
+		int s = mexCallMATLAB(1, &LEVEL, 0, NULL, "flag_accuracy_level");
+		if (!s) {
+			ACC_LEVEL = (int) mxGetScalar(LEVEL);
+			if ((D1>1) && (ACC_LEVEL>2))
+				mexWarnMsgTxt("Warning: Kahan summation not supported with stride > 1 !");
+		}	
+		mxDestroyArray(LEVEL);
+	}
+	mexPrintf("Accuracy Level=%i\n",ACC_LEVEL);
 
 	    // create outputs
 	#define TYP mxDOUBLE_CLASS
@@ -172,23 +189,101 @@
 
 	if (D1*D2*D3<1) // zero size array
 		; 	// do nothing 
-	else if ((POutputCount <= 1) && !mxIsComplex(PInputs[0])) {
+	else if ((D1==1) && (ACC_LEVEL<1)) {
+		// extended accuray, naive summation, error = N*2^-64 
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2wr__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2wr__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3wr__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if ((D1==1) && (ACC_LEVEL==1)) {
+		// extended accuray, naive summation, error = N*2^-64 
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2w__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2w__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3w__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if ((D1==1) && (ACC_LEVEL==2)) {
+		// ACC_LEVEL==2: extended accuracy and Kahan Summation, error = 2^-64
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2we__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2we__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3we__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if ((D1==1) && (ACC_LEVEL==3)) {
+		// ACC_LEVEL==2: extended accuracy and Kahan Summation, error = 2^-64
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2wer__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2wer__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3wer__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if (POutputCount <= 1) {
 		// OUTER LOOP: along dimensions > DIM
 		for (l = 0; l<D3; l++) {
 			ix0 = l*D1; 	// index for output
 			ix1 = ix0*D2;	// index for input 
-			if (D1==1)  	
-			{
-				double count;
-				__sumskipnan2we__(LInput+ix1, D2, LOutputSum+ix0, &count, &flag_isNaN, W);
- 	       		}
-			else {
-			   for (j=0; j<D2; j++) {
+			for (j=0; j<D2; j++) {
 				// minimize cache misses 
 				ix2 =   ix0;	// index for output 
 				// Inner LOOP: along dimensions < DIM
 				if (W) do {
-					register long double x = *LInput;
+					long double x = *LInput;
         				if (!isnan(x)) {
 						LongOutputSum[ix2]   += W[j]*x; 
 					}
@@ -200,7 +295,7 @@
 					ix2++;
 				} while (ix2 != (l+1)*D1);
 				else do {
-					register long double x = *LInput;
+					long double x = *LInput;
         				if (!isnan(x)) {
 						LongOutputSum[ix2]   += x; 
 					}
@@ -211,32 +306,26 @@
 					LInput++;
 					ix2++;
 				} while (ix2 != (l+1)*D1);
-			    }	//	end for (j=	
+			}	//	end for (j=	
 
-                                /* copy to output */
-	               		for (j=0; j<D1; j++) {
-					LOutputSum[ix0+j] = LongOutputSum[ix0+j]; 
-				}
-			}	// 	end else 
+                        /* copy to output */
+               		for (j=0; j<D1; j++) {
+				LOutputSum[ix0+j] = LongOutputSum[ix0+j]; 
+			}
                	}		
 	}
 
-	else if ((POutputCount == 2) && !mxIsComplex(PInputs[0])) {
+	else if (POutputCount == 2) {
 		// OUTER LOOP: along dimensions > DIM
 		for (l = 0; l<D3; l++) {
 			ix0 = l*D1; 
 			ix1 = ix0*D2;	// index for input 
-			if (D1==1)  	
-			{
-				__sumskipnan2we__(LInput+ix1, D2, LOutputSum+ix0, LOutputCount+ix0, &flag_isNaN, W);
- 	       		}
-			else {
-			    for (j=0; j<D2; j++) {
+			for (j=0; j<D2; j++) {
 				// minimize cache misses 
 				ix2 =   ix0;	// index for output 
 				// Inner LOOP: along dimensions < DIM
 				if (W) do {
-					register long double x = *LInput;
+					long double x = *LInput;
         				if (!isnan(x)) {
 						LongOutputCount[ix2] += W[j]; 
 						LongOutputSum[ix2]   += W[j]*x; 
@@ -249,7 +338,7 @@
 					ix2++;
 				} while (ix2 != (l+1)*D1);
 				else do {
-					register long double x = *LInput;
+					long double x = *LInput;
         				if (!isnan(x)) {
 						LongOutputCount[ix2] += 1.0; 
 						LongOutputSum[ix2]   += x; 
@@ -261,33 +350,27 @@
 					LInput++;
 					ix2++;
 				} while (ix2 != (l+1)*D1);
-			    }	//	end for (j=	
+			}	//	end for (j=	
 
                                 /* copy to output */
-	               		for (j=0; j<D1; j++) {
-					LOutputSum[ix0+j]   = LongOutputSum[ix0+j]; 
-					LOutputCount[ix0+j] = LongOutputCount[ix0+j]; 
-				}
-               		}	// 	end else 
+	               	for (j=0; j<D1; j++) {
+				LOutputSum[ix0+j]   = LongOutputSum[ix0+j]; 
+				LOutputCount[ix0+j] = LongOutputCount[ix0+j]; 
+	       		}	// 	end else 
                	}		
 	}
 
-	else if ((POutputCount == 3) && !mxIsComplex(PInputs[0])) {
+	else if (POutputCount == 3) {
 		// OUTER LOOP: along dimensions > DIM
 		for (l = 0; l<D3; l++) {
 			ix0 = l*D1; 
 			ix1 = ix0*D2;	// index for input 
-			if (D1==1)  	
-			{
-				__sumskipnan3we__(LInput+ix1, D2, LOutputSum+ix0, LOutputSum2+ix0, LOutputCount+ix0, &flag_isNaN, W);
- 	       		}
-			else {
-			    for (j=0; j<D2; j++) {
+			for (j=0; j<D2; j++) {
 				// minimize cache misses 
 				ix2 =   ix0;	// index for output 
 				// Inner LOOP: along dimensions < DIM
 				if (W) do {
-					register long double x = *LInput;
+					long double x = *LInput;
         				if (!isnan(x)) {
 						LongOutputCount[ix2] += W[j]; 
 						double t = W[j]*x;
@@ -302,7 +385,7 @@
 					ix2++;	
 				} while (ix2 != (l+1)*D1);
 				else do {
-					register long double x = *LInput;
+					long double x = *LInput;
         				if (!isnan(x)) {
 						LongOutputCount[ix2] += 1.0; 
 						LongOutputSum[ix2]   += x; 
@@ -315,15 +398,14 @@
 					LInput++;
 					ix2++;	
 				} while (ix2 != (l+1)*D1);
-			    }	//	end for (j=	
+			}	//	end for (j=	
 
-                                /* copy to output */
-	               		for (j=0; j<D1; j++) {
-					LOutputSum[ix0+j]   = LongOutputSum[ix0+j]; 
-					LOutputCount[ix0+j] = LongOutputCount[ix0+j]; 
-					LOutputSum2[ix0+j]  = LongOutputSum2[ix0+j]; 
-				}
-               		}	// 	end else 
+                       /* copy to output */
+	        	for (j=0; j<D1; j++) {
+				LOutputSum[ix0+j]   = LongOutputSum[ix0+j]; 
+				LOutputCount[ix0+j] = LongOutputCount[ix0+j]; 
+				LOutputSum2[ix0+j]  = LongOutputSum2[ix0+j]; 
+			}
                	}		
 	}
 	if (LongOutputSum) mxFree(LongOutputSum);
@@ -385,7 +467,117 @@
 		// with weight vector 
 		long double count = 0.0;
 		do {
-			register double x = *data;
+			long double x = *data;
+        		if (!isnan(x))
+			{
+				count += *W; 
+				sum   += *W*x;
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif	
+
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {
+		// w/o weight vector 
+		size_t countI = 0;
+		do {
+			long double x = *data;
+        		if (!isnan(x))
+			{
+				countI++; 
+				sum += x; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}	
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+
+}
+
+
+inline int __sumskipnan3w__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
+{
+	long double sum=0; 
+	long double msq=0; 
+	char   flag=0;
+	// LOOP  along dimension DIM
+	
+	void *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		long double count = 0.0;
+		do {
+			long double x = *data;
+        		if (!isnan(x)) {
+				count += *W;
+				long double t = *W*x; 
+				sum += t; 
+				msq += x*t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {	
+		// w/o weight vector 
+		size_t countI = 0;
+		do {
+			long double x = *data;
+        		if (!isnan(x)) {
+				countI++; 
+				sum += x; 
+				msq += x*x; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+	*s2 = msq; 
+}
+
+inline int __sumskipnan2wr__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W)
+{
+	double sum=0; 
+	char   flag=0; 
+	// LOOP  along dimension DIM
+	
+	void *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		double count = 0.0;
+		do {
+			double x = *data;
         		if (!isnan(x))
 			{
 				count += *W; 
@@ -429,17 +621,17 @@
 }
 
 
-inline int __sumskipnan3w__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
+inline int __sumskipnan3wr__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
 {
-	long double sum=0; 
-	long double msq=0; 
+	double sum=0; 
+	double msq=0; 
 	char   flag=0;
 	// LOOP  along dimension DIM
 	
 	void *end = data + stride*Ni; 
 	if (W) {
 		// with weight vector 
-		long double count = 0.0;
+		double count = 0.0;
 		do {
 			double x = *data;
         		if (!isnan(x)) {
@@ -486,6 +678,15 @@
 
 
 
+/***************************************
+        using Kahan's summation formula [1] 
+        this gives more accurate results while the computational effort within the loop is about 4x as high  
+        First tests show a penalty of about 40% in terms of computational time.   
+
+        [1] David Goldberg, 
+        What Every Computer Scientist Should Know About Floating-Point Arithmetic
+        ACM Computing Surveys, Vol 23, No 1, March 1991. 
+ ****************************************/
 
 inline int __sumskipnan2we__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W)
 {
@@ -499,7 +700,7 @@
 		long double count = 0.0;
 		long double rc=0.0, rn=0.0;
 		do {
-			register double x = *data;
+			long double x = *data;
 		        long double t,y; 
         		if (!isnan(x))
 			{
@@ -530,7 +731,7 @@
 		size_t countI = 0;
 		long double rc=0.0;
 		do {
-			double x = *data;
+			long double x = *data;
 		        long double t,y; 
         		if (!isnan(x))
 			{
@@ -559,17 +760,6 @@
 }
 
 
-
-/***************************************
-        using Kahan's summation formula [1] 
-        this gives more accurate results while the computational effort within the loop is about 4x as high  
-        First tests show a penalty of about 40% in terms of computational time.   
-
-        [1] David Goldberg, 
-        What Every Computer Scientist Should Know About Floating-Point Arithmetic
-        ACM Computing Surveys, Vol 23, No 1, March 1991. 
- ****************************************/
-
 inline int __sumskipnan3we__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
 {
 	long double sum=0; 
@@ -583,7 +773,7 @@
 		long double count = 0.0;
         	long double rc=0.0, rn=0.0, rq=0.0;
 		do {
-			double x = *data;
+			long double x = *data;
 		        long double t,y; 
         		if (!isnan(x)) {
 				//count += *W; [1]
@@ -619,7 +809,7 @@
 		size_t countI = 0;
         	long double rc=0.0, rq=0.0;
 		do {
-			double x = *data;
+			long double x = *data;
 		        long double t,y; 
         		if (!isnan(x)) {
 				countI++; 
@@ -652,3 +842,157 @@
 	*s2 = msq; 
 }
 
+inline int __sumskipnan2wer__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W)
+{
+	double sum=0; 
+	char   flag=0; 
+	// LOOP  along dimension DIM
+	
+	void *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		double count = 0.0;
+		double rc=0.0, rn=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x))
+			{
+				//count += *W; [1]
+        			y = *W-rn;
+        			t = count+y;
+               			rn= (t-count)-y;
+		        	count= t; 
+
+				//sum   += *W*x; [1]
+        			y = *W*x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif	
+
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {
+		// w/o weight vector 
+		size_t countI = 0;
+		double rc=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x))
+			{
+				countI++; 
+				// sum += x; [1]  
+        			y = x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}	
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+
+}
+
+
+inline int __sumskipnan3wer__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
+{
+	double sum=0; 
+	double msq=0; 
+	char   flag=0;
+	// LOOP  along dimension DIM
+	
+	void *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		double count = 0.0;
+        	double rc=0.0, rn=0.0, rq=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x)) {
+				//count += *W; [1]
+        			y = *W-rn;
+        			t = count+y;
+               			rn= (t-count)-y;
+		        	count= t; 
+
+				double w = *W*x; 
+				//sum   += *W*x; [1]
+        			y = *W*x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+
+				// msq += x*w; 
+        			y = w*x-rq;
+        			t = msq+y;
+               			rq= (t-msq)-y;
+		        	msq= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {	
+		// w/o weight vector 
+		size_t countI = 0;
+        	double rc=0.0, rq=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x)) {
+				countI++; 
+				//sum   += x; [1]
+        			y = x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+
+				// msq += x*x; 
+        			y = x*x-rq;
+        			t = msq+y;
+               			rq= (t-msq)-y;
+		        	msq= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+	*s2 = msq; 
+}
+