diff extra/NaN/src/sumskipnan_mex.cpp @ 6549:41e9854fe26d octave-forge

use *.cpp instead of *.c
author schloegl
date Sun, 10 Jan 2010 22:05:59 +0000
parents
children 485d1594d155
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extra/NaN/src/sumskipnan_mex.cpp	Sun Jan 10 22:05:59 2010 +0000
@@ -0,0 +1,998 @@
+//-------------------------------------------------------------------
+#pragma hdrstop
+//-------------------------------------------------------------------
+//   C-MEX implementation of SUMSKIPNAN - this function is part of the NaN-toolbox. 
+//
+//
+//   This program is free software; you can redistribute it and/or modify
+//   it under the terms of the GNU General Public License as published by
+//   the Free Software Foundation; either version 3 of the License, or
+//   (at your option) any later version.
+//
+//   This program is distributed in the hope that it will be useful,
+//   but WITHOUT ANY WARRANTY; without even the implied warranty of
+//   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//   GNU General Public License for more details.
+//
+//   You should have received a copy of the GNU General Public License
+//   along with this program; if not, see <http://www.gnu.org/licenses/>.
+//
+//
+// sumskipnan: sums all non-NaN values
+// usage:
+//	[o,count,SSQ] = sumskipnan_mex(x,DIM,flag,W);
+//
+// SUMSKIPNAN uses two techniques to reduce errors: 
+// 1) long double (80bit) instead of 64-bit double is used internally
+// 2) The Kahan Summation formula is used to reduce the error margin from N*eps to 2*eps 
+//        The latter is only implemented in case of stride=1 (column vectors only, summation along 1st dimension). 
+//
+// Input:
+// - x data array
+// - DIM (optional) dimension to sum
+// - flag (optional) is actually an output argument telling whether some NaN was observed
+// - W (optional) weight vector to compute weighted sum (default 1)
+//
+// Output:
+// - o (weighted) sum along dimension DIM
+// - count of valid elements
+// - sums of squares
+//
+//
+//    $Id$
+//    Copyright (C) 2009 Alois Schloegl <a.schloegl@ieee.org>
+//    This function is part of the NaN-toolbox
+//    http://hci.tugraz.at/~schloegl/matlab/NaN/
+//
+//-------------------------------------------------------------------
+
+
+
+
+#include <inttypes.h>
+#include <math.h>
+#include "mex.h"
+
+inline int __sumskipnan2w__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan3w__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan2wr__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan3wr__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan2we__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan3we__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan2wer__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W);
+inline int __sumskipnan3wer__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W);
+
+//#define NO_FLAG
+
+#ifdef tmwtypes_h
+  #if (MX_API_VER<=0x07020000)
+    typedef int mwSize;
+  #endif 
+#endif 
+
+
+void mexFunction(int POutputCount,  mxArray* POutput[], int PInputCount, const mxArray *PInputs[]) 
+{
+    	const mwSize	*SZ;	    
+    	double* 	LInput;
+    	double* 	LOutputSum;
+    	double* 	LOutputCount;
+    	double* 	LOutputSum2;
+    	long double* 	LongOutputSum = NULL;
+    	long double* 	LongOutputCount = NULL;
+    	long double* 	LongOutputSum2 = NULL;
+    	double  	x;
+    	double*		W = NULL;		// weight vector 
+
+    	mwSize		DIM = 0; 
+    	mwSize		D1, D2, D3; 	// NN; 	//  	
+    	mwSize    	ND, ND2;	// number of dimensions: input, output
+    	mwSize		ix0, ix1, ix2;	// index to input and output
+    	mwSize    	j, l;		// running indices 
+    	mwSize 		*SZ2;		// size of output 	    
+	char	 	flag_isNaN = 0;
+
+	// check for proper number of input and output arguments
+	if ((PInputCount <= 0) || (PInputCount > 4))
+	        mexErrMsgTxt("SUMSKIPNAN.MEX requires between 1 and 4 arguments.");
+	if (POutputCount > 4)
+	        mexErrMsgTxt("SUMSKIPNAN.MEX has 1 to 3 output arguments.");
+
+	// get 1st argument
+	if(mxIsDouble(PInputs[0]) && !mxIsComplex(PInputs[0]))
+		LInput  = mxGetPr(PInputs[0]);
+	else 	
+		mexErrMsgTxt("First argument must be REAL/DOUBLE.");
+
+    	// get 2nd argument
+    	if  (PInputCount > 1) {
+ 	       	switch (mxGetNumberOfElements(PInputs[1])) {
+		case 0: x = 0.0; 		// accept empty element
+			break;
+		case 1: x = (mxIsNumeric(PInputs[1]) ? mxGetScalar(PInputs[1]) : -1.0); 
+			break;
+		default:x = -1.0;		// invalid 
+		}
+		if ((x < 0) || (x > 65535) || (x != floor(x))) 
+			mexErrMsgTxt("Error SUMSKIPNAN.MEX: DIM-argument must be a positive integer scalar");
+
+		DIM = (unsigned)floor(x);	
+	}
+
+	// get size 
+    	ND = mxGetNumberOfDimensions(PInputs[0]);	
+    	// NN = mxGetNumberOfElements(PInputs[0]);
+    	SZ = mxGetDimensions(PInputs[0]);		
+
+	// if DIM==0 (undefined), look for first dimension with more than 1 element. 
+	for (j = 0; (DIM < 1) && (j < ND); j++) 
+		if (SZ[j]>1) DIM = j+1;
+	
+	if (DIM < 1) DIM=1;		// in case DIM is still undefined 
+
+	ND2 = (ND>DIM ? ND : DIM);	// number of dimensions of output 
+
+	SZ2 = (mwSize*)mxCalloc(ND2, sizeof(mwSize)); // allocate memory for output size
+
+	for (j=0; j<ND; j++)		// copy size of input;  
+		SZ2[j] = SZ[j]; 	
+	for (j=ND; j<ND2; j++)		// in case DIM > ND, add extra elements 1 
+		SZ2[j] = 1; 	
+
+    	for (j=0, D1=1; j<DIM-1; D1=D1*SZ2[j++]); 	// D1 is the number of elements between two elements along dimension  DIM  
+	D2 = SZ2[DIM-1];		// D2 contains the size along dimension DIM 	
+    	for (j=DIM, D3=1;  j<ND; D3=D3*SZ2[j++]); 	// D3 is the number of blocks containing D1*D2 elements 
+
+	SZ2[DIM-1] = 1;		// size of output is same as size of input but SZ(DIM)=1;
+
+    	// get weight vector for weighted sumskipnan 
+       	if  (PInputCount > 3)	{
+		if (!mxGetNumberOfElements(PInputs[3])) 
+			; // empty weight vector - no weighting 
+		else if (mxGetNumberOfElements(PInputs[3])==D2)
+			W = mxGetPr(PInputs[3]);
+		else
+			mexErrMsgTxt("Error SUMSKIPNAN.MEX: length of weight vector does not match size of dimension");
+	}
+
+	int ACC_LEVEL = 0;
+	{
+		mxArray *LEVEL = NULL;
+		int s = mexCallMATLAB(1, &LEVEL, 0, NULL, "flag_accuracy_level");
+		if (!s) {
+			ACC_LEVEL = (int) mxGetScalar(LEVEL);
+			if ((D1>1) && (ACC_LEVEL>2))
+				mexWarnMsgTxt("Warning: Kahan summation not supported with stride > 1 !");
+		}	
+		mxDestroyArray(LEVEL);
+	}
+	// mexPrintf("Accuracy Level=%i\n",ACC_LEVEL);
+
+	    // create outputs
+	#define TYP mxDOUBLE_CLASS
+
+	POutput[0] = mxCreateNumericArray(ND2, SZ2, TYP, mxREAL);
+	LOutputSum = mxGetPr(POutput[0]);
+	if (D1!=1 && D2>0) LongOutputSum = (long double*) mxCalloc(D1*D3,sizeof(long double));
+
+    	if (POutputCount >= 2) {
+		POutput[1] = mxCreateNumericArray(ND2, SZ2, TYP, mxREAL);
+		LOutputCount = mxGetPr(POutput[1]);
+		if (D1!=1 && D2>0) LongOutputCount = (long double*) mxCalloc(D1*D3,sizeof(long double));
+    	}
+    	if (POutputCount >= 3) {
+		POutput[2] = mxCreateNumericArray(ND2, SZ2, TYP, mxREAL);
+        	LOutputSum2  = mxGetPr(POutput[2]);
+		if (D1!=1 && D2>0) LongOutputSum2 = (long double*) mxCalloc(D1*D3,sizeof(long double));
+    	}
+	mxFree(SZ2);
+
+	if (D1*D2*D3<1) // zero size array
+		; 	// do nothing 
+	else if ((D1==1) && (ACC_LEVEL<1)) {
+		// double accuray, naive summation, error = N*2^-52 
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2wr__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2wr__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3wr__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if ((D1==1) && (ACC_LEVEL==1)) {
+		// extended accuray, naive summation, error = N*2^-64 
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2w__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2w__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3w__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if ((D1==1) && (ACC_LEVEL==3)) {
+		// ACC_LEVEL==3: extended accuracy and Kahan Summation, error = 2^-64
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2we__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2we__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3we__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if ((D1==1) && (ACC_LEVEL==2)) {
+		// ACC_LEVEL==2: double accuracy and Kahan Summation, error = 2^-52
+		switch (POutputCount) {
+		case 1: 
+			for (l = 0; l<D3; l++) {
+				double count;
+				__sumskipnan2wer__(LInput+l*D2, D2, LOutputSum+l, &count, &flag_isNaN, W);
+			} 
+			break;
+		case 2: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan2wer__(LInput+l*D2, D2, LOutputSum+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		case 3: 
+			for (l = 0; l<D3; l++) {
+				__sumskipnan3wer__(LInput+l*D2, D2, LOutputSum+l, LOutputSum2+l, LOutputCount+l, &flag_isNaN, W);
+			} 
+			break;
+		}
+	}
+	else if (POutputCount <= 1) {
+		// OUTER LOOP: along dimensions > DIM
+		for (l = 0; l<D3; l++) {
+			ix0 = l*D1; 	// index for output
+			ix1 = ix0*D2;	// index for input 
+			for (j=0; j<D2; j++) {
+				// minimize cache misses 
+				ix2 =   ix0;	// index for output 
+				// Inner LOOP: along dimensions < DIM
+				if (W) do {
+					long double x = *LInput;
+        				if (!isnan(x)) {
+						LongOutputSum[ix2]   += W[j]*x; 
+					}
+#ifndef NO_FLAG
+					else 
+						flag_isNaN = 1; 
+#endif 
+					LInput++;
+					ix2++;
+				} while (ix2 != (l+1)*D1);
+				else do {
+					long double x = *LInput;
+        				if (!isnan(x)) {
+						LongOutputSum[ix2]   += x; 
+					}
+#ifndef NO_FLAG
+					else 
+						flag_isNaN = 1; 
+#endif 
+					LInput++;
+					ix2++;
+				} while (ix2 != (l+1)*D1);
+			}	//	end for (j=	
+
+                        /* copy to output */
+               		for (j=0; j<D1; j++) {
+				LOutputSum[ix0+j] = LongOutputSum[ix0+j]; 
+			}
+               	}		
+	}
+
+	else if (POutputCount == 2) {
+		// OUTER LOOP: along dimensions > DIM
+		for (l = 0; l<D3; l++) {
+			ix0 = l*D1; 
+			ix1 = ix0*D2;	// index for input 
+			for (j=0; j<D2; j++) {
+				// minimize cache misses 
+				ix2 =   ix0;	// index for output 
+				// Inner LOOP: along dimensions < DIM
+				if (W) do {
+					long double x = *LInput;
+        				if (!isnan(x)) {
+						LongOutputCount[ix2] += W[j]; 
+						LongOutputSum[ix2]   += W[j]*x; 
+					}
+#ifndef NO_FLAG
+					else 
+						flag_isNaN = 1; 
+#endif
+					LInput++;
+					ix2++;
+				} while (ix2 != (l+1)*D1);
+				else do {
+					long double x = *LInput;
+        				if (!isnan(x)) {
+						LongOutputCount[ix2] += 1.0; 
+						LongOutputSum[ix2]   += x; 
+					}
+#ifndef NO_FLAG
+					else 
+						flag_isNaN = 1; 
+#endif
+					LInput++;
+					ix2++;
+				} while (ix2 != (l+1)*D1);
+			}	//	end for (j=	
+
+                                /* copy to output */
+	               	for (j=0; j<D1; j++) {
+				LOutputSum[ix0+j]   = LongOutputSum[ix0+j]; 
+				LOutputCount[ix0+j] = LongOutputCount[ix0+j]; 
+	       		}	// 	end else 
+               	}		
+	}
+
+	else if (POutputCount == 3) {
+		// OUTER LOOP: along dimensions > DIM
+		for (l = 0; l<D3; l++) {
+			ix0 = l*D1; 
+			ix1 = ix0*D2;	// index for input 
+			for (j=0; j<D2; j++) {
+				// minimize cache misses 
+				ix2 =   ix0;	// index for output 
+				// Inner LOOP: along dimensions < DIM
+				if (W) do {
+					long double x = *LInput;
+        				if (!isnan(x)) {
+						LongOutputCount[ix2] += W[j]; 
+						double t = W[j]*x;
+						LongOutputSum[ix2]   += t; 
+						LongOutputSum2[ix2]  += x*t; 
+					}
+#ifndef NO_FLAG
+					else 
+						flag_isNaN = 1; 
+#endif
+					LInput++;
+					ix2++;	
+				} while (ix2 != (l+1)*D1);
+				else do {
+					long double x = *LInput;
+        				if (!isnan(x)) {
+						LongOutputCount[ix2] += 1.0; 
+						LongOutputSum[ix2]   += x; 
+						LongOutputSum2[ix2]  += x*x; 
+					}
+#ifndef NO_FLAG
+					else 
+						flag_isNaN = 1; 
+#endif
+					LInput++;
+					ix2++;	
+				} while (ix2 != (l+1)*D1);
+			}	//	end for (j=	
+
+                       /* copy to output */
+	        	for (j=0; j<D1; j++) {
+				LOutputSum[ix0+j]   = LongOutputSum[ix0+j]; 
+				LOutputCount[ix0+j] = LongOutputCount[ix0+j]; 
+				LOutputSum2[ix0+j]  = LongOutputSum2[ix0+j]; 
+			}
+               	}		
+	}
+	if (LongOutputSum) mxFree(LongOutputSum);
+	if (LongOutputCount) mxFree(LongOutputCount);
+	if (LongOutputSum2) mxFree(LongOutputSum2);
+
+#ifndef NO_FLAG
+	//mexPrintf("Third argument must be not empty - otherwise status whether a NaN occured or not cannot be returned.");
+	/* this is a hack, the third input argument is used to return whether a NaN occured or not. 
+		this requires that the input argument is a non-empty variable
+	*/	
+	if  (flag_isNaN && (PInputCount > 2) && mxGetNumberOfElements(PInputs[2])) {
+    		// set FLAG_NANS_OCCURED 
+    		switch (mxGetClassID(PInputs[2])) {
+    		case mxLOGICAL_CLASS:
+    		case mxCHAR_CLASS:
+    		case mxINT8_CLASS:
+    		case mxUINT8_CLASS:
+    			*(uint8_t*)mxGetData(PInputs[2]) = 1;
+    			break; 
+    		case mxDOUBLE_CLASS:
+    			*(double*)mxGetData(PInputs[2]) = 1.0;
+    			break; 
+    		case mxSINGLE_CLASS:
+    			*(float*)mxGetData(PInputs[2]) = 1.0;
+    			break; 
+    		case mxINT16_CLASS:
+    		case mxUINT16_CLASS:
+    			*(uint16_t*)mxGetData(PInputs[2]) = 1;
+    			break; 
+    		case mxINT32_CLASS:
+    		case mxUINT32_CLASS:
+    			*(uint32_t*)mxGetData(PInputs[2])= 1;
+    			break; 
+    		case mxINT64_CLASS:
+    		case mxUINT64_CLASS:
+    			*(uint64_t*)mxGetData(PInputs[2]) = 1;
+    			break; 
+    		case mxFUNCTION_CLASS:
+    		case mxUNKNOWN_CLASS:
+    		case mxCELL_CLASS:
+    		case mxSTRUCT_CLASS:
+    			mexPrintf("Type of 3rd input argument not supported.");
+		}
+	}
+#endif
+}
+
+
+#define stride 1 
+inline int __sumskipnan2w__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W)
+{
+	long double sum=0; 
+	char   flag=0; 
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		long double count = 0.0;
+		do {
+			long double x = *data;
+        		if (!isnan(x))
+			{
+				count += *W; 
+				sum   += *W*x;
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif	
+
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {
+		// w/o weight vector 
+		size_t countI = 0;
+		do {
+			long double x = *data;
+        		if (!isnan(x))
+			{
+				countI++; 
+				sum += x; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}	
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+
+}
+
+
+inline int __sumskipnan3w__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
+{
+	long double sum=0; 
+	long double msq=0; 
+	char   flag=0;
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		long double count = 0.0;
+		do {
+			long double x = *data;
+        		if (!isnan(x)) {
+				count += *W;
+				long double t = *W*x; 
+				sum += t; 
+				msq += x*t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {	
+		// w/o weight vector 
+		size_t countI = 0;
+		do {
+			long double x = *data;
+        		if (!isnan(x)) {
+				countI++; 
+				sum += x; 
+				msq += x*x; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+	*s2 = msq; 
+}
+
+inline int __sumskipnan2wr__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W)
+{
+	double sum=0; 
+	char   flag=0; 
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		double count = 0.0;
+		do {
+			double x = *data;
+        		if (!isnan(x))
+			{
+				count += *W; 
+				sum   += *W*x;
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif	
+
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {
+		// w/o weight vector 
+		size_t countI = 0;
+		do {
+			double x = *data;
+        		if (!isnan(x))
+			{
+				countI++; 
+				sum += x; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}	
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+
+}
+
+
+inline int __sumskipnan3wr__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
+{
+	double sum=0; 
+	double msq=0; 
+	char   flag=0;
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		double count = 0.0;
+		do {
+			double x = *data;
+        		if (!isnan(x)) {
+				count += *W;
+				double t = *W*x; 
+				sum += t; 
+				msq += x*t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {	
+		// w/o weight vector 
+		size_t countI = 0;
+		do {
+			double x = *data;
+        		if (!isnan(x)) {
+				countI++; 
+				sum += x; 
+				msq += x*x; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+	*s2 = msq; 
+}
+
+
+
+/***************************************
+        using Kahan's summation formula [1] 
+        this gives more accurate results while the computational effort within the loop is about 4x as high  
+        First tests show a penalty of about 40% in terms of computational time.   
+
+        [1] David Goldberg, 
+        What Every Computer Scientist Should Know About Floating-Point Arithmetic
+        ACM Computing Surveys, Vol 23, No 1, March 1991. 
+ ****************************************/
+
+inline int __sumskipnan2we__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W)
+{
+	long double sum=0; 
+	char   flag=0; 
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		long double count = 0.0;
+		long double rc=0.0, rn=0.0;
+		do {
+			long double x = *data;
+		        long double t,y; 
+        		if (!isnan(x))
+			{
+				//count += *W; [1]
+        			y = *W-rn;
+        			t = count+y;
+               			rn= (t-count)-y;
+		        	count= t; 
+
+				//sum   += *W*x; [1]
+        			y = *W*x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif	
+
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {
+		// w/o weight vector 
+		size_t countI = 0;
+		long double rc=0.0;
+		do {
+			long double x = *data;
+		        long double t,y; 
+        		if (!isnan(x))
+			{
+				countI++; 
+				// sum += x; [1]  
+        			y = x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}	
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+
+}
+
+
+inline int __sumskipnan3we__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
+{
+	long double sum=0; 
+	long double msq=0; 
+	char   flag=0;
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		long double count = 0.0;
+        	long double rc=0.0, rn=0.0, rq=0.0;
+		do {
+			long double x = *data;
+		        long double t,y; 
+        		if (!isnan(x)) {
+				//count += *W; [1]
+        			y = *W-rn;
+        			t = count+y;
+               			rn= (t-count)-y;
+		        	count= t; 
+
+				long double w = *W*x; 
+				//sum   += *W*x; [1]
+        			y = *W*x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+
+				// msq += x*w; 
+        			y = w*x-rq;
+        			t = msq+y;
+               			rq= (t-msq)-y;
+		        	msq= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {	
+		// w/o weight vector 
+		size_t countI = 0;
+        	long double rc=0.0, rq=0.0;
+		do {
+			long double x = *data;
+		        long double t,y; 
+        		if (!isnan(x)) {
+				countI++; 
+				//sum   += x; [1]
+        			y = x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+
+				// msq += x*x; 
+        			y = x*x-rq;
+        			t = msq+y;
+               			rq= (t-msq)-y;
+		        	msq= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+	*s2 = msq; 
+}
+
+inline int __sumskipnan2wer__(double *data, size_t Ni, double *s, double *No, char *flag_anyISNAN, double *W)
+{
+	double sum=0; 
+	char   flag=0; 
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		double count = 0.0;
+		double rc=0.0, rn=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x))
+			{
+				//count += *W; [1]
+        			y = *W-rn;
+        			t = count+y;
+               			rn= (t-count)-y;
+		        	count= t; 
+
+				//sum   += *W*x; [1]
+        			y = *W*x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif	
+
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {
+		// w/o weight vector 
+		size_t countI = 0;
+		double rc=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x))
+			{
+				countI++; 
+				// sum += x; [1]  
+        			y = x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}	
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+
+}
+
+
+inline int __sumskipnan3wer__(double *data, size_t Ni, double *s, double *s2, double *No, char *flag_anyISNAN, double *W)
+{
+	double sum=0; 
+	double msq=0; 
+	char   flag=0;
+	// LOOP  along dimension DIM
+	
+	double *end = data + stride*Ni; 
+	if (W) {
+		// with weight vector 
+		double count = 0.0;
+        	double rc=0.0, rn=0.0, rq=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x)) {
+				//count += *W; [1]
+        			y = *W-rn;
+        			t = count+y;
+               			rn= (t-count)-y;
+		        	count= t; 
+
+				double w = *W*x; 
+				//sum   += *W*x; [1]
+        			y = *W*x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+
+				// msq += x*w; 
+        			y = w*x-rq;
+        			t = msq+y;
+               			rq= (t-msq)-y;
+		        	msq= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+			W++;
+		}
+		while (data < end);
+	        *No = count;
+	} else {	
+		// w/o weight vector 
+		size_t countI = 0;
+        	double rc=0.0, rq=0.0;
+		do {
+			double x = *data;
+		        double t,y; 
+        		if (!isnan(x)) {
+				countI++; 
+				//sum   += x; [1]
+        			y = x-rc;
+        			t = sum+y;
+               			rc= (t-sum)-y;
+		        	sum= t; 
+
+				// msq += x*x; 
+        			y = x*x-rq;
+        			t = msq+y;
+               			rq= (t-msq)-y;
+		        	msq= t; 
+			}
+#ifndef NO_FLAG
+			else 
+				flag = 1; 
+#endif
+			data++;	// stride=1
+		}
+		while (data < end);
+	        *No = (double)countI;
+	}
+	
+#ifndef NO_FLAG
+	if (flag && (flag_anyISNAN != NULL)) *flag_anyISNAN = 1; 
+#endif
+	*s  = sum;
+	*s2 = msq; 
+}
+