Mercurial > forge
annotate extra/NaN/src/covm_mex.cpp @ 12685:f26b1170ea90 octave-forge
resulting values should be really converted to output data type
author | schloegl |
---|---|
date | Sat, 12 Sep 2015 07:15:01 +0000 |
parents | de98e4cb9248 |
children | 6d6285a2a633 |
rev | line source |
---|---|
6549 | 1 /* |
2 //------------------------------------------------------------------- | |
3 // C-MEX implementation of COVM - this function is part of the NaN-toolbox. | |
4 // | |
5 // | |
6 // This program is free software; you can redistribute it and/or modify | |
7 // it under the terms of the GNU General Public License as published by | |
8 // the Free Software Foundation; either version 3 of the License, or | |
9 // (at your option) any later version. | |
10 // | |
11 // This program is distributed in the hope that it will be useful, | |
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 // GNU General Public License for more details. | |
15 // | |
16 // You should have received a copy of the GNU General Public License | |
17 // along with this program; if not, see <http://www.gnu.org/licenses/>. | |
18 // | |
19 // | |
20 // covm: in-product of matrices, NaN are skipped. | |
21 // usage: | |
22 // [cc,nn] = covm_mex(X,Y,flag,W); | |
23 // | |
24 // Input: | |
25 // - X: | |
26 // - Y: [optional], if empty, Y=X; | |
27 // - flag: if not empty, it is set to 1 if some NaN was observed | |
28 // - W: weight vector to compute weighted correlation | |
29 // | |
30 // Output: | |
31 // - CC = X' * sparse(diag(W)) * Y while NaN's are skipped | |
32 // - NN = real(~isnan(X)')*sparse(diag(W))*real(~isnan(Y)) count of valid (non-NaN) elements | |
33 // computed more efficiently | |
34 // | |
35 // $Id$ | |
8037 | 36 // Copyright (C) 2009,2010,2011 Alois Schloegl <alois.schloegl@gmail.com> |
6549 | 37 // This function is part of the NaN-toolbox |
7889 | 38 // http://pub.ist.ac.at/~schloegl/matlab/NaN/ |
6549 | 39 // |
40 //------------------------------------------------------------------- | |
41 */ | |
42 | |
6585 | 43 #ifdef __GNUC__ |
7888 | 44 #include <stdint.h> |
6585 | 45 #endif |
6549 | 46 #include <math.h> |
47 #include "mex.h" | |
48 | |
49 /*#define NO_FLAG*/ | |
50 | |
51 | |
52 void mexFunction(int POutputCount, mxArray* POutput[], int PInputCount, const mxArray *PInputs[]) | |
53 { | |
7992 | 54 double *X0=NULL, *Y0=NULL, *W=NULL; |
55 double *CC; | |
56 double *NN = NULL; | |
6549 | 57 |
7992 | 58 size_t rX,cX,rY,cY; |
59 size_t i; | |
60 char flag_isNaN = 0; | |
6585 | 61 int ACC_LEVEL; |
6549 | 62 |
63 /*********** check input arguments *****************/ | |
64 | |
65 // check for proper number of input and output arguments | |
66 if ((PInputCount <= 0) || (PInputCount > 5)) { | |
67 mexPrintf("usage: [CC,NN] = covm_mex(X [,Y [,flag [,W [,'E']]]])\n\n"); | |
68 mexPrintf("Do not use COVM_MEX directly, use COVM instead. \n"); | |
69 /* | |
70 mexPrintf("\nCOVM_MEX computes the covariance matrix of real matrices and skips NaN's\n"); | |
71 mexPrintf("\t[CC,NN] = covm_mex(...)\n\t\t computes CC=X'*Y, NN contains the number of not-NaN elements\n"); | |
72 mexPrintf("\t\t CC./NN is the unbiased covariance matrix\n"); | |
73 mexPrintf("\t... = covm_mex(X,Y,...)\n\t\t computes CC=X'*sparse(diag(W))*Y, number of rows of X and Y must match\n"); | |
74 mexPrintf("\t... = covm_mex(X,[], ...)\n\t\t computes CC=X'*sparse(diag(W))*X\n"); | |
75 mexPrintf("\t... = covm_mex(...,flag,...)\n\t\t if flag is not empty, it is set to 1 if some NaN occured in X or Y\n"); | |
76 mexPrintf("\t... = covm_mex(...,W)\n\t\t W to compute weighted covariance, number of elements must match the number of rows of X\n"); | |
77 mexPrintf("\t\t if isempty(W), all weights are 1\n"); | |
78 mexPrintf("\t[CC,NN]=covm_mex(X,Y,flag,W)\n"); | |
79 */ return; | |
80 } | |
81 if (POutputCount > 2) | |
82 mexErrMsgTxt("covm.MEX has 1 to 2 output arguments."); | |
83 | |
84 | |
85 // get 1st argument | |
12640
de98e4cb9248
check for sparse matrices and and convert to full if needed
schloegl
parents:
8037
diff
changeset
|
86 if(mxIsDouble(PInputs[0]) && !mxIsComplex(PInputs[0]) && !mxIsSparse(PInputs[0]) ) |
6549 | 87 X0 = mxGetPr(PInputs[0]); |
88 else | |
12640
de98e4cb9248
check for sparse matrices and and convert to full if needed
schloegl
parents:
8037
diff
changeset
|
89 mexErrMsgTxt("First argument must be non-sparse REAL/DOUBLE."); |
6549 | 90 rX = mxGetM(PInputs[0]); |
91 cX = mxGetN(PInputs[0]); | |
92 | |
93 // get 2nd argument | |
94 if (PInputCount > 1) { | |
95 if (!mxGetNumberOfElements(PInputs[1])) | |
96 ; // Y0 = NULL; | |
97 | |
98 else if (mxIsDouble(PInputs[1]) && !mxIsComplex(PInputs[1])) | |
99 Y0 = mxGetPr(PInputs[1]); | |
100 | |
101 else | |
102 mexErrMsgTxt("Second argument must be REAL/DOUBLE."); | |
103 } | |
104 | |
105 | |
106 // get weight vector for weighted sumskipnan | |
107 if (PInputCount > 3) { | |
108 // get 4th argument | |
7992 | 109 size_t nW = mxGetNumberOfElements(PInputs[3]); |
6549 | 110 if (!nW) |
111 ; | |
112 else if (nW == rX) | |
113 W = mxGetPr(PInputs[3]); | |
114 else | |
115 mexErrMsgTxt("number of elements in W must match numbers of rows in X"); | |
116 } | |
117 | |
6585 | 118 #ifdef __GNUC__ |
119 ACC_LEVEL = 0; | |
6549 | 120 { |
121 mxArray *LEVEL = NULL; | |
122 int s = mexCallMATLAB(1, &LEVEL, 0, NULL, "flag_accuracy_level"); | |
123 if (!s) { | |
124 ACC_LEVEL = (int) mxGetScalar(LEVEL); | |
125 } | |
126 mxDestroyArray(LEVEL); | |
127 } | |
128 // mexPrintf("Accuracy Level=%i\n",ACC_LEVEL); | |
6585 | 129 #endif |
6549 | 130 if (Y0==NULL) { |
131 Y0 = X0; | |
132 rY = rX; | |
133 cY = cX; | |
134 } | |
135 else { | |
136 rY = mxGetM(PInputs[1]); | |
137 cY = mxGetN(PInputs[1]); | |
138 } | |
139 if (rX != rY) | |
140 mexErrMsgTxt("number of rows in X and Y do not match"); | |
141 | |
142 /*********** create output arguments *****************/ | |
143 | |
144 POutput[0] = mxCreateDoubleMatrix(cX, cY, mxREAL); | |
145 CC = mxGetPr(POutput[0]); | |
146 | |
147 if (POutputCount > 1) { | |
148 POutput[1] = mxCreateDoubleMatrix(cX, cY, mxREAL); | |
149 NN = mxGetPr(POutput[1]); | |
150 } | |
151 | |
152 | |
153 /*********** compute covariance *****************/ | |
154 | |
155 #if 0 | |
156 /*------ version 1 --------------------- | |
157 this solution is slower than the alternative solution below | |
158 for transposed matrices, this might be faster. | |
159 */ | |
160 for (k=0; k<rX; k++) { | |
161 double w; | |
162 if (W) { | |
163 w = W[k]; | |
164 for (i=0; i<cX; i++) { | |
165 double x = X0[k+i*rX]; | |
166 if (isnan(x)) { | |
167 #ifndef NO_FLAG | |
168 flag_isNaN = 1; | |
169 #endif | |
170 continue; | |
171 } | |
172 for (j=0; j<cY; j++) { | |
173 double y = Y0[k+j*rY]; | |
174 if (isnan(y)) { | |
175 #ifndef NO_FLAG | |
176 flag_isNaN = 1; | |
177 #endif | |
178 continue; | |
179 } | |
180 CC[i+j*cX] += x*y*w; | |
181 if (NN != NULL) | |
182 NN[i+j*cX] += w; | |
183 } | |
184 } | |
185 } | |
186 else for (i=0; i<cX; i++) { | |
187 double x = X0[k+i*rX]; | |
188 if (isnan(x)) { | |
189 #ifndef NO_FLAG | |
190 flag_isNaN = 1; | |
191 #endif | |
192 continue; | |
193 } | |
194 for (j=0; j<cY; j++) { | |
195 double y = Y0[k+j*rY]; | |
196 if (isnan(y)) { | |
197 #ifndef NO_FLAG | |
198 flag_isNaN = 1; | |
199 #endif | |
200 continue; | |
201 } | |
202 CC[i+j*cX] += x*y; | |
203 if (NN != NULL) | |
204 NN[i+j*cX] += 1.0; | |
205 } | |
206 } | |
207 } | |
208 | |
209 #else | |
7992 | 210 |
211 #pragma omp parallel | |
212 { | |
6585 | 213 #ifdef __GNUC__ |
214 if (ACC_LEVEL == 0) | |
215 #endif | |
216 { | |
6549 | 217 /*------ version 2 --------------------- |
218 using naive summation with double accuracy [1] | |
219 */ | |
220 if ( (X0 != Y0) || (cX != cY) ) | |
221 /******** X!=Y, output is not symetric *******/ | |
222 if (W) /* weighted version */ | |
7992 | 223 #pragma omp for schedule(dynamic) nowait |
224 for (i = 0; i < cX * cY; i++) | |
225 { | |
226 double *X = X0 + (i%cX) * rX; | |
227 double *Y = Y0 + (i/cX) * rY; | |
228 double cc = 0.0; | |
229 double nw = 0.0; | |
230 size_t k; | |
6549 | 231 for (k=0; k<rX; k++) { |
232 double z = X[k]*Y[k]; | |
233 if (isnan(z)) { | |
234 #ifndef NO_FLAG | |
235 flag_isNaN = 1; | |
236 #endif | |
237 continue; | |
238 } | |
239 cc += z*W[k]; | |
6585 | 240 nw += W[k]; |
6549 | 241 } |
7992 | 242 CC[i] = cc; |
6549 | 243 if (NN != NULL) |
7992 | 244 NN[i] = nw; |
6549 | 245 } |
246 else /* no weights, all weights are 1 */ | |
7992 | 247 #pragma omp for schedule(dynamic) nowait |
248 for (i = 0; i < cX * cY; i++) | |
249 { | |
250 double *X = X0 + (i%cX) * rX; | |
251 double *Y = Y0 + (i/cX) * rY; | |
252 double cc = 0.0; | |
253 size_t nn = 0; | |
254 size_t k; | |
6549 | 255 for (k=0; k<rX; k++) { |
256 double z = X[k]*Y[k]; | |
257 if (isnan(z)) { | |
258 #ifndef NO_FLAG | |
259 flag_isNaN = 1; | |
260 #endif | |
261 continue; | |
262 } | |
263 cc += z; | |
264 nn++; | |
265 } | |
7992 | 266 CC[i] = cc; |
6549 | 267 if (NN != NULL) |
7992 | 268 NN[i] = (double)nn; |
6549 | 269 } |
270 else // if (X0==Y0) && (cX==cY) | |
271 /******** X==Y, output is symetric *******/ | |
272 if (W) /* weighted version */ | |
7992 | 273 #pragma omp for schedule(dynamic) nowait |
274 for (i = 0; i < cX * cY; i++) | |
275 { | |
276 size_t ii = i%cX; | |
277 size_t jj = i/cX; | |
278 if (ii < jj) continue; | |
279 double *X = X0 + ii * rX; | |
280 double *Y = Y0 + jj * rY; | |
281 double cc = 0.0; | |
282 double nw = 0.0; | |
283 size_t k; | |
6549 | 284 for (k=0; k<rX; k++) { |
285 double z = X[k]*Y[k]; | |
286 if (isnan(z)) { | |
287 #ifndef NO_FLAG | |
288 flag_isNaN = 1; | |
289 #endif | |
290 continue; | |
291 } | |
292 cc += z*W[k]; | |
6585 | 293 nw += W[k]; |
7992 | 294 } |
295 size_t j = jj + ii*cX; | |
296 CC[i] = cc; | |
297 CC[j] = cc; | |
6549 | 298 if (NN != NULL) { |
7992 | 299 NN[i] = nw; |
300 NN[j] = nw; | |
301 } | |
6549 | 302 } |
303 else /* no weights, all weights are 1 */ | |
7992 | 304 #pragma omp for schedule(dynamic) nowait |
305 for (i = 0; i < cX * cY; i++) | |
306 { | |
307 size_t ii = i%cX; | |
308 size_t jj = i/cX; | |
309 if (ii < jj) continue; | |
310 double *X = X0 + ii * rX; | |
311 double *Y = Y0 + jj * rY; | |
312 double cc = 0.0; | |
313 size_t nn = 0; | |
314 size_t k; | |
6549 | 315 for (k=0; k<rX; k++) { |
316 double z = X[k]*Y[k]; | |
317 if (isnan(z)) { | |
318 #ifndef NO_FLAG | |
319 flag_isNaN = 1; | |
320 #endif | |
321 continue; | |
322 } | |
323 cc += z; | |
324 nn++; | |
325 } | |
7992 | 326 size_t j = jj + ii*cX; |
327 CC[i] = cc; | |
328 CC[j] = cc; | |
6549 | 329 if (NN != NULL) { |
7992 | 330 NN[i] = (double)nn; |
331 NN[j] = (double)nn; | |
6549 | 332 } |
333 } | |
334 | |
335 } | |
6585 | 336 |
337 #ifdef __GNUC__ | |
338 | |
6549 | 339 else if (ACC_LEVEL == 1) { |
340 /*------ version 2 --------------------- | |
341 using naive summation with extended accuracy [1] | |
342 */ | |
343 if ( (X0 != Y0) || (cX != cY) ) | |
344 /******** X!=Y, output is not symetric *******/ | |
345 if (W) /* weighted version */ | |
7992 | 346 #pragma omp for schedule(dynamic) nowait |
347 for (i = 0; i < cX * cY; i++) | |
348 { | |
349 double *X = X0 + (i%cX) * rX; | |
350 double *Y = Y0 + (i/cX) * rY; | |
6549 | 351 long double cc=0.0; |
352 long double nn=0.0; | |
7992 | 353 size_t k; |
6549 | 354 for (k=0; k<rX; k++) { |
355 long double z = ((long double)X[k])*Y[k]; | |
356 if (isnan(z)) { | |
357 #ifndef NO_FLAG | |
358 flag_isNaN = 1; | |
359 #endif | |
360 continue; | |
361 } | |
362 cc += z*W[k]; | |
363 nn += W[k]; | |
364 } | |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
365 CC[i] = (typeof(*CC))cc; |
6549 | 366 if (NN != NULL) |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
367 NN[i] = (typeof(*NN))nn; |
6549 | 368 } |
369 else /* no weights, all weights are 1 */ | |
7992 | 370 #pragma omp for schedule(dynamic) nowait |
371 for (i = 0; i < cX * cY; i++) | |
372 { | |
373 double *X = X0 + (i%cX) * rX; | |
374 double *Y = Y0 + (i/cX) * rY; | |
6549 | 375 long double cc=0.0; |
376 size_t nn=0; | |
7992 | 377 size_t k; |
6549 | 378 for (k=0; k<rX; k++) { |
379 long double z = ((long double)X[k])*Y[k]; | |
380 if (isnan(z)) { | |
381 #ifndef NO_FLAG | |
382 flag_isNaN = 1; | |
383 #endif | |
384 continue; | |
385 } | |
386 cc += z; | |
387 nn++; | |
388 } | |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
389 CC[i] = (typeof(*CC))cc; |
6549 | 390 if (NN != NULL) |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
391 NN[i] = (typeof(*NN))nn; |
6549 | 392 } |
393 else // if (X0==Y0) && (cX==cY) | |
394 /******** X==Y, output is symetric *******/ | |
395 if (W) /* weighted version */ | |
7992 | 396 #pragma omp for schedule(dynamic) nowait |
397 for (i = 0; i < cX * cY; i++) | |
398 { | |
399 size_t ii = i%cX; | |
400 size_t jj = i/cX; | |
401 if (ii < jj) continue; | |
402 double *X = X0 + ii * rX; | |
403 double *Y = Y0 + jj * rY; | |
6549 | 404 long double cc=0.0; |
405 long double nn=0.0; | |
7992 | 406 size_t k; |
6549 | 407 for (k=0; k<rX; k++) { |
408 long double z = ((long double)X[k])*Y[k]; | |
409 if (isnan(z)) { | |
410 #ifndef NO_FLAG | |
411 flag_isNaN = 1; | |
412 #endif | |
413 continue; | |
414 } | |
415 cc += z*W[k]; | |
416 nn += W[k]; | |
417 } | |
7992 | 418 size_t j = jj + ii*cX; |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
419 CC[i] = (typeof(*CC))cc; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
420 CC[j] = (typeof(*CC))cc; |
6549 | 421 if (NN != NULL) { |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
422 NN[i] = (typeof(*NN))nn; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
423 NN[j] = (typeof(*NN))nn; |
6549 | 424 } |
425 } | |
426 else /* no weights, all weights are 1 */ | |
7992 | 427 #pragma omp for schedule(dynamic) nowait |
428 for (i = 0; i < cX * cY; i++) | |
429 { | |
430 size_t ii = i%cX; | |
431 size_t jj = i/cX; | |
432 if (ii < jj) continue; | |
433 double *X = X0 + ii * rX; | |
434 double *Y = Y0 + jj * rY; | |
6549 | 435 long double cc=0.0; |
436 size_t nn=0; | |
7992 | 437 size_t k; |
6549 | 438 for (k=0; k<rX; k++) { |
439 long double z = ((long double)X[k])*Y[k]; | |
440 if (isnan(z)) { | |
441 #ifndef NO_FLAG | |
442 flag_isNaN = 1; | |
443 #endif | |
444 continue; | |
445 } | |
446 cc += z; | |
447 nn++; | |
448 } | |
7992 | 449 size_t j = jj + ii*cX; |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
450 CC[i] = (typeof(*CC))cc; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
451 CC[j] = (typeof(*CC))cc; |
6549 | 452 if (NN != NULL) { |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
453 NN[i] = (typeof(*NN))nn; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
454 NN[j] = (typeof(*NN))nn; |
6549 | 455 } |
456 } | |
457 | |
458 } | |
459 else if (ACC_LEVEL == 3) { | |
460 /*------ version 3 --------------------- | |
461 using Kahan's summation with extended (long double) accuracy [1] | |
462 this gives more accurate results while the computational effort within the loop is about 4x as high | |
463 However, first test show an increase in computational time of only about 25 %. | |
464 | |
465 [1] David Goldberg, | |
466 What Every Computer Scientist Should Know About Floating-Point Arithmetic | |
467 ACM Computing Surveys, Vol 23, No 1, March 1991 | |
468 */ | |
469 if ( (X0 != Y0) || (cX != cY) ) | |
470 /******** X!=Y, output is not symetric *******/ | |
471 if (W) /* weighted version */ | |
7992 | 472 #pragma omp for schedule(dynamic) nowait |
473 for (i = 0; i < cX * cY; i++) | |
474 { | |
475 double *X = X0 + (i%cX) * rX; | |
476 double *Y = Y0 + (i/cX) * rY; | |
6549 | 477 long double cc=0.0; |
478 long double nn=0.0; | |
479 long double rc=0.0; | |
480 long double rn=0.0; | |
7992 | 481 size_t k; |
6549 | 482 for (k=0; k<rX; k++) { |
483 long double t,y; | |
484 long double z = ((long double)X[k])*Y[k]; | |
485 if (isnan(z)) { | |
486 #ifndef NO_FLAG | |
487 flag_isNaN = 1; | |
488 #endif | |
489 continue; | |
490 } | |
491 // cc += z*W[k]; [1] | |
492 y = z*W[k]-rc; | |
493 t = cc+y; | |
494 rc= (t-cc)-y; | |
495 cc= t; | |
496 | |
497 // nn += W[k]; [1] | |
498 y = z*W[k]-rn; | |
499 t = nn+y; | |
500 rn= (t-nn)-y; | |
501 nn= t; | |
502 } | |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
503 CC[i] = (typeof(*CC))cc; |
6549 | 504 if (NN != NULL) |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
505 NN[i] = (typeof(*NN))nn; |
6549 | 506 } |
507 else /* no weights, all weights are 1 */ | |
7992 | 508 #pragma omp for schedule(dynamic) nowait |
509 for (i = 0; i < cX * cY; i++) | |
510 { | |
511 double *X = X0 + (i%cX) * rX; | |
512 double *Y = Y0 + (i/cX) * rY; | |
6549 | 513 long double cc=0.0; |
514 long double rc=0.0; | |
515 size_t nn=0; | |
7992 | 516 size_t k; |
6549 | 517 for (k=0; k<rX; k++) { |
518 long double t,y; | |
519 long double z = ((long double)X[k])*Y[k]; | |
520 if (isnan(z)) { | |
521 #ifndef NO_FLAG | |
522 flag_isNaN = 1; | |
523 #endif | |
524 continue; | |
525 } | |
526 // cc += z; [1] | |
527 y = z-rc; | |
528 t = cc+y; | |
529 rc= (t-cc)-y; | |
530 cc= t; | |
531 | |
532 nn++; | |
533 } | |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
534 CC[i] = (typeof(*CC))cc; |
6549 | 535 if (NN != NULL) |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
536 NN[i] = (typeof(*NN))nn; |
6549 | 537 } |
538 else // if (X0==Y0) && (cX==cY) | |
539 /******** X==Y, output is symetric *******/ | |
540 if (W) /* weighted version */ | |
7992 | 541 #pragma omp for schedule(dynamic) nowait |
542 for (i = 0; i < cX * cY; i++) | |
543 { | |
544 size_t ii = i%cX; | |
545 size_t jj = i/cX; | |
546 if (ii < jj) continue; | |
547 double *X = X0 + ii * rX; | |
548 double *Y = Y0 + jj * rY; | |
6549 | 549 long double cc=0.0; |
550 long double nn=0.0; | |
551 long double rc=0.0; | |
552 long double rn=0.0; | |
7992 | 553 size_t k; |
6549 | 554 for (k=0; k<rX; k++) { |
555 long double t,y; | |
556 long double z = ((long double)X[k])*Y[k]; | |
557 if (isnan(z)) { | |
558 #ifndef NO_FLAG | |
559 flag_isNaN = 1; | |
560 #endif | |
561 continue; | |
562 } | |
563 // cc += z*W[k]; [1] | |
564 y = z*W[k]-rc; | |
565 t = cc+y; | |
566 rc= (t-cc)-y; | |
567 cc= t; | |
568 | |
569 // nn += W[k]; [1] | |
570 y = z*W[k]-rn; | |
571 t = nn+y; | |
572 rn= (t-nn)-y; | |
573 nn= t; | |
574 } | |
7992 | 575 size_t j = jj + ii*cX; |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
576 CC[i] = (typeof(*CC))cc; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
577 CC[j] = (typeof(*CC))cc; |
6549 | 578 if (NN != NULL) { |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
579 NN[i] = (typeof(*NN))nn; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
580 NN[j] = (typeof(*NN))nn; |
6549 | 581 } |
582 } | |
583 else /* no weights, all weights are 1 */ | |
7992 | 584 #pragma omp for schedule(dynamic) nowait |
585 for (i = 0; i < cX * cY; i++) | |
586 { | |
587 size_t ii = i%cX; | |
588 size_t jj = i/cX; | |
589 if (ii < jj) continue; | |
590 double *X = X0 + ii * rX; | |
591 double *Y = Y0 + jj * rY; | |
6549 | 592 long double cc=0.0; |
593 long double rc=0.0; | |
594 size_t nn=0; | |
7992 | 595 size_t k; |
6549 | 596 for (k=0; k<rX; k++) { |
597 long double t,y; | |
598 long double z = ((long double)X[k])*Y[k]; | |
599 if (isnan(z)) { | |
600 #ifndef NO_FLAG | |
601 flag_isNaN = 1; | |
602 #endif | |
603 continue; | |
604 } | |
605 // cc += z; [1] | |
606 y = z-rc; | |
607 t = cc+y; | |
608 rc= (t-cc)-y; | |
609 cc= t; | |
610 | |
611 nn++; | |
612 } | |
7992 | 613 size_t j = jj + ii*cX; |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
614 CC[i] = (typeof(*CC))cc; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
615 CC[j] = (typeof(*CC))cc; |
6549 | 616 if (NN != NULL) { |
12685
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
617 NN[i] = (typeof(*NN))nn; |
f26b1170ea90
resulting values should be really converted to output data type
schloegl
parents:
12640
diff
changeset
|
618 NN[j] = (typeof(*NN))nn; |
6549 | 619 } |
620 } | |
621 } | |
622 else if (ACC_LEVEL == 2) { | |
623 /*------ version 3 --------------------- | |
624 using Kahan's summation with double accuracy [1] | |
625 this gives more accurate results while the computational effort within the loop is about 4x as high | |
626 However, first test show an increase in computational time of only about 25 %. | |
627 | |
628 [1] David Goldberg, | |
629 What Every Computer Scientist Should Know About Floating-Point Arithmetic | |
630 ACM Computing Surveys, Vol 23, No 1, March 1991 | |
631 */ | |
632 if ( (X0 != Y0) || (cX != cY) ) | |
633 /******** X!=Y, output is not symetric *******/ | |
634 if (W) /* weighted version */ | |
7992 | 635 #pragma omp for schedule(dynamic) nowait |
636 for (i = 0; i < cX * cY; i++) | |
637 { | |
638 double *X = X0 + (i%cX) * rX; | |
639 double *Y = Y0 + (i/cX) * rY; | |
6549 | 640 double cc=0.0; |
641 double nn=0.0; | |
642 double rc=0.0; | |
643 double rn=0.0; | |
7992 | 644 size_t k; |
6549 | 645 for (k=0; k<rX; k++) { |
646 double t,y; | |
647 double z = X[k]*Y[k]; | |
648 if (isnan(z)) { | |
649 #ifndef NO_FLAG | |
650 flag_isNaN = 1; | |
651 #endif | |
652 continue; | |
653 } | |
654 // cc += z*W[k]; [1] | |
655 y = z*W[k]-rc; | |
656 t = cc+y; | |
657 rc= (t-cc)-y; | |
658 cc= t; | |
659 | |
660 // nn += W[k]; [1] | |
661 y = z*W[k]-rn; | |
662 t = nn+y; | |
663 rn= (t-nn)-y; | |
664 nn= t; | |
665 } | |
7992 | 666 CC[i] = cc; |
6549 | 667 if (NN != NULL) |
7992 | 668 NN[i] = nn; |
6549 | 669 } |
670 else /* no weights, all weights are 1 */ | |
7992 | 671 #pragma omp for schedule(dynamic) nowait |
672 for (i = 0; i < cX * cY; i++) | |
673 { | |
674 double *X = X0 + (i%cX) * rX; | |
675 double *Y = Y0 + (i/cX) * rY; | |
6549 | 676 double cc=0.0; |
677 double rc=0.0; | |
678 size_t nn=0; | |
7992 | 679 size_t k; |
6549 | 680 for (k=0; k<rX; k++) { |
681 double t,y; | |
682 double z = X[k]*Y[k]; | |
683 if (isnan(z)) { | |
684 #ifndef NO_FLAG | |
685 flag_isNaN = 1; | |
686 #endif | |
687 continue; | |
688 } | |
689 // cc += z; [1] | |
690 y = z-rc; | |
691 t = cc+y; | |
692 rc= (t-cc)-y; | |
693 cc= t; | |
694 | |
695 nn++; | |
696 } | |
7992 | 697 CC[i] = cc; |
6549 | 698 if (NN != NULL) |
7992 | 699 NN[i] = (double)nn; |
6549 | 700 } |
701 else // if (X0==Y0) && (cX==cY) | |
702 /******** X==Y, output is symetric *******/ | |
703 if (W) /* weighted version */ | |
7992 | 704 #pragma omp for schedule(dynamic) nowait |
705 for (i = 0; i < cX * cY; i++) | |
706 { | |
707 size_t ii = i%cX; | |
708 size_t jj = i/cX; | |
709 if (ii < jj) continue; | |
710 double *X = X0 + ii * rX; | |
711 double *Y = Y0 + jj * rY; | |
6549 | 712 double cc=0.0; |
713 double nn=0.0; | |
714 double rc=0.0; | |
715 double rn=0.0; | |
7992 | 716 size_t k; |
6549 | 717 for (k=0; k<rX; k++) { |
718 double t,y; | |
719 double z = X[k]*Y[k]; | |
720 if (isnan(z)) { | |
721 #ifndef NO_FLAG | |
722 flag_isNaN = 1; | |
723 #endif | |
724 continue; | |
725 } | |
726 // cc += z*W[k]; [1] | |
727 y = z*W[k]-rc; | |
728 t = cc+y; | |
729 rc= (t-cc)-y; | |
730 cc= t; | |
731 | |
732 // nn += W[k]; [1] | |
733 y = z*W[k]-rn; | |
734 t = nn+y; | |
735 rn= (t-nn)-y; | |
736 nn= t; | |
737 } | |
7992 | 738 size_t j = jj + ii*cX; |
739 CC[i] = cc; | |
740 CC[j] = cc; | |
6549 | 741 if (NN != NULL) { |
7992 | 742 NN[i] = nn; |
743 NN[j] = nn; | |
6549 | 744 } |
745 } | |
746 else /* no weights, all weights are 1 */ | |
7992 | 747 #pragma omp for schedule(dynamic) nowait |
748 for (i = 0; i < cX * cY; i++) | |
749 { | |
750 size_t ii = i%cX; | |
751 size_t jj = i/cX; | |
752 if (ii < jj) continue; | |
753 double *X = X0 + ii * rX; | |
754 double *Y = Y0 + jj * rY; | |
6549 | 755 double cc=0.0; |
756 double rc=0.0; | |
757 size_t nn=0; | |
7992 | 758 size_t k; |
6549 | 759 for (k=0; k<rX; k++) { |
760 double t,y; | |
761 double z = X[k]*Y[k]; | |
762 if (isnan(z)) { | |
763 #ifndef NO_FLAG | |
764 flag_isNaN = 1; | |
765 #endif | |
766 continue; | |
767 } | |
768 // cc += z; [1] | |
769 y = z-rc; | |
770 t = cc+y; | |
771 rc= (t-cc)-y; | |
772 cc= t; | |
773 | |
774 nn++; | |
775 } | |
7992 | 776 size_t j = jj + ii*cX; |
777 CC[i] = cc; | |
778 CC[j] = cc; | |
6549 | 779 if (NN != NULL) { |
7992 | 780 NN[i] = (double)nn; |
781 NN[j] = (double)nn; | |
6549 | 782 } |
783 } | |
784 } | |
6585 | 785 #endif |
7992 | 786 } // end pragma omg parallel |
787 | |
788 | |
6549 | 789 #ifndef NO_FLAG |
790 //mexPrintf("Third argument must be not empty - otherwise status whether a NaN occured or not cannot be returned."); | |
791 /* this is a hack, the third input argument is used to return whether a NaN occured or not. | |
792 this requires that the input argument is a non-empty variable | |
793 */ | |
794 if (flag_isNaN && (PInputCount > 2) && mxGetNumberOfElements(PInputs[2])) { | |
795 // set FLAG_NANS_OCCURED | |
796 switch (mxGetClassID(PInputs[2])) { | |
797 case mxDOUBLE_CLASS: | |
798 *(double*)mxGetData(PInputs[2]) = 1.0; | |
799 break; | |
800 case mxSINGLE_CLASS: | |
801 *(float*)mxGetData(PInputs[2]) = 1.0; | |
802 break; | |
6585 | 803 case mxLOGICAL_CLASS: |
804 case mxCHAR_CLASS: | |
805 case mxINT8_CLASS: | |
806 case mxUINT8_CLASS: | |
807 *(char*)mxGetData(PInputs[2]) = 1; | |
808 break; | |
809 #ifdef __GNUC__ | |
6549 | 810 case mxINT16_CLASS: |
811 case mxUINT16_CLASS: | |
812 *(uint16_t*)mxGetData(PInputs[2]) = 1; | |
813 break; | |
814 case mxINT32_CLASS: | |
815 case mxUINT32_CLASS: | |
816 *(uint32_t*)mxGetData(PInputs[2])= 1; | |
817 break; | |
818 case mxINT64_CLASS: | |
819 case mxUINT64_CLASS: | |
820 *(uint64_t*)mxGetData(PInputs[2]) = 1; | |
821 break; | |
822 case mxFUNCTION_CLASS: | |
823 case mxUNKNOWN_CLASS: | |
824 case mxCELL_CLASS: | |
825 case mxSTRUCT_CLASS: | |
6585 | 826 #endif |
827 default: | |
6549 | 828 mexPrintf("Type of 3rd input argument cannot be used to return status of NaN occurence."); |
829 } | |
830 } | |
831 #endif | |
832 #endif | |
833 } | |
834 |