Mercurial > forge
comparison main/sparse/SuperLU/SRC/zpanel_bmod.c @ 0:6b33357c7561 octave-forge
Initial revision
author | pkienzle |
---|---|
date | Wed, 10 Oct 2001 19:54:49 +0000 |
parents | |
children | 7dad48fc229c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6b33357c7561 |
---|---|
1 | |
2 | |
3 /* | |
4 * -- SuperLU routine (version 2.0) -- | |
5 * Univ. of California Berkeley, Xerox Palo Alto Research Center, | |
6 * and Lawrence Berkeley National Lab. | |
7 * November 15, 1997 | |
8 * | |
9 */ | |
10 /* | |
11 Copyright (c) 1994 by Xerox Corporation. All rights reserved. | |
12 | |
13 THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY | |
14 EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. | |
15 | |
16 Permission is hereby granted to use or copy this program for any | |
17 purpose, provided the above notices are retained on all copies. | |
18 Permission to modify the code and to distribute modified code is | |
19 granted, provided the above notices are retained, and a notice that | |
20 the code was modified is included with the above copyright notice. | |
21 */ | |
22 | |
23 #include <stdio.h> | |
24 #include <stdlib.h> | |
25 #include "zsp_defs.h" | |
26 #include "util.h" | |
27 | |
28 /* | |
29 * Function prototypes | |
30 */ | |
31 void zlsolve(int, int, doublecomplex *, doublecomplex *); | |
32 void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *); | |
33 extern void zcheck_tempv(); | |
34 | |
35 void | |
36 zpanel_bmod ( | |
37 const int m, /* in - number of rows in the matrix */ | |
38 const int w, /* in */ | |
39 const int jcol, /* in */ | |
40 const int nseg, /* in */ | |
41 doublecomplex *dense, /* out, of size n by w */ | |
42 doublecomplex *tempv, /* working array */ | |
43 int *segrep, /* in */ | |
44 int *repfnz, /* in, of size n by w */ | |
45 GlobalLU_t *Glu /* modified */ | |
46 ) | |
47 { | |
48 /* | |
49 * Purpose | |
50 * ======= | |
51 * | |
52 * Performs numeric block updates (sup-panel) in topological order. | |
53 * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. | |
54 * Special processing on the supernodal portion of L\U[*,j] | |
55 * | |
56 * Before entering this routine, the original nonzeros in the panel | |
57 * were already copied into the spa[m,w]. | |
58 * | |
59 * Updated/Output parameters- | |
60 * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned | |
61 * collectively in the m-by-w vector dense[*]. | |
62 * | |
63 */ | |
64 | |
65 #ifdef USE_VENDOR_BLAS | |
66 #ifdef _CRAY | |
67 _fcd ftcs1 = _cptofcd("L", strlen("L")), | |
68 ftcs2 = _cptofcd("N", strlen("N")), | |
69 ftcs3 = _cptofcd("U", strlen("U")); | |
70 #endif | |
71 int incx = 1, incy = 1; | |
72 doublecomplex alpha, beta; | |
73 #endif | |
74 | |
75 register int k, ksub; | |
76 int fsupc, nsupc, nsupr, nrow; | |
77 int krep, krep_ind; | |
78 doublecomplex ukj, ukj1, ukj2; | |
79 int luptr, luptr1, luptr2; | |
80 int segsze; | |
81 int block_nrow; /* no of rows in a block row */ | |
82 register int lptr; /* Points to the row subscripts of a supernode */ | |
83 int kfnz, irow, no_zeros; | |
84 register int isub, isub1, i; | |
85 register int jj; /* Index through each column in the panel */ | |
86 int *xsup, *supno; | |
87 int *lsub, *xlsub; | |
88 doublecomplex *lusup; | |
89 int *xlusup; | |
90 int *repfnz_col; /* repfnz[] for a column in the panel */ | |
91 doublecomplex *dense_col; /* dense[] for a column in the panel */ | |
92 doublecomplex *tempv1; /* Used in 1-D update */ | |
93 doublecomplex *TriTmp, *MatvecTmp; /* used in 2-D update */ | |
94 doublecomplex zero = {0.0, 0.0}; | |
95 doublecomplex one = {1.0, 0.0}; | |
96 doublecomplex comp_temp, comp_temp1; | |
97 register int ldaTmp; | |
98 register int r_ind, r_hi; | |
99 static int first = 1, maxsuper, rowblk, colblk; | |
100 extern SuperLUStat_t SuperLUStat; | |
101 flops_t *ops = SuperLUStat.ops; | |
102 | |
103 xsup = Glu->xsup; | |
104 supno = Glu->supno; | |
105 lsub = Glu->lsub; | |
106 xlsub = Glu->xlsub; | |
107 lusup = Glu->lusup; | |
108 xlusup = Glu->xlusup; | |
109 | |
110 if ( first ) { | |
111 maxsuper = sp_ienv(3); | |
112 rowblk = sp_ienv(4); | |
113 colblk = sp_ienv(5); | |
114 first = 0; | |
115 } | |
116 ldaTmp = maxsuper + rowblk; | |
117 | |
118 /* | |
119 * For each nonz supernode segment of U[*,j] in topological order | |
120 */ | |
121 k = nseg - 1; | |
122 for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ | |
123 | |
124 /* krep = representative of current k-th supernode | |
125 * fsupc = first supernodal column | |
126 * nsupc = no of columns in a supernode | |
127 * nsupr = no of rows in a supernode | |
128 */ | |
129 krep = segrep[k--]; | |
130 fsupc = xsup[supno[krep]]; | |
131 nsupc = krep - fsupc + 1; | |
132 nsupr = xlsub[fsupc+1] - xlsub[fsupc]; | |
133 nrow = nsupr - nsupc; | |
134 lptr = xlsub[fsupc]; | |
135 krep_ind = lptr + nsupc - 1; | |
136 | |
137 repfnz_col = repfnz; | |
138 dense_col = dense; | |
139 | |
140 if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ | |
141 | |
142 TriTmp = tempv; | |
143 | |
144 /* Sequence through each column in panel -- triangular solves */ | |
145 for (jj = jcol; jj < jcol + w; jj++, | |
146 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { | |
147 | |
148 kfnz = repfnz_col[krep]; | |
149 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ | |
150 | |
151 segsze = krep - kfnz + 1; | |
152 luptr = xlusup[fsupc]; | |
153 | |
154 ops[TRSV] += 4 * segsze * (segsze - 1); | |
155 ops[GEMV] += 8 * nrow * segsze; | |
156 | |
157 /* Case 1: Update U-segment of size 1 -- col-col update */ | |
158 if ( segsze == 1 ) { | |
159 ukj = dense_col[lsub[krep_ind]]; | |
160 luptr += nsupr*(nsupc-1) + nsupc; | |
161 | |
162 for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { | |
163 irow = lsub[i]; | |
164 zz_mult(&comp_temp, &ukj, &lusup[luptr]); | |
165 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); | |
166 ++luptr; | |
167 } | |
168 | |
169 } else if ( segsze <= 3 ) { | |
170 ukj = dense_col[lsub[krep_ind]]; | |
171 ukj1 = dense_col[lsub[krep_ind - 1]]; | |
172 luptr += nsupr*(nsupc-1) + nsupc-1; | |
173 luptr1 = luptr - nsupr; | |
174 | |
175 if ( segsze == 2 ) { | |
176 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); | |
177 z_sub(&ukj, &ukj, &comp_temp); | |
178 dense_col[lsub[krep_ind]] = ukj; | |
179 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { | |
180 irow = lsub[i]; | |
181 luptr++; luptr1++; | |
182 zz_mult(&comp_temp, &ukj, &lusup[luptr]); | |
183 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); | |
184 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
185 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); | |
186 } | |
187 } else { | |
188 ukj2 = dense_col[lsub[krep_ind - 2]]; | |
189 luptr2 = luptr1 - nsupr; | |
190 zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); | |
191 z_sub(&ukj1, &ukj1, &comp_temp); | |
192 | |
193 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); | |
194 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); | |
195 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
196 z_sub(&ukj, &ukj, &comp_temp); | |
197 dense_col[lsub[krep_ind]] = ukj; | |
198 dense_col[lsub[krep_ind-1]] = ukj1; | |
199 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { | |
200 irow = lsub[i]; | |
201 luptr++; luptr1++; luptr2++; | |
202 zz_mult(&comp_temp, &ukj, &lusup[luptr]); | |
203 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); | |
204 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
205 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); | |
206 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
207 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); | |
208 } | |
209 } | |
210 | |
211 } else { /* segsze >= 4 */ | |
212 | |
213 /* Copy U[*,j] segment from dense[*] to TriTmp[*], which | |
214 holds the result of triangular solves. */ | |
215 no_zeros = kfnz - fsupc; | |
216 isub = lptr + no_zeros; | |
217 for (i = 0; i < segsze; ++i) { | |
218 irow = lsub[isub]; | |
219 TriTmp[i] = dense_col[irow]; /* Gather */ | |
220 ++isub; | |
221 } | |
222 | |
223 /* start effective triangle */ | |
224 luptr += nsupr * no_zeros + no_zeros; | |
225 | |
226 #ifdef USE_VENDOR_BLAS | |
227 #ifdef _CRAY | |
228 CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], | |
229 &nsupr, TriTmp, &incx ); | |
230 #else | |
231 ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], | |
232 &nsupr, TriTmp, &incx ); | |
233 #endif | |
234 #else | |
235 zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); | |
236 #endif | |
237 | |
238 | |
239 } /* else ... */ | |
240 | |
241 } /* for jj ... end tri-solves */ | |
242 | |
243 /* Block row updates; push all the way into dense[*] block */ | |
244 for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { | |
245 | |
246 r_hi = MIN(nrow, r_ind + rowblk); | |
247 block_nrow = MIN(rowblk, r_hi - r_ind); | |
248 luptr = xlusup[fsupc] + nsupc + r_ind; | |
249 isub1 = lptr + nsupc + r_ind; | |
250 | |
251 repfnz_col = repfnz; | |
252 TriTmp = tempv; | |
253 dense_col = dense; | |
254 | |
255 /* Sequence through each column in panel -- matrix-vector */ | |
256 for (jj = jcol; jj < jcol + w; jj++, | |
257 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { | |
258 | |
259 kfnz = repfnz_col[krep]; | |
260 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ | |
261 | |
262 segsze = krep - kfnz + 1; | |
263 if ( segsze <= 3 ) continue; /* skip unrolled cases */ | |
264 | |
265 /* Perform a block update, and scatter the result of | |
266 matrix-vector to dense[]. */ | |
267 no_zeros = kfnz - fsupc; | |
268 luptr1 = luptr + nsupr * no_zeros; | |
269 MatvecTmp = &TriTmp[maxsuper]; | |
270 | |
271 #ifdef USE_VENDOR_BLAS | |
272 alpha = one; | |
273 beta = zero; | |
274 #ifdef _CRAY | |
275 CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], | |
276 &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); | |
277 #else | |
278 zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], | |
279 &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); | |
280 #endif | |
281 #else | |
282 zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], | |
283 TriTmp, MatvecTmp); | |
284 #endif | |
285 | |
286 /* Scatter MatvecTmp[*] into SPA dense[*] temporarily | |
287 * such that MatvecTmp[*] can be re-used for the | |
288 * the next blok row update. dense[] will be copied into | |
289 * global store after the whole panel has been finished. | |
290 */ | |
291 isub = isub1; | |
292 for (i = 0; i < block_nrow; i++) { | |
293 irow = lsub[isub]; | |
294 z_sub(&dense_col[irow], &dense_col[irow], | |
295 &MatvecTmp[i]); | |
296 MatvecTmp[i] = zero; | |
297 ++isub; | |
298 } | |
299 | |
300 } /* for jj ... */ | |
301 | |
302 } /* for each block row ... */ | |
303 | |
304 /* Scatter the triangular solves into SPA dense[*] */ | |
305 repfnz_col = repfnz; | |
306 TriTmp = tempv; | |
307 dense_col = dense; | |
308 | |
309 for (jj = jcol; jj < jcol + w; jj++, | |
310 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { | |
311 kfnz = repfnz_col[krep]; | |
312 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ | |
313 | |
314 segsze = krep - kfnz + 1; | |
315 if ( segsze <= 3 ) continue; /* skip unrolled cases */ | |
316 | |
317 no_zeros = kfnz - fsupc; | |
318 isub = lptr + no_zeros; | |
319 for (i = 0; i < segsze; i++) { | |
320 irow = lsub[isub]; | |
321 dense_col[irow] = TriTmp[i]; | |
322 TriTmp[i] = zero; | |
323 ++isub; | |
324 } | |
325 | |
326 } /* for jj ... */ | |
327 | |
328 } else { /* 1-D block modification */ | |
329 | |
330 | |
331 /* Sequence through each column in the panel */ | |
332 for (jj = jcol; jj < jcol + w; jj++, | |
333 repfnz_col += m, dense_col += m) { | |
334 | |
335 kfnz = repfnz_col[krep]; | |
336 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ | |
337 | |
338 segsze = krep - kfnz + 1; | |
339 luptr = xlusup[fsupc]; | |
340 | |
341 ops[TRSV] += 4 * segsze * (segsze - 1); | |
342 ops[GEMV] += 8 * nrow * segsze; | |
343 | |
344 /* Case 1: Update U-segment of size 1 -- col-col update */ | |
345 if ( segsze == 1 ) { | |
346 ukj = dense_col[lsub[krep_ind]]; | |
347 luptr += nsupr*(nsupc-1) + nsupc; | |
348 | |
349 for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { | |
350 irow = lsub[i]; | |
351 zz_mult(&comp_temp, &ukj, &lusup[luptr]); | |
352 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); | |
353 ++luptr; | |
354 } | |
355 | |
356 } else if ( segsze <= 3 ) { | |
357 ukj = dense_col[lsub[krep_ind]]; | |
358 luptr += nsupr*(nsupc-1) + nsupc-1; | |
359 ukj1 = dense_col[lsub[krep_ind - 1]]; | |
360 luptr1 = luptr - nsupr; | |
361 | |
362 if ( segsze == 2 ) { | |
363 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); | |
364 z_sub(&ukj, &ukj, &comp_temp); | |
365 dense_col[lsub[krep_ind]] = ukj; | |
366 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { | |
367 irow = lsub[i]; | |
368 ++luptr; ++luptr1; | |
369 zz_mult(&comp_temp, &ukj, &lusup[luptr]); | |
370 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); | |
371 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
372 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); | |
373 } | |
374 } else { | |
375 ukj2 = dense_col[lsub[krep_ind - 2]]; | |
376 luptr2 = luptr1 - nsupr; | |
377 zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); | |
378 z_sub(&ukj1, &ukj1, &comp_temp); | |
379 | |
380 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); | |
381 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); | |
382 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
383 z_sub(&ukj, &ukj, &comp_temp); | |
384 dense_col[lsub[krep_ind]] = ukj; | |
385 dense_col[lsub[krep_ind-1]] = ukj1; | |
386 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { | |
387 irow = lsub[i]; | |
388 ++luptr; ++luptr1; ++luptr2; | |
389 zz_mult(&comp_temp, &ukj, &lusup[luptr]); | |
390 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); | |
391 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
392 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); | |
393 z_add(&comp_temp, &comp_temp, &comp_temp1); | |
394 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); | |
395 } | |
396 } | |
397 | |
398 } else { /* segsze >= 4 */ | |
399 /* | |
400 * Perform a triangular solve and block update, | |
401 * then scatter the result of sup-col update to dense[]. | |
402 */ | |
403 no_zeros = kfnz - fsupc; | |
404 | |
405 /* Copy U[*,j] segment from dense[*] to tempv[*]: | |
406 * The result of triangular solve is in tempv[*]; | |
407 * The result of matrix vector update is in dense_col[*] | |
408 */ | |
409 isub = lptr + no_zeros; | |
410 for (i = 0; i < segsze; ++i) { | |
411 irow = lsub[isub]; | |
412 tempv[i] = dense_col[irow]; /* Gather */ | |
413 ++isub; | |
414 } | |
415 | |
416 /* start effective triangle */ | |
417 luptr += nsupr * no_zeros + no_zeros; | |
418 | |
419 #ifdef USE_VENDOR_BLAS | |
420 #ifdef _CRAY | |
421 CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], | |
422 &nsupr, tempv, &incx ); | |
423 #else | |
424 ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], | |
425 &nsupr, tempv, &incx ); | |
426 #endif | |
427 | |
428 luptr += segsze; /* Dense matrix-vector */ | |
429 tempv1 = &tempv[segsze]; | |
430 alpha = one; | |
431 beta = zero; | |
432 #ifdef _CRAY | |
433 CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], | |
434 &nsupr, tempv, &incx, &beta, tempv1, &incy ); | |
435 #else | |
436 zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], | |
437 &nsupr, tempv, &incx, &beta, tempv1, &incy ); | |
438 #endif | |
439 #else | |
440 zlsolve ( nsupr, segsze, &lusup[luptr], tempv ); | |
441 | |
442 luptr += segsze; /* Dense matrix-vector */ | |
443 tempv1 = &tempv[segsze]; | |
444 zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); | |
445 #endif | |
446 | |
447 /* Scatter tempv[*] into SPA dense[*] temporarily, such | |
448 * that tempv[*] can be used for the triangular solve of | |
449 * the next column of the panel. They will be copied into | |
450 * ucol[*] after the whole panel has been finished. | |
451 */ | |
452 isub = lptr + no_zeros; | |
453 for (i = 0; i < segsze; i++) { | |
454 irow = lsub[isub]; | |
455 dense_col[irow] = tempv[i]; | |
456 tempv[i] = zero; | |
457 isub++; | |
458 } | |
459 | |
460 /* Scatter the update from tempv1[*] into SPA dense[*] */ | |
461 /* Start dense rectangular L */ | |
462 for (i = 0; i < nrow; i++) { | |
463 irow = lsub[isub]; | |
464 z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); | |
465 tempv1[i] = zero; | |
466 ++isub; | |
467 } | |
468 | |
469 } /* else segsze>=4 ... */ | |
470 | |
471 } /* for each column in the panel... */ | |
472 | |
473 } /* else 1-D update ... */ | |
474 | |
475 } /* for each updating supernode ... */ | |
476 | |
477 } | |
478 | |
479 | |
480 |