comparison main/sparse/SuperLU/SRC/zpanel_bmod.c @ 0:6b33357c7561 octave-forge

Initial revision
author pkienzle
date Wed, 10 Oct 2001 19:54:49 +0000
parents
children 7dad48fc229c
comparison
equal deleted inserted replaced
-1:000000000000 0:6b33357c7561
1
2
3 /*
4 * -- SuperLU routine (version 2.0) --
5 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
6 * and Lawrence Berkeley National Lab.
7 * November 15, 1997
8 *
9 */
10 /*
11 Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
13 THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
14 EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
16 Permission is hereby granted to use or copy this program for any
17 purpose, provided the above notices are retained on all copies.
18 Permission to modify the code and to distribute modified code is
19 granted, provided the above notices are retained, and a notice that
20 the code was modified is included with the above copyright notice.
21 */
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include "zsp_defs.h"
26 #include "util.h"
27
28 /*
29 * Function prototypes
30 */
31 void zlsolve(int, int, doublecomplex *, doublecomplex *);
32 void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *);
33 extern void zcheck_tempv();
34
35 void
36 zpanel_bmod (
37 const int m, /* in - number of rows in the matrix */
38 const int w, /* in */
39 const int jcol, /* in */
40 const int nseg, /* in */
41 doublecomplex *dense, /* out, of size n by w */
42 doublecomplex *tempv, /* working array */
43 int *segrep, /* in */
44 int *repfnz, /* in, of size n by w */
45 GlobalLU_t *Glu /* modified */
46 )
47 {
48 /*
49 * Purpose
50 * =======
51 *
52 * Performs numeric block updates (sup-panel) in topological order.
53 * It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
54 * Special processing on the supernodal portion of L\U[*,j]
55 *
56 * Before entering this routine, the original nonzeros in the panel
57 * were already copied into the spa[m,w].
58 *
59 * Updated/Output parameters-
60 * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
61 * collectively in the m-by-w vector dense[*].
62 *
63 */
64
65 #ifdef USE_VENDOR_BLAS
66 #ifdef _CRAY
67 _fcd ftcs1 = _cptofcd("L", strlen("L")),
68 ftcs2 = _cptofcd("N", strlen("N")),
69 ftcs3 = _cptofcd("U", strlen("U"));
70 #endif
71 int incx = 1, incy = 1;
72 doublecomplex alpha, beta;
73 #endif
74
75 register int k, ksub;
76 int fsupc, nsupc, nsupr, nrow;
77 int krep, krep_ind;
78 doublecomplex ukj, ukj1, ukj2;
79 int luptr, luptr1, luptr2;
80 int segsze;
81 int block_nrow; /* no of rows in a block row */
82 register int lptr; /* Points to the row subscripts of a supernode */
83 int kfnz, irow, no_zeros;
84 register int isub, isub1, i;
85 register int jj; /* Index through each column in the panel */
86 int *xsup, *supno;
87 int *lsub, *xlsub;
88 doublecomplex *lusup;
89 int *xlusup;
90 int *repfnz_col; /* repfnz[] for a column in the panel */
91 doublecomplex *dense_col; /* dense[] for a column in the panel */
92 doublecomplex *tempv1; /* Used in 1-D update */
93 doublecomplex *TriTmp, *MatvecTmp; /* used in 2-D update */
94 doublecomplex zero = {0.0, 0.0};
95 doublecomplex one = {1.0, 0.0};
96 doublecomplex comp_temp, comp_temp1;
97 register int ldaTmp;
98 register int r_ind, r_hi;
99 static int first = 1, maxsuper, rowblk, colblk;
100 extern SuperLUStat_t SuperLUStat;
101 flops_t *ops = SuperLUStat.ops;
102
103 xsup = Glu->xsup;
104 supno = Glu->supno;
105 lsub = Glu->lsub;
106 xlsub = Glu->xlsub;
107 lusup = Glu->lusup;
108 xlusup = Glu->xlusup;
109
110 if ( first ) {
111 maxsuper = sp_ienv(3);
112 rowblk = sp_ienv(4);
113 colblk = sp_ienv(5);
114 first = 0;
115 }
116 ldaTmp = maxsuper + rowblk;
117
118 /*
119 * For each nonz supernode segment of U[*,j] in topological order
120 */
121 k = nseg - 1;
122 for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
123
124 /* krep = representative of current k-th supernode
125 * fsupc = first supernodal column
126 * nsupc = no of columns in a supernode
127 * nsupr = no of rows in a supernode
128 */
129 krep = segrep[k--];
130 fsupc = xsup[supno[krep]];
131 nsupc = krep - fsupc + 1;
132 nsupr = xlsub[fsupc+1] - xlsub[fsupc];
133 nrow = nsupr - nsupc;
134 lptr = xlsub[fsupc];
135 krep_ind = lptr + nsupc - 1;
136
137 repfnz_col = repfnz;
138 dense_col = dense;
139
140 if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
141
142 TriTmp = tempv;
143
144 /* Sequence through each column in panel -- triangular solves */
145 for (jj = jcol; jj < jcol + w; jj++,
146 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
147
148 kfnz = repfnz_col[krep];
149 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
150
151 segsze = krep - kfnz + 1;
152 luptr = xlusup[fsupc];
153
154 ops[TRSV] += 4 * segsze * (segsze - 1);
155 ops[GEMV] += 8 * nrow * segsze;
156
157 /* Case 1: Update U-segment of size 1 -- col-col update */
158 if ( segsze == 1 ) {
159 ukj = dense_col[lsub[krep_ind]];
160 luptr += nsupr*(nsupc-1) + nsupc;
161
162 for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
163 irow = lsub[i];
164 zz_mult(&comp_temp, &ukj, &lusup[luptr]);
165 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
166 ++luptr;
167 }
168
169 } else if ( segsze <= 3 ) {
170 ukj = dense_col[lsub[krep_ind]];
171 ukj1 = dense_col[lsub[krep_ind - 1]];
172 luptr += nsupr*(nsupc-1) + nsupc-1;
173 luptr1 = luptr - nsupr;
174
175 if ( segsze == 2 ) {
176 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
177 z_sub(&ukj, &ukj, &comp_temp);
178 dense_col[lsub[krep_ind]] = ukj;
179 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
180 irow = lsub[i];
181 luptr++; luptr1++;
182 zz_mult(&comp_temp, &ukj, &lusup[luptr]);
183 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
184 z_add(&comp_temp, &comp_temp, &comp_temp1);
185 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
186 }
187 } else {
188 ukj2 = dense_col[lsub[krep_ind - 2]];
189 luptr2 = luptr1 - nsupr;
190 zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
191 z_sub(&ukj1, &ukj1, &comp_temp);
192
193 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
194 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
195 z_add(&comp_temp, &comp_temp, &comp_temp1);
196 z_sub(&ukj, &ukj, &comp_temp);
197 dense_col[lsub[krep_ind]] = ukj;
198 dense_col[lsub[krep_ind-1]] = ukj1;
199 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
200 irow = lsub[i];
201 luptr++; luptr1++; luptr2++;
202 zz_mult(&comp_temp, &ukj, &lusup[luptr]);
203 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
204 z_add(&comp_temp, &comp_temp, &comp_temp1);
205 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
206 z_add(&comp_temp, &comp_temp, &comp_temp1);
207 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
208 }
209 }
210
211 } else { /* segsze >= 4 */
212
213 /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
214 holds the result of triangular solves. */
215 no_zeros = kfnz - fsupc;
216 isub = lptr + no_zeros;
217 for (i = 0; i < segsze; ++i) {
218 irow = lsub[isub];
219 TriTmp[i] = dense_col[irow]; /* Gather */
220 ++isub;
221 }
222
223 /* start effective triangle */
224 luptr += nsupr * no_zeros + no_zeros;
225
226 #ifdef USE_VENDOR_BLAS
227 #ifdef _CRAY
228 CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
229 &nsupr, TriTmp, &incx );
230 #else
231 ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
232 &nsupr, TriTmp, &incx );
233 #endif
234 #else
235 zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
236 #endif
237
238
239 } /* else ... */
240
241 } /* for jj ... end tri-solves */
242
243 /* Block row updates; push all the way into dense[*] block */
244 for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
245
246 r_hi = MIN(nrow, r_ind + rowblk);
247 block_nrow = MIN(rowblk, r_hi - r_ind);
248 luptr = xlusup[fsupc] + nsupc + r_ind;
249 isub1 = lptr + nsupc + r_ind;
250
251 repfnz_col = repfnz;
252 TriTmp = tempv;
253 dense_col = dense;
254
255 /* Sequence through each column in panel -- matrix-vector */
256 for (jj = jcol; jj < jcol + w; jj++,
257 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
258
259 kfnz = repfnz_col[krep];
260 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
261
262 segsze = krep - kfnz + 1;
263 if ( segsze <= 3 ) continue; /* skip unrolled cases */
264
265 /* Perform a block update, and scatter the result of
266 matrix-vector to dense[]. */
267 no_zeros = kfnz - fsupc;
268 luptr1 = luptr + nsupr * no_zeros;
269 MatvecTmp = &TriTmp[maxsuper];
270
271 #ifdef USE_VENDOR_BLAS
272 alpha = one;
273 beta = zero;
274 #ifdef _CRAY
275 CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
276 &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
277 #else
278 zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
279 &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
280 #endif
281 #else
282 zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
283 TriTmp, MatvecTmp);
284 #endif
285
286 /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
287 * such that MatvecTmp[*] can be re-used for the
288 * the next blok row update. dense[] will be copied into
289 * global store after the whole panel has been finished.
290 */
291 isub = isub1;
292 for (i = 0; i < block_nrow; i++) {
293 irow = lsub[isub];
294 z_sub(&dense_col[irow], &dense_col[irow],
295 &MatvecTmp[i]);
296 MatvecTmp[i] = zero;
297 ++isub;
298 }
299
300 } /* for jj ... */
301
302 } /* for each block row ... */
303
304 /* Scatter the triangular solves into SPA dense[*] */
305 repfnz_col = repfnz;
306 TriTmp = tempv;
307 dense_col = dense;
308
309 for (jj = jcol; jj < jcol + w; jj++,
310 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
311 kfnz = repfnz_col[krep];
312 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
313
314 segsze = krep - kfnz + 1;
315 if ( segsze <= 3 ) continue; /* skip unrolled cases */
316
317 no_zeros = kfnz - fsupc;
318 isub = lptr + no_zeros;
319 for (i = 0; i < segsze; i++) {
320 irow = lsub[isub];
321 dense_col[irow] = TriTmp[i];
322 TriTmp[i] = zero;
323 ++isub;
324 }
325
326 } /* for jj ... */
327
328 } else { /* 1-D block modification */
329
330
331 /* Sequence through each column in the panel */
332 for (jj = jcol; jj < jcol + w; jj++,
333 repfnz_col += m, dense_col += m) {
334
335 kfnz = repfnz_col[krep];
336 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
337
338 segsze = krep - kfnz + 1;
339 luptr = xlusup[fsupc];
340
341 ops[TRSV] += 4 * segsze * (segsze - 1);
342 ops[GEMV] += 8 * nrow * segsze;
343
344 /* Case 1: Update U-segment of size 1 -- col-col update */
345 if ( segsze == 1 ) {
346 ukj = dense_col[lsub[krep_ind]];
347 luptr += nsupr*(nsupc-1) + nsupc;
348
349 for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
350 irow = lsub[i];
351 zz_mult(&comp_temp, &ukj, &lusup[luptr]);
352 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
353 ++luptr;
354 }
355
356 } else if ( segsze <= 3 ) {
357 ukj = dense_col[lsub[krep_ind]];
358 luptr += nsupr*(nsupc-1) + nsupc-1;
359 ukj1 = dense_col[lsub[krep_ind - 1]];
360 luptr1 = luptr - nsupr;
361
362 if ( segsze == 2 ) {
363 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
364 z_sub(&ukj, &ukj, &comp_temp);
365 dense_col[lsub[krep_ind]] = ukj;
366 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
367 irow = lsub[i];
368 ++luptr; ++luptr1;
369 zz_mult(&comp_temp, &ukj, &lusup[luptr]);
370 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
371 z_add(&comp_temp, &comp_temp, &comp_temp1);
372 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
373 }
374 } else {
375 ukj2 = dense_col[lsub[krep_ind - 2]];
376 luptr2 = luptr1 - nsupr;
377 zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
378 z_sub(&ukj1, &ukj1, &comp_temp);
379
380 zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
381 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
382 z_add(&comp_temp, &comp_temp, &comp_temp1);
383 z_sub(&ukj, &ukj, &comp_temp);
384 dense_col[lsub[krep_ind]] = ukj;
385 dense_col[lsub[krep_ind-1]] = ukj1;
386 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
387 irow = lsub[i];
388 ++luptr; ++luptr1; ++luptr2;
389 zz_mult(&comp_temp, &ukj, &lusup[luptr]);
390 zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
391 z_add(&comp_temp, &comp_temp, &comp_temp1);
392 zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
393 z_add(&comp_temp, &comp_temp, &comp_temp1);
394 z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
395 }
396 }
397
398 } else { /* segsze >= 4 */
399 /*
400 * Perform a triangular solve and block update,
401 * then scatter the result of sup-col update to dense[].
402 */
403 no_zeros = kfnz - fsupc;
404
405 /* Copy U[*,j] segment from dense[*] to tempv[*]:
406 * The result of triangular solve is in tempv[*];
407 * The result of matrix vector update is in dense_col[*]
408 */
409 isub = lptr + no_zeros;
410 for (i = 0; i < segsze; ++i) {
411 irow = lsub[isub];
412 tempv[i] = dense_col[irow]; /* Gather */
413 ++isub;
414 }
415
416 /* start effective triangle */
417 luptr += nsupr * no_zeros + no_zeros;
418
419 #ifdef USE_VENDOR_BLAS
420 #ifdef _CRAY
421 CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
422 &nsupr, tempv, &incx );
423 #else
424 ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
425 &nsupr, tempv, &incx );
426 #endif
427
428 luptr += segsze; /* Dense matrix-vector */
429 tempv1 = &tempv[segsze];
430 alpha = one;
431 beta = zero;
432 #ifdef _CRAY
433 CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
434 &nsupr, tempv, &incx, &beta, tempv1, &incy );
435 #else
436 zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
437 &nsupr, tempv, &incx, &beta, tempv1, &incy );
438 #endif
439 #else
440 zlsolve ( nsupr, segsze, &lusup[luptr], tempv );
441
442 luptr += segsze; /* Dense matrix-vector */
443 tempv1 = &tempv[segsze];
444 zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
445 #endif
446
447 /* Scatter tempv[*] into SPA dense[*] temporarily, such
448 * that tempv[*] can be used for the triangular solve of
449 * the next column of the panel. They will be copied into
450 * ucol[*] after the whole panel has been finished.
451 */
452 isub = lptr + no_zeros;
453 for (i = 0; i < segsze; i++) {
454 irow = lsub[isub];
455 dense_col[irow] = tempv[i];
456 tempv[i] = zero;
457 isub++;
458 }
459
460 /* Scatter the update from tempv1[*] into SPA dense[*] */
461 /* Start dense rectangular L */
462 for (i = 0; i < nrow; i++) {
463 irow = lsub[isub];
464 z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
465 tempv1[i] = zero;
466 ++isub;
467 }
468
469 } /* else segsze>=4 ... */
470
471 } /* for each column in the panel... */
472
473 } /* else 1-D update ... */
474
475 } /* for each updating supernode ... */
476
477 }
478
479
480