Actual source code: inode.c
1: /*
2: This file provides high performance routines for the Inode format (compressed sparse row)
3: by taking advantage of rows with identical nonzero structure (I-nodes).
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #if defined(PETSC_HAVE_XMMINTRIN_H)
7: #include <xmmintrin.h>
8: #endif
10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
15: PetscFunctionBegin;
16: n = A->cmap->n;
17: m = A->rmap->n;
18: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19: ns_row = a->inode.size_csr;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24: for (; count + 1 < n; count++, i++);
25: if (count < n) i++;
26: *size = i;
27: PetscFunctionReturn(PETSC_SUCCESS);
28: }
29: PetscCall(PetscMalloc1(n + 1, &ns_col));
30: ns_col[0] = 0;
32: /* Use the same row structure wherever feasible. */
33: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
35: /* if m < n; pad up the remainder with inode_limit */
36: for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37: /* The last node is the odd ball. pad it up with the remaining rows; */
38: if (count < n) {
39: ns_col[i + 1] = ns_col[i] + (n - count);
40: i++;
41: } else if (count > n) {
42: /* Adjust for the over estimation */
43: ns_col[i] += n - count;
44: }
45: *size = i;
46: *ns = ns_col;
47: PetscFunctionReturn(PETSC_SUCCESS);
48: }
50: /*
51: This builds symmetric version of nonzero structure,
52: */
53: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54: {
55: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
56: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
60: PetscFunctionBegin;
61: nslim_row = a->inode.node_count;
62: m = A->rmap->n;
63: n = A->cmap->n;
64: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
67: /* Use the row_inode as column_inode */
68: nslim_col = nslim_row;
69: ns_col = ns_row;
71: /* allocate space for reformatted inode structure */
72: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
75: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76: nsz = ns_col[i1 + 1] - ns_col[i1];
77: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78: }
79: /* allocate space for row pointers */
80: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81: *iia = ia;
82: PetscCall(PetscMalloc1(nslim_row + 1, &work));
84: /* determine the number of columns in each row */
85: ia[0] = oshift;
86: for (i1 = 0; i1 < nslim_row; i1++) {
87: row = ns_row[i1];
88: j = aj + ai[row] + ishift;
89: jmax = aj + ai[row + 1] + ishift;
90: if (j == jmax) continue; /* empty row */
91: col = *j++ + ishift;
92: i2 = tvc[col];
93: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94: ia[i1 + 1]++;
95: ia[i2 + 1]++;
96: i2++; /* Start col of next node */
97: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98: i2 = tvc[col];
99: }
100: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101: }
103: /* shift ia[i] to point to next row */
104: for (i1 = 1; i1 < nslim_row + 1; i1++) {
105: row = ia[i1 - 1];
106: ia[i1] += row;
107: work[i1 - 1] = row - oshift;
108: }
110: /* allocate space for column pointers */
111: nz = ia[nslim_row] + (!ishift);
112: PetscCall(PetscMalloc1(nz, &ja));
113: *jja = ja;
115: /* loop over lower triangular part putting into ja */
116: for (i1 = 0; i1 < nslim_row; i1++) {
117: row = ns_row[i1];
118: j = aj + ai[row] + ishift;
119: jmax = aj + ai[row + 1] + ishift;
120: if (j == jmax) continue; /* empty row */
121: col = *j++ + ishift;
122: i2 = tvc[col];
123: while (i2 < i1 && j < jmax) {
124: ja[work[i2]++] = i1 + oshift;
125: ja[work[i1]++] = i2 + oshift;
126: ++i2;
127: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128: i2 = tvc[col];
129: }
130: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131: }
132: PetscCall(PetscFree(work));
133: PetscCall(PetscFree2(tns, tvc));
134: PetscFunctionReturn(PETSC_SUCCESS);
135: }
137: /*
138: This builds nonsymmetric version of nonzero structure,
139: */
140: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141: {
142: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
143: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144: PetscInt *tns, *tvc, nsz, i1, i2;
145: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
147: PetscFunctionBegin;
148: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149: nslim_row = a->inode.node_count;
150: n = A->cmap->n;
152: /* Create The column_inode for this matrix */
153: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
155: /* allocate space for reformatted column_inode structure */
156: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
159: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160: nsz = ns_col[i1 + 1] - ns_col[i1];
161: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162: }
163: /* allocate space for row pointers */
164: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165: *iia = ia;
166: PetscCall(PetscMalloc1(nslim_row + 1, &work));
168: /* determine the number of columns in each row */
169: ia[0] = oshift;
170: for (i1 = 0; i1 < nslim_row; i1++) {
171: row = ns_row[i1];
172: j = aj + ai[row] + ishift;
173: nz = ai[row + 1] - ai[row];
174: if (!nz) continue; /* empty row */
175: col = *j++ + ishift;
176: i2 = tvc[col];
177: while (nz-- > 0) { /* off-diagonal elements */
178: ia[i1 + 1]++;
179: i2++; /* Start col of next node */
180: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181: if (nz > 0) i2 = tvc[col];
182: }
183: }
185: /* shift ia[i] to point to next row */
186: for (i1 = 1; i1 < nslim_row + 1; i1++) {
187: row = ia[i1 - 1];
188: ia[i1] += row;
189: work[i1 - 1] = row - oshift;
190: }
192: /* allocate space for column pointers */
193: nz = ia[nslim_row] + (!ishift);
194: PetscCall(PetscMalloc1(nz, &ja));
195: *jja = ja;
197: /* loop over matrix putting into ja */
198: for (i1 = 0; i1 < nslim_row; i1++) {
199: row = ns_row[i1];
200: j = aj + ai[row] + ishift;
201: nz = ai[row + 1] - ai[row];
202: if (!nz) continue; /* empty row */
203: col = *j++ + ishift;
204: i2 = tvc[col];
205: while (nz-- > 0) {
206: ja[work[i1]++] = i2 + oshift;
207: ++i2;
208: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209: if (nz > 0) i2 = tvc[col];
210: }
211: }
212: PetscCall(PetscFree(ns_col));
213: PetscCall(PetscFree(work));
214: PetscCall(PetscFree2(tns, tvc));
215: PetscFunctionReturn(PETSC_SUCCESS);
216: }
218: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219: {
220: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
222: PetscFunctionBegin;
223: if (n) *n = a->inode.node_count;
224: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225: if (!blockcompressed) {
226: PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227: } else if (symmetric) {
228: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229: } else {
230: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231: }
232: PetscFunctionReturn(PETSC_SUCCESS);
233: }
235: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236: {
237: PetscFunctionBegin;
238: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
240: if (!blockcompressed) {
241: PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242: } else {
243: PetscCall(PetscFree(*ia));
244: PetscCall(PetscFree(*ja));
245: }
246: PetscFunctionReturn(PETSC_SUCCESS);
247: }
249: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250: {
251: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
255: PetscFunctionBegin;
256: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257: nslim_row = a->inode.node_count;
258: n = A->cmap->n;
260: /* Create The column_inode for this matrix */
261: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
263: /* allocate space for reformatted column_inode structure */
264: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
267: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268: nsz = ns_col[i1 + 1] - ns_col[i1];
269: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270: }
271: /* allocate space for column pointers */
272: PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273: *iia = ia;
274: PetscCall(PetscMalloc1(nslim_col + 1, &work));
276: /* determine the number of columns in each row */
277: ia[0] = oshift;
278: for (i1 = 0; i1 < nslim_row; i1++) {
279: row = ns_row[i1];
280: j = aj + ai[row] + ishift;
281: col = *j++ + ishift;
282: i2 = tvc[col];
283: nz = ai[row + 1] - ai[row];
284: while (nz-- > 0) { /* off-diagonal elements */
285: /* ia[i1+1]++; */
286: ia[i2 + 1]++;
287: i2++;
288: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289: if (nz > 0) i2 = tvc[col];
290: }
291: }
293: /* shift ia[i] to point to next col */
294: for (i1 = 1; i1 < nslim_col + 1; i1++) {
295: col = ia[i1 - 1];
296: ia[i1] += col;
297: work[i1 - 1] = col - oshift;
298: }
300: /* allocate space for column pointers */
301: nz = ia[nslim_col] + (!ishift);
302: PetscCall(PetscMalloc1(nz, &ja));
303: *jja = ja;
305: /* loop over matrix putting into ja */
306: for (i1 = 0; i1 < nslim_row; i1++) {
307: row = ns_row[i1];
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row + 1] - ai[row];
312: while (nz-- > 0) {
313: /* ja[work[i1]++] = i2 + oshift; */
314: ja[work[i2]++] = i1 + oshift;
315: i2++;
316: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
320: PetscCall(PetscFree(ns_col));
321: PetscCall(PetscFree(work));
322: PetscCall(PetscFree2(tns, tvc));
323: PetscFunctionReturn(PETSC_SUCCESS);
324: }
326: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327: {
328: PetscFunctionBegin;
329: PetscCall(MatCreateColInode_Private(A, n, NULL));
330: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
332: if (!blockcompressed) {
333: PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334: } else if (symmetric) {
335: /* Since the indices are symmetric it doesn't matter */
336: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337: } else {
338: PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339: }
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344: {
345: PetscFunctionBegin;
346: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347: if (!blockcompressed) {
348: PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349: } else {
350: PetscCall(PetscFree(*ia));
351: PetscCall(PetscFree(*ja));
352: }
353: PetscFunctionReturn(PETSC_SUCCESS);
354: }
356: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357: {
358: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
359: PetscScalar *y;
360: const PetscScalar *x;
361: PetscInt row, node_max, nonzerorow = 0;
362: PetscInt *ns;
364: PetscFunctionBegin;
365: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366: node_max = a->inode.node_count;
367: ns = a->inode.size_csr; /* Node Size array */
368: PetscCall(VecGetArrayRead(xx, &x));
369: PetscCall(VecGetArray(yy, &y));
371: PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372: for (PetscInt i = 0; i < node_max; ++i) {
373: PetscInt i1, i2, nsz, n, sz;
374: const MatScalar *v1, *v2, *v3, *v4, *v5;
375: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376: const PetscInt *idx;
378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380: #endif
381: row = ns[i];
382: nsz = ns[i + 1] - ns[i];
383: n = a->i[row + 1] - a->i[row];
384: nonzerorow += (n > 0) * nsz;
386: idx = &a->j[a->i[row]];
387: v1 = &a->a[a->i[row]];
388: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
389: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
390: sz = n; /* No of non zeros in this row */
391: /* Switch on the size of Node */
392: switch (nsz) { /* Each loop in 'case' is unrolled */
393: case 1:
394: sum1 = 0.;
396: for (n = 0; n < sz - 1; n += 2) {
397: i1 = idx[0]; /* The instructions are ordered to */
398: i2 = idx[1]; /* make the compiler's job easy */
399: idx += 2;
400: tmp0 = x[i1];
401: tmp1 = x[i2];
402: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403: v1 += 2;
404: }
406: if (n == sz - 1) { /* Take care of the last nonzero */
407: tmp0 = x[*idx++];
408: sum1 += *v1++ * tmp0;
409: }
410: y[row++] = sum1;
411: break;
412: case 2:
413: sum1 = 0.;
414: sum2 = 0.;
415: v2 = v1 + n;
417: for (n = 0; n < sz - 1; n += 2) {
418: i1 = idx[0];
419: i2 = idx[1];
420: idx += 2;
421: tmp0 = x[i1];
422: tmp1 = x[i2];
423: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424: v1 += 2;
425: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426: v2 += 2;
427: }
428: if (n == sz - 1) {
429: tmp0 = x[*idx++];
430: sum1 += *v1++ * tmp0;
431: sum2 += *v2++ * tmp0;
432: }
433: y[row++] = sum1;
434: y[row++] = sum2;
435: v1 = v2; /* Since the next block to be processed starts there*/
436: idx += sz;
437: break;
438: case 3:
439: sum1 = 0.;
440: sum2 = 0.;
441: sum3 = 0.;
442: v2 = v1 + n;
443: v3 = v2 + n;
445: for (n = 0; n < sz - 1; n += 2) {
446: i1 = idx[0];
447: i2 = idx[1];
448: idx += 2;
449: tmp0 = x[i1];
450: tmp1 = x[i2];
451: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452: v1 += 2;
453: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454: v2 += 2;
455: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456: v3 += 2;
457: }
458: if (n == sz - 1) {
459: tmp0 = x[*idx++];
460: sum1 += *v1++ * tmp0;
461: sum2 += *v2++ * tmp0;
462: sum3 += *v3++ * tmp0;
463: }
464: y[row++] = sum1;
465: y[row++] = sum2;
466: y[row++] = sum3;
467: v1 = v3; /* Since the next block to be processed starts there*/
468: idx += 2 * sz;
469: break;
470: case 4:
471: sum1 = 0.;
472: sum2 = 0.;
473: sum3 = 0.;
474: sum4 = 0.;
475: v2 = v1 + n;
476: v3 = v2 + n;
477: v4 = v3 + n;
479: for (n = 0; n < sz - 1; n += 2) {
480: i1 = idx[0];
481: i2 = idx[1];
482: idx += 2;
483: tmp0 = x[i1];
484: tmp1 = x[i2];
485: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486: v1 += 2;
487: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488: v2 += 2;
489: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490: v3 += 2;
491: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492: v4 += 2;
493: }
494: if (n == sz - 1) {
495: tmp0 = x[*idx++];
496: sum1 += *v1++ * tmp0;
497: sum2 += *v2++ * tmp0;
498: sum3 += *v3++ * tmp0;
499: sum4 += *v4++ * tmp0;
500: }
501: y[row++] = sum1;
502: y[row++] = sum2;
503: y[row++] = sum3;
504: y[row++] = sum4;
505: v1 = v4; /* Since the next block to be processed starts there*/
506: idx += 3 * sz;
507: break;
508: case 5:
509: sum1 = 0.;
510: sum2 = 0.;
511: sum3 = 0.;
512: sum4 = 0.;
513: sum5 = 0.;
514: v2 = v1 + n;
515: v3 = v2 + n;
516: v4 = v3 + n;
517: v5 = v4 + n;
519: for (n = 0; n < sz - 1; n += 2) {
520: i1 = idx[0];
521: i2 = idx[1];
522: idx += 2;
523: tmp0 = x[i1];
524: tmp1 = x[i2];
525: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526: v1 += 2;
527: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528: v2 += 2;
529: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530: v3 += 2;
531: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532: v4 += 2;
533: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534: v5 += 2;
535: }
536: if (n == sz - 1) {
537: tmp0 = x[*idx++];
538: sum1 += *v1++ * tmp0;
539: sum2 += *v2++ * tmp0;
540: sum3 += *v3++ * tmp0;
541: sum4 += *v4++ * tmp0;
542: sum5 += *v5++ * tmp0;
543: }
544: y[row++] = sum1;
545: y[row++] = sum2;
546: y[row++] = sum3;
547: y[row++] = sum4;
548: y[row++] = sum5;
549: v1 = v5; /* Since the next block to be processed starts there */
550: idx += 4 * sz;
551: break;
552: default:
553: SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554: }
555: }
556: PetscCall(VecRestoreArrayRead(xx, &x));
557: PetscCall(VecRestoreArray(yy, &y));
558: PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559: PetscFunctionReturn(PETSC_SUCCESS);
560: }
562: /* Almost same code as the MatMult_SeqAIJ_Inode() */
563: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564: {
565: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
566: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567: const MatScalar *v1, *v2, *v3, *v4, *v5;
568: const PetscScalar *x;
569: PetscScalar *y, *z, *zt;
570: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
571: const PetscInt *idx, *ns, *ii;
573: PetscFunctionBegin;
574: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575: node_max = a->inode.node_count;
576: ns = a->inode.size_csr; /* Node Size array */
578: PetscCall(VecGetArrayRead(xx, &x));
579: PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580: zt = z;
582: idx = a->j;
583: v1 = a->a;
584: ii = a->i;
586: for (i = 0; i < node_max; ++i) {
587: row = ns[i];
588: nsz = ns[i + 1] - ns[i];
589: n = ii[1] - ii[0];
590: ii += nsz;
591: sz = n; /* No of non zeros in this row */
592: /* Switch on the size of Node */
593: switch (nsz) { /* Each loop in 'case' is unrolled */
594: case 1:
595: sum1 = *zt++;
597: for (n = 0; n < sz - 1; n += 2) {
598: i1 = idx[0]; /* The instructions are ordered to */
599: i2 = idx[1]; /* make the compiler's job easy */
600: idx += 2;
601: tmp0 = x[i1];
602: tmp1 = x[i2];
603: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604: v1 += 2;
605: }
607: if (n == sz - 1) { /* Take care of the last nonzero */
608: tmp0 = x[*idx++];
609: sum1 += *v1++ * tmp0;
610: }
611: y[row++] = sum1;
612: break;
613: case 2:
614: sum1 = *zt++;
615: sum2 = *zt++;
616: v2 = v1 + n;
618: for (n = 0; n < sz - 1; n += 2) {
619: i1 = idx[0];
620: i2 = idx[1];
621: idx += 2;
622: tmp0 = x[i1];
623: tmp1 = x[i2];
624: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625: v1 += 2;
626: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627: v2 += 2;
628: }
629: if (n == sz - 1) {
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: sum2 += *v2++ * tmp0;
633: }
634: y[row++] = sum1;
635: y[row++] = sum2;
636: v1 = v2; /* Since the next block to be processed starts there*/
637: idx += sz;
638: break;
639: case 3:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: sum3 = *zt++;
643: v2 = v1 + n;
644: v3 = v2 + n;
646: for (n = 0; n < sz - 1; n += 2) {
647: i1 = idx[0];
648: i2 = idx[1];
649: idx += 2;
650: tmp0 = x[i1];
651: tmp1 = x[i2];
652: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653: v1 += 2;
654: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655: v2 += 2;
656: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657: v3 += 2;
658: }
659: if (n == sz - 1) {
660: tmp0 = x[*idx++];
661: sum1 += *v1++ * tmp0;
662: sum2 += *v2++ * tmp0;
663: sum3 += *v3++ * tmp0;
664: }
665: y[row++] = sum1;
666: y[row++] = sum2;
667: y[row++] = sum3;
668: v1 = v3; /* Since the next block to be processed starts there*/
669: idx += 2 * sz;
670: break;
671: case 4:
672: sum1 = *zt++;
673: sum2 = *zt++;
674: sum3 = *zt++;
675: sum4 = *zt++;
676: v2 = v1 + n;
677: v3 = v2 + n;
678: v4 = v3 + n;
680: for (n = 0; n < sz - 1; n += 2) {
681: i1 = idx[0];
682: i2 = idx[1];
683: idx += 2;
684: tmp0 = x[i1];
685: tmp1 = x[i2];
686: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687: v1 += 2;
688: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689: v2 += 2;
690: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691: v3 += 2;
692: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693: v4 += 2;
694: }
695: if (n == sz - 1) {
696: tmp0 = x[*idx++];
697: sum1 += *v1++ * tmp0;
698: sum2 += *v2++ * tmp0;
699: sum3 += *v3++ * tmp0;
700: sum4 += *v4++ * tmp0;
701: }
702: y[row++] = sum1;
703: y[row++] = sum2;
704: y[row++] = sum3;
705: y[row++] = sum4;
706: v1 = v4; /* Since the next block to be processed starts there*/
707: idx += 3 * sz;
708: break;
709: case 5:
710: sum1 = *zt++;
711: sum2 = *zt++;
712: sum3 = *zt++;
713: sum4 = *zt++;
714: sum5 = *zt++;
715: v2 = v1 + n;
716: v3 = v2 + n;
717: v4 = v3 + n;
718: v5 = v4 + n;
720: for (n = 0; n < sz - 1; n += 2) {
721: i1 = idx[0];
722: i2 = idx[1];
723: idx += 2;
724: tmp0 = x[i1];
725: tmp1 = x[i2];
726: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727: v1 += 2;
728: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729: v2 += 2;
730: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731: v3 += 2;
732: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733: v4 += 2;
734: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735: v5 += 2;
736: }
737: if (n == sz - 1) {
738: tmp0 = x[*idx++];
739: sum1 += *v1++ * tmp0;
740: sum2 += *v2++ * tmp0;
741: sum3 += *v3++ * tmp0;
742: sum4 += *v4++ * tmp0;
743: sum5 += *v5++ * tmp0;
744: }
745: y[row++] = sum1;
746: y[row++] = sum2;
747: y[row++] = sum3;
748: y[row++] = sum4;
749: y[row++] = sum5;
750: v1 = v5; /* Since the next block to be processed starts there */
751: idx += 4 * sz;
752: break;
753: default:
754: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755: }
756: }
757: PetscCall(VecRestoreArrayRead(xx, &x));
758: PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759: PetscCall(PetscLogFlops(2.0 * a->nz));
760: PetscFunctionReturn(PETSC_SUCCESS);
761: }
763: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764: {
765: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
766: IS iscol = a->col, isrow = a->row;
767: const PetscInt *r, *c, *rout, *cout;
768: PetscInt i, j, n = A->rmap->n, nz;
769: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
770: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
772: PetscScalar sum1, sum2, sum3, sum4, sum5;
773: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774: const PetscScalar *b;
776: PetscFunctionBegin;
777: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778: node_max = a->inode.node_count;
779: ns = a->inode.size_csr; /* Node Size array */
781: PetscCall(VecGetArrayRead(bb, &b));
782: PetscCall(VecGetArrayWrite(xx, &x));
783: tmp = a->solve_work;
785: PetscCall(ISGetIndices(isrow, &rout));
786: r = rout;
787: PetscCall(ISGetIndices(iscol, &cout));
788: c = cout + (n - 1);
790: /* forward solve the lower triangular */
791: tmps = tmp;
792: aa = a_a;
793: aj = a_j;
794: ad = a->diag;
796: for (i = 0, row = 0; i < node_max; ++i) {
797: row = ns[i];
798: nsz = ns[i + 1] - ns[i];
799: aii = ai[row];
800: v1 = aa + aii;
801: vi = aj + aii;
802: nz = ad[row] - aii;
803: if (i < node_max - 1) {
804: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805: * but our indexing to determine its size could. */
806: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808: PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810: }
812: switch (nsz) { /* Each loop in 'case' is unrolled */
813: case 1:
814: sum1 = b[*r++];
815: for (j = 0; j < nz - 1; j += 2) {
816: i0 = vi[0];
817: i1 = vi[1];
818: vi += 2;
819: tmp0 = tmps[i0];
820: tmp1 = tmps[i1];
821: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822: v1 += 2;
823: }
824: if (j == nz - 1) {
825: tmp0 = tmps[*vi++];
826: sum1 -= *v1++ * tmp0;
827: }
828: tmp[row++] = sum1;
829: break;
830: case 2:
831: sum1 = b[*r++];
832: sum2 = b[*r++];
833: v2 = aa + ai[row + 1];
835: for (j = 0; j < nz - 1; j += 2) {
836: i0 = vi[0];
837: i1 = vi[1];
838: vi += 2;
839: tmp0 = tmps[i0];
840: tmp1 = tmps[i1];
841: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842: v1 += 2;
843: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844: v2 += 2;
845: }
846: if (j == nz - 1) {
847: tmp0 = tmps[*vi++];
848: sum1 -= *v1++ * tmp0;
849: sum2 -= *v2++ * tmp0;
850: }
851: sum2 -= *v2++ * sum1;
852: tmp[row++] = sum1;
853: tmp[row++] = sum2;
854: break;
855: case 3:
856: sum1 = b[*r++];
857: sum2 = b[*r++];
858: sum3 = b[*r++];
859: v2 = aa + ai[row + 1];
860: v3 = aa + ai[row + 2];
862: for (j = 0; j < nz - 1; j += 2) {
863: i0 = vi[0];
864: i1 = vi[1];
865: vi += 2;
866: tmp0 = tmps[i0];
867: tmp1 = tmps[i1];
868: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869: v1 += 2;
870: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871: v2 += 2;
872: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873: v3 += 2;
874: }
875: if (j == nz - 1) {
876: tmp0 = tmps[*vi++];
877: sum1 -= *v1++ * tmp0;
878: sum2 -= *v2++ * tmp0;
879: sum3 -= *v3++ * tmp0;
880: }
881: sum2 -= *v2++ * sum1;
882: sum3 -= *v3++ * sum1;
883: sum3 -= *v3++ * sum2;
885: tmp[row++] = sum1;
886: tmp[row++] = sum2;
887: tmp[row++] = sum3;
888: break;
890: case 4:
891: sum1 = b[*r++];
892: sum2 = b[*r++];
893: sum3 = b[*r++];
894: sum4 = b[*r++];
895: v2 = aa + ai[row + 1];
896: v3 = aa + ai[row + 2];
897: v4 = aa + ai[row + 3];
899: for (j = 0; j < nz - 1; j += 2) {
900: i0 = vi[0];
901: i1 = vi[1];
902: vi += 2;
903: tmp0 = tmps[i0];
904: tmp1 = tmps[i1];
905: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906: v1 += 2;
907: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908: v2 += 2;
909: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910: v3 += 2;
911: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912: v4 += 2;
913: }
914: if (j == nz - 1) {
915: tmp0 = tmps[*vi++];
916: sum1 -= *v1++ * tmp0;
917: sum2 -= *v2++ * tmp0;
918: sum3 -= *v3++ * tmp0;
919: sum4 -= *v4++ * tmp0;
920: }
921: sum2 -= *v2++ * sum1;
922: sum3 -= *v3++ * sum1;
923: sum4 -= *v4++ * sum1;
924: sum3 -= *v3++ * sum2;
925: sum4 -= *v4++ * sum2;
926: sum4 -= *v4++ * sum3;
928: tmp[row++] = sum1;
929: tmp[row++] = sum2;
930: tmp[row++] = sum3;
931: tmp[row++] = sum4;
932: break;
933: case 5:
934: sum1 = b[*r++];
935: sum2 = b[*r++];
936: sum3 = b[*r++];
937: sum4 = b[*r++];
938: sum5 = b[*r++];
939: v2 = aa + ai[row + 1];
940: v3 = aa + ai[row + 2];
941: v4 = aa + ai[row + 3];
942: v5 = aa + ai[row + 4];
944: for (j = 0; j < nz - 1; j += 2) {
945: i0 = vi[0];
946: i1 = vi[1];
947: vi += 2;
948: tmp0 = tmps[i0];
949: tmp1 = tmps[i1];
950: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951: v1 += 2;
952: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953: v2 += 2;
954: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955: v3 += 2;
956: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957: v4 += 2;
958: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959: v5 += 2;
960: }
961: if (j == nz - 1) {
962: tmp0 = tmps[*vi++];
963: sum1 -= *v1++ * tmp0;
964: sum2 -= *v2++ * tmp0;
965: sum3 -= *v3++ * tmp0;
966: sum4 -= *v4++ * tmp0;
967: sum5 -= *v5++ * tmp0;
968: }
970: sum2 -= *v2++ * sum1;
971: sum3 -= *v3++ * sum1;
972: sum4 -= *v4++ * sum1;
973: sum5 -= *v5++ * sum1;
974: sum3 -= *v3++ * sum2;
975: sum4 -= *v4++ * sum2;
976: sum5 -= *v5++ * sum2;
977: sum4 -= *v4++ * sum3;
978: sum5 -= *v5++ * sum3;
979: sum5 -= *v5++ * sum4;
981: tmp[row++] = sum1;
982: tmp[row++] = sum2;
983: tmp[row++] = sum3;
984: tmp[row++] = sum4;
985: tmp[row++] = sum5;
986: break;
987: default:
988: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989: }
990: }
991: /* backward solve the upper triangular */
992: for (i = node_max - 1; i >= 0; i--) {
993: row = ns[i + 1];
994: nsz = ns[i + 1] - ns[i];
995: aii = ai[row + 1] - 1;
996: v1 = aa + aii;
997: vi = aj + aii;
998: nz = aii - ad[row];
999: switch (nsz) { /* Each loop in 'case' is unrolled */
1000: case 1:
1001: sum1 = tmp[row];
1003: for (j = nz; j > 1; j -= 2) {
1004: vi -= 2;
1005: i0 = vi[2];
1006: i1 = vi[1];
1007: tmp0 = tmps[i0];
1008: tmp1 = tmps[i1];
1009: v1 -= 2;
1010: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011: }
1012: if (j == 1) {
1013: tmp0 = tmps[*vi--];
1014: sum1 -= *v1-- * tmp0;
1015: }
1016: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017: row--;
1018: break;
1019: case 2:
1020: sum1 = tmp[row];
1021: sum2 = tmp[row - 1];
1022: v2 = aa + ai[row] - 1;
1023: for (j = nz; j > 1; j -= 2) {
1024: vi -= 2;
1025: i0 = vi[2];
1026: i1 = vi[1];
1027: tmp0 = tmps[i0];
1028: tmp1 = tmps[i1];
1029: v1 -= 2;
1030: v2 -= 2;
1031: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033: }
1034: if (j == 1) {
1035: tmp0 = tmps[*vi--];
1036: sum1 -= *v1-- * tmp0;
1037: sum2 -= *v2-- * tmp0;
1038: }
1040: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041: row--;
1042: sum2 -= *v2-- * tmp0;
1043: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044: row--;
1045: break;
1046: case 3:
1047: sum1 = tmp[row];
1048: sum2 = tmp[row - 1];
1049: sum3 = tmp[row - 2];
1050: v2 = aa + ai[row] - 1;
1051: v3 = aa + ai[row - 1] - 1;
1052: for (j = nz; j > 1; j -= 2) {
1053: vi -= 2;
1054: i0 = vi[2];
1055: i1 = vi[1];
1056: tmp0 = tmps[i0];
1057: tmp1 = tmps[i1];
1058: v1 -= 2;
1059: v2 -= 2;
1060: v3 -= 2;
1061: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064: }
1065: if (j == 1) {
1066: tmp0 = tmps[*vi--];
1067: sum1 -= *v1-- * tmp0;
1068: sum2 -= *v2-- * tmp0;
1069: sum3 -= *v3-- * tmp0;
1070: }
1071: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072: row--;
1073: sum2 -= *v2-- * tmp0;
1074: sum3 -= *v3-- * tmp0;
1075: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076: row--;
1077: sum3 -= *v3-- * tmp0;
1078: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079: row--;
1081: break;
1082: case 4:
1083: sum1 = tmp[row];
1084: sum2 = tmp[row - 1];
1085: sum3 = tmp[row - 2];
1086: sum4 = tmp[row - 3];
1087: v2 = aa + ai[row] - 1;
1088: v3 = aa + ai[row - 1] - 1;
1089: v4 = aa + ai[row - 2] - 1;
1091: for (j = nz; j > 1; j -= 2) {
1092: vi -= 2;
1093: i0 = vi[2];
1094: i1 = vi[1];
1095: tmp0 = tmps[i0];
1096: tmp1 = tmps[i1];
1097: v1 -= 2;
1098: v2 -= 2;
1099: v3 -= 2;
1100: v4 -= 2;
1101: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105: }
1106: if (j == 1) {
1107: tmp0 = tmps[*vi--];
1108: sum1 -= *v1-- * tmp0;
1109: sum2 -= *v2-- * tmp0;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: }
1114: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115: row--;
1116: sum2 -= *v2-- * tmp0;
1117: sum3 -= *v3-- * tmp0;
1118: sum4 -= *v4-- * tmp0;
1119: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120: row--;
1121: sum3 -= *v3-- * tmp0;
1122: sum4 -= *v4-- * tmp0;
1123: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124: row--;
1125: sum4 -= *v4-- * tmp0;
1126: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127: row--;
1128: break;
1129: case 5:
1130: sum1 = tmp[row];
1131: sum2 = tmp[row - 1];
1132: sum3 = tmp[row - 2];
1133: sum4 = tmp[row - 3];
1134: sum5 = tmp[row - 4];
1135: v2 = aa + ai[row] - 1;
1136: v3 = aa + ai[row - 1] - 1;
1137: v4 = aa + ai[row - 2] - 1;
1138: v5 = aa + ai[row - 3] - 1;
1139: for (j = nz; j > 1; j -= 2) {
1140: vi -= 2;
1141: i0 = vi[2];
1142: i1 = vi[1];
1143: tmp0 = tmps[i0];
1144: tmp1 = tmps[i1];
1145: v1 -= 2;
1146: v2 -= 2;
1147: v3 -= 2;
1148: v4 -= 2;
1149: v5 -= 2;
1150: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155: }
1156: if (j == 1) {
1157: tmp0 = tmps[*vi--];
1158: sum1 -= *v1-- * tmp0;
1159: sum2 -= *v2-- * tmp0;
1160: sum3 -= *v3-- * tmp0;
1161: sum4 -= *v4-- * tmp0;
1162: sum5 -= *v5-- * tmp0;
1163: }
1165: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166: row--;
1167: sum2 -= *v2-- * tmp0;
1168: sum3 -= *v3-- * tmp0;
1169: sum4 -= *v4-- * tmp0;
1170: sum5 -= *v5-- * tmp0;
1171: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172: row--;
1173: sum3 -= *v3-- * tmp0;
1174: sum4 -= *v4-- * tmp0;
1175: sum5 -= *v5-- * tmp0;
1176: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177: row--;
1178: sum4 -= *v4-- * tmp0;
1179: sum5 -= *v5-- * tmp0;
1180: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181: row--;
1182: sum5 -= *v5-- * tmp0;
1183: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184: row--;
1185: break;
1186: default:
1187: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188: }
1189: }
1190: PetscCall(ISRestoreIndices(isrow, &rout));
1191: PetscCall(ISRestoreIndices(iscol, &cout));
1192: PetscCall(VecRestoreArrayRead(bb, &b));
1193: PetscCall(VecRestoreArrayWrite(xx, &x));
1194: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195: PetscFunctionReturn(PETSC_SUCCESS);
1196: }
1198: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199: {
1200: Mat C = B;
1201: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202: IS isrow = b->row, isicol = b->icol;
1203: const PetscInt *r, *ic, *ics;
1204: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205: PetscInt i, j, k, nz, nzL, row, *pj;
1206: const PetscInt *ajtmp, *bjtmp;
1207: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209: FactorShiftCtx sctx;
1210: const PetscInt *ddiag;
1211: PetscReal rs;
1212: MatScalar d;
1213: PetscInt inod, nodesz, node_max, col;
1214: const PetscInt *ns;
1215: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1217: PetscFunctionBegin;
1218: /* MatPivotSetUp(): initialize shift context sctx */
1219: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1221: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222: ddiag = a->diag;
1223: sctx.shift_top = info->zeropivot;
1224: for (i = 0; i < n; i++) {
1225: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226: d = (aa)[ddiag[i]];
1227: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228: v = aa + ai[i];
1229: nz = ai[i + 1] - ai[i];
1230: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231: if (rs > sctx.shift_top) sctx.shift_top = rs;
1232: }
1233: sctx.shift_top *= 1.1;
1234: sctx.nshift_max = 5;
1235: sctx.shift_lo = 0.;
1236: sctx.shift_hi = 1.;
1237: }
1239: PetscCall(ISGetIndices(isrow, &r));
1240: PetscCall(ISGetIndices(isicol, &ic));
1242: PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243: ics = ic;
1245: node_max = a->inode.node_count;
1246: ns = a->inode.size_csr;
1247: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1249: /* If max inode size > 4, split it into two inodes.*/
1250: /* also map the inode sizes according to the ordering */
1251: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253: nodesz = ns[i + 1] - ns[i];
1254: if (nodesz > 4) {
1255: tmp_vec1[j] = 4;
1256: ++j;
1257: tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258: } else {
1259: tmp_vec1[j] = nodesz;
1260: }
1261: }
1262: /* Use the correct node_max */
1263: node_max = j;
1265: /* Now reorder the inode info based on mat re-ordering info */
1266: /* First create a row -> inode_size_array_index map */
1267: PetscCall(PetscMalloc1(n + 1, &nsmap));
1268: PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269: tmp_vec2[0] = 0;
1270: for (i = 0, row = 0; i < node_max; i++) {
1271: nodesz = tmp_vec1[i];
1272: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273: }
1274: /* Using nsmap, create a reordered ns structure */
1275: for (i = 0, j = 0; i < node_max; i++) {
1276: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277: tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278: j += nodesz;
1279: }
1280: PetscCall(PetscFree(nsmap));
1281: PetscCall(PetscFree(tmp_vec1));
1283: /* Now use the correct ns */
1284: ns = tmp_vec2;
1286: do {
1287: sctx.newshift = PETSC_FALSE;
1288: /* Now loop over each block-row, and do the factorization */
1289: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290: nodesz = ns[inod + 1] - ns[inod];
1292: switch (nodesz) {
1293: case 1:
1294: /* zero rtmp1 */
1295: /* L part */
1296: nz = bi[i + 1] - bi[i];
1297: bjtmp = bj + bi[i];
1298: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1300: /* U part */
1301: nz = bdiag[i] - bdiag[i + 1];
1302: bjtmp = bj + bdiag[i + 1] + 1;
1303: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1305: /* load in initial (unfactored row) */
1306: nz = ai[r[i] + 1] - ai[r[i]];
1307: ajtmp = aj + ai[r[i]];
1308: v = aa + ai[r[i]];
1309: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1311: /* ZeropivotApply() */
1312: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1314: /* elimination */
1315: bjtmp = bj + bi[i];
1316: row = *bjtmp++;
1317: nzL = bi[i + 1] - bi[i];
1318: for (k = 0; k < nzL; k++) {
1319: pc = rtmp1 + row;
1320: if (*pc != 0.0) {
1321: pv = b->a + bdiag[row];
1322: mul1 = *pc * (*pv);
1323: *pc = mul1;
1324: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325: pv = b->a + bdiag[row + 1] + 1;
1326: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329: }
1330: row = *bjtmp++;
1331: }
1333: /* finished row so stick it into b->a */
1334: rs = 0.0;
1335: /* L part */
1336: pv = b->a + bi[i];
1337: pj = b->j + bi[i];
1338: nz = bi[i + 1] - bi[i];
1339: for (j = 0; j < nz; j++) {
1340: pv[j] = rtmp1[pj[j]];
1341: rs += PetscAbsScalar(pv[j]);
1342: }
1344: /* U part */
1345: pv = b->a + bdiag[i + 1] + 1;
1346: pj = b->j + bdiag[i + 1] + 1;
1347: nz = bdiag[i] - bdiag[i + 1] - 1;
1348: for (j = 0; j < nz; j++) {
1349: pv[j] = rtmp1[pj[j]];
1350: rs += PetscAbsScalar(pv[j]);
1351: }
1353: /* Check zero pivot */
1354: sctx.rs = rs;
1355: sctx.pv = rtmp1[i];
1356: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357: if (sctx.newshift) break;
1359: /* Mark diagonal and invert diagonal for simpler triangular solves */
1360: pv = b->a + bdiag[i];
1361: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362: break;
1364: case 2:
1365: /* zero rtmp1 and rtmp2 */
1366: /* L part */
1367: nz = bi[i + 1] - bi[i];
1368: bjtmp = bj + bi[i];
1369: for (j = 0; j < nz; j++) {
1370: col = bjtmp[j];
1371: rtmp1[col] = 0.0;
1372: rtmp2[col] = 0.0;
1373: }
1375: /* U part */
1376: nz = bdiag[i] - bdiag[i + 1];
1377: bjtmp = bj + bdiag[i + 1] + 1;
1378: for (j = 0; j < nz; j++) {
1379: col = bjtmp[j];
1380: rtmp1[col] = 0.0;
1381: rtmp2[col] = 0.0;
1382: }
1384: /* load in initial (unfactored row) */
1385: nz = ai[r[i] + 1] - ai[r[i]];
1386: ajtmp = aj + ai[r[i]];
1387: v1 = aa + ai[r[i]];
1388: v2 = aa + ai[r[i] + 1];
1389: for (j = 0; j < nz; j++) {
1390: col = ics[ajtmp[j]];
1391: rtmp1[col] = v1[j];
1392: rtmp2[col] = v2[j];
1393: }
1394: /* ZeropivotApply(): shift the diagonal of the matrix */
1395: rtmp1[i] += sctx.shift_amount;
1396: rtmp2[i + 1] += sctx.shift_amount;
1398: /* elimination */
1399: bjtmp = bj + bi[i];
1400: row = *bjtmp++; /* pivot row */
1401: nzL = bi[i + 1] - bi[i];
1402: for (k = 0; k < nzL; k++) {
1403: pc1 = rtmp1 + row;
1404: pc2 = rtmp2 + row;
1405: if (*pc1 != 0.0 || *pc2 != 0.0) {
1406: pv = b->a + bdiag[row];
1407: mul1 = *pc1 * (*pv);
1408: mul2 = *pc2 * (*pv);
1409: *pc1 = mul1;
1410: *pc2 = mul2;
1412: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413: pv = b->a + bdiag[row + 1] + 1;
1414: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415: for (j = 0; j < nz; j++) {
1416: col = pj[j];
1417: rtmp1[col] -= mul1 * pv[j];
1418: rtmp2[col] -= mul2 * pv[j];
1419: }
1420: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421: }
1422: row = *bjtmp++;
1423: }
1425: /* finished row i; check zero pivot, then stick row i into b->a */
1426: rs = 0.0;
1427: /* L part */
1428: pc1 = b->a + bi[i];
1429: pj = b->j + bi[i];
1430: nz = bi[i + 1] - bi[i];
1431: for (j = 0; j < nz; j++) {
1432: col = pj[j];
1433: pc1[j] = rtmp1[col];
1434: rs += PetscAbsScalar(pc1[j]);
1435: }
1436: /* U part */
1437: pc1 = b->a + bdiag[i + 1] + 1;
1438: pj = b->j + bdiag[i + 1] + 1;
1439: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440: for (j = 0; j < nz; j++) {
1441: col = pj[j];
1442: pc1[j] = rtmp1[col];
1443: rs += PetscAbsScalar(pc1[j]);
1444: }
1446: sctx.rs = rs;
1447: sctx.pv = rtmp1[i];
1448: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449: if (sctx.newshift) break;
1450: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1451: *pc1 = 1.0 / sctx.pv;
1453: /* Now take care of diagonal 2x2 block. */
1454: pc2 = rtmp2 + i;
1455: if (*pc2 != 0.0) {
1456: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1457: *pc2 = mul1; /* insert L entry */
1458: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1459: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460: for (j = 0; j < nz; j++) {
1461: col = pj[j];
1462: rtmp2[col] -= mul1 * rtmp1[col];
1463: }
1464: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465: }
1467: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468: rs = 0.0;
1469: /* L part */
1470: pc2 = b->a + bi[i + 1];
1471: pj = b->j + bi[i + 1];
1472: nz = bi[i + 2] - bi[i + 1];
1473: for (j = 0; j < nz; j++) {
1474: col = pj[j];
1475: pc2[j] = rtmp2[col];
1476: rs += PetscAbsScalar(pc2[j]);
1477: }
1478: /* U part */
1479: pc2 = b->a + bdiag[i + 2] + 1;
1480: pj = b->j + bdiag[i + 2] + 1;
1481: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482: for (j = 0; j < nz; j++) {
1483: col = pj[j];
1484: pc2[j] = rtmp2[col];
1485: rs += PetscAbsScalar(pc2[j]);
1486: }
1488: sctx.rs = rs;
1489: sctx.pv = rtmp2[i + 1];
1490: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491: if (sctx.newshift) break;
1492: pc2 = b->a + bdiag[i + 1];
1493: *pc2 = 1.0 / sctx.pv;
1494: break;
1496: case 3:
1497: /* zero rtmp */
1498: /* L part */
1499: nz = bi[i + 1] - bi[i];
1500: bjtmp = bj + bi[i];
1501: for (j = 0; j < nz; j++) {
1502: col = bjtmp[j];
1503: rtmp1[col] = 0.0;
1504: rtmp2[col] = 0.0;
1505: rtmp3[col] = 0.0;
1506: }
1508: /* U part */
1509: nz = bdiag[i] - bdiag[i + 1];
1510: bjtmp = bj + bdiag[i + 1] + 1;
1511: for (j = 0; j < nz; j++) {
1512: col = bjtmp[j];
1513: rtmp1[col] = 0.0;
1514: rtmp2[col] = 0.0;
1515: rtmp3[col] = 0.0;
1516: }
1518: /* load in initial (unfactored row) */
1519: nz = ai[r[i] + 1] - ai[r[i]];
1520: ajtmp = aj + ai[r[i]];
1521: v1 = aa + ai[r[i]];
1522: v2 = aa + ai[r[i] + 1];
1523: v3 = aa + ai[r[i] + 2];
1524: for (j = 0; j < nz; j++) {
1525: col = ics[ajtmp[j]];
1526: rtmp1[col] = v1[j];
1527: rtmp2[col] = v2[j];
1528: rtmp3[col] = v3[j];
1529: }
1530: /* ZeropivotApply(): shift the diagonal of the matrix */
1531: rtmp1[i] += sctx.shift_amount;
1532: rtmp2[i + 1] += sctx.shift_amount;
1533: rtmp3[i + 2] += sctx.shift_amount;
1535: /* elimination */
1536: bjtmp = bj + bi[i];
1537: row = *bjtmp++; /* pivot row */
1538: nzL = bi[i + 1] - bi[i];
1539: for (k = 0; k < nzL; k++) {
1540: pc1 = rtmp1 + row;
1541: pc2 = rtmp2 + row;
1542: pc3 = rtmp3 + row;
1543: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544: pv = b->a + bdiag[row];
1545: mul1 = *pc1 * (*pv);
1546: mul2 = *pc2 * (*pv);
1547: mul3 = *pc3 * (*pv);
1548: *pc1 = mul1;
1549: *pc2 = mul2;
1550: *pc3 = mul3;
1552: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553: pv = b->a + bdiag[row + 1] + 1;
1554: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555: for (j = 0; j < nz; j++) {
1556: col = pj[j];
1557: rtmp1[col] -= mul1 * pv[j];
1558: rtmp2[col] -= mul2 * pv[j];
1559: rtmp3[col] -= mul3 * pv[j];
1560: }
1561: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562: }
1563: row = *bjtmp++;
1564: }
1566: /* finished row i; check zero pivot, then stick row i into b->a */
1567: rs = 0.0;
1568: /* L part */
1569: pc1 = b->a + bi[i];
1570: pj = b->j + bi[i];
1571: nz = bi[i + 1] - bi[i];
1572: for (j = 0; j < nz; j++) {
1573: col = pj[j];
1574: pc1[j] = rtmp1[col];
1575: rs += PetscAbsScalar(pc1[j]);
1576: }
1577: /* U part */
1578: pc1 = b->a + bdiag[i + 1] + 1;
1579: pj = b->j + bdiag[i + 1] + 1;
1580: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581: for (j = 0; j < nz; j++) {
1582: col = pj[j];
1583: pc1[j] = rtmp1[col];
1584: rs += PetscAbsScalar(pc1[j]);
1585: }
1587: sctx.rs = rs;
1588: sctx.pv = rtmp1[i];
1589: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590: if (sctx.newshift) break;
1591: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1592: *pc1 = 1.0 / sctx.pv;
1594: /* Now take care of 1st column of diagonal 3x3 block. */
1595: pc2 = rtmp2 + i;
1596: pc3 = rtmp3 + i;
1597: if (*pc2 != 0.0 || *pc3 != 0.0) {
1598: mul2 = (*pc2) * (*pc1);
1599: *pc2 = mul2;
1600: mul3 = (*pc3) * (*pc1);
1601: *pc3 = mul3;
1602: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1603: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604: for (j = 0; j < nz; j++) {
1605: col = pj[j];
1606: rtmp2[col] -= mul2 * rtmp1[col];
1607: rtmp3[col] -= mul3 * rtmp1[col];
1608: }
1609: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610: }
1612: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613: rs = 0.0;
1614: /* L part */
1615: pc2 = b->a + bi[i + 1];
1616: pj = b->j + bi[i + 1];
1617: nz = bi[i + 2] - bi[i + 1];
1618: for (j = 0; j < nz; j++) {
1619: col = pj[j];
1620: pc2[j] = rtmp2[col];
1621: rs += PetscAbsScalar(pc2[j]);
1622: }
1623: /* U part */
1624: pc2 = b->a + bdiag[i + 2] + 1;
1625: pj = b->j + bdiag[i + 2] + 1;
1626: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627: for (j = 0; j < nz; j++) {
1628: col = pj[j];
1629: pc2[j] = rtmp2[col];
1630: rs += PetscAbsScalar(pc2[j]);
1631: }
1633: sctx.rs = rs;
1634: sctx.pv = rtmp2[i + 1];
1635: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636: if (sctx.newshift) break;
1637: pc2 = b->a + bdiag[i + 1];
1638: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1640: /* Now take care of 2nd column of diagonal 3x3 block. */
1641: pc3 = rtmp3 + i + 1;
1642: if (*pc3 != 0.0) {
1643: mul3 = (*pc3) * (*pc2);
1644: *pc3 = mul3;
1645: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1646: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647: for (j = 0; j < nz; j++) {
1648: col = pj[j];
1649: rtmp3[col] -= mul3 * rtmp2[col];
1650: }
1651: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652: }
1654: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655: rs = 0.0;
1656: /* L part */
1657: pc3 = b->a + bi[i + 2];
1658: pj = b->j + bi[i + 2];
1659: nz = bi[i + 3] - bi[i + 2];
1660: for (j = 0; j < nz; j++) {
1661: col = pj[j];
1662: pc3[j] = rtmp3[col];
1663: rs += PetscAbsScalar(pc3[j]);
1664: }
1665: /* U part */
1666: pc3 = b->a + bdiag[i + 3] + 1;
1667: pj = b->j + bdiag[i + 3] + 1;
1668: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669: for (j = 0; j < nz; j++) {
1670: col = pj[j];
1671: pc3[j] = rtmp3[col];
1672: rs += PetscAbsScalar(pc3[j]);
1673: }
1675: sctx.rs = rs;
1676: sctx.pv = rtmp3[i + 2];
1677: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678: if (sctx.newshift) break;
1679: pc3 = b->a + bdiag[i + 2];
1680: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681: break;
1682: case 4:
1683: /* zero rtmp */
1684: /* L part */
1685: nz = bi[i + 1] - bi[i];
1686: bjtmp = bj + bi[i];
1687: for (j = 0; j < nz; j++) {
1688: col = bjtmp[j];
1689: rtmp1[col] = 0.0;
1690: rtmp2[col] = 0.0;
1691: rtmp3[col] = 0.0;
1692: rtmp4[col] = 0.0;
1693: }
1695: /* U part */
1696: nz = bdiag[i] - bdiag[i + 1];
1697: bjtmp = bj + bdiag[i + 1] + 1;
1698: for (j = 0; j < nz; j++) {
1699: col = bjtmp[j];
1700: rtmp1[col] = 0.0;
1701: rtmp2[col] = 0.0;
1702: rtmp3[col] = 0.0;
1703: rtmp4[col] = 0.0;
1704: }
1706: /* load in initial (unfactored row) */
1707: nz = ai[r[i] + 1] - ai[r[i]];
1708: ajtmp = aj + ai[r[i]];
1709: v1 = aa + ai[r[i]];
1710: v2 = aa + ai[r[i] + 1];
1711: v3 = aa + ai[r[i] + 2];
1712: v4 = aa + ai[r[i] + 3];
1713: for (j = 0; j < nz; j++) {
1714: col = ics[ajtmp[j]];
1715: rtmp1[col] = v1[j];
1716: rtmp2[col] = v2[j];
1717: rtmp3[col] = v3[j];
1718: rtmp4[col] = v4[j];
1719: }
1720: /* ZeropivotApply(): shift the diagonal of the matrix */
1721: rtmp1[i] += sctx.shift_amount;
1722: rtmp2[i + 1] += sctx.shift_amount;
1723: rtmp3[i + 2] += sctx.shift_amount;
1724: rtmp4[i + 3] += sctx.shift_amount;
1726: /* elimination */
1727: bjtmp = bj + bi[i];
1728: row = *bjtmp++; /* pivot row */
1729: nzL = bi[i + 1] - bi[i];
1730: for (k = 0; k < nzL; k++) {
1731: pc1 = rtmp1 + row;
1732: pc2 = rtmp2 + row;
1733: pc3 = rtmp3 + row;
1734: pc4 = rtmp4 + row;
1735: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736: pv = b->a + bdiag[row];
1737: mul1 = *pc1 * (*pv);
1738: mul2 = *pc2 * (*pv);
1739: mul3 = *pc3 * (*pv);
1740: mul4 = *pc4 * (*pv);
1741: *pc1 = mul1;
1742: *pc2 = mul2;
1743: *pc3 = mul3;
1744: *pc4 = mul4;
1746: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747: pv = b->a + bdiag[row + 1] + 1;
1748: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749: for (j = 0; j < nz; j++) {
1750: col = pj[j];
1751: rtmp1[col] -= mul1 * pv[j];
1752: rtmp2[col] -= mul2 * pv[j];
1753: rtmp3[col] -= mul3 * pv[j];
1754: rtmp4[col] -= mul4 * pv[j];
1755: }
1756: PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757: }
1758: row = *bjtmp++;
1759: }
1761: /* finished row i; check zero pivot, then stick row i into b->a */
1762: rs = 0.0;
1763: /* L part */
1764: pc1 = b->a + bi[i];
1765: pj = b->j + bi[i];
1766: nz = bi[i + 1] - bi[i];
1767: for (j = 0; j < nz; j++) {
1768: col = pj[j];
1769: pc1[j] = rtmp1[col];
1770: rs += PetscAbsScalar(pc1[j]);
1771: }
1772: /* U part */
1773: pc1 = b->a + bdiag[i + 1] + 1;
1774: pj = b->j + bdiag[i + 1] + 1;
1775: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776: for (j = 0; j < nz; j++) {
1777: col = pj[j];
1778: pc1[j] = rtmp1[col];
1779: rs += PetscAbsScalar(pc1[j]);
1780: }
1782: sctx.rs = rs;
1783: sctx.pv = rtmp1[i];
1784: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785: if (sctx.newshift) break;
1786: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1787: *pc1 = 1.0 / sctx.pv;
1789: /* Now take care of 1st column of diagonal 4x4 block. */
1790: pc2 = rtmp2 + i;
1791: pc3 = rtmp3 + i;
1792: pc4 = rtmp4 + i;
1793: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794: mul2 = (*pc2) * (*pc1);
1795: *pc2 = mul2;
1796: mul3 = (*pc3) * (*pc1);
1797: *pc3 = mul3;
1798: mul4 = (*pc4) * (*pc1);
1799: *pc4 = mul4;
1800: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1801: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802: for (j = 0; j < nz; j++) {
1803: col = pj[j];
1804: rtmp2[col] -= mul2 * rtmp1[col];
1805: rtmp3[col] -= mul3 * rtmp1[col];
1806: rtmp4[col] -= mul4 * rtmp1[col];
1807: }
1808: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809: }
1811: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812: rs = 0.0;
1813: /* L part */
1814: pc2 = b->a + bi[i + 1];
1815: pj = b->j + bi[i + 1];
1816: nz = bi[i + 2] - bi[i + 1];
1817: for (j = 0; j < nz; j++) {
1818: col = pj[j];
1819: pc2[j] = rtmp2[col];
1820: rs += PetscAbsScalar(pc2[j]);
1821: }
1822: /* U part */
1823: pc2 = b->a + bdiag[i + 2] + 1;
1824: pj = b->j + bdiag[i + 2] + 1;
1825: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826: for (j = 0; j < nz; j++) {
1827: col = pj[j];
1828: pc2[j] = rtmp2[col];
1829: rs += PetscAbsScalar(pc2[j]);
1830: }
1832: sctx.rs = rs;
1833: sctx.pv = rtmp2[i + 1];
1834: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835: if (sctx.newshift) break;
1836: pc2 = b->a + bdiag[i + 1];
1837: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1839: /* Now take care of 2nd column of diagonal 4x4 block. */
1840: pc3 = rtmp3 + i + 1;
1841: pc4 = rtmp4 + i + 1;
1842: if (*pc3 != 0.0 || *pc4 != 0.0) {
1843: mul3 = (*pc3) * (*pc2);
1844: *pc3 = mul3;
1845: mul4 = (*pc4) * (*pc2);
1846: *pc4 = mul4;
1847: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1848: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849: for (j = 0; j < nz; j++) {
1850: col = pj[j];
1851: rtmp3[col] -= mul3 * rtmp2[col];
1852: rtmp4[col] -= mul4 * rtmp2[col];
1853: }
1854: PetscCall(PetscLogFlops(4.0 * nz));
1855: }
1857: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858: rs = 0.0;
1859: /* L part */
1860: pc3 = b->a + bi[i + 2];
1861: pj = b->j + bi[i + 2];
1862: nz = bi[i + 3] - bi[i + 2];
1863: for (j = 0; j < nz; j++) {
1864: col = pj[j];
1865: pc3[j] = rtmp3[col];
1866: rs += PetscAbsScalar(pc3[j]);
1867: }
1868: /* U part */
1869: pc3 = b->a + bdiag[i + 3] + 1;
1870: pj = b->j + bdiag[i + 3] + 1;
1871: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872: for (j = 0; j < nz; j++) {
1873: col = pj[j];
1874: pc3[j] = rtmp3[col];
1875: rs += PetscAbsScalar(pc3[j]);
1876: }
1878: sctx.rs = rs;
1879: sctx.pv = rtmp3[i + 2];
1880: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881: if (sctx.newshift) break;
1882: pc3 = b->a + bdiag[i + 2];
1883: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1885: /* Now take care of 3rd column of diagonal 4x4 block. */
1886: pc4 = rtmp4 + i + 2;
1887: if (*pc4 != 0.0) {
1888: mul4 = (*pc4) * (*pc3);
1889: *pc4 = mul4;
1890: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1891: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892: for (j = 0; j < nz; j++) {
1893: col = pj[j];
1894: rtmp4[col] -= mul4 * rtmp3[col];
1895: }
1896: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897: }
1899: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900: rs = 0.0;
1901: /* L part */
1902: pc4 = b->a + bi[i + 3];
1903: pj = b->j + bi[i + 3];
1904: nz = bi[i + 4] - bi[i + 3];
1905: for (j = 0; j < nz; j++) {
1906: col = pj[j];
1907: pc4[j] = rtmp4[col];
1908: rs += PetscAbsScalar(pc4[j]);
1909: }
1910: /* U part */
1911: pc4 = b->a + bdiag[i + 4] + 1;
1912: pj = b->j + bdiag[i + 4] + 1;
1913: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914: for (j = 0; j < nz; j++) {
1915: col = pj[j];
1916: pc4[j] = rtmp4[col];
1917: rs += PetscAbsScalar(pc4[j]);
1918: }
1920: sctx.rs = rs;
1921: sctx.pv = rtmp4[i + 3];
1922: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923: if (sctx.newshift) break;
1924: pc4 = b->a + bdiag[i + 3];
1925: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926: break;
1928: default:
1929: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930: }
1931: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932: i += nodesz; /* Update the row */
1933: }
1935: /* MatPivotRefine() */
1936: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937: /*
1938: * if no shift in this attempt & shifting & started shifting & can refine,
1939: * then try lower shift
1940: */
1941: sctx.shift_hi = sctx.shift_fraction;
1942: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1944: sctx.newshift = PETSC_TRUE;
1945: sctx.nshift++;
1946: }
1947: } while (sctx.newshift);
1949: PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950: PetscCall(PetscFree(tmp_vec2));
1951: PetscCall(ISRestoreIndices(isicol, &ic));
1952: PetscCall(ISRestoreIndices(isrow, &r));
1954: if (b->inode.size_csr) {
1955: C->ops->solve = MatSolve_SeqAIJ_Inode;
1956: } else {
1957: C->ops->solve = MatSolve_SeqAIJ;
1958: }
1959: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1960: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1961: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962: C->ops->matsolve = MatMatSolve_SeqAIJ;
1963: C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964: C->assembled = PETSC_TRUE;
1965: C->preallocated = PETSC_TRUE;
1967: PetscCall(PetscLogFlops(C->cmap->n));
1969: /* MatShiftView(A,info,&sctx) */
1970: if (sctx.nshift) {
1971: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977: }
1978: }
1979: PetscFunctionReturn(PETSC_SUCCESS);
1980: }
1982: #if 0
1983: // unused
1984: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1985: {
1986: Mat C = B;
1987: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1988: IS iscol = b->col, isrow = b->row, isicol = b->icol;
1989: const PetscInt *r, *ic, *c, *ics;
1990: PetscInt n = A->rmap->n, *bi = b->i;
1991: PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1992: PetscInt i, j, idx, *bd = b->diag, node_max, nodesz;
1993: PetscInt *ai = a->i, *aj = a->j;
1994: PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1995: PetscScalar mul1, mul2, mul3, tmp;
1996: MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1997: const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1;
1998: PetscReal rs = 0.0;
1999: FactorShiftCtx sctx;
2001: PetscFunctionBegin;
2002: sctx.shift_top = 0;
2003: sctx.nshift_max = 0;
2004: sctx.shift_lo = 0;
2005: sctx.shift_hi = 0;
2006: sctx.shift_fraction = 0;
2008: /* if both shift schemes are chosen by user, only use info->shiftpd */
2009: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2010: sctx.shift_top = 0;
2011: for (i = 0; i < n; i++) {
2012: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2013: rs = 0.0;
2014: ajtmp = aj + ai[i];
2015: rtmp1 = aa + ai[i];
2016: nz = ai[i + 1] - ai[i];
2017: for (j = 0; j < nz; j++) {
2018: if (*ajtmp != i) {
2019: rs += PetscAbsScalar(*rtmp1++);
2020: } else {
2021: rs -= PetscRealPart(*rtmp1++);
2022: }
2023: ajtmp++;
2024: }
2025: if (rs > sctx.shift_top) sctx.shift_top = rs;
2026: }
2027: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2028: sctx.shift_top *= 1.1;
2029: sctx.nshift_max = 5;
2030: sctx.shift_lo = 0.;
2031: sctx.shift_hi = 1.;
2032: }
2033: sctx.shift_amount = 0;
2034: sctx.nshift = 0;
2036: PetscCall(ISGetIndices(isrow, &r));
2037: PetscCall(ISGetIndices(iscol, &c));
2038: PetscCall(ISGetIndices(isicol, &ic));
2039: PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2040: ics = ic;
2042: node_max = a->inode.node_count;
2043: ns = a->inode.size;
2044: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2046: /* If max inode size > 3, split it into two inodes.*/
2047: /* also map the inode sizes according to the ordering */
2048: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2049: for (i = 0, j = 0; i < node_max; ++i, ++j) {
2050: if (ns[i] > 3) {
2051: tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */
2052: ++j;
2053: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2054: } else {
2055: tmp_vec1[j] = ns[i];
2056: }
2057: }
2058: /* Use the correct node_max */
2059: node_max = j;
2061: /* Now reorder the inode info based on mat re-ordering info */
2062: /* First create a row -> inode_size_array_index map */
2063: PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2064: for (i = 0, row = 0; i < node_max; i++) {
2065: nodesz = tmp_vec1[i];
2066: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2067: }
2068: /* Using nsmap, create a reordered ns structure */
2069: for (i = 0, j = 0; i < node_max; i++) {
2070: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2071: tmp_vec2[i] = nodesz;
2072: j += nodesz;
2073: }
2074: PetscCall(PetscFree2(nsmap, tmp_vec1));
2075: /* Now use the correct ns */
2076: ns = tmp_vec2;
2078: do {
2079: sctx.newshift = PETSC_FALSE;
2080: /* Now loop over each block-row, and do the factorization */
2081: for (i = 0, row = 0; i < node_max; i++) {
2082: nodesz = ns[i];
2083: nz = bi[row + 1] - bi[row];
2084: bjtmp = bj + bi[row];
2086: switch (nodesz) {
2087: case 1:
2088: for (j = 0; j < nz; j++) {
2089: idx = bjtmp[j];
2090: rtmp11[idx] = 0.0;
2091: }
2093: /* load in initial (unfactored row) */
2094: idx = r[row];
2095: nz_tmp = ai[idx + 1] - ai[idx];
2096: ajtmp = aj + ai[idx];
2097: v1 = aa + ai[idx];
2099: for (j = 0; j < nz_tmp; j++) {
2100: idx = ics[ajtmp[j]];
2101: rtmp11[idx] = v1[j];
2102: }
2103: rtmp11[ics[r[row]]] += sctx.shift_amount;
2105: prow = *bjtmp++;
2106: while (prow < row) {
2107: pc1 = rtmp11 + prow;
2108: if (*pc1 != 0.0) {
2109: pv = ba + bd[prow];
2110: pj = nbj + bd[prow];
2111: mul1 = *pc1 * *pv++;
2112: *pc1 = mul1;
2113: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2114: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2115: for (j = 0; j < nz_tmp; j++) {
2116: tmp = pv[j];
2117: idx = pj[j];
2118: rtmp11[idx] -= mul1 * tmp;
2119: }
2120: }
2121: prow = *bjtmp++;
2122: }
2123: pj = bj + bi[row];
2124: pc1 = ba + bi[row];
2126: sctx.pv = rtmp11[row];
2127: rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2128: rs = 0.0;
2129: for (j = 0; j < nz; j++) {
2130: idx = pj[j];
2131: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2132: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2133: }
2134: sctx.rs = rs;
2135: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2136: if (sctx.newshift) goto endofwhile;
2137: break;
2139: case 2:
2140: for (j = 0; j < nz; j++) {
2141: idx = bjtmp[j];
2142: rtmp11[idx] = 0.0;
2143: rtmp22[idx] = 0.0;
2144: }
2146: /* load in initial (unfactored row) */
2147: idx = r[row];
2148: nz_tmp = ai[idx + 1] - ai[idx];
2149: ajtmp = aj + ai[idx];
2150: v1 = aa + ai[idx];
2151: v2 = aa + ai[idx + 1];
2152: for (j = 0; j < nz_tmp; j++) {
2153: idx = ics[ajtmp[j]];
2154: rtmp11[idx] = v1[j];
2155: rtmp22[idx] = v2[j];
2156: }
2157: rtmp11[ics[r[row]]] += sctx.shift_amount;
2158: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2160: prow = *bjtmp++;
2161: while (prow < row) {
2162: pc1 = rtmp11 + prow;
2163: pc2 = rtmp22 + prow;
2164: if (*pc1 != 0.0 || *pc2 != 0.0) {
2165: pv = ba + bd[prow];
2166: pj = nbj + bd[prow];
2167: mul1 = *pc1 * *pv;
2168: mul2 = *pc2 * *pv;
2169: ++pv;
2170: *pc1 = mul1;
2171: *pc2 = mul2;
2173: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2174: for (j = 0; j < nz_tmp; j++) {
2175: tmp = pv[j];
2176: idx = pj[j];
2177: rtmp11[idx] -= mul1 * tmp;
2178: rtmp22[idx] -= mul2 * tmp;
2179: }
2180: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2181: }
2182: prow = *bjtmp++;
2183: }
2185: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2186: pc1 = rtmp11 + prow;
2187: pc2 = rtmp22 + prow;
2189: sctx.pv = *pc1;
2190: pj = bj + bi[prow];
2191: rs = 0.0;
2192: for (j = 0; j < nz; j++) {
2193: idx = pj[j];
2194: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2195: }
2196: sctx.rs = rs;
2197: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2198: if (sctx.newshift) goto endofwhile;
2200: if (*pc2 != 0.0) {
2201: pj = nbj + bd[prow];
2202: mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2203: *pc2 = mul2;
2204: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2205: for (j = 0; j < nz_tmp; j++) {
2206: idx = pj[j];
2207: tmp = rtmp11[idx];
2208: rtmp22[idx] -= mul2 * tmp;
2209: }
2210: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2211: }
2213: pj = bj + bi[row];
2214: pc1 = ba + bi[row];
2215: pc2 = ba + bi[row + 1];
2217: sctx.pv = rtmp22[row + 1];
2218: rs = 0.0;
2219: rtmp11[row] = 1.0 / rtmp11[row];
2220: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2221: /* copy row entries from dense representation to sparse */
2222: for (j = 0; j < nz; j++) {
2223: idx = pj[j];
2224: pc1[j] = rtmp11[idx];
2225: pc2[j] = rtmp22[idx];
2226: if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2227: }
2228: sctx.rs = rs;
2229: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2230: if (sctx.newshift) goto endofwhile;
2231: break;
2233: case 3:
2234: for (j = 0; j < nz; j++) {
2235: idx = bjtmp[j];
2236: rtmp11[idx] = 0.0;
2237: rtmp22[idx] = 0.0;
2238: rtmp33[idx] = 0.0;
2239: }
2240: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2241: idx = r[row];
2242: nz_tmp = ai[idx + 1] - ai[idx];
2243: ajtmp = aj + ai[idx];
2244: v1 = aa + ai[idx];
2245: v2 = aa + ai[idx + 1];
2246: v3 = aa + ai[idx + 2];
2247: for (j = 0; j < nz_tmp; j++) {
2248: idx = ics[ajtmp[j]];
2249: rtmp11[idx] = v1[j];
2250: rtmp22[idx] = v2[j];
2251: rtmp33[idx] = v3[j];
2252: }
2253: rtmp11[ics[r[row]]] += sctx.shift_amount;
2254: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2255: rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2257: /* loop over all pivot row blocks above this row block */
2258: prow = *bjtmp++;
2259: while (prow < row) {
2260: pc1 = rtmp11 + prow;
2261: pc2 = rtmp22 + prow;
2262: pc3 = rtmp33 + prow;
2263: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2264: pv = ba + bd[prow];
2265: pj = nbj + bd[prow];
2266: mul1 = *pc1 * *pv;
2267: mul2 = *pc2 * *pv;
2268: mul3 = *pc3 * *pv;
2269: ++pv;
2270: *pc1 = mul1;
2271: *pc2 = mul2;
2272: *pc3 = mul3;
2274: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2275: /* update this row based on pivot row */
2276: for (j = 0; j < nz_tmp; j++) {
2277: tmp = pv[j];
2278: idx = pj[j];
2279: rtmp11[idx] -= mul1 * tmp;
2280: rtmp22[idx] -= mul2 * tmp;
2281: rtmp33[idx] -= mul3 * tmp;
2282: }
2283: PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2284: }
2285: prow = *bjtmp++;
2286: }
2288: /* Now take care of diagonal 3x3 block in this set of rows */
2289: /* note: prow = row here */
2290: pc1 = rtmp11 + prow;
2291: pc2 = rtmp22 + prow;
2292: pc3 = rtmp33 + prow;
2294: sctx.pv = *pc1;
2295: pj = bj + bi[prow];
2296: rs = 0.0;
2297: for (j = 0; j < nz; j++) {
2298: idx = pj[j];
2299: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2300: }
2301: sctx.rs = rs;
2302: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2303: if (sctx.newshift) goto endofwhile;
2305: if (*pc2 != 0.0 || *pc3 != 0.0) {
2306: mul2 = (*pc2) / (*pc1);
2307: mul3 = (*pc3) / (*pc1);
2308: *pc2 = mul2;
2309: *pc3 = mul3;
2310: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2311: pj = nbj + bd[prow];
2312: for (j = 0; j < nz_tmp; j++) {
2313: idx = pj[j];
2314: tmp = rtmp11[idx];
2315: rtmp22[idx] -= mul2 * tmp;
2316: rtmp33[idx] -= mul3 * tmp;
2317: }
2318: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2319: }
2320: ++prow;
2322: pc2 = rtmp22 + prow;
2323: pc3 = rtmp33 + prow;
2324: sctx.pv = *pc2;
2325: pj = bj + bi[prow];
2326: rs = 0.0;
2327: for (j = 0; j < nz; j++) {
2328: idx = pj[j];
2329: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2330: }
2331: sctx.rs = rs;
2332: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2333: if (sctx.newshift) goto endofwhile;
2335: if (*pc3 != 0.0) {
2336: mul3 = (*pc3) / (*pc2);
2337: *pc3 = mul3;
2338: pj = nbj + bd[prow];
2339: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2340: for (j = 0; j < nz_tmp; j++) {
2341: idx = pj[j];
2342: tmp = rtmp22[idx];
2343: rtmp33[idx] -= mul3 * tmp;
2344: }
2345: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2346: }
2348: pj = bj + bi[row];
2349: pc1 = ba + bi[row];
2350: pc2 = ba + bi[row + 1];
2351: pc3 = ba + bi[row + 2];
2353: sctx.pv = rtmp33[row + 2];
2354: rs = 0.0;
2355: rtmp11[row] = 1.0 / rtmp11[row];
2356: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2357: rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2358: /* copy row entries from dense representation to sparse */
2359: for (j = 0; j < nz; j++) {
2360: idx = pj[j];
2361: pc1[j] = rtmp11[idx];
2362: pc2[j] = rtmp22[idx];
2363: pc3[j] = rtmp33[idx];
2364: if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2365: }
2367: sctx.rs = rs;
2368: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2369: if (sctx.newshift) goto endofwhile;
2370: break;
2372: default:
2373: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2374: }
2375: row += nodesz; /* Update the row */
2376: }
2377: endofwhile:;
2378: } while (sctx.newshift);
2379: PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2380: PetscCall(PetscFree(tmp_vec2));
2381: PetscCall(ISRestoreIndices(isicol, &ic));
2382: PetscCall(ISRestoreIndices(isrow, &r));
2383: PetscCall(ISRestoreIndices(iscol, &c));
2385: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2386: /* do not set solve add, since MatSolve_Inode + Add is faster */
2387: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2388: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2389: C->assembled = PETSC_TRUE;
2390: C->preallocated = PETSC_TRUE;
2391: if (sctx.nshift) {
2392: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2393: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2394: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2395: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2396: }
2397: }
2398: PetscCall(PetscLogFlops(C->cmap->n));
2399: PetscCall(MatSeqAIJCheckInode(C));
2400: PetscFunctionReturn(PETSC_SUCCESS);
2401: }
2402: #endif
2404: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2405: {
2406: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2407: IS iscol = a->col, isrow = a->row;
2408: const PetscInt *r, *c, *rout, *cout;
2409: PetscInt i, j;
2410: PetscInt node_max, row, nsz, aii, i0, i1, nz;
2411: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2412: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
2413: PetscScalar sum1, sum2, sum3, sum4, sum5;
2414: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2415: const PetscScalar *b;
2417: PetscFunctionBegin;
2418: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2419: node_max = a->inode.node_count;
2420: ns = a->inode.size_csr; /* Node Size array */
2422: PetscCall(VecGetArrayRead(bb, &b));
2423: PetscCall(VecGetArrayWrite(xx, &x));
2424: tmp = a->solve_work;
2426: PetscCall(ISGetIndices(isrow, &rout));
2427: r = rout;
2428: PetscCall(ISGetIndices(iscol, &cout));
2429: c = cout;
2431: /* forward solve the lower triangular */
2432: tmps = tmp;
2433: aa = a_a;
2434: aj = a_j;
2435: ad = a->diag;
2437: for (i = 0; i < node_max; ++i) {
2438: row = ns[i];
2439: nsz = ns[i + 1] - ns[i];
2440: aii = ai[row];
2441: v1 = aa + aii;
2442: vi = aj + aii;
2443: nz = ai[row + 1] - ai[row];
2445: if (i < node_max - 1) {
2446: /* Prefetch the indices for the next block */
2447: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2448: /* Prefetch the data for the next block */
2449: PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2450: }
2452: switch (nsz) { /* Each loop in 'case' is unrolled */
2453: case 1:
2454: sum1 = b[r[row]];
2455: for (j = 0; j < nz - 1; j += 2) {
2456: i0 = vi[j];
2457: i1 = vi[j + 1];
2458: tmp0 = tmps[i0];
2459: tmp1 = tmps[i1];
2460: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2461: }
2462: if (j == nz - 1) {
2463: tmp0 = tmps[vi[j]];
2464: sum1 -= v1[j] * tmp0;
2465: }
2466: tmp[row++] = sum1;
2467: break;
2468: case 2:
2469: sum1 = b[r[row]];
2470: sum2 = b[r[row + 1]];
2471: v2 = aa + ai[row + 1];
2473: for (j = 0; j < nz - 1; j += 2) {
2474: i0 = vi[j];
2475: i1 = vi[j + 1];
2476: tmp0 = tmps[i0];
2477: tmp1 = tmps[i1];
2478: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480: }
2481: if (j == nz - 1) {
2482: tmp0 = tmps[vi[j]];
2483: sum1 -= v1[j] * tmp0;
2484: sum2 -= v2[j] * tmp0;
2485: }
2486: sum2 -= v2[nz] * sum1;
2487: tmp[row++] = sum1;
2488: tmp[row++] = sum2;
2489: break;
2490: case 3:
2491: sum1 = b[r[row]];
2492: sum2 = b[r[row + 1]];
2493: sum3 = b[r[row + 2]];
2494: v2 = aa + ai[row + 1];
2495: v3 = aa + ai[row + 2];
2497: for (j = 0; j < nz - 1; j += 2) {
2498: i0 = vi[j];
2499: i1 = vi[j + 1];
2500: tmp0 = tmps[i0];
2501: tmp1 = tmps[i1];
2502: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2503: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2504: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2505: }
2506: if (j == nz - 1) {
2507: tmp0 = tmps[vi[j]];
2508: sum1 -= v1[j] * tmp0;
2509: sum2 -= v2[j] * tmp0;
2510: sum3 -= v3[j] * tmp0;
2511: }
2512: sum2 -= v2[nz] * sum1;
2513: sum3 -= v3[nz] * sum1;
2514: sum3 -= v3[nz + 1] * sum2;
2515: tmp[row++] = sum1;
2516: tmp[row++] = sum2;
2517: tmp[row++] = sum3;
2518: break;
2520: case 4:
2521: sum1 = b[r[row]];
2522: sum2 = b[r[row + 1]];
2523: sum3 = b[r[row + 2]];
2524: sum4 = b[r[row + 3]];
2525: v2 = aa + ai[row + 1];
2526: v3 = aa + ai[row + 2];
2527: v4 = aa + ai[row + 3];
2529: for (j = 0; j < nz - 1; j += 2) {
2530: i0 = vi[j];
2531: i1 = vi[j + 1];
2532: tmp0 = tmps[i0];
2533: tmp1 = tmps[i1];
2534: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2535: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2536: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2537: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2538: }
2539: if (j == nz - 1) {
2540: tmp0 = tmps[vi[j]];
2541: sum1 -= v1[j] * tmp0;
2542: sum2 -= v2[j] * tmp0;
2543: sum3 -= v3[j] * tmp0;
2544: sum4 -= v4[j] * tmp0;
2545: }
2546: sum2 -= v2[nz] * sum1;
2547: sum3 -= v3[nz] * sum1;
2548: sum4 -= v4[nz] * sum1;
2549: sum3 -= v3[nz + 1] * sum2;
2550: sum4 -= v4[nz + 1] * sum2;
2551: sum4 -= v4[nz + 2] * sum3;
2553: tmp[row++] = sum1;
2554: tmp[row++] = sum2;
2555: tmp[row++] = sum3;
2556: tmp[row++] = sum4;
2557: break;
2558: case 5:
2559: sum1 = b[r[row]];
2560: sum2 = b[r[row + 1]];
2561: sum3 = b[r[row + 2]];
2562: sum4 = b[r[row + 3]];
2563: sum5 = b[r[row + 4]];
2564: v2 = aa + ai[row + 1];
2565: v3 = aa + ai[row + 2];
2566: v4 = aa + ai[row + 3];
2567: v5 = aa + ai[row + 4];
2569: for (j = 0; j < nz - 1; j += 2) {
2570: i0 = vi[j];
2571: i1 = vi[j + 1];
2572: tmp0 = tmps[i0];
2573: tmp1 = tmps[i1];
2574: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2575: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2576: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2577: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2578: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2579: }
2580: if (j == nz - 1) {
2581: tmp0 = tmps[vi[j]];
2582: sum1 -= v1[j] * tmp0;
2583: sum2 -= v2[j] * tmp0;
2584: sum3 -= v3[j] * tmp0;
2585: sum4 -= v4[j] * tmp0;
2586: sum5 -= v5[j] * tmp0;
2587: }
2589: sum2 -= v2[nz] * sum1;
2590: sum3 -= v3[nz] * sum1;
2591: sum4 -= v4[nz] * sum1;
2592: sum5 -= v5[nz] * sum1;
2593: sum3 -= v3[nz + 1] * sum2;
2594: sum4 -= v4[nz + 1] * sum2;
2595: sum5 -= v5[nz + 1] * sum2;
2596: sum4 -= v4[nz + 2] * sum3;
2597: sum5 -= v5[nz + 2] * sum3;
2598: sum5 -= v5[nz + 3] * sum4;
2600: tmp[row++] = sum1;
2601: tmp[row++] = sum2;
2602: tmp[row++] = sum3;
2603: tmp[row++] = sum4;
2604: tmp[row++] = sum5;
2605: break;
2606: default:
2607: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2608: }
2609: }
2610: /* backward solve the upper triangular */
2611: for (i = node_max - 1; i >= 0; i--) {
2612: row = ns[i + 1] - 1;
2613: nsz = ns[i + 1] - ns[i];
2614: aii = ad[row + 1] + 1;
2615: v1 = aa + aii;
2616: vi = aj + aii;
2617: nz = ad[row] - ad[row + 1] - 1;
2619: if (i > 0) {
2620: /* Prefetch the indices for the next block */
2621: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2622: /* Prefetch the data for the next block */
2623: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2624: }
2626: switch (nsz) { /* Each loop in 'case' is unrolled */
2627: case 1:
2628: sum1 = tmp[row];
2630: for (j = 0; j < nz - 1; j += 2) {
2631: i0 = vi[j];
2632: i1 = vi[j + 1];
2633: tmp0 = tmps[i0];
2634: tmp1 = tmps[i1];
2635: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636: }
2637: if (j == nz - 1) {
2638: tmp0 = tmps[vi[j]];
2639: sum1 -= v1[j] * tmp0;
2640: }
2641: x[c[row]] = tmp[row] = sum1 * v1[nz];
2642: row--;
2643: break;
2644: case 2:
2645: sum1 = tmp[row];
2646: sum2 = tmp[row - 1];
2647: v2 = aa + ad[row] + 1;
2648: for (j = 0; j < nz - 1; j += 2) {
2649: i0 = vi[j];
2650: i1 = vi[j + 1];
2651: tmp0 = tmps[i0];
2652: tmp1 = tmps[i1];
2653: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2654: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2655: }
2656: if (j == nz - 1) {
2657: tmp0 = tmps[vi[j]];
2658: sum1 -= v1[j] * tmp0;
2659: sum2 -= v2[j + 1] * tmp0;
2660: }
2662: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2663: row--;
2664: sum2 -= v2[0] * tmp0;
2665: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2666: row--;
2667: break;
2668: case 3:
2669: sum1 = tmp[row];
2670: sum2 = tmp[row - 1];
2671: sum3 = tmp[row - 2];
2672: v2 = aa + ad[row] + 1;
2673: v3 = aa + ad[row - 1] + 1;
2674: for (j = 0; j < nz - 1; j += 2) {
2675: i0 = vi[j];
2676: i1 = vi[j + 1];
2677: tmp0 = tmps[i0];
2678: tmp1 = tmps[i1];
2679: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2680: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2681: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2682: }
2683: if (j == nz - 1) {
2684: tmp0 = tmps[vi[j]];
2685: sum1 -= v1[j] * tmp0;
2686: sum2 -= v2[j + 1] * tmp0;
2687: sum3 -= v3[j + 2] * tmp0;
2688: }
2689: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2690: row--;
2691: sum2 -= v2[0] * tmp0;
2692: sum3 -= v3[1] * tmp0;
2693: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2694: row--;
2695: sum3 -= v3[0] * tmp0;
2696: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2697: row--;
2699: break;
2700: case 4:
2701: sum1 = tmp[row];
2702: sum2 = tmp[row - 1];
2703: sum3 = tmp[row - 2];
2704: sum4 = tmp[row - 3];
2705: v2 = aa + ad[row] + 1;
2706: v3 = aa + ad[row - 1] + 1;
2707: v4 = aa + ad[row - 2] + 1;
2709: for (j = 0; j < nz - 1; j += 2) {
2710: i0 = vi[j];
2711: i1 = vi[j + 1];
2712: tmp0 = tmps[i0];
2713: tmp1 = tmps[i1];
2714: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2715: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2716: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2717: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2718: }
2719: if (j == nz - 1) {
2720: tmp0 = tmps[vi[j]];
2721: sum1 -= v1[j] * tmp0;
2722: sum2 -= v2[j + 1] * tmp0;
2723: sum3 -= v3[j + 2] * tmp0;
2724: sum4 -= v4[j + 3] * tmp0;
2725: }
2727: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2728: row--;
2729: sum2 -= v2[0] * tmp0;
2730: sum3 -= v3[1] * tmp0;
2731: sum4 -= v4[2] * tmp0;
2732: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2733: row--;
2734: sum3 -= v3[0] * tmp0;
2735: sum4 -= v4[1] * tmp0;
2736: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2737: row--;
2738: sum4 -= v4[0] * tmp0;
2739: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2740: row--;
2741: break;
2742: case 5:
2743: sum1 = tmp[row];
2744: sum2 = tmp[row - 1];
2745: sum3 = tmp[row - 2];
2746: sum4 = tmp[row - 3];
2747: sum5 = tmp[row - 4];
2748: v2 = aa + ad[row] + 1;
2749: v3 = aa + ad[row - 1] + 1;
2750: v4 = aa + ad[row - 2] + 1;
2751: v5 = aa + ad[row - 3] + 1;
2752: for (j = 0; j < nz - 1; j += 2) {
2753: i0 = vi[j];
2754: i1 = vi[j + 1];
2755: tmp0 = tmps[i0];
2756: tmp1 = tmps[i1];
2757: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2758: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2759: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2760: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2761: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2762: }
2763: if (j == nz - 1) {
2764: tmp0 = tmps[vi[j]];
2765: sum1 -= v1[j] * tmp0;
2766: sum2 -= v2[j + 1] * tmp0;
2767: sum3 -= v3[j + 2] * tmp0;
2768: sum4 -= v4[j + 3] * tmp0;
2769: sum5 -= v5[j + 4] * tmp0;
2770: }
2772: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2773: row--;
2774: sum2 -= v2[0] * tmp0;
2775: sum3 -= v3[1] * tmp0;
2776: sum4 -= v4[2] * tmp0;
2777: sum5 -= v5[3] * tmp0;
2778: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2779: row--;
2780: sum3 -= v3[0] * tmp0;
2781: sum4 -= v4[1] * tmp0;
2782: sum5 -= v5[2] * tmp0;
2783: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2784: row--;
2785: sum4 -= v4[0] * tmp0;
2786: sum5 -= v5[1] * tmp0;
2787: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2788: row--;
2789: sum5 -= v5[0] * tmp0;
2790: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2791: row--;
2792: break;
2793: default:
2794: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2795: }
2796: }
2797: PetscCall(ISRestoreIndices(isrow, &rout));
2798: PetscCall(ISRestoreIndices(iscol, &cout));
2799: PetscCall(VecRestoreArrayRead(bb, &b));
2800: PetscCall(VecRestoreArrayWrite(xx, &x));
2801: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2802: PetscFunctionReturn(PETSC_SUCCESS);
2803: }
2805: /*
2806: Makes a longer coloring[] array and calls the usual code with that
2807: */
2808: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2809: {
2810: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2811: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2812: PetscInt *colorused, i;
2813: ISColoringValue *newcolor;
2815: PetscFunctionBegin;
2816: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2817: PetscCall(PetscMalloc1(n + 1, &newcolor));
2818: /* loop over inodes, marking a color for each column*/
2819: row = 0;
2820: for (i = 0; i < m; i++) {
2821: for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2822: }
2824: /* eliminate unneeded colors */
2825: PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2826: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2828: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2829: ncolors = colorused[5 * ncolors - 1];
2830: for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2831: PetscCall(PetscFree(colorused));
2832: PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2833: PetscCall(PetscFree(coloring));
2834: PetscFunctionReturn(PETSC_SUCCESS);
2835: }
2837: #include <petsc/private/kernels/blockinvert.h>
2839: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2840: {
2841: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2842: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2843: MatScalar *ibdiag, *bdiag, work[25], *t;
2844: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2845: const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2846: const PetscScalar *xb, *b;
2847: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2848: PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
2849: PetscInt sz, k, ipvt[5];
2850: PetscBool allowzeropivot, zeropivotdetected;
2851: const PetscInt *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;
2853: PetscFunctionBegin;
2854: allowzeropivot = PetscNot(A->erroriffailure);
2855: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2856: PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2857: PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2859: if (!a->inode.ibdiagvalid) {
2860: if (!a->inode.ibdiag) {
2861: /* calculate space needed for diagonal blocks */
2862: for (i = 0; i < m; i++) {
2863: nodesz = sizes[i + 1] - sizes[i];
2864: cnt += nodesz * nodesz;
2865: }
2866: a->inode.bdiagsize = cnt;
2868: PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2869: }
2871: /* copy over the diagonal blocks and invert them */
2872: ibdiag = a->inode.ibdiag;
2873: bdiag = a->inode.bdiag;
2874: cnt = 0;
2875: for (i = 0, row = 0; i < m; i++) {
2876: nodesz = sizes[i + 1] - sizes[i];
2877: for (j = 0; j < nodesz; j++) {
2878: for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2879: }
2880: PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2882: switch (nodesz) {
2883: case 1:
2884: /* Create matrix data structure */
2885: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2886: if (allowzeropivot) {
2887: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2888: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2889: A->factorerror_zeropivot_row = row;
2890: PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2891: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2892: }
2893: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2894: break;
2895: case 2:
2896: PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2897: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2898: break;
2899: case 3:
2900: PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2901: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2902: break;
2903: case 4:
2904: PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2905: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2906: break;
2907: case 5:
2908: PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2909: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2910: break;
2911: default:
2912: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2913: }
2914: cnt += nodesz * nodesz;
2915: row += nodesz;
2916: }
2917: a->inode.ibdiagvalid = PETSC_TRUE;
2918: }
2919: ibdiag = a->inode.ibdiag;
2920: bdiag = a->inode.bdiag;
2921: t = a->inode.ssor_work;
2923: PetscCall(VecGetArray(xx, &x));
2924: PetscCall(VecGetArrayRead(bb, &b));
2925: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2926: if (flag & SOR_ZERO_INITIAL_GUESS) {
2927: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2928: for (i = 0, row = 0; i < m; i++) {
2929: sz = diag[row] - ii[row];
2930: v1 = a->a + ii[row];
2931: idx = a->j + ii[row];
2933: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2934: nodesz = sizes[i + 1] - sizes[i];
2935: switch (nodesz) {
2936: case 1:
2938: sum1 = b[row];
2939: for (n = 0; n < sz - 1; n += 2) {
2940: i1 = idx[0];
2941: i2 = idx[1];
2942: idx += 2;
2943: tmp0 = x[i1];
2944: tmp1 = x[i2];
2945: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2946: v1 += 2;
2947: }
2949: if (n == sz - 1) {
2950: tmp0 = x[*idx];
2951: sum1 -= *v1 * tmp0;
2952: }
2953: t[row] = sum1;
2954: x[row++] = sum1 * (*ibdiag++);
2955: break;
2956: case 2:
2957: v2 = a->a + ii[row + 1];
2958: sum1 = b[row];
2959: sum2 = b[row + 1];
2960: for (n = 0; n < sz - 1; n += 2) {
2961: i1 = idx[0];
2962: i2 = idx[1];
2963: idx += 2;
2964: tmp0 = x[i1];
2965: tmp1 = x[i2];
2966: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2967: v1 += 2;
2968: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2969: v2 += 2;
2970: }
2972: if (n == sz - 1) {
2973: tmp0 = x[*idx];
2974: sum1 -= v1[0] * tmp0;
2975: sum2 -= v2[0] * tmp0;
2976: }
2977: t[row] = sum1;
2978: t[row + 1] = sum2;
2979: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2980: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2981: ibdiag += 4;
2982: break;
2983: case 3:
2984: v2 = a->a + ii[row + 1];
2985: v3 = a->a + ii[row + 2];
2986: sum1 = b[row];
2987: sum2 = b[row + 1];
2988: sum3 = b[row + 2];
2989: for (n = 0; n < sz - 1; n += 2) {
2990: i1 = idx[0];
2991: i2 = idx[1];
2992: idx += 2;
2993: tmp0 = x[i1];
2994: tmp1 = x[i2];
2995: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2996: v1 += 2;
2997: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2998: v2 += 2;
2999: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3000: v3 += 2;
3001: }
3003: if (n == sz - 1) {
3004: tmp0 = x[*idx];
3005: sum1 -= v1[0] * tmp0;
3006: sum2 -= v2[0] * tmp0;
3007: sum3 -= v3[0] * tmp0;
3008: }
3009: t[row] = sum1;
3010: t[row + 1] = sum2;
3011: t[row + 2] = sum3;
3012: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3013: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3014: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3015: ibdiag += 9;
3016: break;
3017: case 4:
3018: v2 = a->a + ii[row + 1];
3019: v3 = a->a + ii[row + 2];
3020: v4 = a->a + ii[row + 3];
3021: sum1 = b[row];
3022: sum2 = b[row + 1];
3023: sum3 = b[row + 2];
3024: sum4 = b[row + 3];
3025: for (n = 0; n < sz - 1; n += 2) {
3026: i1 = idx[0];
3027: i2 = idx[1];
3028: idx += 2;
3029: tmp0 = x[i1];
3030: tmp1 = x[i2];
3031: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3032: v1 += 2;
3033: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3034: v2 += 2;
3035: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3036: v3 += 2;
3037: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3038: v4 += 2;
3039: }
3041: if (n == sz - 1) {
3042: tmp0 = x[*idx];
3043: sum1 -= v1[0] * tmp0;
3044: sum2 -= v2[0] * tmp0;
3045: sum3 -= v3[0] * tmp0;
3046: sum4 -= v4[0] * tmp0;
3047: }
3048: t[row] = sum1;
3049: t[row + 1] = sum2;
3050: t[row + 2] = sum3;
3051: t[row + 3] = sum4;
3052: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3053: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3054: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3055: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3056: ibdiag += 16;
3057: break;
3058: case 5:
3059: v2 = a->a + ii[row + 1];
3060: v3 = a->a + ii[row + 2];
3061: v4 = a->a + ii[row + 3];
3062: v5 = a->a + ii[row + 4];
3063: sum1 = b[row];
3064: sum2 = b[row + 1];
3065: sum3 = b[row + 2];
3066: sum4 = b[row + 3];
3067: sum5 = b[row + 4];
3068: for (n = 0; n < sz - 1; n += 2) {
3069: i1 = idx[0];
3070: i2 = idx[1];
3071: idx += 2;
3072: tmp0 = x[i1];
3073: tmp1 = x[i2];
3074: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3075: v1 += 2;
3076: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3077: v2 += 2;
3078: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3079: v3 += 2;
3080: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3081: v4 += 2;
3082: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3083: v5 += 2;
3084: }
3086: if (n == sz - 1) {
3087: tmp0 = x[*idx];
3088: sum1 -= v1[0] * tmp0;
3089: sum2 -= v2[0] * tmp0;
3090: sum3 -= v3[0] * tmp0;
3091: sum4 -= v4[0] * tmp0;
3092: sum5 -= v5[0] * tmp0;
3093: }
3094: t[row] = sum1;
3095: t[row + 1] = sum2;
3096: t[row + 2] = sum3;
3097: t[row + 3] = sum4;
3098: t[row + 4] = sum5;
3099: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3100: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3101: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3102: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3103: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3104: ibdiag += 25;
3105: break;
3106: default:
3107: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3108: }
3109: }
3111: xb = t;
3112: PetscCall(PetscLogFlops(a->nz));
3113: } else xb = b;
3114: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3115: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3116: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3117: nodesz = sizes[i + 1] - sizes[i];
3118: ibdiag -= nodesz * nodesz;
3119: sz = ii[row + 1] - diag[row] - 1;
3120: v1 = a->a + diag[row] + 1;
3121: idx = a->j + diag[row] + 1;
3123: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3124: switch (nodesz) {
3125: case 1:
3127: sum1 = xb[row];
3128: for (n = 0; n < sz - 1; n += 2) {
3129: i1 = idx[0];
3130: i2 = idx[1];
3131: idx += 2;
3132: tmp0 = x[i1];
3133: tmp1 = x[i2];
3134: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3135: v1 += 2;
3136: }
3138: if (n == sz - 1) {
3139: tmp0 = x[*idx];
3140: sum1 -= *v1 * tmp0;
3141: }
3142: x[row--] = sum1 * (*ibdiag);
3143: break;
3145: case 2:
3147: sum1 = xb[row];
3148: sum2 = xb[row - 1];
3149: /* note that sum1 is associated with the second of the two rows */
3150: v2 = a->a + diag[row - 1] + 2;
3151: for (n = 0; n < sz - 1; n += 2) {
3152: i1 = idx[0];
3153: i2 = idx[1];
3154: idx += 2;
3155: tmp0 = x[i1];
3156: tmp1 = x[i2];
3157: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3158: v1 += 2;
3159: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3160: v2 += 2;
3161: }
3163: if (n == sz - 1) {
3164: tmp0 = x[*idx];
3165: sum1 -= *v1 * tmp0;
3166: sum2 -= *v2 * tmp0;
3167: }
3168: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3169: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3170: break;
3171: case 3:
3173: sum1 = xb[row];
3174: sum2 = xb[row - 1];
3175: sum3 = xb[row - 2];
3176: v2 = a->a + diag[row - 1] + 2;
3177: v3 = a->a + diag[row - 2] + 3;
3178: for (n = 0; n < sz - 1; n += 2) {
3179: i1 = idx[0];
3180: i2 = idx[1];
3181: idx += 2;
3182: tmp0 = x[i1];
3183: tmp1 = x[i2];
3184: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3185: v1 += 2;
3186: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3187: v2 += 2;
3188: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3189: v3 += 2;
3190: }
3192: if (n == sz - 1) {
3193: tmp0 = x[*idx];
3194: sum1 -= *v1 * tmp0;
3195: sum2 -= *v2 * tmp0;
3196: sum3 -= *v3 * tmp0;
3197: }
3198: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3199: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3200: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3201: break;
3202: case 4:
3204: sum1 = xb[row];
3205: sum2 = xb[row - 1];
3206: sum3 = xb[row - 2];
3207: sum4 = xb[row - 3];
3208: v2 = a->a + diag[row - 1] + 2;
3209: v3 = a->a + diag[row - 2] + 3;
3210: v4 = a->a + diag[row - 3] + 4;
3211: for (n = 0; n < sz - 1; n += 2) {
3212: i1 = idx[0];
3213: i2 = idx[1];
3214: idx += 2;
3215: tmp0 = x[i1];
3216: tmp1 = x[i2];
3217: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3218: v1 += 2;
3219: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3220: v2 += 2;
3221: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3222: v3 += 2;
3223: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3224: v4 += 2;
3225: }
3227: if (n == sz - 1) {
3228: tmp0 = x[*idx];
3229: sum1 -= *v1 * tmp0;
3230: sum2 -= *v2 * tmp0;
3231: sum3 -= *v3 * tmp0;
3232: sum4 -= *v4 * tmp0;
3233: }
3234: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3235: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3236: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3237: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3238: break;
3239: case 5:
3241: sum1 = xb[row];
3242: sum2 = xb[row - 1];
3243: sum3 = xb[row - 2];
3244: sum4 = xb[row - 3];
3245: sum5 = xb[row - 4];
3246: v2 = a->a + diag[row - 1] + 2;
3247: v3 = a->a + diag[row - 2] + 3;
3248: v4 = a->a + diag[row - 3] + 4;
3249: v5 = a->a + diag[row - 4] + 5;
3250: for (n = 0; n < sz - 1; n += 2) {
3251: i1 = idx[0];
3252: i2 = idx[1];
3253: idx += 2;
3254: tmp0 = x[i1];
3255: tmp1 = x[i2];
3256: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3257: v1 += 2;
3258: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3259: v2 += 2;
3260: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3261: v3 += 2;
3262: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3263: v4 += 2;
3264: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3265: v5 += 2;
3266: }
3268: if (n == sz - 1) {
3269: tmp0 = x[*idx];
3270: sum1 -= *v1 * tmp0;
3271: sum2 -= *v2 * tmp0;
3272: sum3 -= *v3 * tmp0;
3273: sum4 -= *v4 * tmp0;
3274: sum5 -= *v5 * tmp0;
3275: }
3276: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3277: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3278: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3279: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3280: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3281: break;
3282: default:
3283: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3284: }
3285: }
3287: PetscCall(PetscLogFlops(a->nz));
3288: }
3289: its--;
3290: }
3291: while (its--) {
3292: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3293: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3294: nodesz = sizes[i + 1] - sizes[i];
3295: sz = diag[row] - ii[row];
3296: v1 = a->a + ii[row];
3297: idx = a->j + ii[row];
3298: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3299: switch (nodesz) {
3300: case 1:
3301: sum1 = b[row];
3302: for (n = 0; n < sz - 1; n += 2) {
3303: i1 = idx[0];
3304: i2 = idx[1];
3305: idx += 2;
3306: tmp0 = x[i1];
3307: tmp1 = x[i2];
3308: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3309: v1 += 2;
3310: }
3311: if (n == sz - 1) {
3312: tmp0 = x[*idx++];
3313: sum1 -= *v1 * tmp0;
3314: v1++;
3315: }
3316: t[row] = sum1;
3317: sz = ii[row + 1] - diag[row] - 1;
3318: idx = a->j + diag[row] + 1;
3319: v1 += 1;
3320: for (n = 0; n < sz - 1; n += 2) {
3321: i1 = idx[0];
3322: i2 = idx[1];
3323: idx += 2;
3324: tmp0 = x[i1];
3325: tmp1 = x[i2];
3326: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3327: v1 += 2;
3328: }
3329: if (n == sz - 1) {
3330: tmp0 = x[*idx++];
3331: sum1 -= *v1 * tmp0;
3332: }
3333: /* in MatSOR_SeqAIJ this line would be
3334: *
3335: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3336: *
3337: * but omega == 1, so this becomes
3338: *
3339: * x[row] = sum1*(*ibdiag++);
3340: *
3341: */
3342: x[row] = sum1 * (*ibdiag);
3343: break;
3344: case 2:
3345: v2 = a->a + ii[row + 1];
3346: sum1 = b[row];
3347: sum2 = b[row + 1];
3348: for (n = 0; n < sz - 1; n += 2) {
3349: i1 = idx[0];
3350: i2 = idx[1];
3351: idx += 2;
3352: tmp0 = x[i1];
3353: tmp1 = x[i2];
3354: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3355: v1 += 2;
3356: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3357: v2 += 2;
3358: }
3359: if (n == sz - 1) {
3360: tmp0 = x[*idx++];
3361: sum1 -= v1[0] * tmp0;
3362: sum2 -= v2[0] * tmp0;
3363: v1++;
3364: v2++;
3365: }
3366: t[row] = sum1;
3367: t[row + 1] = sum2;
3368: sz = ii[row + 1] - diag[row] - 2;
3369: idx = a->j + diag[row] + 2;
3370: v1 += 2;
3371: v2 += 2;
3372: for (n = 0; n < sz - 1; n += 2) {
3373: i1 = idx[0];
3374: i2 = idx[1];
3375: idx += 2;
3376: tmp0 = x[i1];
3377: tmp1 = x[i2];
3378: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3379: v1 += 2;
3380: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3381: v2 += 2;
3382: }
3383: if (n == sz - 1) {
3384: tmp0 = x[*idx];
3385: sum1 -= v1[0] * tmp0;
3386: sum2 -= v2[0] * tmp0;
3387: }
3388: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3389: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3390: break;
3391: case 3:
3392: v2 = a->a + ii[row + 1];
3393: v3 = a->a + ii[row + 2];
3394: sum1 = b[row];
3395: sum2 = b[row + 1];
3396: sum3 = b[row + 2];
3397: for (n = 0; n < sz - 1; n += 2) {
3398: i1 = idx[0];
3399: i2 = idx[1];
3400: idx += 2;
3401: tmp0 = x[i1];
3402: tmp1 = x[i2];
3403: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3404: v1 += 2;
3405: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3406: v2 += 2;
3407: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3408: v3 += 2;
3409: }
3410: if (n == sz - 1) {
3411: tmp0 = x[*idx++];
3412: sum1 -= v1[0] * tmp0;
3413: sum2 -= v2[0] * tmp0;
3414: sum3 -= v3[0] * tmp0;
3415: v1++;
3416: v2++;
3417: v3++;
3418: }
3419: t[row] = sum1;
3420: t[row + 1] = sum2;
3421: t[row + 2] = sum3;
3422: sz = ii[row + 1] - diag[row] - 3;
3423: idx = a->j + diag[row] + 3;
3424: v1 += 3;
3425: v2 += 3;
3426: v3 += 3;
3427: for (n = 0; n < sz - 1; n += 2) {
3428: i1 = idx[0];
3429: i2 = idx[1];
3430: idx += 2;
3431: tmp0 = x[i1];
3432: tmp1 = x[i2];
3433: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3434: v1 += 2;
3435: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3436: v2 += 2;
3437: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3438: v3 += 2;
3439: }
3440: if (n == sz - 1) {
3441: tmp0 = x[*idx];
3442: sum1 -= v1[0] * tmp0;
3443: sum2 -= v2[0] * tmp0;
3444: sum3 -= v3[0] * tmp0;
3445: }
3446: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3447: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3448: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3449: break;
3450: case 4:
3451: v2 = a->a + ii[row + 1];
3452: v3 = a->a + ii[row + 2];
3453: v4 = a->a + ii[row + 3];
3454: sum1 = b[row];
3455: sum2 = b[row + 1];
3456: sum3 = b[row + 2];
3457: sum4 = b[row + 3];
3458: for (n = 0; n < sz - 1; n += 2) {
3459: i1 = idx[0];
3460: i2 = idx[1];
3461: idx += 2;
3462: tmp0 = x[i1];
3463: tmp1 = x[i2];
3464: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3465: v1 += 2;
3466: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3467: v2 += 2;
3468: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3469: v3 += 2;
3470: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3471: v4 += 2;
3472: }
3473: if (n == sz - 1) {
3474: tmp0 = x[*idx++];
3475: sum1 -= v1[0] * tmp0;
3476: sum2 -= v2[0] * tmp0;
3477: sum3 -= v3[0] * tmp0;
3478: sum4 -= v4[0] * tmp0;
3479: v1++;
3480: v2++;
3481: v3++;
3482: v4++;
3483: }
3484: t[row] = sum1;
3485: t[row + 1] = sum2;
3486: t[row + 2] = sum3;
3487: t[row + 3] = sum4;
3488: sz = ii[row + 1] - diag[row] - 4;
3489: idx = a->j + diag[row] + 4;
3490: v1 += 4;
3491: v2 += 4;
3492: v3 += 4;
3493: v4 += 4;
3494: for (n = 0; n < sz - 1; n += 2) {
3495: i1 = idx[0];
3496: i2 = idx[1];
3497: idx += 2;
3498: tmp0 = x[i1];
3499: tmp1 = x[i2];
3500: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3501: v1 += 2;
3502: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3503: v2 += 2;
3504: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3505: v3 += 2;
3506: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3507: v4 += 2;
3508: }
3509: if (n == sz - 1) {
3510: tmp0 = x[*idx];
3511: sum1 -= v1[0] * tmp0;
3512: sum2 -= v2[0] * tmp0;
3513: sum3 -= v3[0] * tmp0;
3514: sum4 -= v4[0] * tmp0;
3515: }
3516: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3517: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3518: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3519: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3520: break;
3521: case 5:
3522: v2 = a->a + ii[row + 1];
3523: v3 = a->a + ii[row + 2];
3524: v4 = a->a + ii[row + 3];
3525: v5 = a->a + ii[row + 4];
3526: sum1 = b[row];
3527: sum2 = b[row + 1];
3528: sum3 = b[row + 2];
3529: sum4 = b[row + 3];
3530: sum5 = b[row + 4];
3531: for (n = 0; n < sz - 1; n += 2) {
3532: i1 = idx[0];
3533: i2 = idx[1];
3534: idx += 2;
3535: tmp0 = x[i1];
3536: tmp1 = x[i2];
3537: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3538: v1 += 2;
3539: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3540: v2 += 2;
3541: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3542: v3 += 2;
3543: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3544: v4 += 2;
3545: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3546: v5 += 2;
3547: }
3548: if (n == sz - 1) {
3549: tmp0 = x[*idx++];
3550: sum1 -= v1[0] * tmp0;
3551: sum2 -= v2[0] * tmp0;
3552: sum3 -= v3[0] * tmp0;
3553: sum4 -= v4[0] * tmp0;
3554: sum5 -= v5[0] * tmp0;
3555: v1++;
3556: v2++;
3557: v3++;
3558: v4++;
3559: v5++;
3560: }
3561: t[row] = sum1;
3562: t[row + 1] = sum2;
3563: t[row + 2] = sum3;
3564: t[row + 3] = sum4;
3565: t[row + 4] = sum5;
3566: sz = ii[row + 1] - diag[row] - 5;
3567: idx = a->j + diag[row] + 5;
3568: v1 += 5;
3569: v2 += 5;
3570: v3 += 5;
3571: v4 += 5;
3572: v5 += 5;
3573: for (n = 0; n < sz - 1; n += 2) {
3574: i1 = idx[0];
3575: i2 = idx[1];
3576: idx += 2;
3577: tmp0 = x[i1];
3578: tmp1 = x[i2];
3579: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3580: v1 += 2;
3581: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3582: v2 += 2;
3583: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3584: v3 += 2;
3585: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3586: v4 += 2;
3587: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3588: v5 += 2;
3589: }
3590: if (n == sz - 1) {
3591: tmp0 = x[*idx];
3592: sum1 -= v1[0] * tmp0;
3593: sum2 -= v2[0] * tmp0;
3594: sum3 -= v3[0] * tmp0;
3595: sum4 -= v4[0] * tmp0;
3596: sum5 -= v5[0] * tmp0;
3597: }
3598: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3599: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3600: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3601: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3602: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3603: break;
3604: default:
3605: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3606: }
3607: }
3608: xb = t;
3609: PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3610: } else xb = b;
3612: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3613: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3614: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3615: nodesz = sizes[i + 1] - sizes[i];
3616: ibdiag -= nodesz * nodesz;
3618: /* set RHS */
3619: if (xb == b) {
3620: /* whole (old way) */
3621: sz = ii[row + 1] - ii[row];
3622: idx = a->j + ii[row];
3623: switch (nodesz) {
3624: case 5:
3625: v5 = a->a + ii[row - 4]; /* fall through */
3626: case 4:
3627: v4 = a->a + ii[row - 3]; /* fall through */
3628: case 3:
3629: v3 = a->a + ii[row - 2]; /* fall through */
3630: case 2:
3631: v2 = a->a + ii[row - 1]; /* fall through */
3632: case 1:
3633: v1 = a->a + ii[row];
3634: break;
3635: default:
3636: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3637: }
3638: } else {
3639: /* upper, no diag */
3640: sz = ii[row + 1] - diag[row] - 1;
3641: idx = a->j + diag[row] + 1;
3642: switch (nodesz) {
3643: case 5:
3644: v5 = a->a + diag[row - 4] + 5; /* fall through */
3645: case 4:
3646: v4 = a->a + diag[row - 3] + 4; /* fall through */
3647: case 3:
3648: v3 = a->a + diag[row - 2] + 3; /* fall through */
3649: case 2:
3650: v2 = a->a + diag[row - 1] + 2; /* fall through */
3651: case 1:
3652: v1 = a->a + diag[row] + 1;
3653: }
3654: }
3655: /* set sum */
3656: switch (nodesz) {
3657: case 5:
3658: sum5 = xb[row - 4]; /* fall through */
3659: case 4:
3660: sum4 = xb[row - 3]; /* fall through */
3661: case 3:
3662: sum3 = xb[row - 2]; /* fall through */
3663: case 2:
3664: sum2 = xb[row - 1]; /* fall through */
3665: case 1:
3666: /* note that sum1 is associated with the last row */
3667: sum1 = xb[row];
3668: }
3669: /* do sums */
3670: for (n = 0; n < sz - 1; n += 2) {
3671: i1 = idx[0];
3672: i2 = idx[1];
3673: idx += 2;
3674: tmp0 = x[i1];
3675: tmp1 = x[i2];
3676: switch (nodesz) {
3677: case 5:
3678: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3679: v5 += 2; /* fall through */
3680: case 4:
3681: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3682: v4 += 2; /* fall through */
3683: case 3:
3684: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3685: v3 += 2; /* fall through */
3686: case 2:
3687: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3688: v2 += 2; /* fall through */
3689: case 1:
3690: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3691: v1 += 2;
3692: }
3693: }
3694: /* ragged edge */
3695: if (n == sz - 1) {
3696: tmp0 = x[*idx];
3697: switch (nodesz) {
3698: case 5:
3699: sum5 -= *v5 * tmp0; /* fall through */
3700: case 4:
3701: sum4 -= *v4 * tmp0; /* fall through */
3702: case 3:
3703: sum3 -= *v3 * tmp0; /* fall through */
3704: case 2:
3705: sum2 -= *v2 * tmp0; /* fall through */
3706: case 1:
3707: sum1 -= *v1 * tmp0;
3708: }
3709: }
3710: /* update */
3711: if (xb == b) {
3712: /* whole (old way) w/ diag */
3713: switch (nodesz) {
3714: case 5:
3715: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3716: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3717: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3718: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3719: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3720: break;
3721: case 4:
3722: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3723: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3724: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3725: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3726: break;
3727: case 3:
3728: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3729: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3730: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3731: break;
3732: case 2:
3733: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3734: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3735: break;
3736: case 1:
3737: x[row--] += sum1 * (*ibdiag);
3738: break;
3739: }
3740: } else {
3741: /* no diag so set = */
3742: switch (nodesz) {
3743: case 5:
3744: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3745: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3746: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3747: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3748: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3749: break;
3750: case 4:
3751: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3752: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3753: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3754: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3755: break;
3756: case 3:
3757: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3758: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3759: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3760: break;
3761: case 2:
3762: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3763: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3764: break;
3765: case 1:
3766: x[row--] = sum1 * (*ibdiag);
3767: break;
3768: }
3769: }
3770: }
3771: if (xb == b) {
3772: PetscCall(PetscLogFlops(2.0 * a->nz));
3773: } else {
3774: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3775: }
3776: }
3777: }
3778: if (flag & SOR_EISENSTAT) {
3779: /*
3780: Apply (U + D)^-1 where D is now the block diagonal
3781: */
3782: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3783: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3784: nodesz = sizes[i + 1] - sizes[i];
3785: ibdiag -= nodesz * nodesz;
3786: sz = ii[row + 1] - diag[row] - 1;
3787: v1 = a->a + diag[row] + 1;
3788: idx = a->j + diag[row] + 1;
3789: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3790: switch (nodesz) {
3791: case 1:
3793: sum1 = b[row];
3794: for (n = 0; n < sz - 1; n += 2) {
3795: i1 = idx[0];
3796: i2 = idx[1];
3797: idx += 2;
3798: tmp0 = x[i1];
3799: tmp1 = x[i2];
3800: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3801: v1 += 2;
3802: }
3804: if (n == sz - 1) {
3805: tmp0 = x[*idx];
3806: sum1 -= *v1 * tmp0;
3807: }
3808: x[row] = sum1 * (*ibdiag);
3809: row--;
3810: break;
3812: case 2:
3814: sum1 = b[row];
3815: sum2 = b[row - 1];
3816: /* note that sum1 is associated with the second of the two rows */
3817: v2 = a->a + diag[row - 1] + 2;
3818: for (n = 0; n < sz - 1; n += 2) {
3819: i1 = idx[0];
3820: i2 = idx[1];
3821: idx += 2;
3822: tmp0 = x[i1];
3823: tmp1 = x[i2];
3824: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3825: v1 += 2;
3826: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3827: v2 += 2;
3828: }
3830: if (n == sz - 1) {
3831: tmp0 = x[*idx];
3832: sum1 -= *v1 * tmp0;
3833: sum2 -= *v2 * tmp0;
3834: }
3835: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3836: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3837: row -= 2;
3838: break;
3839: case 3:
3841: sum1 = b[row];
3842: sum2 = b[row - 1];
3843: sum3 = b[row - 2];
3844: v2 = a->a + diag[row - 1] + 2;
3845: v3 = a->a + diag[row - 2] + 3;
3846: for (n = 0; n < sz - 1; n += 2) {
3847: i1 = idx[0];
3848: i2 = idx[1];
3849: idx += 2;
3850: tmp0 = x[i1];
3851: tmp1 = x[i2];
3852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3853: v1 += 2;
3854: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3855: v2 += 2;
3856: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3857: v3 += 2;
3858: }
3860: if (n == sz - 1) {
3861: tmp0 = x[*idx];
3862: sum1 -= *v1 * tmp0;
3863: sum2 -= *v2 * tmp0;
3864: sum3 -= *v3 * tmp0;
3865: }
3866: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3867: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3868: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3869: row -= 3;
3870: break;
3871: case 4:
3873: sum1 = b[row];
3874: sum2 = b[row - 1];
3875: sum3 = b[row - 2];
3876: sum4 = b[row - 3];
3877: v2 = a->a + diag[row - 1] + 2;
3878: v3 = a->a + diag[row - 2] + 3;
3879: v4 = a->a + diag[row - 3] + 4;
3880: for (n = 0; n < sz - 1; n += 2) {
3881: i1 = idx[0];
3882: i2 = idx[1];
3883: idx += 2;
3884: tmp0 = x[i1];
3885: tmp1 = x[i2];
3886: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3887: v1 += 2;
3888: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3889: v2 += 2;
3890: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3891: v3 += 2;
3892: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3893: v4 += 2;
3894: }
3896: if (n == sz - 1) {
3897: tmp0 = x[*idx];
3898: sum1 -= *v1 * tmp0;
3899: sum2 -= *v2 * tmp0;
3900: sum3 -= *v3 * tmp0;
3901: sum4 -= *v4 * tmp0;
3902: }
3903: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3904: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3905: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3906: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3907: row -= 4;
3908: break;
3909: case 5:
3911: sum1 = b[row];
3912: sum2 = b[row - 1];
3913: sum3 = b[row - 2];
3914: sum4 = b[row - 3];
3915: sum5 = b[row - 4];
3916: v2 = a->a + diag[row - 1] + 2;
3917: v3 = a->a + diag[row - 2] + 3;
3918: v4 = a->a + diag[row - 3] + 4;
3919: v5 = a->a + diag[row - 4] + 5;
3920: for (n = 0; n < sz - 1; n += 2) {
3921: i1 = idx[0];
3922: i2 = idx[1];
3923: idx += 2;
3924: tmp0 = x[i1];
3925: tmp1 = x[i2];
3926: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3927: v1 += 2;
3928: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3929: v2 += 2;
3930: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3931: v3 += 2;
3932: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3933: v4 += 2;
3934: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3935: v5 += 2;
3936: }
3938: if (n == sz - 1) {
3939: tmp0 = x[*idx];
3940: sum1 -= *v1 * tmp0;
3941: sum2 -= *v2 * tmp0;
3942: sum3 -= *v3 * tmp0;
3943: sum4 -= *v4 * tmp0;
3944: sum5 -= *v5 * tmp0;
3945: }
3946: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3947: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3948: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3949: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3950: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3951: row -= 5;
3952: break;
3953: default:
3954: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3955: }
3956: }
3957: PetscCall(PetscLogFlops(a->nz));
3959: /*
3960: t = b - D x where D is the block diagonal
3961: */
3962: cnt = 0;
3963: for (i = 0, row = 0; i < m; i++) {
3964: nodesz = sizes[i + 1] - sizes[i];
3965: switch (nodesz) {
3966: case 1:
3967: t[row] = b[row] - bdiag[cnt++] * x[row];
3968: row++;
3969: break;
3970: case 2:
3971: x1 = x[row];
3972: x2 = x[row + 1];
3973: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3974: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3975: t[row] = b[row] - tmp1;
3976: t[row + 1] = b[row + 1] - tmp2;
3977: row += 2;
3978: cnt += 4;
3979: break;
3980: case 3:
3981: x1 = x[row];
3982: x2 = x[row + 1];
3983: x3 = x[row + 2];
3984: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3985: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3986: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3987: t[row] = b[row] - tmp1;
3988: t[row + 1] = b[row + 1] - tmp2;
3989: t[row + 2] = b[row + 2] - tmp3;
3990: row += 3;
3991: cnt += 9;
3992: break;
3993: case 4:
3994: x1 = x[row];
3995: x2 = x[row + 1];
3996: x3 = x[row + 2];
3997: x4 = x[row + 3];
3998: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3999: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4000: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4001: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4002: t[row] = b[row] - tmp1;
4003: t[row + 1] = b[row + 1] - tmp2;
4004: t[row + 2] = b[row + 2] - tmp3;
4005: t[row + 3] = b[row + 3] - tmp4;
4006: row += 4;
4007: cnt += 16;
4008: break;
4009: case 5:
4010: x1 = x[row];
4011: x2 = x[row + 1];
4012: x3 = x[row + 2];
4013: x4 = x[row + 3];
4014: x5 = x[row + 4];
4015: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4016: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4017: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4018: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4019: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4020: t[row] = b[row] - tmp1;
4021: t[row + 1] = b[row + 1] - tmp2;
4022: t[row + 2] = b[row + 2] - tmp3;
4023: t[row + 3] = b[row + 3] - tmp4;
4024: t[row + 4] = b[row + 4] - tmp5;
4025: row += 5;
4026: cnt += 25;
4027: break;
4028: default:
4029: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4030: }
4031: }
4032: PetscCall(PetscLogFlops(m));
4034: /*
4035: Apply (L + D)^-1 where D is the block diagonal
4036: */
4037: for (i = 0, row = 0; i < m; i++) {
4038: nodesz = sizes[i + 1] - sizes[i];
4039: sz = diag[row] - ii[row];
4040: v1 = a->a + ii[row];
4041: idx = a->j + ii[row];
4042: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4043: switch (nodesz) {
4044: case 1:
4046: sum1 = t[row];
4047: for (n = 0; n < sz - 1; n += 2) {
4048: i1 = idx[0];
4049: i2 = idx[1];
4050: idx += 2;
4051: tmp0 = t[i1];
4052: tmp1 = t[i2];
4053: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4054: v1 += 2;
4055: }
4057: if (n == sz - 1) {
4058: tmp0 = t[*idx];
4059: sum1 -= *v1 * tmp0;
4060: }
4061: x[row] += t[row] = sum1 * (*ibdiag++);
4062: row++;
4063: break;
4064: case 2:
4065: v2 = a->a + ii[row + 1];
4066: sum1 = t[row];
4067: sum2 = t[row + 1];
4068: for (n = 0; n < sz - 1; n += 2) {
4069: i1 = idx[0];
4070: i2 = idx[1];
4071: idx += 2;
4072: tmp0 = t[i1];
4073: tmp1 = t[i2];
4074: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4075: v1 += 2;
4076: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4077: v2 += 2;
4078: }
4080: if (n == sz - 1) {
4081: tmp0 = t[*idx];
4082: sum1 -= v1[0] * tmp0;
4083: sum2 -= v2[0] * tmp0;
4084: }
4085: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4086: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4087: ibdiag += 4;
4088: row += 2;
4089: break;
4090: case 3:
4091: v2 = a->a + ii[row + 1];
4092: v3 = a->a + ii[row + 2];
4093: sum1 = t[row];
4094: sum2 = t[row + 1];
4095: sum3 = t[row + 2];
4096: for (n = 0; n < sz - 1; n += 2) {
4097: i1 = idx[0];
4098: i2 = idx[1];
4099: idx += 2;
4100: tmp0 = t[i1];
4101: tmp1 = t[i2];
4102: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4103: v1 += 2;
4104: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4105: v2 += 2;
4106: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4107: v3 += 2;
4108: }
4110: if (n == sz - 1) {
4111: tmp0 = t[*idx];
4112: sum1 -= v1[0] * tmp0;
4113: sum2 -= v2[0] * tmp0;
4114: sum3 -= v3[0] * tmp0;
4115: }
4116: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4117: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4118: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4119: ibdiag += 9;
4120: row += 3;
4121: break;
4122: case 4:
4123: v2 = a->a + ii[row + 1];
4124: v3 = a->a + ii[row + 2];
4125: v4 = a->a + ii[row + 3];
4126: sum1 = t[row];
4127: sum2 = t[row + 1];
4128: sum3 = t[row + 2];
4129: sum4 = t[row + 3];
4130: for (n = 0; n < sz - 1; n += 2) {
4131: i1 = idx[0];
4132: i2 = idx[1];
4133: idx += 2;
4134: tmp0 = t[i1];
4135: tmp1 = t[i2];
4136: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4137: v1 += 2;
4138: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4139: v2 += 2;
4140: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4141: v3 += 2;
4142: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4143: v4 += 2;
4144: }
4146: if (n == sz - 1) {
4147: tmp0 = t[*idx];
4148: sum1 -= v1[0] * tmp0;
4149: sum2 -= v2[0] * tmp0;
4150: sum3 -= v3[0] * tmp0;
4151: sum4 -= v4[0] * tmp0;
4152: }
4153: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4154: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4155: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4156: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4157: ibdiag += 16;
4158: row += 4;
4159: break;
4160: case 5:
4161: v2 = a->a + ii[row + 1];
4162: v3 = a->a + ii[row + 2];
4163: v4 = a->a + ii[row + 3];
4164: v5 = a->a + ii[row + 4];
4165: sum1 = t[row];
4166: sum2 = t[row + 1];
4167: sum3 = t[row + 2];
4168: sum4 = t[row + 3];
4169: sum5 = t[row + 4];
4170: for (n = 0; n < sz - 1; n += 2) {
4171: i1 = idx[0];
4172: i2 = idx[1];
4173: idx += 2;
4174: tmp0 = t[i1];
4175: tmp1 = t[i2];
4176: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4177: v1 += 2;
4178: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4179: v2 += 2;
4180: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4181: v3 += 2;
4182: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4183: v4 += 2;
4184: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4185: v5 += 2;
4186: }
4188: if (n == sz - 1) {
4189: tmp0 = t[*idx];
4190: sum1 -= v1[0] * tmp0;
4191: sum2 -= v2[0] * tmp0;
4192: sum3 -= v3[0] * tmp0;
4193: sum4 -= v4[0] * tmp0;
4194: sum5 -= v5[0] * tmp0;
4195: }
4196: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4197: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4198: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4199: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4200: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4201: ibdiag += 25;
4202: row += 5;
4203: break;
4204: default:
4205: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4206: }
4207: }
4208: PetscCall(PetscLogFlops(a->nz));
4209: }
4210: PetscCall(VecRestoreArray(xx, &x));
4211: PetscCall(VecRestoreArrayRead(bb, &b));
4212: PetscFunctionReturn(PETSC_SUCCESS);
4213: }
4215: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4216: {
4217: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4218: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4219: const MatScalar *bdiag = a->inode.bdiag;
4220: const PetscScalar *b;
4221: PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz;
4222: const PetscInt *sizes = a->inode.size_csr;
4224: PetscFunctionBegin;
4225: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4226: PetscCall(VecGetArray(xx, &x));
4227: PetscCall(VecGetArrayRead(bb, &b));
4228: cnt = 0;
4229: for (i = 0, row = 0; i < m; i++) {
4230: nodesz = sizes[i + 1] - sizes[i];
4231: switch (nodesz) {
4232: case 1:
4233: x[row] = b[row] * bdiag[cnt++];
4234: row++;
4235: break;
4236: case 2:
4237: x1 = b[row];
4238: x2 = b[row + 1];
4239: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4240: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4241: x[row++] = tmp1;
4242: x[row++] = tmp2;
4243: cnt += 4;
4244: break;
4245: case 3:
4246: x1 = b[row];
4247: x2 = b[row + 1];
4248: x3 = b[row + 2];
4249: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4250: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4251: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4252: x[row++] = tmp1;
4253: x[row++] = tmp2;
4254: x[row++] = tmp3;
4255: cnt += 9;
4256: break;
4257: case 4:
4258: x1 = b[row];
4259: x2 = b[row + 1];
4260: x3 = b[row + 2];
4261: x4 = b[row + 3];
4262: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4263: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4264: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4265: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4266: x[row++] = tmp1;
4267: x[row++] = tmp2;
4268: x[row++] = tmp3;
4269: x[row++] = tmp4;
4270: cnt += 16;
4271: break;
4272: case 5:
4273: x1 = b[row];
4274: x2 = b[row + 1];
4275: x3 = b[row + 2];
4276: x4 = b[row + 3];
4277: x5 = b[row + 4];
4278: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4279: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4280: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4281: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4282: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4283: x[row++] = tmp1;
4284: x[row++] = tmp2;
4285: x[row++] = tmp3;
4286: x[row++] = tmp4;
4287: x[row++] = tmp5;
4288: cnt += 25;
4289: break;
4290: default:
4291: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4292: }
4293: }
4294: PetscCall(PetscLogFlops(2.0 * cnt));
4295: PetscCall(VecRestoreArray(xx, &x));
4296: PetscCall(VecRestoreArrayRead(bb, &b));
4297: PetscFunctionReturn(PETSC_SUCCESS);
4298: }
4300: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4301: {
4302: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4304: PetscFunctionBegin;
4305: a->inode.node_count = 0;
4306: a->inode.use = PETSC_FALSE;
4307: a->inode.checked = PETSC_FALSE;
4308: a->inode.mat_nonzerostate = -1;
4309: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4310: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4311: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4312: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4313: A->ops->coloringpatch = NULL;
4314: A->ops->multdiagonalblock = NULL;
4315: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4316: PetscFunctionReturn(PETSC_SUCCESS);
4317: }
4319: /*
4320: samestructure indicates that the matrix has not changed its nonzero structure so we
4321: do not need to recompute the inodes
4322: */
4323: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4324: {
4325: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4326: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
4327: PetscBool flag;
4328: const PetscInt *idx, *idy, *ii;
4330: PetscFunctionBegin;
4331: if (!a->inode.use) {
4332: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4333: PetscCall(PetscFree(a->inode.size_csr));
4334: PetscFunctionReturn(PETSC_SUCCESS);
4335: }
4336: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4338: m = A->rmap->n;
4339: if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4340: ns = a->inode.size_csr;
4341: ns[0] = 0;
4343: i = 0;
4344: node_count = 0;
4345: idx = a->j;
4346: ii = a->i;
4347: if (idx) {
4348: while (i < m) { /* For each row */
4349: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4350: /* Limits the number of elements in a node to 'a->inode.limit' */
4351: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4352: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4353: if (nzy != nzx) break;
4354: idy += nzx; /* Same nonzero pattern */
4355: PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4356: if (!flag) break;
4357: }
4358: ns[node_count + 1] = ns[node_count] + blk_size;
4359: node_count++;
4360: idx += blk_size * nzx;
4361: i = j;
4362: }
4363: }
4364: /* If not enough inodes found,, do not use inode version of the routines */
4365: if (!m || !idx || node_count > .8 * m) {
4366: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4367: PetscCall(PetscFree(a->inode.size_csr));
4368: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4369: } else {
4370: if (!A->factortype) {
4371: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4372: if (A->rmap->n == A->cmap->n) {
4373: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4374: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4375: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4376: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4377: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4378: }
4379: } else {
4380: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4381: }
4382: a->inode.node_count = node_count;
4383: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4384: }
4385: a->inode.checked = PETSC_TRUE;
4386: a->inode.mat_nonzerostate = A->nonzerostate;
4387: PetscFunctionReturn(PETSC_SUCCESS);
4388: }
4390: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4391: {
4392: Mat B = *C;
4393: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4394: PetscInt m = A->rmap->n;
4396: PetscFunctionBegin;
4397: c->inode.use = a->inode.use;
4398: c->inode.limit = a->inode.limit;
4399: c->inode.max_limit = a->inode.max_limit;
4400: c->inode.checked = PETSC_FALSE;
4401: c->inode.size_csr = NULL;
4402: c->inode.node_count = 0;
4403: c->inode.ibdiagvalid = PETSC_FALSE;
4404: c->inode.ibdiag = NULL;
4405: c->inode.bdiag = NULL;
4406: c->inode.mat_nonzerostate = -1;
4407: if (a->inode.use) {
4408: if (a->inode.checked && a->inode.size_csr) {
4409: PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4410: PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4412: c->inode.checked = PETSC_TRUE;
4413: c->inode.node_count = a->inode.node_count;
4414: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4415: }
4416: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4417: if (!B->factortype) {
4418: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4419: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4420: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4421: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4422: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4423: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4424: } else {
4425: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4426: }
4427: }
4428: PetscFunctionReturn(PETSC_SUCCESS);
4429: }
4431: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4432: {
4433: PetscInt k;
4434: const PetscInt *vi;
4436: PetscFunctionBegin;
4437: vi = aj + ai[row];
4438: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4439: vi = aj + adiag[row];
4440: cols[nzl] = vi[0];
4441: vi = aj + adiag[row + 1] + 1;
4442: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4443: PetscFunctionReturn(PETSC_SUCCESS);
4444: }
4445: /*
4446: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4447: Modified from MatSeqAIJCheckInode().
4449: Input Parameters:
4450: . Mat A - ILU or LU matrix factor
4452: */
4453: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4454: {
4455: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4456: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4457: PetscInt *cols1, *cols2, *ns;
4458: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4459: PetscBool flag;
4461: PetscFunctionBegin;
4462: if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4463: if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4465: m = A->rmap->n;
4466: if (a->inode.size_csr) ns = a->inode.size_csr;
4467: else PetscCall(PetscMalloc1(m + 1, &ns));
4468: ns[0] = 0;
4470: i = 0;
4471: node_count = 0;
4472: PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4473: while (i < m) { /* For each row */
4474: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4475: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4476: nzx = nzl1 + nzu1 + 1;
4477: PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4479: /* Limits the number of elements in a node to 'a->inode.limit' */
4480: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4481: nzl2 = ai[j + 1] - ai[j];
4482: nzu2 = adiag[j] - adiag[j + 1] - 1;
4483: nzy = nzl2 + nzu2 + 1;
4484: if (nzy != nzx) break;
4485: PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4486: PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4487: if (!flag) break;
4488: }
4489: ns[node_count + 1] = ns[node_count] + blk_size;
4490: node_count++;
4491: i = j;
4492: }
4493: PetscCall(PetscFree2(cols1, cols2));
4494: /* If not enough inodes found,, do not use inode version of the routines */
4495: if (!m || node_count > .8 * m) {
4496: PetscCall(PetscFree(ns));
4498: a->inode.node_count = 0;
4499: a->inode.size_csr = NULL;
4500: a->inode.use = PETSC_FALSE;
4502: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4503: } else {
4504: A->ops->mult = NULL;
4505: A->ops->sor = NULL;
4506: A->ops->multadd = NULL;
4507: A->ops->getrowij = NULL;
4508: A->ops->restorerowij = NULL;
4509: A->ops->getcolumnij = NULL;
4510: A->ops->restorecolumnij = NULL;
4511: A->ops->coloringpatch = NULL;
4512: A->ops->multdiagonalblock = NULL;
4513: a->inode.node_count = node_count;
4514: a->inode.size_csr = ns;
4515: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4516: }
4517: a->inode.checked = PETSC_TRUE;
4518: PetscFunctionReturn(PETSC_SUCCESS);
4519: }
4521: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4522: {
4523: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4525: PetscFunctionBegin;
4526: a->inode.ibdiagvalid = PETSC_FALSE;
4527: PetscFunctionReturn(PETSC_SUCCESS);
4528: }
4530: /*
4531: This is really ugly. if inodes are used this replaces the
4532: permutations with ones that correspond to rows/cols of the matrix
4533: rather than inode blocks
4534: */
4535: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4536: {
4537: PetscFunctionBegin;
4538: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4539: PetscFunctionReturn(PETSC_SUCCESS);
4540: }
4542: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4543: {
4544: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4545: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4546: const PetscInt *ridx, *cidx;
4547: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4548: PetscInt nslim_col, *ns_col;
4549: IS ris = *rperm, cis = *cperm;
4551: PetscFunctionBegin;
4552: if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4553: if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4555: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4556: PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4557: PetscCall(PetscMalloc2(m, &permr, n, &permc));
4559: PetscCall(ISGetIndices(ris, &ridx));
4560: PetscCall(ISGetIndices(cis, &cidx));
4562: /* Form the inode structure for the rows of permuted matrix using inv perm*/
4563: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4565: /* Construct the permutations for rows*/
4566: for (i = 0, row = 0; i < nslim_row; ++i) {
4567: indx = ridx[i];
4568: start_val = tns[indx];
4569: end_val = tns[indx + 1];
4570: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4571: }
4573: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4574: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4576: /* Construct permutations for columns */
4577: for (i = 0, col = 0; i < nslim_col; ++i) {
4578: indx = cidx[i];
4579: start_val = tns[indx];
4580: end_val = tns[indx + 1];
4581: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4582: }
4584: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4585: PetscCall(ISSetPermutation(*rperm));
4586: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4587: PetscCall(ISSetPermutation(*cperm));
4589: PetscCall(ISRestoreIndices(ris, &ridx));
4590: PetscCall(ISRestoreIndices(cis, &cidx));
4592: PetscCall(PetscFree(ns_col));
4593: PetscCall(PetscFree2(permr, permc));
4594: PetscCall(ISDestroy(&cis));
4595: PetscCall(ISDestroy(&ris));
4596: PetscCall(PetscFree(tns));
4597: PetscFunctionReturn(PETSC_SUCCESS);
4598: }
4600: /*@C
4601: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4603: Not Collective
4605: Input Parameter:
4606: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4608: Output Parameters:
4609: + node_count - no of inodes present in the matrix.
4610: . sizes - an array of size `node_count`, with the sizes of each inode.
4611: - limit - the max size used to generate the inodes.
4613: Level: advanced
4615: Note:
4616: It should be called after the matrix is assembled.
4617: The contents of the sizes[] array should not be changed.
4618: `NULL` may be passed for information not needed
4620: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4621: @*/
4622: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4623: {
4624: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4626: PetscFunctionBegin;
4627: PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4628: PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4629: if (f) PetscCall((*f)(A, node_count, sizes, limit));
4630: PetscFunctionReturn(PETSC_SUCCESS);
4631: }
4633: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4634: {
4635: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4637: PetscFunctionBegin;
4638: if (node_count) *node_count = a->inode.node_count;
4639: if (sizes) *sizes = a->inode.size_csr;
4640: if (limit) *limit = a->inode.limit;
4641: PetscFunctionReturn(PETSC_SUCCESS);
4642: }