Actual source code: inode.c
  1: /*
  2:   This file provides high performance routines for the Inode format (compressed sparse row)
  3:   by taking advantage of rows with identical nonzero structure (I-nodes).
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #if defined(PETSC_HAVE_XMMINTRIN_H)
  7:   #include <xmmintrin.h>
  8: #endif

 10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
 11: {
 12:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
 13:   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;

 15:   PetscFunctionBegin;
 16:   n = A->cmap->n;
 17:   m = A->rmap->n;
 18:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
 19:   ns_row = a->inode.size_csr;

 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
 24:     for (; count + 1 < n; count++, i++);
 25:     if (count < n) i++;
 26:     *size = i;
 27:     PetscFunctionReturn(PETSC_SUCCESS);
 28:   }
 29:   PetscCall(PetscMalloc1(n + 1, &ns_col));
 30:   ns_col[0] = 0;

 32:   /* Use the same row structure wherever feasible. */
 33:   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];

 35:   /* if m < n; pad up the remainder with inode_limit */
 36:   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
 37:   /* The last node is the odd ball. pad it up with the remaining rows; */
 38:   if (count < n) {
 39:     ns_col[i + 1] = ns_col[i] + (n - count);
 40:     i++;
 41:   } else if (count > n) {
 42:     /* Adjust for the over estimation */
 43:     ns_col[i] += n - count;
 44:   }
 45:   *size = i;
 46:   *ns   = ns_col;
 47:   PetscFunctionReturn(PETSC_SUCCESS);
 48: }

 50: /*
 51:       This builds symmetric version of nonzero structure,
 52: */
 53: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
 54: {
 55:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
 56:   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
 57:   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
 58:   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;

 60:   PetscFunctionBegin;
 61:   nslim_row = a->inode.node_count;
 62:   m         = A->rmap->n;
 63:   n         = A->cmap->n;
 64:   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
 65:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");

 67:   /* Use the row_inode as column_inode */
 68:   nslim_col = nslim_row;
 69:   ns_col    = ns_row;

 71:   /* allocate space for reformatted inode structure */
 72:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
 73:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);

 75:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
 76:     nsz = ns_col[i1 + 1] - ns_col[i1];
 77:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
 78:   }
 79:   /* allocate space for row pointers */
 80:   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
 81:   *iia = ia;
 82:   PetscCall(PetscMalloc1(nslim_row + 1, &work));

 84:   /* determine the number of columns in each row */
 85:   ia[0] = oshift;
 86:   for (i1 = 0; i1 < nslim_row; i1++) {
 87:     row  = ns_row[i1];
 88:     j    = aj + ai[row] + ishift;
 89:     jmax = aj + ai[row + 1] + ishift;
 90:     if (j == jmax) continue; /* empty row */
 91:     col = *j++ + ishift;
 92:     i2  = tvc[col];
 93:     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
 94:       ia[i1 + 1]++;
 95:       ia[i2 + 1]++;
 96:       i2++; /* Start col of next node */
 97:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
 98:       i2 = tvc[col];
 99:     }
100:     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101:   }

103:   /* shift ia[i] to point to next row */
104:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105:     row = ia[i1 - 1];
106:     ia[i1] += row;
107:     work[i1 - 1] = row - oshift;
108:   }

110:   /* allocate space for column pointers */
111:   nz = ia[nslim_row] + (!ishift);
112:   PetscCall(PetscMalloc1(nz, &ja));
113:   *jja = ja;

115:   /* loop over lower triangular part putting into ja */
116:   for (i1 = 0; i1 < nslim_row; i1++) {
117:     row  = ns_row[i1];
118:     j    = aj + ai[row] + ishift;
119:     jmax = aj + ai[row + 1] + ishift;
120:     if (j == jmax) continue; /* empty row */
121:     col = *j++ + ishift;
122:     i2  = tvc[col];
123:     while (i2 < i1 && j < jmax) {
124:       ja[work[i2]++] = i1 + oshift;
125:       ja[work[i1]++] = i2 + oshift;
126:       ++i2;
127:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128:       i2 = tvc[col];
129:     }
130:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131:   }
132:   PetscCall(PetscFree(work));
133:   PetscCall(PetscFree2(tns, tvc));
134:   PetscFunctionReturn(PETSC_SUCCESS);
135: }

137: /*
138:       This builds nonsymmetric version of nonzero structure,
139: */
140: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141: {
142:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
143:   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144:   PetscInt       *tns, *tvc, nsz, i1, i2;
145:   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;

147:   PetscFunctionBegin;
148:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149:   nslim_row = a->inode.node_count;
150:   n         = A->cmap->n;

152:   /* Create The column_inode for this matrix */
153:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));

155:   /* allocate space for reformatted column_inode structure */
156:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);

159:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160:     nsz = ns_col[i1 + 1] - ns_col[i1];
161:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162:   }
163:   /* allocate space for row pointers */
164:   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165:   *iia = ia;
166:   PetscCall(PetscMalloc1(nslim_row + 1, &work));

168:   /* determine the number of columns in each row */
169:   ia[0] = oshift;
170:   for (i1 = 0; i1 < nslim_row; i1++) {
171:     row = ns_row[i1];
172:     j   = aj + ai[row] + ishift;
173:     nz  = ai[row + 1] - ai[row];
174:     if (!nz) continue; /* empty row */
175:     col = *j++ + ishift;
176:     i2  = tvc[col];
177:     while (nz-- > 0) { /* off-diagonal elements */
178:       ia[i1 + 1]++;
179:       i2++; /* Start col of next node */
180:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181:       if (nz > 0) i2 = tvc[col];
182:     }
183:   }

185:   /* shift ia[i] to point to next row */
186:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
187:     row = ia[i1 - 1];
188:     ia[i1] += row;
189:     work[i1 - 1] = row - oshift;
190:   }

192:   /* allocate space for column pointers */
193:   nz = ia[nslim_row] + (!ishift);
194:   PetscCall(PetscMalloc1(nz, &ja));
195:   *jja = ja;

197:   /* loop over matrix putting into ja */
198:   for (i1 = 0; i1 < nslim_row; i1++) {
199:     row = ns_row[i1];
200:     j   = aj + ai[row] + ishift;
201:     nz  = ai[row + 1] - ai[row];
202:     if (!nz) continue; /* empty row */
203:     col = *j++ + ishift;
204:     i2  = tvc[col];
205:     while (nz-- > 0) {
206:       ja[work[i1]++] = i2 + oshift;
207:       ++i2;
208:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209:       if (nz > 0) i2 = tvc[col];
210:     }
211:   }
212:   PetscCall(PetscFree(ns_col));
213:   PetscCall(PetscFree(work));
214:   PetscCall(PetscFree2(tns, tvc));
215:   PetscFunctionReturn(PETSC_SUCCESS);
216: }

218: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219: {
220:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

222:   PetscFunctionBegin;
223:   if (n) *n = a->inode.node_count;
224:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225:   if (!blockcompressed) {
226:     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227:   } else if (symmetric) {
228:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229:   } else {
230:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231:   }
232:   PetscFunctionReturn(PETSC_SUCCESS);
233: }

235: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236: {
237:   PetscFunctionBegin;
238:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);

240:   if (!blockcompressed) {
241:     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242:   } else {
243:     PetscCall(PetscFree(*ia));
244:     PetscCall(PetscFree(*ja));
245:   }
246:   PetscFunctionReturn(PETSC_SUCCESS);
247: }

249: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250: {
251:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252:   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253:   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;

255:   PetscFunctionBegin;
256:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257:   nslim_row = a->inode.node_count;
258:   n         = A->cmap->n;

260:   /* Create The column_inode for this matrix */
261:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));

263:   /* allocate space for reformatted column_inode structure */
264:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);

267:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268:     nsz = ns_col[i1 + 1] - ns_col[i1];
269:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270:   }
271:   /* allocate space for column pointers */
272:   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273:   *iia = ia;
274:   PetscCall(PetscMalloc1(nslim_col + 1, &work));

276:   /* determine the number of columns in each row */
277:   ia[0] = oshift;
278:   for (i1 = 0; i1 < nslim_row; i1++) {
279:     row = ns_row[i1];
280:     j   = aj + ai[row] + ishift;
281:     col = *j++ + ishift;
282:     i2  = tvc[col];
283:     nz  = ai[row + 1] - ai[row];
284:     while (nz-- > 0) { /* off-diagonal elements */
285:       /* ia[i1+1]++; */
286:       ia[i2 + 1]++;
287:       i2++;
288:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289:       if (nz > 0) i2 = tvc[col];
290:     }
291:   }

293:   /* shift ia[i] to point to next col */
294:   for (i1 = 1; i1 < nslim_col + 1; i1++) {
295:     col = ia[i1 - 1];
296:     ia[i1] += col;
297:     work[i1 - 1] = col - oshift;
298:   }

300:   /* allocate space for column pointers */
301:   nz = ia[nslim_col] + (!ishift);
302:   PetscCall(PetscMalloc1(nz, &ja));
303:   *jja = ja;

305:   /* loop over matrix putting into ja */
306:   for (i1 = 0; i1 < nslim_row; i1++) {
307:     row = ns_row[i1];
308:     j   = aj + ai[row] + ishift;
309:     col = *j++ + ishift;
310:     i2  = tvc[col];
311:     nz  = ai[row + 1] - ai[row];
312:     while (nz-- > 0) {
313:       /* ja[work[i1]++] = i2 + oshift; */
314:       ja[work[i2]++] = i1 + oshift;
315:       i2++;
316:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317:       if (nz > 0) i2 = tvc[col];
318:     }
319:   }
320:   PetscCall(PetscFree(ns_col));
321:   PetscCall(PetscFree(work));
322:   PetscCall(PetscFree2(tns, tvc));
323:   PetscFunctionReturn(PETSC_SUCCESS);
324: }

326: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327: {
328:   PetscFunctionBegin;
329:   PetscCall(MatCreateColInode_Private(A, n, NULL));
330:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);

332:   if (!blockcompressed) {
333:     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334:   } else if (symmetric) {
335:     /* Since the indices are symmetric it doesn't matter */
336:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337:   } else {
338:     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339:   }
340:   PetscFunctionReturn(PETSC_SUCCESS);
341: }

343: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344: {
345:   PetscFunctionBegin;
346:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347:   if (!blockcompressed) {
348:     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349:   } else {
350:     PetscCall(PetscFree(*ia));
351:     PetscCall(PetscFree(*ja));
352:   }
353:   PetscFunctionReturn(PETSC_SUCCESS);
354: }

356: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357: {
358:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
359:   PetscScalar       *y;
360:   const PetscScalar *x;
361:   PetscInt           row, node_max, nonzerorow = 0;
362:   PetscInt          *ns;

364:   PetscFunctionBegin;
365:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366:   node_max = a->inode.node_count;
367:   ns       = a->inode.size_csr; /* Node Size array */
368:   PetscCall(VecGetArrayRead(xx, &x));
369:   PetscCall(VecGetArray(yy, &y));

371:   PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372:   for (PetscInt i = 0; i < node_max; ++i) {
373:     PetscInt         i1, i2, nsz, n, sz;
374:     const MatScalar *v1, *v2, *v3, *v4, *v5;
375:     PetscScalar      sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376:     const PetscInt  *idx;

378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379:   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380: #endif
381:     row = ns[i];
382:     nsz = ns[i + 1] - ns[i];
383:     n   = a->i[row + 1] - a->i[row];
384:     nonzerorow += (n > 0) * nsz;

386:     idx = &a->j[a->i[row]];
387:     v1  = &a->a[a->i[row]];
388:     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
389:     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
390:     sz = n;                                                                /* No of non zeros in this row */
391:                                                                            /* Switch on the size of Node */
392:     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
393:     case 1:
394:       sum1 = 0.;

396:       for (n = 0; n < sz - 1; n += 2) {
397:         i1 = idx[0]; /* The instructions are ordered to */
398:         i2 = idx[1]; /* make the compiler's job easy */
399:         idx += 2;
400:         tmp0 = x[i1];
401:         tmp1 = x[i2];
402:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403:         v1 += 2;
404:       }

406:       if (n == sz - 1) { /* Take care of the last nonzero  */
407:         tmp0 = x[*idx++];
408:         sum1 += *v1++ * tmp0;
409:       }
410:       y[row++] = sum1;
411:       break;
412:     case 2:
413:       sum1 = 0.;
414:       sum2 = 0.;
415:       v2   = v1 + n;

417:       for (n = 0; n < sz - 1; n += 2) {
418:         i1 = idx[0];
419:         i2 = idx[1];
420:         idx += 2;
421:         tmp0 = x[i1];
422:         tmp1 = x[i2];
423:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424:         v1 += 2;
425:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426:         v2 += 2;
427:       }
428:       if (n == sz - 1) {
429:         tmp0 = x[*idx++];
430:         sum1 += *v1++ * tmp0;
431:         sum2 += *v2++ * tmp0;
432:       }
433:       y[row++] = sum1;
434:       y[row++] = sum2;
435:       v1       = v2; /* Since the next block to be processed starts there*/
436:       idx += sz;
437:       break;
438:     case 3:
439:       sum1 = 0.;
440:       sum2 = 0.;
441:       sum3 = 0.;
442:       v2   = v1 + n;
443:       v3   = v2 + n;

445:       for (n = 0; n < sz - 1; n += 2) {
446:         i1 = idx[0];
447:         i2 = idx[1];
448:         idx += 2;
449:         tmp0 = x[i1];
450:         tmp1 = x[i2];
451:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452:         v1 += 2;
453:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454:         v2 += 2;
455:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456:         v3 += 2;
457:       }
458:       if (n == sz - 1) {
459:         tmp0 = x[*idx++];
460:         sum1 += *v1++ * tmp0;
461:         sum2 += *v2++ * tmp0;
462:         sum3 += *v3++ * tmp0;
463:       }
464:       y[row++] = sum1;
465:       y[row++] = sum2;
466:       y[row++] = sum3;
467:       v1       = v3; /* Since the next block to be processed starts there*/
468:       idx += 2 * sz;
469:       break;
470:     case 4:
471:       sum1 = 0.;
472:       sum2 = 0.;
473:       sum3 = 0.;
474:       sum4 = 0.;
475:       v2   = v1 + n;
476:       v3   = v2 + n;
477:       v4   = v3 + n;

479:       for (n = 0; n < sz - 1; n += 2) {
480:         i1 = idx[0];
481:         i2 = idx[1];
482:         idx += 2;
483:         tmp0 = x[i1];
484:         tmp1 = x[i2];
485:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486:         v1 += 2;
487:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488:         v2 += 2;
489:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490:         v3 += 2;
491:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492:         v4 += 2;
493:       }
494:       if (n == sz - 1) {
495:         tmp0 = x[*idx++];
496:         sum1 += *v1++ * tmp0;
497:         sum2 += *v2++ * tmp0;
498:         sum3 += *v3++ * tmp0;
499:         sum4 += *v4++ * tmp0;
500:       }
501:       y[row++] = sum1;
502:       y[row++] = sum2;
503:       y[row++] = sum3;
504:       y[row++] = sum4;
505:       v1       = v4; /* Since the next block to be processed starts there*/
506:       idx += 3 * sz;
507:       break;
508:     case 5:
509:       sum1 = 0.;
510:       sum2 = 0.;
511:       sum3 = 0.;
512:       sum4 = 0.;
513:       sum5 = 0.;
514:       v2   = v1 + n;
515:       v3   = v2 + n;
516:       v4   = v3 + n;
517:       v5   = v4 + n;

519:       for (n = 0; n < sz - 1; n += 2) {
520:         i1 = idx[0];
521:         i2 = idx[1];
522:         idx += 2;
523:         tmp0 = x[i1];
524:         tmp1 = x[i2];
525:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526:         v1 += 2;
527:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528:         v2 += 2;
529:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530:         v3 += 2;
531:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532:         v4 += 2;
533:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534:         v5 += 2;
535:       }
536:       if (n == sz - 1) {
537:         tmp0 = x[*idx++];
538:         sum1 += *v1++ * tmp0;
539:         sum2 += *v2++ * tmp0;
540:         sum3 += *v3++ * tmp0;
541:         sum4 += *v4++ * tmp0;
542:         sum5 += *v5++ * tmp0;
543:       }
544:       y[row++] = sum1;
545:       y[row++] = sum2;
546:       y[row++] = sum3;
547:       y[row++] = sum4;
548:       y[row++] = sum5;
549:       v1       = v5; /* Since the next block to be processed starts there */
550:       idx += 4 * sz;
551:       break;
552:     default:
553:       SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554:     }
555:   }
556:   PetscCall(VecRestoreArrayRead(xx, &x));
557:   PetscCall(VecRestoreArray(yy, &y));
558:   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559:   PetscFunctionReturn(PETSC_SUCCESS);
560: }

562: /* Almost same code as the MatMult_SeqAIJ_Inode() */
563: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564: {
565:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
566:   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
568:   const PetscScalar *x;
569:   PetscScalar       *y, *z, *zt;
570:   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
571:   const PetscInt    *idx, *ns, *ii;

573:   PetscFunctionBegin;
574:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575:   node_max = a->inode.node_count;
576:   ns       = a->inode.size_csr; /* Node Size array */

578:   PetscCall(VecGetArrayRead(xx, &x));
579:   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580:   zt = z;

582:   idx = a->j;
583:   v1  = a->a;
584:   ii  = a->i;

586:   for (i = 0; i < node_max; ++i) {
587:     row = ns[i];
588:     nsz = ns[i + 1] - ns[i];
589:     n   = ii[1] - ii[0];
590:     ii += nsz;
591:     sz = n;        /* No of non zeros in this row */
592:                    /* Switch on the size of Node */
593:     switch (nsz) { /* Each loop in 'case' is unrolled */
594:     case 1:
595:       sum1 = *zt++;

597:       for (n = 0; n < sz - 1; n += 2) {
598:         i1 = idx[0]; /* The instructions are ordered to */
599:         i2 = idx[1]; /* make the compiler's job easy */
600:         idx += 2;
601:         tmp0 = x[i1];
602:         tmp1 = x[i2];
603:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604:         v1 += 2;
605:       }

607:       if (n == sz - 1) { /* Take care of the last nonzero  */
608:         tmp0 = x[*idx++];
609:         sum1 += *v1++ * tmp0;
610:       }
611:       y[row++] = sum1;
612:       break;
613:     case 2:
614:       sum1 = *zt++;
615:       sum2 = *zt++;
616:       v2   = v1 + n;

618:       for (n = 0; n < sz - 1; n += 2) {
619:         i1 = idx[0];
620:         i2 = idx[1];
621:         idx += 2;
622:         tmp0 = x[i1];
623:         tmp1 = x[i2];
624:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625:         v1 += 2;
626:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627:         v2 += 2;
628:       }
629:       if (n == sz - 1) {
630:         tmp0 = x[*idx++];
631:         sum1 += *v1++ * tmp0;
632:         sum2 += *v2++ * tmp0;
633:       }
634:       y[row++] = sum1;
635:       y[row++] = sum2;
636:       v1       = v2; /* Since the next block to be processed starts there*/
637:       idx += sz;
638:       break;
639:     case 3:
640:       sum1 = *zt++;
641:       sum2 = *zt++;
642:       sum3 = *zt++;
643:       v2   = v1 + n;
644:       v3   = v2 + n;

646:       for (n = 0; n < sz - 1; n += 2) {
647:         i1 = idx[0];
648:         i2 = idx[1];
649:         idx += 2;
650:         tmp0 = x[i1];
651:         tmp1 = x[i2];
652:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653:         v1 += 2;
654:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655:         v2 += 2;
656:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657:         v3 += 2;
658:       }
659:       if (n == sz - 1) {
660:         tmp0 = x[*idx++];
661:         sum1 += *v1++ * tmp0;
662:         sum2 += *v2++ * tmp0;
663:         sum3 += *v3++ * tmp0;
664:       }
665:       y[row++] = sum1;
666:       y[row++] = sum2;
667:       y[row++] = sum3;
668:       v1       = v3; /* Since the next block to be processed starts there*/
669:       idx += 2 * sz;
670:       break;
671:     case 4:
672:       sum1 = *zt++;
673:       sum2 = *zt++;
674:       sum3 = *zt++;
675:       sum4 = *zt++;
676:       v2   = v1 + n;
677:       v3   = v2 + n;
678:       v4   = v3 + n;

680:       for (n = 0; n < sz - 1; n += 2) {
681:         i1 = idx[0];
682:         i2 = idx[1];
683:         idx += 2;
684:         tmp0 = x[i1];
685:         tmp1 = x[i2];
686:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687:         v1 += 2;
688:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689:         v2 += 2;
690:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691:         v3 += 2;
692:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693:         v4 += 2;
694:       }
695:       if (n == sz - 1) {
696:         tmp0 = x[*idx++];
697:         sum1 += *v1++ * tmp0;
698:         sum2 += *v2++ * tmp0;
699:         sum3 += *v3++ * tmp0;
700:         sum4 += *v4++ * tmp0;
701:       }
702:       y[row++] = sum1;
703:       y[row++] = sum2;
704:       y[row++] = sum3;
705:       y[row++] = sum4;
706:       v1       = v4; /* Since the next block to be processed starts there*/
707:       idx += 3 * sz;
708:       break;
709:     case 5:
710:       sum1 = *zt++;
711:       sum2 = *zt++;
712:       sum3 = *zt++;
713:       sum4 = *zt++;
714:       sum5 = *zt++;
715:       v2   = v1 + n;
716:       v3   = v2 + n;
717:       v4   = v3 + n;
718:       v5   = v4 + n;

720:       for (n = 0; n < sz - 1; n += 2) {
721:         i1 = idx[0];
722:         i2 = idx[1];
723:         idx += 2;
724:         tmp0 = x[i1];
725:         tmp1 = x[i2];
726:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727:         v1 += 2;
728:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729:         v2 += 2;
730:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731:         v3 += 2;
732:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733:         v4 += 2;
734:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735:         v5 += 2;
736:       }
737:       if (n == sz - 1) {
738:         tmp0 = x[*idx++];
739:         sum1 += *v1++ * tmp0;
740:         sum2 += *v2++ * tmp0;
741:         sum3 += *v3++ * tmp0;
742:         sum4 += *v4++ * tmp0;
743:         sum5 += *v5++ * tmp0;
744:       }
745:       y[row++] = sum1;
746:       y[row++] = sum2;
747:       y[row++] = sum3;
748:       y[row++] = sum4;
749:       y[row++] = sum5;
750:       v1       = v5; /* Since the next block to be processed starts there */
751:       idx += 4 * sz;
752:       break;
753:     default:
754:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755:     }
756:   }
757:   PetscCall(VecRestoreArrayRead(xx, &x));
758:   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759:   PetscCall(PetscLogFlops(2.0 * a->nz));
760:   PetscFunctionReturn(PETSC_SUCCESS);
761: }

763: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764: {
765:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
766:   IS                 iscol = a->col, isrow = a->row;
767:   const PetscInt    *r, *c, *rout, *cout;
768:   PetscInt           i, j, n = A->rmap->n, nz;
769:   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
770:   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
772:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
773:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774:   const PetscScalar *b;

776:   PetscFunctionBegin;
777:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778:   node_max = a->inode.node_count;
779:   ns       = a->inode.size_csr; /* Node Size array */

781:   PetscCall(VecGetArrayRead(bb, &b));
782:   PetscCall(VecGetArrayWrite(xx, &x));
783:   tmp = a->solve_work;

785:   PetscCall(ISGetIndices(isrow, &rout));
786:   r = rout;
787:   PetscCall(ISGetIndices(iscol, &cout));
788:   c = cout + (n - 1);

790:   /* forward solve the lower triangular */
791:   tmps = tmp;
792:   aa   = a_a;
793:   aj   = a_j;
794:   ad   = a->diag;

796:   for (i = 0, row = 0; i < node_max; ++i) {
797:     row = ns[i];
798:     nsz = ns[i + 1] - ns[i];
799:     aii = ai[row];
800:     v1  = aa + aii;
801:     vi  = aj + aii;
802:     nz  = ad[row] - aii;
803:     if (i < node_max - 1) {
804:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805:       * but our indexing to determine its size could. */
806:       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808:       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810:     }

812:     switch (nsz) { /* Each loop in 'case' is unrolled */
813:     case 1:
814:       sum1 = b[*r++];
815:       for (j = 0; j < nz - 1; j += 2) {
816:         i0 = vi[0];
817:         i1 = vi[1];
818:         vi += 2;
819:         tmp0 = tmps[i0];
820:         tmp1 = tmps[i1];
821:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822:         v1 += 2;
823:       }
824:       if (j == nz - 1) {
825:         tmp0 = tmps[*vi++];
826:         sum1 -= *v1++ * tmp0;
827:       }
828:       tmp[row++] = sum1;
829:       break;
830:     case 2:
831:       sum1 = b[*r++];
832:       sum2 = b[*r++];
833:       v2   = aa + ai[row + 1];

835:       for (j = 0; j < nz - 1; j += 2) {
836:         i0 = vi[0];
837:         i1 = vi[1];
838:         vi += 2;
839:         tmp0 = tmps[i0];
840:         tmp1 = tmps[i1];
841:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842:         v1 += 2;
843:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844:         v2 += 2;
845:       }
846:       if (j == nz - 1) {
847:         tmp0 = tmps[*vi++];
848:         sum1 -= *v1++ * tmp0;
849:         sum2 -= *v2++ * tmp0;
850:       }
851:       sum2 -= *v2++ * sum1;
852:       tmp[row++] = sum1;
853:       tmp[row++] = sum2;
854:       break;
855:     case 3:
856:       sum1 = b[*r++];
857:       sum2 = b[*r++];
858:       sum3 = b[*r++];
859:       v2   = aa + ai[row + 1];
860:       v3   = aa + ai[row + 2];

862:       for (j = 0; j < nz - 1; j += 2) {
863:         i0 = vi[0];
864:         i1 = vi[1];
865:         vi += 2;
866:         tmp0 = tmps[i0];
867:         tmp1 = tmps[i1];
868:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869:         v1 += 2;
870:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871:         v2 += 2;
872:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873:         v3 += 2;
874:       }
875:       if (j == nz - 1) {
876:         tmp0 = tmps[*vi++];
877:         sum1 -= *v1++ * tmp0;
878:         sum2 -= *v2++ * tmp0;
879:         sum3 -= *v3++ * tmp0;
880:       }
881:       sum2 -= *v2++ * sum1;
882:       sum3 -= *v3++ * sum1;
883:       sum3 -= *v3++ * sum2;

885:       tmp[row++] = sum1;
886:       tmp[row++] = sum2;
887:       tmp[row++] = sum3;
888:       break;

890:     case 4:
891:       sum1 = b[*r++];
892:       sum2 = b[*r++];
893:       sum3 = b[*r++];
894:       sum4 = b[*r++];
895:       v2   = aa + ai[row + 1];
896:       v3   = aa + ai[row + 2];
897:       v4   = aa + ai[row + 3];

899:       for (j = 0; j < nz - 1; j += 2) {
900:         i0 = vi[0];
901:         i1 = vi[1];
902:         vi += 2;
903:         tmp0 = tmps[i0];
904:         tmp1 = tmps[i1];
905:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906:         v1 += 2;
907:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908:         v2 += 2;
909:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910:         v3 += 2;
911:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912:         v4 += 2;
913:       }
914:       if (j == nz - 1) {
915:         tmp0 = tmps[*vi++];
916:         sum1 -= *v1++ * tmp0;
917:         sum2 -= *v2++ * tmp0;
918:         sum3 -= *v3++ * tmp0;
919:         sum4 -= *v4++ * tmp0;
920:       }
921:       sum2 -= *v2++ * sum1;
922:       sum3 -= *v3++ * sum1;
923:       sum4 -= *v4++ * sum1;
924:       sum3 -= *v3++ * sum2;
925:       sum4 -= *v4++ * sum2;
926:       sum4 -= *v4++ * sum3;

928:       tmp[row++] = sum1;
929:       tmp[row++] = sum2;
930:       tmp[row++] = sum3;
931:       tmp[row++] = sum4;
932:       break;
933:     case 5:
934:       sum1 = b[*r++];
935:       sum2 = b[*r++];
936:       sum3 = b[*r++];
937:       sum4 = b[*r++];
938:       sum5 = b[*r++];
939:       v2   = aa + ai[row + 1];
940:       v3   = aa + ai[row + 2];
941:       v4   = aa + ai[row + 3];
942:       v5   = aa + ai[row + 4];

944:       for (j = 0; j < nz - 1; j += 2) {
945:         i0 = vi[0];
946:         i1 = vi[1];
947:         vi += 2;
948:         tmp0 = tmps[i0];
949:         tmp1 = tmps[i1];
950:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951:         v1 += 2;
952:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953:         v2 += 2;
954:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955:         v3 += 2;
956:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957:         v4 += 2;
958:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959:         v5 += 2;
960:       }
961:       if (j == nz - 1) {
962:         tmp0 = tmps[*vi++];
963:         sum1 -= *v1++ * tmp0;
964:         sum2 -= *v2++ * tmp0;
965:         sum3 -= *v3++ * tmp0;
966:         sum4 -= *v4++ * tmp0;
967:         sum5 -= *v5++ * tmp0;
968:       }

970:       sum2 -= *v2++ * sum1;
971:       sum3 -= *v3++ * sum1;
972:       sum4 -= *v4++ * sum1;
973:       sum5 -= *v5++ * sum1;
974:       sum3 -= *v3++ * sum2;
975:       sum4 -= *v4++ * sum2;
976:       sum5 -= *v5++ * sum2;
977:       sum4 -= *v4++ * sum3;
978:       sum5 -= *v5++ * sum3;
979:       sum5 -= *v5++ * sum4;

981:       tmp[row++] = sum1;
982:       tmp[row++] = sum2;
983:       tmp[row++] = sum3;
984:       tmp[row++] = sum4;
985:       tmp[row++] = sum5;
986:       break;
987:     default:
988:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989:     }
990:   }
991:   /* backward solve the upper triangular */
992:   for (i = node_max - 1; i >= 0; i--) {
993:     row = ns[i + 1];
994:     nsz = ns[i + 1] - ns[i];
995:     aii = ai[row + 1] - 1;
996:     v1  = aa + aii;
997:     vi  = aj + aii;
998:     nz  = aii - ad[row];
999:     switch (nsz) { /* Each loop in 'case' is unrolled */
1000:     case 1:
1001:       sum1 = tmp[row];

1003:       for (j = nz; j > 1; j -= 2) {
1004:         vi -= 2;
1005:         i0   = vi[2];
1006:         i1   = vi[1];
1007:         tmp0 = tmps[i0];
1008:         tmp1 = tmps[i1];
1009:         v1 -= 2;
1010:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011:       }
1012:       if (j == 1) {
1013:         tmp0 = tmps[*vi--];
1014:         sum1 -= *v1-- * tmp0;
1015:       }
1016:       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017:       row--;
1018:       break;
1019:     case 2:
1020:       sum1 = tmp[row];
1021:       sum2 = tmp[row - 1];
1022:       v2   = aa + ai[row] - 1;
1023:       for (j = nz; j > 1; j -= 2) {
1024:         vi -= 2;
1025:         i0   = vi[2];
1026:         i1   = vi[1];
1027:         tmp0 = tmps[i0];
1028:         tmp1 = tmps[i1];
1029:         v1 -= 2;
1030:         v2 -= 2;
1031:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033:       }
1034:       if (j == 1) {
1035:         tmp0 = tmps[*vi--];
1036:         sum1 -= *v1-- * tmp0;
1037:         sum2 -= *v2-- * tmp0;
1038:       }

1040:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041:       row--;
1042:       sum2 -= *v2-- * tmp0;
1043:       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044:       row--;
1045:       break;
1046:     case 3:
1047:       sum1 = tmp[row];
1048:       sum2 = tmp[row - 1];
1049:       sum3 = tmp[row - 2];
1050:       v2   = aa + ai[row] - 1;
1051:       v3   = aa + ai[row - 1] - 1;
1052:       for (j = nz; j > 1; j -= 2) {
1053:         vi -= 2;
1054:         i0   = vi[2];
1055:         i1   = vi[1];
1056:         tmp0 = tmps[i0];
1057:         tmp1 = tmps[i1];
1058:         v1 -= 2;
1059:         v2 -= 2;
1060:         v3 -= 2;
1061:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064:       }
1065:       if (j == 1) {
1066:         tmp0 = tmps[*vi--];
1067:         sum1 -= *v1-- * tmp0;
1068:         sum2 -= *v2-- * tmp0;
1069:         sum3 -= *v3-- * tmp0;
1070:       }
1071:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072:       row--;
1073:       sum2 -= *v2-- * tmp0;
1074:       sum3 -= *v3-- * tmp0;
1075:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076:       row--;
1077:       sum3 -= *v3-- * tmp0;
1078:       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079:       row--;

1081:       break;
1082:     case 4:
1083:       sum1 = tmp[row];
1084:       sum2 = tmp[row - 1];
1085:       sum3 = tmp[row - 2];
1086:       sum4 = tmp[row - 3];
1087:       v2   = aa + ai[row] - 1;
1088:       v3   = aa + ai[row - 1] - 1;
1089:       v4   = aa + ai[row - 2] - 1;

1091:       for (j = nz; j > 1; j -= 2) {
1092:         vi -= 2;
1093:         i0   = vi[2];
1094:         i1   = vi[1];
1095:         tmp0 = tmps[i0];
1096:         tmp1 = tmps[i1];
1097:         v1 -= 2;
1098:         v2 -= 2;
1099:         v3 -= 2;
1100:         v4 -= 2;
1101:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105:       }
1106:       if (j == 1) {
1107:         tmp0 = tmps[*vi--];
1108:         sum1 -= *v1-- * tmp0;
1109:         sum2 -= *v2-- * tmp0;
1110:         sum3 -= *v3-- * tmp0;
1111:         sum4 -= *v4-- * tmp0;
1112:       }

1114:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115:       row--;
1116:       sum2 -= *v2-- * tmp0;
1117:       sum3 -= *v3-- * tmp0;
1118:       sum4 -= *v4-- * tmp0;
1119:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120:       row--;
1121:       sum3 -= *v3-- * tmp0;
1122:       sum4 -= *v4-- * tmp0;
1123:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124:       row--;
1125:       sum4 -= *v4-- * tmp0;
1126:       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127:       row--;
1128:       break;
1129:     case 5:
1130:       sum1 = tmp[row];
1131:       sum2 = tmp[row - 1];
1132:       sum3 = tmp[row - 2];
1133:       sum4 = tmp[row - 3];
1134:       sum5 = tmp[row - 4];
1135:       v2   = aa + ai[row] - 1;
1136:       v3   = aa + ai[row - 1] - 1;
1137:       v4   = aa + ai[row - 2] - 1;
1138:       v5   = aa + ai[row - 3] - 1;
1139:       for (j = nz; j > 1; j -= 2) {
1140:         vi -= 2;
1141:         i0   = vi[2];
1142:         i1   = vi[1];
1143:         tmp0 = tmps[i0];
1144:         tmp1 = tmps[i1];
1145:         v1 -= 2;
1146:         v2 -= 2;
1147:         v3 -= 2;
1148:         v4 -= 2;
1149:         v5 -= 2;
1150:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155:       }
1156:       if (j == 1) {
1157:         tmp0 = tmps[*vi--];
1158:         sum1 -= *v1-- * tmp0;
1159:         sum2 -= *v2-- * tmp0;
1160:         sum3 -= *v3-- * tmp0;
1161:         sum4 -= *v4-- * tmp0;
1162:         sum5 -= *v5-- * tmp0;
1163:       }

1165:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166:       row--;
1167:       sum2 -= *v2-- * tmp0;
1168:       sum3 -= *v3-- * tmp0;
1169:       sum4 -= *v4-- * tmp0;
1170:       sum5 -= *v5-- * tmp0;
1171:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172:       row--;
1173:       sum3 -= *v3-- * tmp0;
1174:       sum4 -= *v4-- * tmp0;
1175:       sum5 -= *v5-- * tmp0;
1176:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177:       row--;
1178:       sum4 -= *v4-- * tmp0;
1179:       sum5 -= *v5-- * tmp0;
1180:       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181:       row--;
1182:       sum5 -= *v5-- * tmp0;
1183:       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184:       row--;
1185:       break;
1186:     default:
1187:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188:     }
1189:   }
1190:   PetscCall(ISRestoreIndices(isrow, &rout));
1191:   PetscCall(ISRestoreIndices(iscol, &cout));
1192:   PetscCall(VecRestoreArrayRead(bb, &b));
1193:   PetscCall(VecRestoreArrayWrite(xx, &x));
1194:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195:   PetscFunctionReturn(PETSC_SUCCESS);
1196: }

1198: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199: {
1200:   Mat              C = B;
1201:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202:   IS               isrow = b->row, isicol = b->icol;
1203:   const PetscInt  *r, *ic, *ics;
1204:   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205:   PetscInt         i, j, k, nz, nzL, row, *pj;
1206:   const PetscInt  *ajtmp, *bjtmp;
1207:   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208:   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209:   FactorShiftCtx   sctx;
1210:   const PetscInt  *ddiag;
1211:   PetscReal        rs;
1212:   MatScalar        d;
1213:   PetscInt         inod, nodesz, node_max, col;
1214:   const PetscInt  *ns;
1215:   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;

1217:   PetscFunctionBegin;
1218:   /* MatPivotSetUp(): initialize shift context sctx */
1219:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

1221:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222:     ddiag          = a->diag;
1223:     sctx.shift_top = info->zeropivot;
1224:     for (i = 0; i < n; i++) {
1225:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226:       d  = (aa)[ddiag[i]];
1227:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228:       v  = aa + ai[i];
1229:       nz = ai[i + 1] - ai[i];
1230:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231:       if (rs > sctx.shift_top) sctx.shift_top = rs;
1232:     }
1233:     sctx.shift_top *= 1.1;
1234:     sctx.nshift_max = 5;
1235:     sctx.shift_lo   = 0.;
1236:     sctx.shift_hi   = 1.;
1237:   }

1239:   PetscCall(ISGetIndices(isrow, &r));
1240:   PetscCall(ISGetIndices(isicol, &ic));

1242:   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243:   ics = ic;

1245:   node_max = a->inode.node_count;
1246:   ns       = a->inode.size_csr;
1247:   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");

1249:   /* If max inode size > 4, split it into two inodes.*/
1250:   /* also map the inode sizes according to the ordering */
1251:   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253:     nodesz = ns[i + 1] - ns[i];
1254:     if (nodesz > 4) {
1255:       tmp_vec1[j] = 4;
1256:       ++j;
1257:       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258:     } else {
1259:       tmp_vec1[j] = nodesz;
1260:     }
1261:   }
1262:   /* Use the correct node_max */
1263:   node_max = j;

1265:   /* Now reorder the inode info based on mat re-ordering info */
1266:   /* First create a row -> inode_size_array_index map */
1267:   PetscCall(PetscMalloc1(n + 1, &nsmap));
1268:   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269:   tmp_vec2[0] = 0;
1270:   for (i = 0, row = 0; i < node_max; i++) {
1271:     nodesz = tmp_vec1[i];
1272:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273:   }
1274:   /* Using nsmap, create a reordered ns structure */
1275:   for (i = 0, j = 0; i < node_max; i++) {
1276:     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277:     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278:     j += nodesz;
1279:   }
1280:   PetscCall(PetscFree(nsmap));
1281:   PetscCall(PetscFree(tmp_vec1));

1283:   /* Now use the correct ns */
1284:   ns = tmp_vec2;

1286:   do {
1287:     sctx.newshift = PETSC_FALSE;
1288:     /* Now loop over each block-row, and do the factorization */
1289:     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290:       nodesz = ns[inod + 1] - ns[inod];

1292:       switch (nodesz) {
1293:       case 1:
1294:         /* zero rtmp1 */
1295:         /* L part */
1296:         nz    = bi[i + 1] - bi[i];
1297:         bjtmp = bj + bi[i];
1298:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1300:         /* U part */
1301:         nz    = bdiag[i] - bdiag[i + 1];
1302:         bjtmp = bj + bdiag[i + 1] + 1;
1303:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1305:         /* load in initial (unfactored row) */
1306:         nz    = ai[r[i] + 1] - ai[r[i]];
1307:         ajtmp = aj + ai[r[i]];
1308:         v     = aa + ai[r[i]];
1309:         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1311:         /* ZeropivotApply() */
1312:         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

1314:         /* elimination */
1315:         bjtmp = bj + bi[i];
1316:         row   = *bjtmp++;
1317:         nzL   = bi[i + 1] - bi[i];
1318:         for (k = 0; k < nzL; k++) {
1319:           pc = rtmp1 + row;
1320:           if (*pc != 0.0) {
1321:             pv   = b->a + bdiag[row];
1322:             mul1 = *pc * (*pv);
1323:             *pc  = mul1;
1324:             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325:             pv   = b->a + bdiag[row + 1] + 1;
1326:             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327:             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328:             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329:           }
1330:           row = *bjtmp++;
1331:         }

1333:         /* finished row so stick it into b->a */
1334:         rs = 0.0;
1335:         /* L part */
1336:         pv = b->a + bi[i];
1337:         pj = b->j + bi[i];
1338:         nz = bi[i + 1] - bi[i];
1339:         for (j = 0; j < nz; j++) {
1340:           pv[j] = rtmp1[pj[j]];
1341:           rs += PetscAbsScalar(pv[j]);
1342:         }

1344:         /* U part */
1345:         pv = b->a + bdiag[i + 1] + 1;
1346:         pj = b->j + bdiag[i + 1] + 1;
1347:         nz = bdiag[i] - bdiag[i + 1] - 1;
1348:         for (j = 0; j < nz; j++) {
1349:           pv[j] = rtmp1[pj[j]];
1350:           rs += PetscAbsScalar(pv[j]);
1351:         }

1353:         /* Check zero pivot */
1354:         sctx.rs = rs;
1355:         sctx.pv = rtmp1[i];
1356:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357:         if (sctx.newshift) break;

1359:         /* Mark diagonal and invert diagonal for simpler triangular solves */
1360:         pv  = b->a + bdiag[i];
1361:         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362:         break;

1364:       case 2:
1365:         /* zero rtmp1 and rtmp2 */
1366:         /* L part */
1367:         nz    = bi[i + 1] - bi[i];
1368:         bjtmp = bj + bi[i];
1369:         for (j = 0; j < nz; j++) {
1370:           col        = bjtmp[j];
1371:           rtmp1[col] = 0.0;
1372:           rtmp2[col] = 0.0;
1373:         }

1375:         /* U part */
1376:         nz    = bdiag[i] - bdiag[i + 1];
1377:         bjtmp = bj + bdiag[i + 1] + 1;
1378:         for (j = 0; j < nz; j++) {
1379:           col        = bjtmp[j];
1380:           rtmp1[col] = 0.0;
1381:           rtmp2[col] = 0.0;
1382:         }

1384:         /* load in initial (unfactored row) */
1385:         nz    = ai[r[i] + 1] - ai[r[i]];
1386:         ajtmp = aj + ai[r[i]];
1387:         v1    = aa + ai[r[i]];
1388:         v2    = aa + ai[r[i] + 1];
1389:         for (j = 0; j < nz; j++) {
1390:           col        = ics[ajtmp[j]];
1391:           rtmp1[col] = v1[j];
1392:           rtmp2[col] = v2[j];
1393:         }
1394:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1395:         rtmp1[i] += sctx.shift_amount;
1396:         rtmp2[i + 1] += sctx.shift_amount;

1398:         /* elimination */
1399:         bjtmp = bj + bi[i];
1400:         row   = *bjtmp++; /* pivot row */
1401:         nzL   = bi[i + 1] - bi[i];
1402:         for (k = 0; k < nzL; k++) {
1403:           pc1 = rtmp1 + row;
1404:           pc2 = rtmp2 + row;
1405:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1406:             pv   = b->a + bdiag[row];
1407:             mul1 = *pc1 * (*pv);
1408:             mul2 = *pc2 * (*pv);
1409:             *pc1 = mul1;
1410:             *pc2 = mul2;

1412:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413:             pv = b->a + bdiag[row + 1] + 1;
1414:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415:             for (j = 0; j < nz; j++) {
1416:               col = pj[j];
1417:               rtmp1[col] -= mul1 * pv[j];
1418:               rtmp2[col] -= mul2 * pv[j];
1419:             }
1420:             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421:           }
1422:           row = *bjtmp++;
1423:         }

1425:         /* finished row i; check zero pivot, then stick row i into b->a */
1426:         rs = 0.0;
1427:         /* L part */
1428:         pc1 = b->a + bi[i];
1429:         pj  = b->j + bi[i];
1430:         nz  = bi[i + 1] - bi[i];
1431:         for (j = 0; j < nz; j++) {
1432:           col    = pj[j];
1433:           pc1[j] = rtmp1[col];
1434:           rs += PetscAbsScalar(pc1[j]);
1435:         }
1436:         /* U part */
1437:         pc1 = b->a + bdiag[i + 1] + 1;
1438:         pj  = b->j + bdiag[i + 1] + 1;
1439:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440:         for (j = 0; j < nz; j++) {
1441:           col    = pj[j];
1442:           pc1[j] = rtmp1[col];
1443:           rs += PetscAbsScalar(pc1[j]);
1444:         }

1446:         sctx.rs = rs;
1447:         sctx.pv = rtmp1[i];
1448:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449:         if (sctx.newshift) break;
1450:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1451:         *pc1 = 1.0 / sctx.pv;

1453:         /* Now take care of diagonal 2x2 block. */
1454:         pc2 = rtmp2 + i;
1455:         if (*pc2 != 0.0) {
1456:           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1457:           *pc2 = mul1;                        /* insert L entry */
1458:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1459:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460:           for (j = 0; j < nz; j++) {
1461:             col = pj[j];
1462:             rtmp2[col] -= mul1 * rtmp1[col];
1463:           }
1464:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465:         }

1467:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468:         rs = 0.0;
1469:         /* L part */
1470:         pc2 = b->a + bi[i + 1];
1471:         pj  = b->j + bi[i + 1];
1472:         nz  = bi[i + 2] - bi[i + 1];
1473:         for (j = 0; j < nz; j++) {
1474:           col    = pj[j];
1475:           pc2[j] = rtmp2[col];
1476:           rs += PetscAbsScalar(pc2[j]);
1477:         }
1478:         /* U part */
1479:         pc2 = b->a + bdiag[i + 2] + 1;
1480:         pj  = b->j + bdiag[i + 2] + 1;
1481:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482:         for (j = 0; j < nz; j++) {
1483:           col    = pj[j];
1484:           pc2[j] = rtmp2[col];
1485:           rs += PetscAbsScalar(pc2[j]);
1486:         }

1488:         sctx.rs = rs;
1489:         sctx.pv = rtmp2[i + 1];
1490:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491:         if (sctx.newshift) break;
1492:         pc2  = b->a + bdiag[i + 1];
1493:         *pc2 = 1.0 / sctx.pv;
1494:         break;

1496:       case 3:
1497:         /* zero rtmp */
1498:         /* L part */
1499:         nz    = bi[i + 1] - bi[i];
1500:         bjtmp = bj + bi[i];
1501:         for (j = 0; j < nz; j++) {
1502:           col        = bjtmp[j];
1503:           rtmp1[col] = 0.0;
1504:           rtmp2[col] = 0.0;
1505:           rtmp3[col] = 0.0;
1506:         }

1508:         /* U part */
1509:         nz    = bdiag[i] - bdiag[i + 1];
1510:         bjtmp = bj + bdiag[i + 1] + 1;
1511:         for (j = 0; j < nz; j++) {
1512:           col        = bjtmp[j];
1513:           rtmp1[col] = 0.0;
1514:           rtmp2[col] = 0.0;
1515:           rtmp3[col] = 0.0;
1516:         }

1518:         /* load in initial (unfactored row) */
1519:         nz    = ai[r[i] + 1] - ai[r[i]];
1520:         ajtmp = aj + ai[r[i]];
1521:         v1    = aa + ai[r[i]];
1522:         v2    = aa + ai[r[i] + 1];
1523:         v3    = aa + ai[r[i] + 2];
1524:         for (j = 0; j < nz; j++) {
1525:           col        = ics[ajtmp[j]];
1526:           rtmp1[col] = v1[j];
1527:           rtmp2[col] = v2[j];
1528:           rtmp3[col] = v3[j];
1529:         }
1530:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1531:         rtmp1[i] += sctx.shift_amount;
1532:         rtmp2[i + 1] += sctx.shift_amount;
1533:         rtmp3[i + 2] += sctx.shift_amount;

1535:         /* elimination */
1536:         bjtmp = bj + bi[i];
1537:         row   = *bjtmp++; /* pivot row */
1538:         nzL   = bi[i + 1] - bi[i];
1539:         for (k = 0; k < nzL; k++) {
1540:           pc1 = rtmp1 + row;
1541:           pc2 = rtmp2 + row;
1542:           pc3 = rtmp3 + row;
1543:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544:             pv   = b->a + bdiag[row];
1545:             mul1 = *pc1 * (*pv);
1546:             mul2 = *pc2 * (*pv);
1547:             mul3 = *pc3 * (*pv);
1548:             *pc1 = mul1;
1549:             *pc2 = mul2;
1550:             *pc3 = mul3;

1552:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553:             pv = b->a + bdiag[row + 1] + 1;
1554:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555:             for (j = 0; j < nz; j++) {
1556:               col = pj[j];
1557:               rtmp1[col] -= mul1 * pv[j];
1558:               rtmp2[col] -= mul2 * pv[j];
1559:               rtmp3[col] -= mul3 * pv[j];
1560:             }
1561:             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562:           }
1563:           row = *bjtmp++;
1564:         }

1566:         /* finished row i; check zero pivot, then stick row i into b->a */
1567:         rs = 0.0;
1568:         /* L part */
1569:         pc1 = b->a + bi[i];
1570:         pj  = b->j + bi[i];
1571:         nz  = bi[i + 1] - bi[i];
1572:         for (j = 0; j < nz; j++) {
1573:           col    = pj[j];
1574:           pc1[j] = rtmp1[col];
1575:           rs += PetscAbsScalar(pc1[j]);
1576:         }
1577:         /* U part */
1578:         pc1 = b->a + bdiag[i + 1] + 1;
1579:         pj  = b->j + bdiag[i + 1] + 1;
1580:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581:         for (j = 0; j < nz; j++) {
1582:           col    = pj[j];
1583:           pc1[j] = rtmp1[col];
1584:           rs += PetscAbsScalar(pc1[j]);
1585:         }

1587:         sctx.rs = rs;
1588:         sctx.pv = rtmp1[i];
1589:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590:         if (sctx.newshift) break;
1591:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1592:         *pc1 = 1.0 / sctx.pv;

1594:         /* Now take care of 1st column of diagonal 3x3 block. */
1595:         pc2 = rtmp2 + i;
1596:         pc3 = rtmp3 + i;
1597:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1598:           mul2 = (*pc2) * (*pc1);
1599:           *pc2 = mul2;
1600:           mul3 = (*pc3) * (*pc1);
1601:           *pc3 = mul3;
1602:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1603:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604:           for (j = 0; j < nz; j++) {
1605:             col = pj[j];
1606:             rtmp2[col] -= mul2 * rtmp1[col];
1607:             rtmp3[col] -= mul3 * rtmp1[col];
1608:           }
1609:           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610:         }

1612:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613:         rs = 0.0;
1614:         /* L part */
1615:         pc2 = b->a + bi[i + 1];
1616:         pj  = b->j + bi[i + 1];
1617:         nz  = bi[i + 2] - bi[i + 1];
1618:         for (j = 0; j < nz; j++) {
1619:           col    = pj[j];
1620:           pc2[j] = rtmp2[col];
1621:           rs += PetscAbsScalar(pc2[j]);
1622:         }
1623:         /* U part */
1624:         pc2 = b->a + bdiag[i + 2] + 1;
1625:         pj  = b->j + bdiag[i + 2] + 1;
1626:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627:         for (j = 0; j < nz; j++) {
1628:           col    = pj[j];
1629:           pc2[j] = rtmp2[col];
1630:           rs += PetscAbsScalar(pc2[j]);
1631:         }

1633:         sctx.rs = rs;
1634:         sctx.pv = rtmp2[i + 1];
1635:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636:         if (sctx.newshift) break;
1637:         pc2  = b->a + bdiag[i + 1];
1638:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1640:         /* Now take care of 2nd column of diagonal 3x3 block. */
1641:         pc3 = rtmp3 + i + 1;
1642:         if (*pc3 != 0.0) {
1643:           mul3 = (*pc3) * (*pc2);
1644:           *pc3 = mul3;
1645:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1646:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647:           for (j = 0; j < nz; j++) {
1648:             col = pj[j];
1649:             rtmp3[col] -= mul3 * rtmp2[col];
1650:           }
1651:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652:         }

1654:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655:         rs = 0.0;
1656:         /* L part */
1657:         pc3 = b->a + bi[i + 2];
1658:         pj  = b->j + bi[i + 2];
1659:         nz  = bi[i + 3] - bi[i + 2];
1660:         for (j = 0; j < nz; j++) {
1661:           col    = pj[j];
1662:           pc3[j] = rtmp3[col];
1663:           rs += PetscAbsScalar(pc3[j]);
1664:         }
1665:         /* U part */
1666:         pc3 = b->a + bdiag[i + 3] + 1;
1667:         pj  = b->j + bdiag[i + 3] + 1;
1668:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669:         for (j = 0; j < nz; j++) {
1670:           col    = pj[j];
1671:           pc3[j] = rtmp3[col];
1672:           rs += PetscAbsScalar(pc3[j]);
1673:         }

1675:         sctx.rs = rs;
1676:         sctx.pv = rtmp3[i + 2];
1677:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678:         if (sctx.newshift) break;
1679:         pc3  = b->a + bdiag[i + 2];
1680:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681:         break;
1682:       case 4:
1683:         /* zero rtmp */
1684:         /* L part */
1685:         nz    = bi[i + 1] - bi[i];
1686:         bjtmp = bj + bi[i];
1687:         for (j = 0; j < nz; j++) {
1688:           col        = bjtmp[j];
1689:           rtmp1[col] = 0.0;
1690:           rtmp2[col] = 0.0;
1691:           rtmp3[col] = 0.0;
1692:           rtmp4[col] = 0.0;
1693:         }

1695:         /* U part */
1696:         nz    = bdiag[i] - bdiag[i + 1];
1697:         bjtmp = bj + bdiag[i + 1] + 1;
1698:         for (j = 0; j < nz; j++) {
1699:           col        = bjtmp[j];
1700:           rtmp1[col] = 0.0;
1701:           rtmp2[col] = 0.0;
1702:           rtmp3[col] = 0.0;
1703:           rtmp4[col] = 0.0;
1704:         }

1706:         /* load in initial (unfactored row) */
1707:         nz    = ai[r[i] + 1] - ai[r[i]];
1708:         ajtmp = aj + ai[r[i]];
1709:         v1    = aa + ai[r[i]];
1710:         v2    = aa + ai[r[i] + 1];
1711:         v3    = aa + ai[r[i] + 2];
1712:         v4    = aa + ai[r[i] + 3];
1713:         for (j = 0; j < nz; j++) {
1714:           col        = ics[ajtmp[j]];
1715:           rtmp1[col] = v1[j];
1716:           rtmp2[col] = v2[j];
1717:           rtmp3[col] = v3[j];
1718:           rtmp4[col] = v4[j];
1719:         }
1720:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1721:         rtmp1[i] += sctx.shift_amount;
1722:         rtmp2[i + 1] += sctx.shift_amount;
1723:         rtmp3[i + 2] += sctx.shift_amount;
1724:         rtmp4[i + 3] += sctx.shift_amount;

1726:         /* elimination */
1727:         bjtmp = bj + bi[i];
1728:         row   = *bjtmp++; /* pivot row */
1729:         nzL   = bi[i + 1] - bi[i];
1730:         for (k = 0; k < nzL; k++) {
1731:           pc1 = rtmp1 + row;
1732:           pc2 = rtmp2 + row;
1733:           pc3 = rtmp3 + row;
1734:           pc4 = rtmp4 + row;
1735:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736:             pv   = b->a + bdiag[row];
1737:             mul1 = *pc1 * (*pv);
1738:             mul2 = *pc2 * (*pv);
1739:             mul3 = *pc3 * (*pv);
1740:             mul4 = *pc4 * (*pv);
1741:             *pc1 = mul1;
1742:             *pc2 = mul2;
1743:             *pc3 = mul3;
1744:             *pc4 = mul4;

1746:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747:             pv = b->a + bdiag[row + 1] + 1;
1748:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749:             for (j = 0; j < nz; j++) {
1750:               col = pj[j];
1751:               rtmp1[col] -= mul1 * pv[j];
1752:               rtmp2[col] -= mul2 * pv[j];
1753:               rtmp3[col] -= mul3 * pv[j];
1754:               rtmp4[col] -= mul4 * pv[j];
1755:             }
1756:             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757:           }
1758:           row = *bjtmp++;
1759:         }

1761:         /* finished row i; check zero pivot, then stick row i into b->a */
1762:         rs = 0.0;
1763:         /* L part */
1764:         pc1 = b->a + bi[i];
1765:         pj  = b->j + bi[i];
1766:         nz  = bi[i + 1] - bi[i];
1767:         for (j = 0; j < nz; j++) {
1768:           col    = pj[j];
1769:           pc1[j] = rtmp1[col];
1770:           rs += PetscAbsScalar(pc1[j]);
1771:         }
1772:         /* U part */
1773:         pc1 = b->a + bdiag[i + 1] + 1;
1774:         pj  = b->j + bdiag[i + 1] + 1;
1775:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776:         for (j = 0; j < nz; j++) {
1777:           col    = pj[j];
1778:           pc1[j] = rtmp1[col];
1779:           rs += PetscAbsScalar(pc1[j]);
1780:         }

1782:         sctx.rs = rs;
1783:         sctx.pv = rtmp1[i];
1784:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785:         if (sctx.newshift) break;
1786:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1787:         *pc1 = 1.0 / sctx.pv;

1789:         /* Now take care of 1st column of diagonal 4x4 block. */
1790:         pc2 = rtmp2 + i;
1791:         pc3 = rtmp3 + i;
1792:         pc4 = rtmp4 + i;
1793:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794:           mul2 = (*pc2) * (*pc1);
1795:           *pc2 = mul2;
1796:           mul3 = (*pc3) * (*pc1);
1797:           *pc3 = mul3;
1798:           mul4 = (*pc4) * (*pc1);
1799:           *pc4 = mul4;
1800:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1801:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802:           for (j = 0; j < nz; j++) {
1803:             col = pj[j];
1804:             rtmp2[col] -= mul2 * rtmp1[col];
1805:             rtmp3[col] -= mul3 * rtmp1[col];
1806:             rtmp4[col] -= mul4 * rtmp1[col];
1807:           }
1808:           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809:         }

1811:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812:         rs = 0.0;
1813:         /* L part */
1814:         pc2 = b->a + bi[i + 1];
1815:         pj  = b->j + bi[i + 1];
1816:         nz  = bi[i + 2] - bi[i + 1];
1817:         for (j = 0; j < nz; j++) {
1818:           col    = pj[j];
1819:           pc2[j] = rtmp2[col];
1820:           rs += PetscAbsScalar(pc2[j]);
1821:         }
1822:         /* U part */
1823:         pc2 = b->a + bdiag[i + 2] + 1;
1824:         pj  = b->j + bdiag[i + 2] + 1;
1825:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826:         for (j = 0; j < nz; j++) {
1827:           col    = pj[j];
1828:           pc2[j] = rtmp2[col];
1829:           rs += PetscAbsScalar(pc2[j]);
1830:         }

1832:         sctx.rs = rs;
1833:         sctx.pv = rtmp2[i + 1];
1834:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835:         if (sctx.newshift) break;
1836:         pc2  = b->a + bdiag[i + 1];
1837:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1839:         /* Now take care of 2nd column of diagonal 4x4 block. */
1840:         pc3 = rtmp3 + i + 1;
1841:         pc4 = rtmp4 + i + 1;
1842:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1843:           mul3 = (*pc3) * (*pc2);
1844:           *pc3 = mul3;
1845:           mul4 = (*pc4) * (*pc2);
1846:           *pc4 = mul4;
1847:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1848:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849:           for (j = 0; j < nz; j++) {
1850:             col = pj[j];
1851:             rtmp3[col] -= mul3 * rtmp2[col];
1852:             rtmp4[col] -= mul4 * rtmp2[col];
1853:           }
1854:           PetscCall(PetscLogFlops(4.0 * nz));
1855:         }

1857:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858:         rs = 0.0;
1859:         /* L part */
1860:         pc3 = b->a + bi[i + 2];
1861:         pj  = b->j + bi[i + 2];
1862:         nz  = bi[i + 3] - bi[i + 2];
1863:         for (j = 0; j < nz; j++) {
1864:           col    = pj[j];
1865:           pc3[j] = rtmp3[col];
1866:           rs += PetscAbsScalar(pc3[j]);
1867:         }
1868:         /* U part */
1869:         pc3 = b->a + bdiag[i + 3] + 1;
1870:         pj  = b->j + bdiag[i + 3] + 1;
1871:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872:         for (j = 0; j < nz; j++) {
1873:           col    = pj[j];
1874:           pc3[j] = rtmp3[col];
1875:           rs += PetscAbsScalar(pc3[j]);
1876:         }

1878:         sctx.rs = rs;
1879:         sctx.pv = rtmp3[i + 2];
1880:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881:         if (sctx.newshift) break;
1882:         pc3  = b->a + bdiag[i + 2];
1883:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */

1885:         /* Now take care of 3rd column of diagonal 4x4 block. */
1886:         pc4 = rtmp4 + i + 2;
1887:         if (*pc4 != 0.0) {
1888:           mul4 = (*pc4) * (*pc3);
1889:           *pc4 = mul4;
1890:           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1891:           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892:           for (j = 0; j < nz; j++) {
1893:             col = pj[j];
1894:             rtmp4[col] -= mul4 * rtmp3[col];
1895:           }
1896:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897:         }

1899:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900:         rs = 0.0;
1901:         /* L part */
1902:         pc4 = b->a + bi[i + 3];
1903:         pj  = b->j + bi[i + 3];
1904:         nz  = bi[i + 4] - bi[i + 3];
1905:         for (j = 0; j < nz; j++) {
1906:           col    = pj[j];
1907:           pc4[j] = rtmp4[col];
1908:           rs += PetscAbsScalar(pc4[j]);
1909:         }
1910:         /* U part */
1911:         pc4 = b->a + bdiag[i + 4] + 1;
1912:         pj  = b->j + bdiag[i + 4] + 1;
1913:         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914:         for (j = 0; j < nz; j++) {
1915:           col    = pj[j];
1916:           pc4[j] = rtmp4[col];
1917:           rs += PetscAbsScalar(pc4[j]);
1918:         }

1920:         sctx.rs = rs;
1921:         sctx.pv = rtmp4[i + 3];
1922:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923:         if (sctx.newshift) break;
1924:         pc4  = b->a + bdiag[i + 3];
1925:         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926:         break;

1928:       default:
1929:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930:       }
1931:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932:       i += nodesz;              /* Update the row */
1933:     }

1935:     /* MatPivotRefine() */
1936:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937:       /*
1938:        * if no shift in this attempt & shifting & started shifting & can refine,
1939:        * then try lower shift
1940:        */
1941:       sctx.shift_hi       = sctx.shift_fraction;
1942:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1944:       sctx.newshift       = PETSC_TRUE;
1945:       sctx.nshift++;
1946:     }
1947:   } while (sctx.newshift);

1949:   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950:   PetscCall(PetscFree(tmp_vec2));
1951:   PetscCall(ISRestoreIndices(isicol, &ic));
1952:   PetscCall(ISRestoreIndices(isrow, &r));

1954:   if (b->inode.size_csr) {
1955:     C->ops->solve = MatSolve_SeqAIJ_Inode;
1956:   } else {
1957:     C->ops->solve = MatSolve_SeqAIJ;
1958:   }
1959:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1960:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1961:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1963:   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964:   C->assembled              = PETSC_TRUE;
1965:   C->preallocated           = PETSC_TRUE;

1967:   PetscCall(PetscLogFlops(C->cmap->n));

1969:   /* MatShiftView(A,info,&sctx) */
1970:   if (sctx.nshift) {
1971:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976:       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977:     }
1978:   }
1979:   PetscFunctionReturn(PETSC_SUCCESS);
1980: }

1982: #if 0
1983: // unused
1984: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1985: {
1986:   Mat              C = B;
1987:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1988:   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1989:   const PetscInt  *r, *ic, *c, *ics;
1990:   PetscInt         n = A->rmap->n, *bi = b->i;
1991:   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1992:   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1993:   PetscInt        *ai = a->i, *aj = a->j;
1994:   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1995:   PetscScalar      mul1, mul2, mul3, tmp;
1996:   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1997:   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1998:   PetscReal        rs = 0.0;
1999:   FactorShiftCtx   sctx;

2001:   PetscFunctionBegin;
2002:   sctx.shift_top      = 0;
2003:   sctx.nshift_max     = 0;
2004:   sctx.shift_lo       = 0;
2005:   sctx.shift_hi       = 0;
2006:   sctx.shift_fraction = 0;

2008:   /* if both shift schemes are chosen by user, only use info->shiftpd */
2009:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2010:     sctx.shift_top = 0;
2011:     for (i = 0; i < n; i++) {
2012:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2013:       rs    = 0.0;
2014:       ajtmp = aj + ai[i];
2015:       rtmp1 = aa + ai[i];
2016:       nz    = ai[i + 1] - ai[i];
2017:       for (j = 0; j < nz; j++) {
2018:         if (*ajtmp != i) {
2019:           rs += PetscAbsScalar(*rtmp1++);
2020:         } else {
2021:           rs -= PetscRealPart(*rtmp1++);
2022:         }
2023:         ajtmp++;
2024:       }
2025:       if (rs > sctx.shift_top) sctx.shift_top = rs;
2026:     }
2027:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2028:     sctx.shift_top *= 1.1;
2029:     sctx.nshift_max = 5;
2030:     sctx.shift_lo   = 0.;
2031:     sctx.shift_hi   = 1.;
2032:   }
2033:   sctx.shift_amount = 0;
2034:   sctx.nshift       = 0;

2036:   PetscCall(ISGetIndices(isrow, &r));
2037:   PetscCall(ISGetIndices(iscol, &c));
2038:   PetscCall(ISGetIndices(isicol, &ic));
2039:   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2040:   ics = ic;

2042:   node_max = a->inode.node_count;
2043:   ns       = a->inode.size;
2044:   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");

2046:   /* If max inode size > 3, split it into two inodes.*/
2047:   /* also map the inode sizes according to the ordering */
2048:   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2049:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2050:     if (ns[i] > 3) {
2051:       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2052:       ++j;
2053:       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2054:     } else {
2055:       tmp_vec1[j] = ns[i];
2056:     }
2057:   }
2058:   /* Use the correct node_max */
2059:   node_max = j;

2061:   /* Now reorder the inode info based on mat re-ordering info */
2062:   /* First create a row -> inode_size_array_index map */
2063:   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2064:   for (i = 0, row = 0; i < node_max; i++) {
2065:     nodesz = tmp_vec1[i];
2066:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2067:   }
2068:   /* Using nsmap, create a reordered ns structure */
2069:   for (i = 0, j = 0; i < node_max; i++) {
2070:     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2071:     tmp_vec2[i] = nodesz;
2072:     j += nodesz;
2073:   }
2074:   PetscCall(PetscFree2(nsmap, tmp_vec1));
2075:   /* Now use the correct ns */
2076:   ns = tmp_vec2;

2078:   do {
2079:     sctx.newshift = PETSC_FALSE;
2080:     /* Now loop over each block-row, and do the factorization */
2081:     for (i = 0, row = 0; i < node_max; i++) {
2082:       nodesz = ns[i];
2083:       nz     = bi[row + 1] - bi[row];
2084:       bjtmp  = bj + bi[row];

2086:       switch (nodesz) {
2087:       case 1:
2088:         for (j = 0; j < nz; j++) {
2089:           idx         = bjtmp[j];
2090:           rtmp11[idx] = 0.0;
2091:         }

2093:         /* load in initial (unfactored row) */
2094:         idx    = r[row];
2095:         nz_tmp = ai[idx + 1] - ai[idx];
2096:         ajtmp  = aj + ai[idx];
2097:         v1     = aa + ai[idx];

2099:         for (j = 0; j < nz_tmp; j++) {
2100:           idx         = ics[ajtmp[j]];
2101:           rtmp11[idx] = v1[j];
2102:         }
2103:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2105:         prow = *bjtmp++;
2106:         while (prow < row) {
2107:           pc1 = rtmp11 + prow;
2108:           if (*pc1 != 0.0) {
2109:             pv     = ba + bd[prow];
2110:             pj     = nbj + bd[prow];
2111:             mul1   = *pc1 * *pv++;
2112:             *pc1   = mul1;
2113:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2114:             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2115:             for (j = 0; j < nz_tmp; j++) {
2116:               tmp = pv[j];
2117:               idx = pj[j];
2118:               rtmp11[idx] -= mul1 * tmp;
2119:             }
2120:           }
2121:           prow = *bjtmp++;
2122:         }
2123:         pj  = bj + bi[row];
2124:         pc1 = ba + bi[row];

2126:         sctx.pv     = rtmp11[row];
2127:         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2128:         rs          = 0.0;
2129:         for (j = 0; j < nz; j++) {
2130:           idx    = pj[j];
2131:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2132:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2133:         }
2134:         sctx.rs = rs;
2135:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2136:         if (sctx.newshift) goto endofwhile;
2137:         break;

2139:       case 2:
2140:         for (j = 0; j < nz; j++) {
2141:           idx         = bjtmp[j];
2142:           rtmp11[idx] = 0.0;
2143:           rtmp22[idx] = 0.0;
2144:         }

2146:         /* load in initial (unfactored row) */
2147:         idx    = r[row];
2148:         nz_tmp = ai[idx + 1] - ai[idx];
2149:         ajtmp  = aj + ai[idx];
2150:         v1     = aa + ai[idx];
2151:         v2     = aa + ai[idx + 1];
2152:         for (j = 0; j < nz_tmp; j++) {
2153:           idx         = ics[ajtmp[j]];
2154:           rtmp11[idx] = v1[j];
2155:           rtmp22[idx] = v2[j];
2156:         }
2157:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2158:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;

2160:         prow = *bjtmp++;
2161:         while (prow < row) {
2162:           pc1 = rtmp11 + prow;
2163:           pc2 = rtmp22 + prow;
2164:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2165:             pv   = ba + bd[prow];
2166:             pj   = nbj + bd[prow];
2167:             mul1 = *pc1 * *pv;
2168:             mul2 = *pc2 * *pv;
2169:             ++pv;
2170:             *pc1 = mul1;
2171:             *pc2 = mul2;

2173:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2174:             for (j = 0; j < nz_tmp; j++) {
2175:               tmp = pv[j];
2176:               idx = pj[j];
2177:               rtmp11[idx] -= mul1 * tmp;
2178:               rtmp22[idx] -= mul2 * tmp;
2179:             }
2180:             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2181:           }
2182:           prow = *bjtmp++;
2183:         }

2185:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2186:         pc1 = rtmp11 + prow;
2187:         pc2 = rtmp22 + prow;

2189:         sctx.pv = *pc1;
2190:         pj      = bj + bi[prow];
2191:         rs      = 0.0;
2192:         for (j = 0; j < nz; j++) {
2193:           idx = pj[j];
2194:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2195:         }
2196:         sctx.rs = rs;
2197:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2198:         if (sctx.newshift) goto endofwhile;

2200:         if (*pc2 != 0.0) {
2201:           pj     = nbj + bd[prow];
2202:           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2203:           *pc2   = mul2;
2204:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2205:           for (j = 0; j < nz_tmp; j++) {
2206:             idx = pj[j];
2207:             tmp = rtmp11[idx];
2208:             rtmp22[idx] -= mul2 * tmp;
2209:           }
2210:           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2211:         }

2213:         pj  = bj + bi[row];
2214:         pc1 = ba + bi[row];
2215:         pc2 = ba + bi[row + 1];

2217:         sctx.pv         = rtmp22[row + 1];
2218:         rs              = 0.0;
2219:         rtmp11[row]     = 1.0 / rtmp11[row];
2220:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2221:         /* copy row entries from dense representation to sparse */
2222:         for (j = 0; j < nz; j++) {
2223:           idx    = pj[j];
2224:           pc1[j] = rtmp11[idx];
2225:           pc2[j] = rtmp22[idx];
2226:           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2227:         }
2228:         sctx.rs = rs;
2229:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2230:         if (sctx.newshift) goto endofwhile;
2231:         break;

2233:       case 3:
2234:         for (j = 0; j < nz; j++) {
2235:           idx         = bjtmp[j];
2236:           rtmp11[idx] = 0.0;
2237:           rtmp22[idx] = 0.0;
2238:           rtmp33[idx] = 0.0;
2239:         }
2240:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2241:         idx    = r[row];
2242:         nz_tmp = ai[idx + 1] - ai[idx];
2243:         ajtmp  = aj + ai[idx];
2244:         v1     = aa + ai[idx];
2245:         v2     = aa + ai[idx + 1];
2246:         v3     = aa + ai[idx + 2];
2247:         for (j = 0; j < nz_tmp; j++) {
2248:           idx         = ics[ajtmp[j]];
2249:           rtmp11[idx] = v1[j];
2250:           rtmp22[idx] = v2[j];
2251:           rtmp33[idx] = v3[j];
2252:         }
2253:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2254:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2255:         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;

2257:         /* loop over all pivot row blocks above this row block */
2258:         prow = *bjtmp++;
2259:         while (prow < row) {
2260:           pc1 = rtmp11 + prow;
2261:           pc2 = rtmp22 + prow;
2262:           pc3 = rtmp33 + prow;
2263:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2264:             pv   = ba + bd[prow];
2265:             pj   = nbj + bd[prow];
2266:             mul1 = *pc1 * *pv;
2267:             mul2 = *pc2 * *pv;
2268:             mul3 = *pc3 * *pv;
2269:             ++pv;
2270:             *pc1 = mul1;
2271:             *pc2 = mul2;
2272:             *pc3 = mul3;

2274:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2275:             /* update this row based on pivot row */
2276:             for (j = 0; j < nz_tmp; j++) {
2277:               tmp = pv[j];
2278:               idx = pj[j];
2279:               rtmp11[idx] -= mul1 * tmp;
2280:               rtmp22[idx] -= mul2 * tmp;
2281:               rtmp33[idx] -= mul3 * tmp;
2282:             }
2283:             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2284:           }
2285:           prow = *bjtmp++;
2286:         }

2288:         /* Now take care of diagonal 3x3 block in this set of rows */
2289:         /* note: prow = row here */
2290:         pc1 = rtmp11 + prow;
2291:         pc2 = rtmp22 + prow;
2292:         pc3 = rtmp33 + prow;

2294:         sctx.pv = *pc1;
2295:         pj      = bj + bi[prow];
2296:         rs      = 0.0;
2297:         for (j = 0; j < nz; j++) {
2298:           idx = pj[j];
2299:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2300:         }
2301:         sctx.rs = rs;
2302:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2303:         if (sctx.newshift) goto endofwhile;

2305:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2306:           mul2   = (*pc2) / (*pc1);
2307:           mul3   = (*pc3) / (*pc1);
2308:           *pc2   = mul2;
2309:           *pc3   = mul3;
2310:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2311:           pj     = nbj + bd[prow];
2312:           for (j = 0; j < nz_tmp; j++) {
2313:             idx = pj[j];
2314:             tmp = rtmp11[idx];
2315:             rtmp22[idx] -= mul2 * tmp;
2316:             rtmp33[idx] -= mul3 * tmp;
2317:           }
2318:           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2319:         }
2320:         ++prow;

2322:         pc2     = rtmp22 + prow;
2323:         pc3     = rtmp33 + prow;
2324:         sctx.pv = *pc2;
2325:         pj      = bj + bi[prow];
2326:         rs      = 0.0;
2327:         for (j = 0; j < nz; j++) {
2328:           idx = pj[j];
2329:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2330:         }
2331:         sctx.rs = rs;
2332:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2333:         if (sctx.newshift) goto endofwhile;

2335:         if (*pc3 != 0.0) {
2336:           mul3   = (*pc3) / (*pc2);
2337:           *pc3   = mul3;
2338:           pj     = nbj + bd[prow];
2339:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2340:           for (j = 0; j < nz_tmp; j++) {
2341:             idx = pj[j];
2342:             tmp = rtmp22[idx];
2343:             rtmp33[idx] -= mul3 * tmp;
2344:           }
2345:           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2346:         }

2348:         pj  = bj + bi[row];
2349:         pc1 = ba + bi[row];
2350:         pc2 = ba + bi[row + 1];
2351:         pc3 = ba + bi[row + 2];

2353:         sctx.pv         = rtmp33[row + 2];
2354:         rs              = 0.0;
2355:         rtmp11[row]     = 1.0 / rtmp11[row];
2356:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2357:         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2358:         /* copy row entries from dense representation to sparse */
2359:         for (j = 0; j < nz; j++) {
2360:           idx    = pj[j];
2361:           pc1[j] = rtmp11[idx];
2362:           pc2[j] = rtmp22[idx];
2363:           pc3[j] = rtmp33[idx];
2364:           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2365:         }

2367:         sctx.rs = rs;
2368:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2369:         if (sctx.newshift) goto endofwhile;
2370:         break;

2372:       default:
2373:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2374:       }
2375:       row += nodesz; /* Update the row */
2376:     }
2377:   endofwhile:;
2378:   } while (sctx.newshift);
2379:   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2380:   PetscCall(PetscFree(tmp_vec2));
2381:   PetscCall(ISRestoreIndices(isicol, &ic));
2382:   PetscCall(ISRestoreIndices(isrow, &r));
2383:   PetscCall(ISRestoreIndices(iscol, &c));

2385:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2386:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2387:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2388:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2389:   C->assembled              = PETSC_TRUE;
2390:   C->preallocated           = PETSC_TRUE;
2391:   if (sctx.nshift) {
2392:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2393:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2394:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2395:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2396:     }
2397:   }
2398:   PetscCall(PetscLogFlops(C->cmap->n));
2399:   PetscCall(MatSeqAIJCheckInode(C));
2400:   PetscFunctionReturn(PETSC_SUCCESS);
2401: }
2402: #endif

2404: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2405: {
2406:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2407:   IS                 iscol = a->col, isrow = a->row;
2408:   const PetscInt    *r, *c, *rout, *cout;
2409:   PetscInt           i, j;
2410:   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2411:   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2412:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2413:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2414:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2415:   const PetscScalar *b;

2417:   PetscFunctionBegin;
2418:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2419:   node_max = a->inode.node_count;
2420:   ns       = a->inode.size_csr; /* Node Size array */

2422:   PetscCall(VecGetArrayRead(bb, &b));
2423:   PetscCall(VecGetArrayWrite(xx, &x));
2424:   tmp = a->solve_work;

2426:   PetscCall(ISGetIndices(isrow, &rout));
2427:   r = rout;
2428:   PetscCall(ISGetIndices(iscol, &cout));
2429:   c = cout;

2431:   /* forward solve the lower triangular */
2432:   tmps = tmp;
2433:   aa   = a_a;
2434:   aj   = a_j;
2435:   ad   = a->diag;

2437:   for (i = 0; i < node_max; ++i) {
2438:     row = ns[i];
2439:     nsz = ns[i + 1] - ns[i];
2440:     aii = ai[row];
2441:     v1  = aa + aii;
2442:     vi  = aj + aii;
2443:     nz  = ai[row + 1] - ai[row];

2445:     if (i < node_max - 1) {
2446:       /* Prefetch the indices for the next block */
2447:       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2448:       /* Prefetch the data for the next block */
2449:       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2450:     }

2452:     switch (nsz) { /* Each loop in 'case' is unrolled */
2453:     case 1:
2454:       sum1 = b[r[row]];
2455:       for (j = 0; j < nz - 1; j += 2) {
2456:         i0   = vi[j];
2457:         i1   = vi[j + 1];
2458:         tmp0 = tmps[i0];
2459:         tmp1 = tmps[i1];
2460:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2461:       }
2462:       if (j == nz - 1) {
2463:         tmp0 = tmps[vi[j]];
2464:         sum1 -= v1[j] * tmp0;
2465:       }
2466:       tmp[row++] = sum1;
2467:       break;
2468:     case 2:
2469:       sum1 = b[r[row]];
2470:       sum2 = b[r[row + 1]];
2471:       v2   = aa + ai[row + 1];

2473:       for (j = 0; j < nz - 1; j += 2) {
2474:         i0   = vi[j];
2475:         i1   = vi[j + 1];
2476:         tmp0 = tmps[i0];
2477:         tmp1 = tmps[i1];
2478:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480:       }
2481:       if (j == nz - 1) {
2482:         tmp0 = tmps[vi[j]];
2483:         sum1 -= v1[j] * tmp0;
2484:         sum2 -= v2[j] * tmp0;
2485:       }
2486:       sum2 -= v2[nz] * sum1;
2487:       tmp[row++] = sum1;
2488:       tmp[row++] = sum2;
2489:       break;
2490:     case 3:
2491:       sum1 = b[r[row]];
2492:       sum2 = b[r[row + 1]];
2493:       sum3 = b[r[row + 2]];
2494:       v2   = aa + ai[row + 1];
2495:       v3   = aa + ai[row + 2];

2497:       for (j = 0; j < nz - 1; j += 2) {
2498:         i0   = vi[j];
2499:         i1   = vi[j + 1];
2500:         tmp0 = tmps[i0];
2501:         tmp1 = tmps[i1];
2502:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2503:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2504:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2505:       }
2506:       if (j == nz - 1) {
2507:         tmp0 = tmps[vi[j]];
2508:         sum1 -= v1[j] * tmp0;
2509:         sum2 -= v2[j] * tmp0;
2510:         sum3 -= v3[j] * tmp0;
2511:       }
2512:       sum2 -= v2[nz] * sum1;
2513:       sum3 -= v3[nz] * sum1;
2514:       sum3 -= v3[nz + 1] * sum2;
2515:       tmp[row++] = sum1;
2516:       tmp[row++] = sum2;
2517:       tmp[row++] = sum3;
2518:       break;

2520:     case 4:
2521:       sum1 = b[r[row]];
2522:       sum2 = b[r[row + 1]];
2523:       sum3 = b[r[row + 2]];
2524:       sum4 = b[r[row + 3]];
2525:       v2   = aa + ai[row + 1];
2526:       v3   = aa + ai[row + 2];
2527:       v4   = aa + ai[row + 3];

2529:       for (j = 0; j < nz - 1; j += 2) {
2530:         i0   = vi[j];
2531:         i1   = vi[j + 1];
2532:         tmp0 = tmps[i0];
2533:         tmp1 = tmps[i1];
2534:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2535:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2536:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2537:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2538:       }
2539:       if (j == nz - 1) {
2540:         tmp0 = tmps[vi[j]];
2541:         sum1 -= v1[j] * tmp0;
2542:         sum2 -= v2[j] * tmp0;
2543:         sum3 -= v3[j] * tmp0;
2544:         sum4 -= v4[j] * tmp0;
2545:       }
2546:       sum2 -= v2[nz] * sum1;
2547:       sum3 -= v3[nz] * sum1;
2548:       sum4 -= v4[nz] * sum1;
2549:       sum3 -= v3[nz + 1] * sum2;
2550:       sum4 -= v4[nz + 1] * sum2;
2551:       sum4 -= v4[nz + 2] * sum3;

2553:       tmp[row++] = sum1;
2554:       tmp[row++] = sum2;
2555:       tmp[row++] = sum3;
2556:       tmp[row++] = sum4;
2557:       break;
2558:     case 5:
2559:       sum1 = b[r[row]];
2560:       sum2 = b[r[row + 1]];
2561:       sum3 = b[r[row + 2]];
2562:       sum4 = b[r[row + 3]];
2563:       sum5 = b[r[row + 4]];
2564:       v2   = aa + ai[row + 1];
2565:       v3   = aa + ai[row + 2];
2566:       v4   = aa + ai[row + 3];
2567:       v5   = aa + ai[row + 4];

2569:       for (j = 0; j < nz - 1; j += 2) {
2570:         i0   = vi[j];
2571:         i1   = vi[j + 1];
2572:         tmp0 = tmps[i0];
2573:         tmp1 = tmps[i1];
2574:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2575:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2576:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2577:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2578:         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2579:       }
2580:       if (j == nz - 1) {
2581:         tmp0 = tmps[vi[j]];
2582:         sum1 -= v1[j] * tmp0;
2583:         sum2 -= v2[j] * tmp0;
2584:         sum3 -= v3[j] * tmp0;
2585:         sum4 -= v4[j] * tmp0;
2586:         sum5 -= v5[j] * tmp0;
2587:       }

2589:       sum2 -= v2[nz] * sum1;
2590:       sum3 -= v3[nz] * sum1;
2591:       sum4 -= v4[nz] * sum1;
2592:       sum5 -= v5[nz] * sum1;
2593:       sum3 -= v3[nz + 1] * sum2;
2594:       sum4 -= v4[nz + 1] * sum2;
2595:       sum5 -= v5[nz + 1] * sum2;
2596:       sum4 -= v4[nz + 2] * sum3;
2597:       sum5 -= v5[nz + 2] * sum3;
2598:       sum5 -= v5[nz + 3] * sum4;

2600:       tmp[row++] = sum1;
2601:       tmp[row++] = sum2;
2602:       tmp[row++] = sum3;
2603:       tmp[row++] = sum4;
2604:       tmp[row++] = sum5;
2605:       break;
2606:     default:
2607:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2608:     }
2609:   }
2610:   /* backward solve the upper triangular */
2611:   for (i = node_max - 1; i >= 0; i--) {
2612:     row = ns[i + 1] - 1;
2613:     nsz = ns[i + 1] - ns[i];
2614:     aii = ad[row + 1] + 1;
2615:     v1  = aa + aii;
2616:     vi  = aj + aii;
2617:     nz  = ad[row] - ad[row + 1] - 1;

2619:     if (i > 0) {
2620:       /* Prefetch the indices for the next block */
2621:       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2622:       /* Prefetch the data for the next block */
2623:       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2624:     }

2626:     switch (nsz) { /* Each loop in 'case' is unrolled */
2627:     case 1:
2628:       sum1 = tmp[row];

2630:       for (j = 0; j < nz - 1; j += 2) {
2631:         i0   = vi[j];
2632:         i1   = vi[j + 1];
2633:         tmp0 = tmps[i0];
2634:         tmp1 = tmps[i1];
2635:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636:       }
2637:       if (j == nz - 1) {
2638:         tmp0 = tmps[vi[j]];
2639:         sum1 -= v1[j] * tmp0;
2640:       }
2641:       x[c[row]] = tmp[row] = sum1 * v1[nz];
2642:       row--;
2643:       break;
2644:     case 2:
2645:       sum1 = tmp[row];
2646:       sum2 = tmp[row - 1];
2647:       v2   = aa + ad[row] + 1;
2648:       for (j = 0; j < nz - 1; j += 2) {
2649:         i0   = vi[j];
2650:         i1   = vi[j + 1];
2651:         tmp0 = tmps[i0];
2652:         tmp1 = tmps[i1];
2653:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2654:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2655:       }
2656:       if (j == nz - 1) {
2657:         tmp0 = tmps[vi[j]];
2658:         sum1 -= v1[j] * tmp0;
2659:         sum2 -= v2[j + 1] * tmp0;
2660:       }

2662:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2663:       row--;
2664:       sum2 -= v2[0] * tmp0;
2665:       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2666:       row--;
2667:       break;
2668:     case 3:
2669:       sum1 = tmp[row];
2670:       sum2 = tmp[row - 1];
2671:       sum3 = tmp[row - 2];
2672:       v2   = aa + ad[row] + 1;
2673:       v3   = aa + ad[row - 1] + 1;
2674:       for (j = 0; j < nz - 1; j += 2) {
2675:         i0   = vi[j];
2676:         i1   = vi[j + 1];
2677:         tmp0 = tmps[i0];
2678:         tmp1 = tmps[i1];
2679:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2680:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2681:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2682:       }
2683:       if (j == nz - 1) {
2684:         tmp0 = tmps[vi[j]];
2685:         sum1 -= v1[j] * tmp0;
2686:         sum2 -= v2[j + 1] * tmp0;
2687:         sum3 -= v3[j + 2] * tmp0;
2688:       }
2689:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2690:       row--;
2691:       sum2 -= v2[0] * tmp0;
2692:       sum3 -= v3[1] * tmp0;
2693:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2694:       row--;
2695:       sum3 -= v3[0] * tmp0;
2696:       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2697:       row--;

2699:       break;
2700:     case 4:
2701:       sum1 = tmp[row];
2702:       sum2 = tmp[row - 1];
2703:       sum3 = tmp[row - 2];
2704:       sum4 = tmp[row - 3];
2705:       v2   = aa + ad[row] + 1;
2706:       v3   = aa + ad[row - 1] + 1;
2707:       v4   = aa + ad[row - 2] + 1;

2709:       for (j = 0; j < nz - 1; j += 2) {
2710:         i0   = vi[j];
2711:         i1   = vi[j + 1];
2712:         tmp0 = tmps[i0];
2713:         tmp1 = tmps[i1];
2714:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2715:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2716:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2717:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2718:       }
2719:       if (j == nz - 1) {
2720:         tmp0 = tmps[vi[j]];
2721:         sum1 -= v1[j] * tmp0;
2722:         sum2 -= v2[j + 1] * tmp0;
2723:         sum3 -= v3[j + 2] * tmp0;
2724:         sum4 -= v4[j + 3] * tmp0;
2725:       }

2727:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2728:       row--;
2729:       sum2 -= v2[0] * tmp0;
2730:       sum3 -= v3[1] * tmp0;
2731:       sum4 -= v4[2] * tmp0;
2732:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2733:       row--;
2734:       sum3 -= v3[0] * tmp0;
2735:       sum4 -= v4[1] * tmp0;
2736:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2737:       row--;
2738:       sum4 -= v4[0] * tmp0;
2739:       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2740:       row--;
2741:       break;
2742:     case 5:
2743:       sum1 = tmp[row];
2744:       sum2 = tmp[row - 1];
2745:       sum3 = tmp[row - 2];
2746:       sum4 = tmp[row - 3];
2747:       sum5 = tmp[row - 4];
2748:       v2   = aa + ad[row] + 1;
2749:       v3   = aa + ad[row - 1] + 1;
2750:       v4   = aa + ad[row - 2] + 1;
2751:       v5   = aa + ad[row - 3] + 1;
2752:       for (j = 0; j < nz - 1; j += 2) {
2753:         i0   = vi[j];
2754:         i1   = vi[j + 1];
2755:         tmp0 = tmps[i0];
2756:         tmp1 = tmps[i1];
2757:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2758:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2759:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2760:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2761:         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2762:       }
2763:       if (j == nz - 1) {
2764:         tmp0 = tmps[vi[j]];
2765:         sum1 -= v1[j] * tmp0;
2766:         sum2 -= v2[j + 1] * tmp0;
2767:         sum3 -= v3[j + 2] * tmp0;
2768:         sum4 -= v4[j + 3] * tmp0;
2769:         sum5 -= v5[j + 4] * tmp0;
2770:       }

2772:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2773:       row--;
2774:       sum2 -= v2[0] * tmp0;
2775:       sum3 -= v3[1] * tmp0;
2776:       sum4 -= v4[2] * tmp0;
2777:       sum5 -= v5[3] * tmp0;
2778:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2779:       row--;
2780:       sum3 -= v3[0] * tmp0;
2781:       sum4 -= v4[1] * tmp0;
2782:       sum5 -= v5[2] * tmp0;
2783:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2784:       row--;
2785:       sum4 -= v4[0] * tmp0;
2786:       sum5 -= v5[1] * tmp0;
2787:       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2788:       row--;
2789:       sum5 -= v5[0] * tmp0;
2790:       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2791:       row--;
2792:       break;
2793:     default:
2794:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2795:     }
2796:   }
2797:   PetscCall(ISRestoreIndices(isrow, &rout));
2798:   PetscCall(ISRestoreIndices(iscol, &cout));
2799:   PetscCall(VecRestoreArrayRead(bb, &b));
2800:   PetscCall(VecRestoreArrayWrite(xx, &x));
2801:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2802:   PetscFunctionReturn(PETSC_SUCCESS);
2803: }

2805: /*
2806:      Makes a longer coloring[] array and calls the usual code with that
2807: */
2808: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2809: {
2810:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2811:   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2812:   PetscInt        *colorused, i;
2813:   ISColoringValue *newcolor;

2815:   PetscFunctionBegin;
2816:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2817:   PetscCall(PetscMalloc1(n + 1, &newcolor));
2818:   /* loop over inodes, marking a color for each column*/
2819:   row = 0;
2820:   for (i = 0; i < m; i++) {
2821:     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2822:   }

2824:   /* eliminate unneeded colors */
2825:   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2826:   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;

2828:   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2829:   ncolors = colorused[5 * ncolors - 1];
2830:   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2831:   PetscCall(PetscFree(colorused));
2832:   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2833:   PetscCall(PetscFree(coloring));
2834:   PetscFunctionReturn(PETSC_SUCCESS);
2835: }

2837: #include <petsc/private/kernels/blockinvert.h>

2839: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2840: {
2841:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2842:   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2843:   MatScalar         *ibdiag, *bdiag, work[25], *t;
2844:   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2845:   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2846:   const PetscScalar *xb, *b;
2847:   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2848:   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
2849:   PetscInt           sz, k, ipvt[5];
2850:   PetscBool          allowzeropivot, zeropivotdetected;
2851:   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;

2853:   PetscFunctionBegin;
2854:   allowzeropivot = PetscNot(A->erroriffailure);
2855:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2856:   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2857:   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");

2859:   if (!a->inode.ibdiagvalid) {
2860:     if (!a->inode.ibdiag) {
2861:       /* calculate space needed for diagonal blocks */
2862:       for (i = 0; i < m; i++) {
2863:         nodesz = sizes[i + 1] - sizes[i];
2864:         cnt += nodesz * nodesz;
2865:       }
2866:       a->inode.bdiagsize = cnt;

2868:       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2869:     }

2871:     /* copy over the diagonal blocks and invert them */
2872:     ibdiag = a->inode.ibdiag;
2873:     bdiag  = a->inode.bdiag;
2874:     cnt    = 0;
2875:     for (i = 0, row = 0; i < m; i++) {
2876:       nodesz = sizes[i + 1] - sizes[i];
2877:       for (j = 0; j < nodesz; j++) {
2878:         for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2879:       }
2880:       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));

2882:       switch (nodesz) {
2883:       case 1:
2884:         /* Create matrix data structure */
2885:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2886:           if (allowzeropivot) {
2887:             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2888:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2889:             A->factorerror_zeropivot_row   = row;
2890:             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2891:           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2892:         }
2893:         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2894:         break;
2895:       case 2:
2896:         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2897:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2898:         break;
2899:       case 3:
2900:         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2901:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2902:         break;
2903:       case 4:
2904:         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2905:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2906:         break;
2907:       case 5:
2908:         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2909:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2910:         break;
2911:       default:
2912:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2913:       }
2914:       cnt += nodesz * nodesz;
2915:       row += nodesz;
2916:     }
2917:     a->inode.ibdiagvalid = PETSC_TRUE;
2918:   }
2919:   ibdiag = a->inode.ibdiag;
2920:   bdiag  = a->inode.bdiag;
2921:   t      = a->inode.ssor_work;

2923:   PetscCall(VecGetArray(xx, &x));
2924:   PetscCall(VecGetArrayRead(bb, &b));
2925:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2926:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2927:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2928:       for (i = 0, row = 0; i < m; i++) {
2929:         sz  = diag[row] - ii[row];
2930:         v1  = a->a + ii[row];
2931:         idx = a->j + ii[row];

2933:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2934:         nodesz = sizes[i + 1] - sizes[i];
2935:         switch (nodesz) {
2936:         case 1:

2938:           sum1 = b[row];
2939:           for (n = 0; n < sz - 1; n += 2) {
2940:             i1 = idx[0];
2941:             i2 = idx[1];
2942:             idx += 2;
2943:             tmp0 = x[i1];
2944:             tmp1 = x[i2];
2945:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2946:             v1 += 2;
2947:           }

2949:           if (n == sz - 1) {
2950:             tmp0 = x[*idx];
2951:             sum1 -= *v1 * tmp0;
2952:           }
2953:           t[row]   = sum1;
2954:           x[row++] = sum1 * (*ibdiag++);
2955:           break;
2956:         case 2:
2957:           v2   = a->a + ii[row + 1];
2958:           sum1 = b[row];
2959:           sum2 = b[row + 1];
2960:           for (n = 0; n < sz - 1; n += 2) {
2961:             i1 = idx[0];
2962:             i2 = idx[1];
2963:             idx += 2;
2964:             tmp0 = x[i1];
2965:             tmp1 = x[i2];
2966:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2967:             v1 += 2;
2968:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2969:             v2 += 2;
2970:           }

2972:           if (n == sz - 1) {
2973:             tmp0 = x[*idx];
2974:             sum1 -= v1[0] * tmp0;
2975:             sum2 -= v2[0] * tmp0;
2976:           }
2977:           t[row]     = sum1;
2978:           t[row + 1] = sum2;
2979:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2980:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2981:           ibdiag += 4;
2982:           break;
2983:         case 3:
2984:           v2   = a->a + ii[row + 1];
2985:           v3   = a->a + ii[row + 2];
2986:           sum1 = b[row];
2987:           sum2 = b[row + 1];
2988:           sum3 = b[row + 2];
2989:           for (n = 0; n < sz - 1; n += 2) {
2990:             i1 = idx[0];
2991:             i2 = idx[1];
2992:             idx += 2;
2993:             tmp0 = x[i1];
2994:             tmp1 = x[i2];
2995:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2996:             v1 += 2;
2997:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2998:             v2 += 2;
2999:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3000:             v3 += 2;
3001:           }

3003:           if (n == sz - 1) {
3004:             tmp0 = x[*idx];
3005:             sum1 -= v1[0] * tmp0;
3006:             sum2 -= v2[0] * tmp0;
3007:             sum3 -= v3[0] * tmp0;
3008:           }
3009:           t[row]     = sum1;
3010:           t[row + 1] = sum2;
3011:           t[row + 2] = sum3;
3012:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3013:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3014:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3015:           ibdiag += 9;
3016:           break;
3017:         case 4:
3018:           v2   = a->a + ii[row + 1];
3019:           v3   = a->a + ii[row + 2];
3020:           v4   = a->a + ii[row + 3];
3021:           sum1 = b[row];
3022:           sum2 = b[row + 1];
3023:           sum3 = b[row + 2];
3024:           sum4 = b[row + 3];
3025:           for (n = 0; n < sz - 1; n += 2) {
3026:             i1 = idx[0];
3027:             i2 = idx[1];
3028:             idx += 2;
3029:             tmp0 = x[i1];
3030:             tmp1 = x[i2];
3031:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3032:             v1 += 2;
3033:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3034:             v2 += 2;
3035:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3036:             v3 += 2;
3037:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3038:             v4 += 2;
3039:           }

3041:           if (n == sz - 1) {
3042:             tmp0 = x[*idx];
3043:             sum1 -= v1[0] * tmp0;
3044:             sum2 -= v2[0] * tmp0;
3045:             sum3 -= v3[0] * tmp0;
3046:             sum4 -= v4[0] * tmp0;
3047:           }
3048:           t[row]     = sum1;
3049:           t[row + 1] = sum2;
3050:           t[row + 2] = sum3;
3051:           t[row + 3] = sum4;
3052:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3053:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3054:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3055:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3056:           ibdiag += 16;
3057:           break;
3058:         case 5:
3059:           v2   = a->a + ii[row + 1];
3060:           v3   = a->a + ii[row + 2];
3061:           v4   = a->a + ii[row + 3];
3062:           v5   = a->a + ii[row + 4];
3063:           sum1 = b[row];
3064:           sum2 = b[row + 1];
3065:           sum3 = b[row + 2];
3066:           sum4 = b[row + 3];
3067:           sum5 = b[row + 4];
3068:           for (n = 0; n < sz - 1; n += 2) {
3069:             i1 = idx[0];
3070:             i2 = idx[1];
3071:             idx += 2;
3072:             tmp0 = x[i1];
3073:             tmp1 = x[i2];
3074:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3075:             v1 += 2;
3076:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3077:             v2 += 2;
3078:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3079:             v3 += 2;
3080:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3081:             v4 += 2;
3082:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3083:             v5 += 2;
3084:           }

3086:           if (n == sz - 1) {
3087:             tmp0 = x[*idx];
3088:             sum1 -= v1[0] * tmp0;
3089:             sum2 -= v2[0] * tmp0;
3090:             sum3 -= v3[0] * tmp0;
3091:             sum4 -= v4[0] * tmp0;
3092:             sum5 -= v5[0] * tmp0;
3093:           }
3094:           t[row]     = sum1;
3095:           t[row + 1] = sum2;
3096:           t[row + 2] = sum3;
3097:           t[row + 3] = sum4;
3098:           t[row + 4] = sum5;
3099:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3100:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3101:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3102:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3103:           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3104:           ibdiag += 25;
3105:           break;
3106:         default:
3107:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3108:         }
3109:       }

3111:       xb = t;
3112:       PetscCall(PetscLogFlops(a->nz));
3113:     } else xb = b;
3114:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3115:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3116:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3117:         nodesz = sizes[i + 1] - sizes[i];
3118:         ibdiag -= nodesz * nodesz;
3119:         sz  = ii[row + 1] - diag[row] - 1;
3120:         v1  = a->a + diag[row] + 1;
3121:         idx = a->j + diag[row] + 1;

3123:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3124:         switch (nodesz) {
3125:         case 1:

3127:           sum1 = xb[row];
3128:           for (n = 0; n < sz - 1; n += 2) {
3129:             i1 = idx[0];
3130:             i2 = idx[1];
3131:             idx += 2;
3132:             tmp0 = x[i1];
3133:             tmp1 = x[i2];
3134:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3135:             v1 += 2;
3136:           }

3138:           if (n == sz - 1) {
3139:             tmp0 = x[*idx];
3140:             sum1 -= *v1 * tmp0;
3141:           }
3142:           x[row--] = sum1 * (*ibdiag);
3143:           break;

3145:         case 2:

3147:           sum1 = xb[row];
3148:           sum2 = xb[row - 1];
3149:           /* note that sum1 is associated with the second of the two rows */
3150:           v2 = a->a + diag[row - 1] + 2;
3151:           for (n = 0; n < sz - 1; n += 2) {
3152:             i1 = idx[0];
3153:             i2 = idx[1];
3154:             idx += 2;
3155:             tmp0 = x[i1];
3156:             tmp1 = x[i2];
3157:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3158:             v1 += 2;
3159:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3160:             v2 += 2;
3161:           }

3163:           if (n == sz - 1) {
3164:             tmp0 = x[*idx];
3165:             sum1 -= *v1 * tmp0;
3166:             sum2 -= *v2 * tmp0;
3167:           }
3168:           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3169:           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3170:           break;
3171:         case 3:

3173:           sum1 = xb[row];
3174:           sum2 = xb[row - 1];
3175:           sum3 = xb[row - 2];
3176:           v2   = a->a + diag[row - 1] + 2;
3177:           v3   = a->a + diag[row - 2] + 3;
3178:           for (n = 0; n < sz - 1; n += 2) {
3179:             i1 = idx[0];
3180:             i2 = idx[1];
3181:             idx += 2;
3182:             tmp0 = x[i1];
3183:             tmp1 = x[i2];
3184:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3185:             v1 += 2;
3186:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3187:             v2 += 2;
3188:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3189:             v3 += 2;
3190:           }

3192:           if (n == sz - 1) {
3193:             tmp0 = x[*idx];
3194:             sum1 -= *v1 * tmp0;
3195:             sum2 -= *v2 * tmp0;
3196:             sum3 -= *v3 * tmp0;
3197:           }
3198:           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3199:           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3200:           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3201:           break;
3202:         case 4:

3204:           sum1 = xb[row];
3205:           sum2 = xb[row - 1];
3206:           sum3 = xb[row - 2];
3207:           sum4 = xb[row - 3];
3208:           v2   = a->a + diag[row - 1] + 2;
3209:           v3   = a->a + diag[row - 2] + 3;
3210:           v4   = a->a + diag[row - 3] + 4;
3211:           for (n = 0; n < sz - 1; n += 2) {
3212:             i1 = idx[0];
3213:             i2 = idx[1];
3214:             idx += 2;
3215:             tmp0 = x[i1];
3216:             tmp1 = x[i2];
3217:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3218:             v1 += 2;
3219:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3220:             v2 += 2;
3221:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3222:             v3 += 2;
3223:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3224:             v4 += 2;
3225:           }

3227:           if (n == sz - 1) {
3228:             tmp0 = x[*idx];
3229:             sum1 -= *v1 * tmp0;
3230:             sum2 -= *v2 * tmp0;
3231:             sum3 -= *v3 * tmp0;
3232:             sum4 -= *v4 * tmp0;
3233:           }
3234:           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3235:           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3236:           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3237:           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3238:           break;
3239:         case 5:

3241:           sum1 = xb[row];
3242:           sum2 = xb[row - 1];
3243:           sum3 = xb[row - 2];
3244:           sum4 = xb[row - 3];
3245:           sum5 = xb[row - 4];
3246:           v2   = a->a + diag[row - 1] + 2;
3247:           v3   = a->a + diag[row - 2] + 3;
3248:           v4   = a->a + diag[row - 3] + 4;
3249:           v5   = a->a + diag[row - 4] + 5;
3250:           for (n = 0; n < sz - 1; n += 2) {
3251:             i1 = idx[0];
3252:             i2 = idx[1];
3253:             idx += 2;
3254:             tmp0 = x[i1];
3255:             tmp1 = x[i2];
3256:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3257:             v1 += 2;
3258:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3259:             v2 += 2;
3260:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3261:             v3 += 2;
3262:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3263:             v4 += 2;
3264:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3265:             v5 += 2;
3266:           }

3268:           if (n == sz - 1) {
3269:             tmp0 = x[*idx];
3270:             sum1 -= *v1 * tmp0;
3271:             sum2 -= *v2 * tmp0;
3272:             sum3 -= *v3 * tmp0;
3273:             sum4 -= *v4 * tmp0;
3274:             sum5 -= *v5 * tmp0;
3275:           }
3276:           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3277:           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3278:           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3279:           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3280:           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3281:           break;
3282:         default:
3283:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3284:         }
3285:       }

3287:       PetscCall(PetscLogFlops(a->nz));
3288:     }
3289:     its--;
3290:   }
3291:   while (its--) {
3292:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3293:       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3294:         nodesz = sizes[i + 1] - sizes[i];
3295:         sz     = diag[row] - ii[row];
3296:         v1     = a->a + ii[row];
3297:         idx    = a->j + ii[row];
3298:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3299:         switch (nodesz) {
3300:         case 1:
3301:           sum1 = b[row];
3302:           for (n = 0; n < sz - 1; n += 2) {
3303:             i1 = idx[0];
3304:             i2 = idx[1];
3305:             idx += 2;
3306:             tmp0 = x[i1];
3307:             tmp1 = x[i2];
3308:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3309:             v1 += 2;
3310:           }
3311:           if (n == sz - 1) {
3312:             tmp0 = x[*idx++];
3313:             sum1 -= *v1 * tmp0;
3314:             v1++;
3315:           }
3316:           t[row] = sum1;
3317:           sz     = ii[row + 1] - diag[row] - 1;
3318:           idx    = a->j + diag[row] + 1;
3319:           v1 += 1;
3320:           for (n = 0; n < sz - 1; n += 2) {
3321:             i1 = idx[0];
3322:             i2 = idx[1];
3323:             idx += 2;
3324:             tmp0 = x[i1];
3325:             tmp1 = x[i2];
3326:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3327:             v1 += 2;
3328:           }
3329:           if (n == sz - 1) {
3330:             tmp0 = x[*idx++];
3331:             sum1 -= *v1 * tmp0;
3332:           }
3333:           /* in MatSOR_SeqAIJ this line would be
3334:            *
3335:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3336:            *
3337:            * but omega == 1, so this becomes
3338:            *
3339:            * x[row] = sum1*(*ibdiag++);
3340:            *
3341:            */
3342:           x[row] = sum1 * (*ibdiag);
3343:           break;
3344:         case 2:
3345:           v2   = a->a + ii[row + 1];
3346:           sum1 = b[row];
3347:           sum2 = b[row + 1];
3348:           for (n = 0; n < sz - 1; n += 2) {
3349:             i1 = idx[0];
3350:             i2 = idx[1];
3351:             idx += 2;
3352:             tmp0 = x[i1];
3353:             tmp1 = x[i2];
3354:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3355:             v1 += 2;
3356:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3357:             v2 += 2;
3358:           }
3359:           if (n == sz - 1) {
3360:             tmp0 = x[*idx++];
3361:             sum1 -= v1[0] * tmp0;
3362:             sum2 -= v2[0] * tmp0;
3363:             v1++;
3364:             v2++;
3365:           }
3366:           t[row]     = sum1;
3367:           t[row + 1] = sum2;
3368:           sz         = ii[row + 1] - diag[row] - 2;
3369:           idx        = a->j + diag[row] + 2;
3370:           v1 += 2;
3371:           v2 += 2;
3372:           for (n = 0; n < sz - 1; n += 2) {
3373:             i1 = idx[0];
3374:             i2 = idx[1];
3375:             idx += 2;
3376:             tmp0 = x[i1];
3377:             tmp1 = x[i2];
3378:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3379:             v1 += 2;
3380:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3381:             v2 += 2;
3382:           }
3383:           if (n == sz - 1) {
3384:             tmp0 = x[*idx];
3385:             sum1 -= v1[0] * tmp0;
3386:             sum2 -= v2[0] * tmp0;
3387:           }
3388:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3389:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3390:           break;
3391:         case 3:
3392:           v2   = a->a + ii[row + 1];
3393:           v3   = a->a + ii[row + 2];
3394:           sum1 = b[row];
3395:           sum2 = b[row + 1];
3396:           sum3 = b[row + 2];
3397:           for (n = 0; n < sz - 1; n += 2) {
3398:             i1 = idx[0];
3399:             i2 = idx[1];
3400:             idx += 2;
3401:             tmp0 = x[i1];
3402:             tmp1 = x[i2];
3403:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3404:             v1 += 2;
3405:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3406:             v2 += 2;
3407:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3408:             v3 += 2;
3409:           }
3410:           if (n == sz - 1) {
3411:             tmp0 = x[*idx++];
3412:             sum1 -= v1[0] * tmp0;
3413:             sum2 -= v2[0] * tmp0;
3414:             sum3 -= v3[0] * tmp0;
3415:             v1++;
3416:             v2++;
3417:             v3++;
3418:           }
3419:           t[row]     = sum1;
3420:           t[row + 1] = sum2;
3421:           t[row + 2] = sum3;
3422:           sz         = ii[row + 1] - diag[row] - 3;
3423:           idx        = a->j + diag[row] + 3;
3424:           v1 += 3;
3425:           v2 += 3;
3426:           v3 += 3;
3427:           for (n = 0; n < sz - 1; n += 2) {
3428:             i1 = idx[0];
3429:             i2 = idx[1];
3430:             idx += 2;
3431:             tmp0 = x[i1];
3432:             tmp1 = x[i2];
3433:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3434:             v1 += 2;
3435:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3436:             v2 += 2;
3437:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3438:             v3 += 2;
3439:           }
3440:           if (n == sz - 1) {
3441:             tmp0 = x[*idx];
3442:             sum1 -= v1[0] * tmp0;
3443:             sum2 -= v2[0] * tmp0;
3444:             sum3 -= v3[0] * tmp0;
3445:           }
3446:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3447:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3448:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3449:           break;
3450:         case 4:
3451:           v2   = a->a + ii[row + 1];
3452:           v3   = a->a + ii[row + 2];
3453:           v4   = a->a + ii[row + 3];
3454:           sum1 = b[row];
3455:           sum2 = b[row + 1];
3456:           sum3 = b[row + 2];
3457:           sum4 = b[row + 3];
3458:           for (n = 0; n < sz - 1; n += 2) {
3459:             i1 = idx[0];
3460:             i2 = idx[1];
3461:             idx += 2;
3462:             tmp0 = x[i1];
3463:             tmp1 = x[i2];
3464:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3465:             v1 += 2;
3466:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3467:             v2 += 2;
3468:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3469:             v3 += 2;
3470:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3471:             v4 += 2;
3472:           }
3473:           if (n == sz - 1) {
3474:             tmp0 = x[*idx++];
3475:             sum1 -= v1[0] * tmp0;
3476:             sum2 -= v2[0] * tmp0;
3477:             sum3 -= v3[0] * tmp0;
3478:             sum4 -= v4[0] * tmp0;
3479:             v1++;
3480:             v2++;
3481:             v3++;
3482:             v4++;
3483:           }
3484:           t[row]     = sum1;
3485:           t[row + 1] = sum2;
3486:           t[row + 2] = sum3;
3487:           t[row + 3] = sum4;
3488:           sz         = ii[row + 1] - diag[row] - 4;
3489:           idx        = a->j + diag[row] + 4;
3490:           v1 += 4;
3491:           v2 += 4;
3492:           v3 += 4;
3493:           v4 += 4;
3494:           for (n = 0; n < sz - 1; n += 2) {
3495:             i1 = idx[0];
3496:             i2 = idx[1];
3497:             idx += 2;
3498:             tmp0 = x[i1];
3499:             tmp1 = x[i2];
3500:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3501:             v1 += 2;
3502:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3503:             v2 += 2;
3504:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3505:             v3 += 2;
3506:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3507:             v4 += 2;
3508:           }
3509:           if (n == sz - 1) {
3510:             tmp0 = x[*idx];
3511:             sum1 -= v1[0] * tmp0;
3512:             sum2 -= v2[0] * tmp0;
3513:             sum3 -= v3[0] * tmp0;
3514:             sum4 -= v4[0] * tmp0;
3515:           }
3516:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3517:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3518:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3519:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3520:           break;
3521:         case 5:
3522:           v2   = a->a + ii[row + 1];
3523:           v3   = a->a + ii[row + 2];
3524:           v4   = a->a + ii[row + 3];
3525:           v5   = a->a + ii[row + 4];
3526:           sum1 = b[row];
3527:           sum2 = b[row + 1];
3528:           sum3 = b[row + 2];
3529:           sum4 = b[row + 3];
3530:           sum5 = b[row + 4];
3531:           for (n = 0; n < sz - 1; n += 2) {
3532:             i1 = idx[0];
3533:             i2 = idx[1];
3534:             idx += 2;
3535:             tmp0 = x[i1];
3536:             tmp1 = x[i2];
3537:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3538:             v1 += 2;
3539:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3540:             v2 += 2;
3541:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3542:             v3 += 2;
3543:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3544:             v4 += 2;
3545:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3546:             v5 += 2;
3547:           }
3548:           if (n == sz - 1) {
3549:             tmp0 = x[*idx++];
3550:             sum1 -= v1[0] * tmp0;
3551:             sum2 -= v2[0] * tmp0;
3552:             sum3 -= v3[0] * tmp0;
3553:             sum4 -= v4[0] * tmp0;
3554:             sum5 -= v5[0] * tmp0;
3555:             v1++;
3556:             v2++;
3557:             v3++;
3558:             v4++;
3559:             v5++;
3560:           }
3561:           t[row]     = sum1;
3562:           t[row + 1] = sum2;
3563:           t[row + 2] = sum3;
3564:           t[row + 3] = sum4;
3565:           t[row + 4] = sum5;
3566:           sz         = ii[row + 1] - diag[row] - 5;
3567:           idx        = a->j + diag[row] + 5;
3568:           v1 += 5;
3569:           v2 += 5;
3570:           v3 += 5;
3571:           v4 += 5;
3572:           v5 += 5;
3573:           for (n = 0; n < sz - 1; n += 2) {
3574:             i1 = idx[0];
3575:             i2 = idx[1];
3576:             idx += 2;
3577:             tmp0 = x[i1];
3578:             tmp1 = x[i2];
3579:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3580:             v1 += 2;
3581:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3582:             v2 += 2;
3583:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3584:             v3 += 2;
3585:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3586:             v4 += 2;
3587:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3588:             v5 += 2;
3589:           }
3590:           if (n == sz - 1) {
3591:             tmp0 = x[*idx];
3592:             sum1 -= v1[0] * tmp0;
3593:             sum2 -= v2[0] * tmp0;
3594:             sum3 -= v3[0] * tmp0;
3595:             sum4 -= v4[0] * tmp0;
3596:             sum5 -= v5[0] * tmp0;
3597:           }
3598:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3599:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3600:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3601:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3602:           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3603:           break;
3604:         default:
3605:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3606:         }
3607:       }
3608:       xb = t;
3609:       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3610:     } else xb = b;

3612:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3613:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3614:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3615:         nodesz = sizes[i + 1] - sizes[i];
3616:         ibdiag -= nodesz * nodesz;

3618:         /* set RHS */
3619:         if (xb == b) {
3620:           /* whole (old way) */
3621:           sz  = ii[row + 1] - ii[row];
3622:           idx = a->j + ii[row];
3623:           switch (nodesz) {
3624:           case 5:
3625:             v5 = a->a + ii[row - 4]; /* fall through */
3626:           case 4:
3627:             v4 = a->a + ii[row - 3]; /* fall through */
3628:           case 3:
3629:             v3 = a->a + ii[row - 2]; /* fall through */
3630:           case 2:
3631:             v2 = a->a + ii[row - 1]; /* fall through */
3632:           case 1:
3633:             v1 = a->a + ii[row];
3634:             break;
3635:           default:
3636:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3637:           }
3638:         } else {
3639:           /* upper, no diag */
3640:           sz  = ii[row + 1] - diag[row] - 1;
3641:           idx = a->j + diag[row] + 1;
3642:           switch (nodesz) {
3643:           case 5:
3644:             v5 = a->a + diag[row - 4] + 5; /* fall through */
3645:           case 4:
3646:             v4 = a->a + diag[row - 3] + 4; /* fall through */
3647:           case 3:
3648:             v3 = a->a + diag[row - 2] + 3; /* fall through */
3649:           case 2:
3650:             v2 = a->a + diag[row - 1] + 2; /* fall through */
3651:           case 1:
3652:             v1 = a->a + diag[row] + 1;
3653:           }
3654:         }
3655:         /* set sum */
3656:         switch (nodesz) {
3657:         case 5:
3658:           sum5 = xb[row - 4]; /* fall through */
3659:         case 4:
3660:           sum4 = xb[row - 3]; /* fall through */
3661:         case 3:
3662:           sum3 = xb[row - 2]; /* fall through */
3663:         case 2:
3664:           sum2 = xb[row - 1]; /* fall through */
3665:         case 1:
3666:           /* note that sum1 is associated with the last row */
3667:           sum1 = xb[row];
3668:         }
3669:         /* do sums */
3670:         for (n = 0; n < sz - 1; n += 2) {
3671:           i1 = idx[0];
3672:           i2 = idx[1];
3673:           idx += 2;
3674:           tmp0 = x[i1];
3675:           tmp1 = x[i2];
3676:           switch (nodesz) {
3677:           case 5:
3678:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3679:             v5 += 2; /* fall through */
3680:           case 4:
3681:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3682:             v4 += 2; /* fall through */
3683:           case 3:
3684:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3685:             v3 += 2; /* fall through */
3686:           case 2:
3687:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3688:             v2 += 2; /* fall through */
3689:           case 1:
3690:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3691:             v1 += 2;
3692:           }
3693:         }
3694:         /* ragged edge */
3695:         if (n == sz - 1) {
3696:           tmp0 = x[*idx];
3697:           switch (nodesz) {
3698:           case 5:
3699:             sum5 -= *v5 * tmp0; /* fall through */
3700:           case 4:
3701:             sum4 -= *v4 * tmp0; /* fall through */
3702:           case 3:
3703:             sum3 -= *v3 * tmp0; /* fall through */
3704:           case 2:
3705:             sum2 -= *v2 * tmp0; /* fall through */
3706:           case 1:
3707:             sum1 -= *v1 * tmp0;
3708:           }
3709:         }
3710:         /* update */
3711:         if (xb == b) {
3712:           /* whole (old way) w/ diag */
3713:           switch (nodesz) {
3714:           case 5:
3715:             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3716:             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3717:             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3718:             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3719:             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3720:             break;
3721:           case 4:
3722:             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3723:             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3724:             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3725:             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3726:             break;
3727:           case 3:
3728:             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3729:             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3730:             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3731:             break;
3732:           case 2:
3733:             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3734:             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3735:             break;
3736:           case 1:
3737:             x[row--] += sum1 * (*ibdiag);
3738:             break;
3739:           }
3740:         } else {
3741:           /* no diag so set =  */
3742:           switch (nodesz) {
3743:           case 5:
3744:             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3745:             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3746:             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3747:             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3748:             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3749:             break;
3750:           case 4:
3751:             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3752:             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3753:             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3754:             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3755:             break;
3756:           case 3:
3757:             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3758:             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3759:             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3760:             break;
3761:           case 2:
3762:             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3763:             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3764:             break;
3765:           case 1:
3766:             x[row--] = sum1 * (*ibdiag);
3767:             break;
3768:           }
3769:         }
3770:       }
3771:       if (xb == b) {
3772:         PetscCall(PetscLogFlops(2.0 * a->nz));
3773:       } else {
3774:         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3775:       }
3776:     }
3777:   }
3778:   if (flag & SOR_EISENSTAT) {
3779:     /*
3780:           Apply  (U + D)^-1  where D is now the block diagonal
3781:     */
3782:     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3783:     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3784:       nodesz = sizes[i + 1] - sizes[i];
3785:       ibdiag -= nodesz * nodesz;
3786:       sz  = ii[row + 1] - diag[row] - 1;
3787:       v1  = a->a + diag[row] + 1;
3788:       idx = a->j + diag[row] + 1;
3789:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3790:       switch (nodesz) {
3791:       case 1:

3793:         sum1 = b[row];
3794:         for (n = 0; n < sz - 1; n += 2) {
3795:           i1 = idx[0];
3796:           i2 = idx[1];
3797:           idx += 2;
3798:           tmp0 = x[i1];
3799:           tmp1 = x[i2];
3800:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3801:           v1 += 2;
3802:         }

3804:         if (n == sz - 1) {
3805:           tmp0 = x[*idx];
3806:           sum1 -= *v1 * tmp0;
3807:         }
3808:         x[row] = sum1 * (*ibdiag);
3809:         row--;
3810:         break;

3812:       case 2:

3814:         sum1 = b[row];
3815:         sum2 = b[row - 1];
3816:         /* note that sum1 is associated with the second of the two rows */
3817:         v2 = a->a + diag[row - 1] + 2;
3818:         for (n = 0; n < sz - 1; n += 2) {
3819:           i1 = idx[0];
3820:           i2 = idx[1];
3821:           idx += 2;
3822:           tmp0 = x[i1];
3823:           tmp1 = x[i2];
3824:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3825:           v1 += 2;
3826:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3827:           v2 += 2;
3828:         }

3830:         if (n == sz - 1) {
3831:           tmp0 = x[*idx];
3832:           sum1 -= *v1 * tmp0;
3833:           sum2 -= *v2 * tmp0;
3834:         }
3835:         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3836:         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3837:         row -= 2;
3838:         break;
3839:       case 3:

3841:         sum1 = b[row];
3842:         sum2 = b[row - 1];
3843:         sum3 = b[row - 2];
3844:         v2   = a->a + diag[row - 1] + 2;
3845:         v3   = a->a + diag[row - 2] + 3;
3846:         for (n = 0; n < sz - 1; n += 2) {
3847:           i1 = idx[0];
3848:           i2 = idx[1];
3849:           idx += 2;
3850:           tmp0 = x[i1];
3851:           tmp1 = x[i2];
3852:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3853:           v1 += 2;
3854:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3855:           v2 += 2;
3856:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3857:           v3 += 2;
3858:         }

3860:         if (n == sz - 1) {
3861:           tmp0 = x[*idx];
3862:           sum1 -= *v1 * tmp0;
3863:           sum2 -= *v2 * tmp0;
3864:           sum3 -= *v3 * tmp0;
3865:         }
3866:         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3867:         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3868:         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3869:         row -= 3;
3870:         break;
3871:       case 4:

3873:         sum1 = b[row];
3874:         sum2 = b[row - 1];
3875:         sum3 = b[row - 2];
3876:         sum4 = b[row - 3];
3877:         v2   = a->a + diag[row - 1] + 2;
3878:         v3   = a->a + diag[row - 2] + 3;
3879:         v4   = a->a + diag[row - 3] + 4;
3880:         for (n = 0; n < sz - 1; n += 2) {
3881:           i1 = idx[0];
3882:           i2 = idx[1];
3883:           idx += 2;
3884:           tmp0 = x[i1];
3885:           tmp1 = x[i2];
3886:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3887:           v1 += 2;
3888:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3889:           v2 += 2;
3890:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3891:           v3 += 2;
3892:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3893:           v4 += 2;
3894:         }

3896:         if (n == sz - 1) {
3897:           tmp0 = x[*idx];
3898:           sum1 -= *v1 * tmp0;
3899:           sum2 -= *v2 * tmp0;
3900:           sum3 -= *v3 * tmp0;
3901:           sum4 -= *v4 * tmp0;
3902:         }
3903:         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3904:         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3905:         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3906:         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3907:         row -= 4;
3908:         break;
3909:       case 5:

3911:         sum1 = b[row];
3912:         sum2 = b[row - 1];
3913:         sum3 = b[row - 2];
3914:         sum4 = b[row - 3];
3915:         sum5 = b[row - 4];
3916:         v2   = a->a + diag[row - 1] + 2;
3917:         v3   = a->a + diag[row - 2] + 3;
3918:         v4   = a->a + diag[row - 3] + 4;
3919:         v5   = a->a + diag[row - 4] + 5;
3920:         for (n = 0; n < sz - 1; n += 2) {
3921:           i1 = idx[0];
3922:           i2 = idx[1];
3923:           idx += 2;
3924:           tmp0 = x[i1];
3925:           tmp1 = x[i2];
3926:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3927:           v1 += 2;
3928:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3929:           v2 += 2;
3930:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3931:           v3 += 2;
3932:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3933:           v4 += 2;
3934:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3935:           v5 += 2;
3936:         }

3938:         if (n == sz - 1) {
3939:           tmp0 = x[*idx];
3940:           sum1 -= *v1 * tmp0;
3941:           sum2 -= *v2 * tmp0;
3942:           sum3 -= *v3 * tmp0;
3943:           sum4 -= *v4 * tmp0;
3944:           sum5 -= *v5 * tmp0;
3945:         }
3946:         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3947:         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3948:         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3949:         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3950:         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3951:         row -= 5;
3952:         break;
3953:       default:
3954:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3955:       }
3956:     }
3957:     PetscCall(PetscLogFlops(a->nz));

3959:     /*
3960:            t = b - D x    where D is the block diagonal
3961:     */
3962:     cnt = 0;
3963:     for (i = 0, row = 0; i < m; i++) {
3964:       nodesz = sizes[i + 1] - sizes[i];
3965:       switch (nodesz) {
3966:       case 1:
3967:         t[row] = b[row] - bdiag[cnt++] * x[row];
3968:         row++;
3969:         break;
3970:       case 2:
3971:         x1         = x[row];
3972:         x2         = x[row + 1];
3973:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3974:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3975:         t[row]     = b[row] - tmp1;
3976:         t[row + 1] = b[row + 1] - tmp2;
3977:         row += 2;
3978:         cnt += 4;
3979:         break;
3980:       case 3:
3981:         x1         = x[row];
3982:         x2         = x[row + 1];
3983:         x3         = x[row + 2];
3984:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3985:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3986:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3987:         t[row]     = b[row] - tmp1;
3988:         t[row + 1] = b[row + 1] - tmp2;
3989:         t[row + 2] = b[row + 2] - tmp3;
3990:         row += 3;
3991:         cnt += 9;
3992:         break;
3993:       case 4:
3994:         x1         = x[row];
3995:         x2         = x[row + 1];
3996:         x3         = x[row + 2];
3997:         x4         = x[row + 3];
3998:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3999:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4000:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4001:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4002:         t[row]     = b[row] - tmp1;
4003:         t[row + 1] = b[row + 1] - tmp2;
4004:         t[row + 2] = b[row + 2] - tmp3;
4005:         t[row + 3] = b[row + 3] - tmp4;
4006:         row += 4;
4007:         cnt += 16;
4008:         break;
4009:       case 5:
4010:         x1         = x[row];
4011:         x2         = x[row + 1];
4012:         x3         = x[row + 2];
4013:         x4         = x[row + 3];
4014:         x5         = x[row + 4];
4015:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4016:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4017:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4018:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4019:         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4020:         t[row]     = b[row] - tmp1;
4021:         t[row + 1] = b[row + 1] - tmp2;
4022:         t[row + 2] = b[row + 2] - tmp3;
4023:         t[row + 3] = b[row + 3] - tmp4;
4024:         t[row + 4] = b[row + 4] - tmp5;
4025:         row += 5;
4026:         cnt += 25;
4027:         break;
4028:       default:
4029:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4030:       }
4031:     }
4032:     PetscCall(PetscLogFlops(m));

4034:     /*
4035:           Apply (L + D)^-1 where D is the block diagonal
4036:     */
4037:     for (i = 0, row = 0; i < m; i++) {
4038:       nodesz = sizes[i + 1] - sizes[i];
4039:       sz     = diag[row] - ii[row];
4040:       v1     = a->a + ii[row];
4041:       idx    = a->j + ii[row];
4042:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4043:       switch (nodesz) {
4044:       case 1:

4046:         sum1 = t[row];
4047:         for (n = 0; n < sz - 1; n += 2) {
4048:           i1 = idx[0];
4049:           i2 = idx[1];
4050:           idx += 2;
4051:           tmp0 = t[i1];
4052:           tmp1 = t[i2];
4053:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4054:           v1 += 2;
4055:         }

4057:         if (n == sz - 1) {
4058:           tmp0 = t[*idx];
4059:           sum1 -= *v1 * tmp0;
4060:         }
4061:         x[row] += t[row] = sum1 * (*ibdiag++);
4062:         row++;
4063:         break;
4064:       case 2:
4065:         v2   = a->a + ii[row + 1];
4066:         sum1 = t[row];
4067:         sum2 = t[row + 1];
4068:         for (n = 0; n < sz - 1; n += 2) {
4069:           i1 = idx[0];
4070:           i2 = idx[1];
4071:           idx += 2;
4072:           tmp0 = t[i1];
4073:           tmp1 = t[i2];
4074:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4075:           v1 += 2;
4076:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4077:           v2 += 2;
4078:         }

4080:         if (n == sz - 1) {
4081:           tmp0 = t[*idx];
4082:           sum1 -= v1[0] * tmp0;
4083:           sum2 -= v2[0] * tmp0;
4084:         }
4085:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4086:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4087:         ibdiag += 4;
4088:         row += 2;
4089:         break;
4090:       case 3:
4091:         v2   = a->a + ii[row + 1];
4092:         v3   = a->a + ii[row + 2];
4093:         sum1 = t[row];
4094:         sum2 = t[row + 1];
4095:         sum3 = t[row + 2];
4096:         for (n = 0; n < sz - 1; n += 2) {
4097:           i1 = idx[0];
4098:           i2 = idx[1];
4099:           idx += 2;
4100:           tmp0 = t[i1];
4101:           tmp1 = t[i2];
4102:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4103:           v1 += 2;
4104:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4105:           v2 += 2;
4106:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4107:           v3 += 2;
4108:         }

4110:         if (n == sz - 1) {
4111:           tmp0 = t[*idx];
4112:           sum1 -= v1[0] * tmp0;
4113:           sum2 -= v2[0] * tmp0;
4114:           sum3 -= v3[0] * tmp0;
4115:         }
4116:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4117:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4118:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4119:         ibdiag += 9;
4120:         row += 3;
4121:         break;
4122:       case 4:
4123:         v2   = a->a + ii[row + 1];
4124:         v3   = a->a + ii[row + 2];
4125:         v4   = a->a + ii[row + 3];
4126:         sum1 = t[row];
4127:         sum2 = t[row + 1];
4128:         sum3 = t[row + 2];
4129:         sum4 = t[row + 3];
4130:         for (n = 0; n < sz - 1; n += 2) {
4131:           i1 = idx[0];
4132:           i2 = idx[1];
4133:           idx += 2;
4134:           tmp0 = t[i1];
4135:           tmp1 = t[i2];
4136:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4137:           v1 += 2;
4138:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4139:           v2 += 2;
4140:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4141:           v3 += 2;
4142:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4143:           v4 += 2;
4144:         }

4146:         if (n == sz - 1) {
4147:           tmp0 = t[*idx];
4148:           sum1 -= v1[0] * tmp0;
4149:           sum2 -= v2[0] * tmp0;
4150:           sum3 -= v3[0] * tmp0;
4151:           sum4 -= v4[0] * tmp0;
4152:         }
4153:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4154:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4155:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4156:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4157:         ibdiag += 16;
4158:         row += 4;
4159:         break;
4160:       case 5:
4161:         v2   = a->a + ii[row + 1];
4162:         v3   = a->a + ii[row + 2];
4163:         v4   = a->a + ii[row + 3];
4164:         v5   = a->a + ii[row + 4];
4165:         sum1 = t[row];
4166:         sum2 = t[row + 1];
4167:         sum3 = t[row + 2];
4168:         sum4 = t[row + 3];
4169:         sum5 = t[row + 4];
4170:         for (n = 0; n < sz - 1; n += 2) {
4171:           i1 = idx[0];
4172:           i2 = idx[1];
4173:           idx += 2;
4174:           tmp0 = t[i1];
4175:           tmp1 = t[i2];
4176:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4177:           v1 += 2;
4178:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4179:           v2 += 2;
4180:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4181:           v3 += 2;
4182:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4183:           v4 += 2;
4184:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4185:           v5 += 2;
4186:         }

4188:         if (n == sz - 1) {
4189:           tmp0 = t[*idx];
4190:           sum1 -= v1[0] * tmp0;
4191:           sum2 -= v2[0] * tmp0;
4192:           sum3 -= v3[0] * tmp0;
4193:           sum4 -= v4[0] * tmp0;
4194:           sum5 -= v5[0] * tmp0;
4195:         }
4196:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4197:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4198:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4199:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4200:         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4201:         ibdiag += 25;
4202:         row += 5;
4203:         break;
4204:       default:
4205:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4206:       }
4207:     }
4208:     PetscCall(PetscLogFlops(a->nz));
4209:   }
4210:   PetscCall(VecRestoreArray(xx, &x));
4211:   PetscCall(VecRestoreArrayRead(bb, &b));
4212:   PetscFunctionReturn(PETSC_SUCCESS);
4213: }

4215: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4216: {
4217:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4218:   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4219:   const MatScalar   *bdiag = a->inode.bdiag;
4220:   const PetscScalar *b;
4221:   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
4222:   const PetscInt    *sizes = a->inode.size_csr;

4224:   PetscFunctionBegin;
4225:   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4226:   PetscCall(VecGetArray(xx, &x));
4227:   PetscCall(VecGetArrayRead(bb, &b));
4228:   cnt = 0;
4229:   for (i = 0, row = 0; i < m; i++) {
4230:     nodesz = sizes[i + 1] - sizes[i];
4231:     switch (nodesz) {
4232:     case 1:
4233:       x[row] = b[row] * bdiag[cnt++];
4234:       row++;
4235:       break;
4236:     case 2:
4237:       x1       = b[row];
4238:       x2       = b[row + 1];
4239:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4240:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4241:       x[row++] = tmp1;
4242:       x[row++] = tmp2;
4243:       cnt += 4;
4244:       break;
4245:     case 3:
4246:       x1       = b[row];
4247:       x2       = b[row + 1];
4248:       x3       = b[row + 2];
4249:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4250:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4251:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4252:       x[row++] = tmp1;
4253:       x[row++] = tmp2;
4254:       x[row++] = tmp3;
4255:       cnt += 9;
4256:       break;
4257:     case 4:
4258:       x1       = b[row];
4259:       x2       = b[row + 1];
4260:       x3       = b[row + 2];
4261:       x4       = b[row + 3];
4262:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4263:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4264:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4265:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4266:       x[row++] = tmp1;
4267:       x[row++] = tmp2;
4268:       x[row++] = tmp3;
4269:       x[row++] = tmp4;
4270:       cnt += 16;
4271:       break;
4272:     case 5:
4273:       x1       = b[row];
4274:       x2       = b[row + 1];
4275:       x3       = b[row + 2];
4276:       x4       = b[row + 3];
4277:       x5       = b[row + 4];
4278:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4279:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4280:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4281:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4282:       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4283:       x[row++] = tmp1;
4284:       x[row++] = tmp2;
4285:       x[row++] = tmp3;
4286:       x[row++] = tmp4;
4287:       x[row++] = tmp5;
4288:       cnt += 25;
4289:       break;
4290:     default:
4291:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4292:     }
4293:   }
4294:   PetscCall(PetscLogFlops(2.0 * cnt));
4295:   PetscCall(VecRestoreArray(xx, &x));
4296:   PetscCall(VecRestoreArrayRead(bb, &b));
4297:   PetscFunctionReturn(PETSC_SUCCESS);
4298: }

4300: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4301: {
4302:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4304:   PetscFunctionBegin;
4305:   a->inode.node_count       = 0;
4306:   a->inode.use              = PETSC_FALSE;
4307:   a->inode.checked          = PETSC_FALSE;
4308:   a->inode.mat_nonzerostate = -1;
4309:   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4310:   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4311:   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4312:   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4313:   A->ops->coloringpatch     = NULL;
4314:   A->ops->multdiagonalblock = NULL;
4315:   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4316:   PetscFunctionReturn(PETSC_SUCCESS);
4317: }

4319: /*
4320:     samestructure indicates that the matrix has not changed its nonzero structure so we
4321:     do not need to recompute the inodes
4322: */
4323: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4324: {
4325:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4326:   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4327:   PetscBool       flag;
4328:   const PetscInt *idx, *idy, *ii;

4330:   PetscFunctionBegin;
4331:   if (!a->inode.use) {
4332:     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4333:     PetscCall(PetscFree(a->inode.size_csr));
4334:     PetscFunctionReturn(PETSC_SUCCESS);
4335:   }
4336:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);

4338:   m = A->rmap->n;
4339:   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4340:   ns    = a->inode.size_csr;
4341:   ns[0] = 0;

4343:   i          = 0;
4344:   node_count = 0;
4345:   idx        = a->j;
4346:   ii         = a->i;
4347:   if (idx) {
4348:     while (i < m) {            /* For each row */
4349:       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4350:       /* Limits the number of elements in a node to 'a->inode.limit' */
4351:       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4352:         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4353:         if (nzy != nzx) break;
4354:         idy += nzx; /* Same nonzero pattern */
4355:         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4356:         if (!flag) break;
4357:       }
4358:       ns[node_count + 1] = ns[node_count] + blk_size;
4359:       node_count++;
4360:       idx += blk_size * nzx;
4361:       i = j;
4362:     }
4363:   }
4364:   /* If not enough inodes found,, do not use inode version of the routines */
4365:   if (!m || !idx || node_count > .8 * m) {
4366:     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4367:     PetscCall(PetscFree(a->inode.size_csr));
4368:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4369:   } else {
4370:     if (!A->factortype) {
4371:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4372:       if (A->rmap->n == A->cmap->n) {
4373:         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4374:         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4375:         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4376:         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4377:         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4378:       }
4379:     } else {
4380:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4381:     }
4382:     a->inode.node_count = node_count;
4383:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4384:   }
4385:   a->inode.checked          = PETSC_TRUE;
4386:   a->inode.mat_nonzerostate = A->nonzerostate;
4387:   PetscFunctionReturn(PETSC_SUCCESS);
4388: }

4390: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4391: {
4392:   Mat         B = *C;
4393:   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4394:   PetscInt    m = A->rmap->n;

4396:   PetscFunctionBegin;
4397:   c->inode.use              = a->inode.use;
4398:   c->inode.limit            = a->inode.limit;
4399:   c->inode.max_limit        = a->inode.max_limit;
4400:   c->inode.checked          = PETSC_FALSE;
4401:   c->inode.size_csr         = NULL;
4402:   c->inode.node_count       = 0;
4403:   c->inode.ibdiagvalid      = PETSC_FALSE;
4404:   c->inode.ibdiag           = NULL;
4405:   c->inode.bdiag            = NULL;
4406:   c->inode.mat_nonzerostate = -1;
4407:   if (a->inode.use) {
4408:     if (a->inode.checked && a->inode.size_csr) {
4409:       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4410:       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));

4412:       c->inode.checked          = PETSC_TRUE;
4413:       c->inode.node_count       = a->inode.node_count;
4414:       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4415:     }
4416:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4417:     if (!B->factortype) {
4418:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4419:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4420:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4421:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4422:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4423:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4424:     } else {
4425:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4426:     }
4427:   }
4428:   PetscFunctionReturn(PETSC_SUCCESS);
4429: }

4431: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4432: {
4433:   PetscInt        k;
4434:   const PetscInt *vi;

4436:   PetscFunctionBegin;
4437:   vi = aj + ai[row];
4438:   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4439:   vi        = aj + adiag[row];
4440:   cols[nzl] = vi[0];
4441:   vi        = aj + adiag[row + 1] + 1;
4442:   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4443:   PetscFunctionReturn(PETSC_SUCCESS);
4444: }
4445: /*
4446:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4447:    Modified from MatSeqAIJCheckInode().

4449:    Input Parameters:
4450: .  Mat A - ILU or LU matrix factor

4452: */
4453: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4454: {
4455:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4456:   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4457:   PetscInt       *cols1, *cols2, *ns;
4458:   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4459:   PetscBool       flag;

4461:   PetscFunctionBegin;
4462:   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4463:   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);

4465:   m = A->rmap->n;
4466:   if (a->inode.size_csr) ns = a->inode.size_csr;
4467:   else PetscCall(PetscMalloc1(m + 1, &ns));
4468:   ns[0] = 0;

4470:   i          = 0;
4471:   node_count = 0;
4472:   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4473:   while (i < m) {                       /* For each row */
4474:     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4475:     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4476:     nzx  = nzl1 + nzu1 + 1;
4477:     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));

4479:     /* Limits the number of elements in a node to 'a->inode.limit' */
4480:     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4481:       nzl2 = ai[j + 1] - ai[j];
4482:       nzu2 = adiag[j] - adiag[j + 1] - 1;
4483:       nzy  = nzl2 + nzu2 + 1;
4484:       if (nzy != nzx) break;
4485:       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4486:       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4487:       if (!flag) break;
4488:     }
4489:     ns[node_count + 1] = ns[node_count] + blk_size;
4490:     node_count++;
4491:     i = j;
4492:   }
4493:   PetscCall(PetscFree2(cols1, cols2));
4494:   /* If not enough inodes found,, do not use inode version of the routines */
4495:   if (!m || node_count > .8 * m) {
4496:     PetscCall(PetscFree(ns));

4498:     a->inode.node_count = 0;
4499:     a->inode.size_csr   = NULL;
4500:     a->inode.use        = PETSC_FALSE;

4502:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4503:   } else {
4504:     A->ops->mult              = NULL;
4505:     A->ops->sor               = NULL;
4506:     A->ops->multadd           = NULL;
4507:     A->ops->getrowij          = NULL;
4508:     A->ops->restorerowij      = NULL;
4509:     A->ops->getcolumnij       = NULL;
4510:     A->ops->restorecolumnij   = NULL;
4511:     A->ops->coloringpatch     = NULL;
4512:     A->ops->multdiagonalblock = NULL;
4513:     a->inode.node_count       = node_count;
4514:     a->inode.size_csr         = ns;
4515:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4516:   }
4517:   a->inode.checked = PETSC_TRUE;
4518:   PetscFunctionReturn(PETSC_SUCCESS);
4519: }

4521: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4522: {
4523:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4525:   PetscFunctionBegin;
4526:   a->inode.ibdiagvalid = PETSC_FALSE;
4527:   PetscFunctionReturn(PETSC_SUCCESS);
4528: }

4530: /*
4531:      This is really ugly. if inodes are used this replaces the
4532:   permutations with ones that correspond to rows/cols of the matrix
4533:   rather than inode blocks
4534: */
4535: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4536: {
4537:   PetscFunctionBegin;
4538:   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4539:   PetscFunctionReturn(PETSC_SUCCESS);
4540: }

4542: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4543: {
4544:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4545:   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4546:   const PetscInt *ridx, *cidx;
4547:   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4548:   PetscInt        nslim_col, *ns_col;
4549:   IS              ris = *rperm, cis = *cperm;

4551:   PetscFunctionBegin;
4552:   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
4553:   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */

4555:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4556:   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4557:   PetscCall(PetscMalloc2(m, &permr, n, &permc));

4559:   PetscCall(ISGetIndices(ris, &ridx));
4560:   PetscCall(ISGetIndices(cis, &cidx));

4562:   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4563:   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);

4565:   /* Construct the permutations for rows*/
4566:   for (i = 0, row = 0; i < nslim_row; ++i) {
4567:     indx      = ridx[i];
4568:     start_val = tns[indx];
4569:     end_val   = tns[indx + 1];
4570:     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4571:   }

4573:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4574:   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);

4576:   /* Construct permutations for columns */
4577:   for (i = 0, col = 0; i < nslim_col; ++i) {
4578:     indx      = cidx[i];
4579:     start_val = tns[indx];
4580:     end_val   = tns[indx + 1];
4581:     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4582:   }

4584:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4585:   PetscCall(ISSetPermutation(*rperm));
4586:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4587:   PetscCall(ISSetPermutation(*cperm));

4589:   PetscCall(ISRestoreIndices(ris, &ridx));
4590:   PetscCall(ISRestoreIndices(cis, &cidx));

4592:   PetscCall(PetscFree(ns_col));
4593:   PetscCall(PetscFree2(permr, permc));
4594:   PetscCall(ISDestroy(&cis));
4595:   PetscCall(ISDestroy(&ris));
4596:   PetscCall(PetscFree(tns));
4597:   PetscFunctionReturn(PETSC_SUCCESS);
4598: }

4600: /*@C
4601:   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes

4603:   Not Collective

4605:   Input Parameter:
4606: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`

4608:   Output Parameters:
4609: + node_count - no of inodes present in the matrix.
4610: . sizes      - an array of size `node_count`, with the sizes of each inode.
4611: - limit      - the max size used to generate the inodes.

4613:   Level: advanced

4615:   Note:
4616:   It should be called after the matrix is assembled.
4617:   The contents of the sizes[] array should not be changed.
4618:   `NULL` may be passed for information not needed

4620: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4621: @*/
4622: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4623: {
4624:   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);

4626:   PetscFunctionBegin;
4627:   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4628:   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4629:   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4630:   PetscFunctionReturn(PETSC_SUCCESS);
4631: }

4633: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4634: {
4635:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4637:   PetscFunctionBegin;
4638:   if (node_count) *node_count = a->inode.node_count;
4639:   if (sizes) *sizes = a->inode.size_csr;
4640:   if (limit) *limit = a->inode.limit;
4641:   PetscFunctionReturn(PETSC_SUCCESS);
4642: }