Actual source code: sbaijfact2.c
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11: {
12: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
13: IS isrow=a->row;
14: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
15: const PetscInt *r;
16: PetscInt nz,*vj,k,idx,k1;
17: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
18: const MatScalar *aa=a->a,*v,*diag;
19: PetscScalar *x,*xk,*xj,*xk_tmp,*t;
20: const PetscScalar *b;
22: VecGetArrayRead(bb,&b);
23: VecGetArray(xx,&x);
24: t = a->solve_work;
25: ISGetIndices(isrow,&r);
26: PetscMalloc1(bs,&xk_tmp);
28: /* solve U^T * D * y = b by forward substitution */
29: xk = t;
30: for (k=0; k<mbs; k++) { /* t <- perm(b) */
31: idx = bs*r[k];
32: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
33: }
34: for (k=0; k<mbs; k++) {
35: v = aa + bs2*ai[k];
36: xk = t + k*bs; /* Dk*xk = k-th block of x */
37: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
38: nz = ai[k+1] - ai[k];
39: vj = aj + ai[k];
40: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
41: while (nz--) {
42: /* x(:) += U(k,:)^T*(Dk*xk) */
43: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
44: vj++; xj = t + (*vj)*bs;
45: v += bs2;
46: }
47: /* xk = inv(Dk)*(Dk*xk) */
48: diag = aa+k*bs2; /* ptr to inv(Dk) */
49: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
50: }
52: /* solve U*x = y by back substitution */
53: for (k=mbs-1; k>=0; k--) {
54: v = aa + bs2*ai[k];
55: xk = t + k*bs; /* xk */
56: nz = ai[k+1] - ai[k];
57: vj = aj + ai[k];
58: xj = t + (*vj)*bs;
59: while (nz--) {
60: /* xk += U(k,:)*x(:) */
61: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
62: vj++;
63: v += bs2; xj = t + (*vj)*bs;
64: }
65: idx = bs*r[k];
66: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
67: }
69: PetscFree(xk_tmp);
70: ISRestoreIndices(isrow,&r);
71: VecRestoreArrayRead(bb,&b);
72: VecRestoreArray(xx,&x);
73: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
74: return 0;
75: }
77: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
78: {
79: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
80: }
82: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
83: {
84: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
85: }
87: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
88: {
89: PetscInt nz,k;
90: const PetscInt *vj,bs2 = bs*bs;
91: const MatScalar *v,*diag;
92: PetscScalar *xk,*xj,*xk_tmp;
94: PetscMalloc1(bs,&xk_tmp);
95: for (k=0; k<mbs; k++) {
96: v = aa + bs2*ai[k];
97: xk = x + k*bs; /* Dk*xk = k-th block of x */
98: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
99: nz = ai[k+1] - ai[k];
100: vj = aj + ai[k];
101: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
102: while (nz--) {
103: /* x(:) += U(k,:)^T*(Dk*xk) */
104: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
105: vj++; xj = x + (*vj)*bs;
106: v += bs2;
107: }
108: /* xk = inv(Dk)*(Dk*xk) */
109: diag = aa+k*bs2; /* ptr to inv(Dk) */
110: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
111: }
112: PetscFree(xk_tmp);
113: return 0;
114: }
116: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
117: {
118: PetscInt nz,k;
119: const PetscInt *vj,bs2 = bs*bs;
120: const MatScalar *v;
121: PetscScalar *xk,*xj;
123: for (k=mbs-1; k>=0; k--) {
124: v = aa + bs2*ai[k];
125: xk = x + k*bs; /* xk */
126: nz = ai[k+1] - ai[k];
127: vj = aj + ai[k];
128: xj = x + (*vj)*bs;
129: while (nz--) {
130: /* xk += U(k,:)*x(:) */
131: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
132: vj++;
133: v += bs2; xj = x + (*vj)*bs;
134: }
135: }
136: return 0;
137: }
139: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
140: {
141: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
142: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
143: PetscInt bs =A->rmap->bs;
144: const MatScalar *aa=a->a;
145: PetscScalar *x;
146: const PetscScalar *b;
148: VecGetArrayRead(bb,&b);
149: VecGetArray(xx,&x);
151: /* solve U^T * D * y = b by forward substitution */
152: PetscArraycpy(x,b,bs*mbs); /* x <- b */
153: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
155: /* solve U*x = y by back substitution */
156: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
158: VecRestoreArrayRead(bb,&b);
159: VecRestoreArray(xx,&x);
160: PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
161: return 0;
162: }
164: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
165: {
166: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
167: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
168: PetscInt bs =A->rmap->bs;
169: const MatScalar *aa=a->a;
170: const PetscScalar *b;
171: PetscScalar *x;
173: VecGetArrayRead(bb,&b);
174: VecGetArray(xx,&x);
175: PetscArraycpy(x,b,bs*mbs); /* x <- b */
176: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
177: VecRestoreArrayRead(bb,&b);
178: VecRestoreArray(xx,&x);
179: PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
180: return 0;
181: }
183: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
184: {
185: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
186: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
187: PetscInt bs =A->rmap->bs;
188: const MatScalar *aa=a->a;
189: const PetscScalar *b;
190: PetscScalar *x;
192: VecGetArrayRead(bb,&b);
193: VecGetArray(xx,&x);
194: PetscArraycpy(x,b,bs*mbs);
195: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
196: VecRestoreArrayRead(bb,&b);
197: VecRestoreArray(xx,&x);
198: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
199: return 0;
200: }
202: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
203: {
204: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
205: IS isrow=a->row;
206: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
207: PetscInt nz,k,idx;
208: const MatScalar *aa=a->a,*v,*d;
209: PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
210: const PetscScalar *b;
212: VecGetArrayRead(bb,&b);
213: VecGetArray(xx,&x);
214: t = a->solve_work;
215: ISGetIndices(isrow,&r);
217: /* solve U^T * D * y = b by forward substitution */
218: tp = t;
219: for (k=0; k<mbs; k++) { /* t <- perm(b) */
220: idx = 7*r[k];
221: tp[0] = b[idx];
222: tp[1] = b[idx+1];
223: tp[2] = b[idx+2];
224: tp[3] = b[idx+3];
225: tp[4] = b[idx+4];
226: tp[5] = b[idx+5];
227: tp[6] = b[idx+6];
228: tp += 7;
229: }
231: for (k=0; k<mbs; k++) {
232: v = aa + 49*ai[k];
233: vj = aj + ai[k];
234: tp = t + k*7;
235: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
236: nz = ai[k+1] - ai[k];
237: tp = t + (*vj)*7;
238: while (nz--) {
239: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
240: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
241: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
242: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
243: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
244: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
245: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
246: vj++;
247: tp = t + (*vj)*7;
248: v += 49;
249: }
251: /* xk = inv(Dk)*(Dk*xk) */
252: d = aa+k*49; /* ptr to inv(Dk) */
253: tp = t + k*7;
254: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
255: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
256: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
257: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
258: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
259: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
260: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
261: }
263: /* solve U*x = y by back substitution */
264: for (k=mbs-1; k>=0; k--) {
265: v = aa + 49*ai[k];
266: vj = aj + ai[k];
267: tp = t + k*7;
268: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
269: nz = ai[k+1] - ai[k];
271: tp = t + (*vj)*7;
272: while (nz--) {
273: /* xk += U(k,:)*x(:) */
274: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
275: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
276: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
277: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
278: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
279: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
280: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
281: vj++;
282: tp = t + (*vj)*7;
283: v += 49;
284: }
285: tp = t + k*7;
286: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
287: idx = 7*r[k];
288: x[idx] = x0;
289: x[idx+1] = x1;
290: x[idx+2] = x2;
291: x[idx+3] = x3;
292: x[idx+4] = x4;
293: x[idx+5] = x5;
294: x[idx+6] = x6;
295: }
297: ISRestoreIndices(isrow,&r);
298: VecRestoreArrayRead(bb,&b);
299: VecRestoreArray(xx,&x);
300: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
301: return 0;
302: }
304: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
305: {
306: const MatScalar *v,*d;
307: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
308: PetscInt nz,k;
309: const PetscInt *vj;
311: for (k=0; k<mbs; k++) {
312: v = aa + 49*ai[k];
313: xp = x + k*7;
314: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
315: nz = ai[k+1] - ai[k];
316: vj = aj + ai[k];
317: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
318: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
319: while (nz--) {
320: xp = x + (*vj)*7;
321: /* x(:) += U(k,:)^T*(Dk*xk) */
322: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
323: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
324: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
325: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
326: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
327: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
328: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
329: vj++;
330: v += 49;
331: }
332: /* xk = inv(Dk)*(Dk*xk) */
333: d = aa+k*49; /* ptr to inv(Dk) */
334: xp = x + k*7;
335: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
336: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
337: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
338: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
339: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
340: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
341: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
342: }
343: return 0;
344: }
346: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
347: {
348: const MatScalar *v;
349: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
350: PetscInt nz,k;
351: const PetscInt *vj;
353: for (k=mbs-1; k>=0; k--) {
354: v = aa + 49*ai[k];
355: xp = x + k*7;
356: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
357: nz = ai[k+1] - ai[k];
358: vj = aj + ai[k];
359: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
360: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
361: while (nz--) {
362: xp = x + (*vj)*7;
363: /* xk += U(k,:)*x(:) */
364: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
365: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
366: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
367: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
368: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
369: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
370: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
371: vj++;
372: v += 49;
373: }
374: xp = x + k*7;
375: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
376: }
377: return 0;
378: }
380: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
381: {
382: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
383: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
384: const MatScalar *aa=a->a;
385: PetscScalar *x;
386: const PetscScalar *b;
388: VecGetArrayRead(bb,&b);
389: VecGetArray(xx,&x);
391: /* solve U^T * D * y = b by forward substitution */
392: PetscArraycpy(x,b,7*mbs); /* x <- b */
393: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
395: /* solve U*x = y by back substitution */
396: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
398: VecRestoreArrayRead(bb,&b);
399: VecRestoreArray(xx,&x);
400: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
401: return 0;
402: }
404: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
405: {
406: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
407: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
408: const MatScalar *aa=a->a;
409: PetscScalar *x;
410: const PetscScalar *b;
412: VecGetArrayRead(bb,&b);
413: VecGetArray(xx,&x);
414: PetscArraycpy(x,b,7*mbs);
415: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
416: VecRestoreArrayRead(bb,&b);
417: VecRestoreArray(xx,&x);
418: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
419: return 0;
420: }
422: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
423: {
424: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
425: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
426: const MatScalar *aa=a->a;
427: PetscScalar *x;
428: const PetscScalar *b;
430: VecGetArrayRead(bb,&b);
431: VecGetArray(xx,&x);
432: PetscArraycpy(x,b,7*mbs);
433: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
434: VecRestoreArrayRead(bb,&b);
435: VecRestoreArray(xx,&x);
436: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
437: return 0;
438: }
440: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
441: {
442: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
443: IS isrow=a->row;
444: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
445: PetscInt nz,k,idx;
446: const MatScalar *aa=a->a,*v,*d;
447: PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp;
448: const PetscScalar *b;
450: VecGetArrayRead(bb,&b);
451: VecGetArray(xx,&x);
452: t = a->solve_work;
453: ISGetIndices(isrow,&r);
455: /* solve U^T * D * y = b by forward substitution */
456: tp = t;
457: for (k=0; k<mbs; k++) { /* t <- perm(b) */
458: idx = 6*r[k];
459: tp[0] = b[idx];
460: tp[1] = b[idx+1];
461: tp[2] = b[idx+2];
462: tp[3] = b[idx+3];
463: tp[4] = b[idx+4];
464: tp[5] = b[idx+5];
465: tp += 6;
466: }
468: for (k=0; k<mbs; k++) {
469: v = aa + 36*ai[k];
470: vj = aj + ai[k];
471: tp = t + k*6;
472: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
473: nz = ai[k+1] - ai[k];
474: tp = t + (*vj)*6;
475: while (nz--) {
476: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
477: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
478: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
479: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
480: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
481: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
482: vj++;
483: tp = t + (*vj)*6;
484: v += 36;
485: }
487: /* xk = inv(Dk)*(Dk*xk) */
488: d = aa+k*36; /* ptr to inv(Dk) */
489: tp = t + k*6;
490: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
491: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
492: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
493: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
494: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
495: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
496: }
498: /* solve U*x = y by back substitution */
499: for (k=mbs-1; k>=0; k--) {
500: v = aa + 36*ai[k];
501: vj = aj + ai[k];
502: tp = t + k*6;
503: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
504: nz = ai[k+1] - ai[k];
506: tp = t + (*vj)*6;
507: while (nz--) {
508: /* xk += U(k,:)*x(:) */
509: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
510: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
511: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
512: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
513: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
514: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
515: vj++;
516: tp = t + (*vj)*6;
517: v += 36;
518: }
519: tp = t + k*6;
520: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
521: idx = 6*r[k];
522: x[idx] = x0;
523: x[idx+1] = x1;
524: x[idx+2] = x2;
525: x[idx+3] = x3;
526: x[idx+4] = x4;
527: x[idx+5] = x5;
528: }
530: ISRestoreIndices(isrow,&r);
531: VecRestoreArrayRead(bb,&b);
532: VecRestoreArray(xx,&x);
533: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
534: return 0;
535: }
537: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
538: {
539: const MatScalar *v,*d;
540: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
541: PetscInt nz,k;
542: const PetscInt *vj;
544: for (k=0; k<mbs; k++) {
545: v = aa + 36*ai[k];
546: xp = x + k*6;
547: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
548: nz = ai[k+1] - ai[k];
549: vj = aj + ai[k];
550: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
551: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
552: while (nz--) {
553: xp = x + (*vj)*6;
554: /* x(:) += U(k,:)^T*(Dk*xk) */
555: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
556: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
557: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
558: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
559: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
560: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
561: vj++;
562: v += 36;
563: }
564: /* xk = inv(Dk)*(Dk*xk) */
565: d = aa+k*36; /* ptr to inv(Dk) */
566: xp = x + k*6;
567: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
568: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
569: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
570: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
571: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
572: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
573: }
574: return 0;
575: }
576: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
577: {
578: const MatScalar *v;
579: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
580: PetscInt nz,k;
581: const PetscInt *vj;
583: for (k=mbs-1; k>=0; k--) {
584: v = aa + 36*ai[k];
585: xp = x + k*6;
586: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
587: nz = ai[k+1] - ai[k];
588: vj = aj + ai[k];
589: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
590: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
591: while (nz--) {
592: xp = x + (*vj)*6;
593: /* xk += U(k,:)*x(:) */
594: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
595: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
596: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
597: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
598: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
599: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
600: vj++;
601: v += 36;
602: }
603: xp = x + k*6;
604: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
605: }
606: return 0;
607: }
609: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
610: {
611: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
612: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
613: const MatScalar *aa=a->a;
614: PetscScalar *x;
615: const PetscScalar *b;
617: VecGetArrayRead(bb,&b);
618: VecGetArray(xx,&x);
620: /* solve U^T * D * y = b by forward substitution */
621: PetscArraycpy(x,b,6*mbs); /* x <- b */
622: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
624: /* solve U*x = y by back substitution */
625: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
627: VecRestoreArrayRead(bb,&b);
628: VecRestoreArray(xx,&x);
629: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
630: return 0;
631: }
633: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
634: {
635: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
636: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
637: const MatScalar *aa=a->a;
638: PetscScalar *x;
639: const PetscScalar *b;
641: VecGetArrayRead(bb,&b);
642: VecGetArray(xx,&x);
643: PetscArraycpy(x,b,6*mbs); /* x <- b */
644: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
645: VecRestoreArrayRead(bb,&b);
646: VecRestoreArray(xx,&x);
647: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
648: return 0;
649: }
651: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
652: {
653: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
654: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
655: const MatScalar *aa=a->a;
656: PetscScalar *x;
657: const PetscScalar *b;
659: VecGetArrayRead(bb,&b);
660: VecGetArray(xx,&x);
661: PetscArraycpy(x,b,6*mbs); /* x <- b */
662: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
663: VecRestoreArrayRead(bb,&b);
664: VecRestoreArray(xx,&x);
665: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
666: return 0;
667: }
669: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
670: {
671: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
672: IS isrow=a->row;
673: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
674: const PetscInt *r,*vj;
675: PetscInt nz,k,idx;
676: const MatScalar *aa=a->a,*v,*diag;
677: PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp;
678: const PetscScalar *b;
680: VecGetArrayRead(bb,&b);
681: VecGetArray(xx,&x);
682: t = a->solve_work;
683: ISGetIndices(isrow,&r);
685: /* solve U^T * D * y = b by forward substitution */
686: tp = t;
687: for (k=0; k<mbs; k++) { /* t <- perm(b) */
688: idx = 5*r[k];
689: tp[0] = b[idx];
690: tp[1] = b[idx+1];
691: tp[2] = b[idx+2];
692: tp[3] = b[idx+3];
693: tp[4] = b[idx+4];
694: tp += 5;
695: }
697: for (k=0; k<mbs; k++) {
698: v = aa + 25*ai[k];
699: vj = aj + ai[k];
700: tp = t + k*5;
701: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
702: nz = ai[k+1] - ai[k];
704: tp = t + (*vj)*5;
705: while (nz--) {
706: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
707: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
708: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
709: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
710: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
711: vj++;
712: tp = t + (*vj)*5;
713: v += 25;
714: }
716: /* xk = inv(Dk)*(Dk*xk) */
717: diag = aa+k*25; /* ptr to inv(Dk) */
718: tp = t + k*5;
719: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
720: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
721: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
722: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
723: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
724: }
726: /* solve U*x = y by back substitution */
727: for (k=mbs-1; k>=0; k--) {
728: v = aa + 25*ai[k];
729: vj = aj + ai[k];
730: tp = t + k*5;
731: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
732: nz = ai[k+1] - ai[k];
734: tp = t + (*vj)*5;
735: while (nz--) {
736: /* xk += U(k,:)*x(:) */
737: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
738: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
739: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
740: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
741: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
742: vj++;
743: tp = t + (*vj)*5;
744: v += 25;
745: }
746: tp = t + k*5;
747: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
748: idx = 5*r[k];
749: x[idx] = x0;
750: x[idx+1] = x1;
751: x[idx+2] = x2;
752: x[idx+3] = x3;
753: x[idx+4] = x4;
754: }
756: ISRestoreIndices(isrow,&r);
757: VecRestoreArrayRead(bb,&b);
758: VecRestoreArray(xx,&x);
759: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
760: return 0;
761: }
763: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
764: {
765: const MatScalar *v,*diag;
766: PetscScalar *xp,x0,x1,x2,x3,x4;
767: PetscInt nz,k;
768: const PetscInt *vj;
770: for (k=0; k<mbs; k++) {
771: v = aa + 25*ai[k];
772: xp = x + k*5;
773: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
774: nz = ai[k+1] - ai[k];
775: vj = aj + ai[k];
776: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
777: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
778: while (nz--) {
779: xp = x + (*vj)*5;
780: /* x(:) += U(k,:)^T*(Dk*xk) */
781: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
782: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
783: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
784: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
785: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
786: vj++;
787: v += 25;
788: }
789: /* xk = inv(Dk)*(Dk*xk) */
790: diag = aa+k*25; /* ptr to inv(Dk) */
791: xp = x + k*5;
792: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
793: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
794: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
795: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
796: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
797: }
798: return 0;
799: }
801: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
802: {
803: const MatScalar *v;
804: PetscScalar *xp,x0,x1,x2,x3,x4;
805: PetscInt nz,k;
806: const PetscInt *vj;
808: for (k=mbs-1; k>=0; k--) {
809: v = aa + 25*ai[k];
810: xp = x + k*5;
811: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
812: nz = ai[k+1] - ai[k];
813: vj = aj + ai[k];
814: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
815: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
816: while (nz--) {
817: xp = x + (*vj)*5;
818: /* xk += U(k,:)*x(:) */
819: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
820: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
821: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
822: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
823: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
824: vj++;
825: v += 25;
826: }
827: xp = x + k*5;
828: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
829: }
830: return 0;
831: }
833: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
834: {
835: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
836: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
837: const MatScalar *aa=a->a;
838: PetscScalar *x;
839: const PetscScalar *b;
841: VecGetArrayRead(bb,&b);
842: VecGetArray(xx,&x);
844: /* solve U^T * D * y = b by forward substitution */
845: PetscArraycpy(x,b,5*mbs); /* x <- b */
846: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
848: /* solve U*x = y by back substitution */
849: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
851: VecRestoreArrayRead(bb,&b);
852: VecRestoreArray(xx,&x);
853: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
854: return 0;
855: }
857: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
858: {
859: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
860: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
861: const MatScalar *aa=a->a;
862: PetscScalar *x;
863: const PetscScalar *b;
865: VecGetArrayRead(bb,&b);
866: VecGetArray(xx,&x);
867: PetscArraycpy(x,b,5*mbs); /* x <- b */
868: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
869: VecRestoreArrayRead(bb,&b);
870: VecRestoreArray(xx,&x);
871: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
872: return 0;
873: }
875: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
876: {
877: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
878: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
879: const MatScalar *aa=a->a;
880: PetscScalar *x;
881: const PetscScalar *b;
883: VecGetArrayRead(bb,&b);
884: VecGetArray(xx,&x);
885: PetscArraycpy(x,b,5*mbs);
886: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
887: VecRestoreArrayRead(bb,&b);
888: VecRestoreArray(xx,&x);
889: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
890: return 0;
891: }
893: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
894: {
895: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
896: IS isrow=a->row;
897: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
898: const PetscInt *r,*vj;
899: PetscInt nz,k,idx;
900: const MatScalar *aa=a->a,*v,*diag;
901: PetscScalar *x,x0,x1,x2,x3,*t,*tp;
902: const PetscScalar *b;
904: VecGetArrayRead(bb,&b);
905: VecGetArray(xx,&x);
906: t = a->solve_work;
907: ISGetIndices(isrow,&r);
909: /* solve U^T * D * y = b by forward substitution */
910: tp = t;
911: for (k=0; k<mbs; k++) { /* t <- perm(b) */
912: idx = 4*r[k];
913: tp[0] = b[idx];
914: tp[1] = b[idx+1];
915: tp[2] = b[idx+2];
916: tp[3] = b[idx+3];
917: tp += 4;
918: }
920: for (k=0; k<mbs; k++) {
921: v = aa + 16*ai[k];
922: vj = aj + ai[k];
923: tp = t + k*4;
924: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
925: nz = ai[k+1] - ai[k];
927: tp = t + (*vj)*4;
928: while (nz--) {
929: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
930: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
931: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
932: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
933: vj++;
934: tp = t + (*vj)*4;
935: v += 16;
936: }
938: /* xk = inv(Dk)*(Dk*xk) */
939: diag = aa+k*16; /* ptr to inv(Dk) */
940: tp = t + k*4;
941: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
942: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
943: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
944: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
945: }
947: /* solve U*x = y by back substitution */
948: for (k=mbs-1; k>=0; k--) {
949: v = aa + 16*ai[k];
950: vj = aj + ai[k];
951: tp = t + k*4;
952: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
953: nz = ai[k+1] - ai[k];
955: tp = t + (*vj)*4;
956: while (nz--) {
957: /* xk += U(k,:)*x(:) */
958: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
959: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
960: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
961: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
962: vj++;
963: tp = t + (*vj)*4;
964: v += 16;
965: }
966: tp = t + k*4;
967: tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
968: idx = 4*r[k];
969: x[idx] = x0;
970: x[idx+1] = x1;
971: x[idx+2] = x2;
972: x[idx+3] = x3;
973: }
975: ISRestoreIndices(isrow,&r);
976: VecRestoreArrayRead(bb,&b);
977: VecRestoreArray(xx,&x);
978: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
979: return 0;
980: }
982: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
983: {
984: const MatScalar *v,*diag;
985: PetscScalar *xp,x0,x1,x2,x3;
986: PetscInt nz,k;
987: const PetscInt *vj;
989: for (k=0; k<mbs; k++) {
990: v = aa + 16*ai[k];
991: xp = x + k*4;
992: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
993: nz = ai[k+1] - ai[k];
994: vj = aj + ai[k];
995: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
996: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
997: while (nz--) {
998: xp = x + (*vj)*4;
999: /* x(:) += U(k,:)^T*(Dk*xk) */
1000: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1001: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1002: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1003: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1004: vj++;
1005: v += 16;
1006: }
1007: /* xk = inv(Dk)*(Dk*xk) */
1008: diag = aa+k*16; /* ptr to inv(Dk) */
1009: xp = x + k*4;
1010: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1011: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1012: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1013: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1014: }
1015: return 0;
1016: }
1018: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1019: {
1020: const MatScalar *v;
1021: PetscScalar *xp,x0,x1,x2,x3;
1022: PetscInt nz,k;
1023: const PetscInt *vj;
1025: for (k=mbs-1; k>=0; k--) {
1026: v = aa + 16*ai[k];
1027: xp = x + k*4;
1028: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1029: nz = ai[k+1] - ai[k];
1030: vj = aj + ai[k];
1031: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1032: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1033: while (nz--) {
1034: xp = x + (*vj)*4;
1035: /* xk += U(k,:)*x(:) */
1036: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1037: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1038: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1039: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1040: vj++;
1041: v += 16;
1042: }
1043: xp = x + k*4;
1044: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1045: }
1046: return 0;
1047: }
1049: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1050: {
1051: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1052: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1053: const MatScalar *aa=a->a;
1054: PetscScalar *x;
1055: const PetscScalar *b;
1057: VecGetArrayRead(bb,&b);
1058: VecGetArray(xx,&x);
1060: /* solve U^T * D * y = b by forward substitution */
1061: PetscArraycpy(x,b,4*mbs); /* x <- b */
1062: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1064: /* solve U*x = y by back substitution */
1065: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1066: VecRestoreArrayRead(bb,&b);
1067: VecRestoreArray(xx,&x);
1068: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1069: return 0;
1070: }
1072: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1073: {
1074: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1075: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1076: const MatScalar *aa=a->a;
1077: PetscScalar *x;
1078: const PetscScalar *b;
1080: VecGetArrayRead(bb,&b);
1081: VecGetArray(xx,&x);
1082: PetscArraycpy(x,b,4*mbs); /* x <- b */
1083: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1084: VecRestoreArrayRead(bb,&b);
1085: VecRestoreArray(xx,&x);
1086: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1087: return 0;
1088: }
1090: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1091: {
1092: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1093: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1094: const MatScalar *aa=a->a;
1095: PetscScalar *x;
1096: const PetscScalar *b;
1098: VecGetArrayRead(bb,&b);
1099: VecGetArray(xx,&x);
1100: PetscArraycpy(x,b,4*mbs);
1101: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1102: VecRestoreArrayRead(bb,&b);
1103: VecRestoreArray(xx,&x);
1104: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1105: return 0;
1106: }
1108: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1109: {
1110: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1111: IS isrow=a->row;
1112: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1113: const PetscInt *r;
1114: PetscInt nz,k,idx;
1115: const PetscInt *vj;
1116: const MatScalar *aa=a->a,*v,*diag;
1117: PetscScalar *x,x0,x1,x2,*t,*tp;
1118: const PetscScalar *b;
1120: VecGetArrayRead(bb,&b);
1121: VecGetArray(xx,&x);
1122: t = a->solve_work;
1123: ISGetIndices(isrow,&r);
1125: /* solve U^T * D * y = b by forward substitution */
1126: tp = t;
1127: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1128: idx = 3*r[k];
1129: tp[0] = b[idx];
1130: tp[1] = b[idx+1];
1131: tp[2] = b[idx+2];
1132: tp += 3;
1133: }
1135: for (k=0; k<mbs; k++) {
1136: v = aa + 9*ai[k];
1137: vj = aj + ai[k];
1138: tp = t + k*3;
1139: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1140: nz = ai[k+1] - ai[k];
1142: tp = t + (*vj)*3;
1143: while (nz--) {
1144: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1145: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1146: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1147: vj++;
1148: tp = t + (*vj)*3;
1149: v += 9;
1150: }
1152: /* xk = inv(Dk)*(Dk*xk) */
1153: diag = aa+k*9; /* ptr to inv(Dk) */
1154: tp = t + k*3;
1155: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1156: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1157: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1158: }
1160: /* solve U*x = y by back substitution */
1161: for (k=mbs-1; k>=0; k--) {
1162: v = aa + 9*ai[k];
1163: vj = aj + ai[k];
1164: tp = t + k*3;
1165: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1166: nz = ai[k+1] - ai[k];
1168: tp = t + (*vj)*3;
1169: while (nz--) {
1170: /* xk += U(k,:)*x(:) */
1171: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1172: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1173: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1174: vj++;
1175: tp = t + (*vj)*3;
1176: v += 9;
1177: }
1178: tp = t + k*3;
1179: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1180: idx = 3*r[k];
1181: x[idx] = x0;
1182: x[idx+1] = x1;
1183: x[idx+2] = x2;
1184: }
1186: ISRestoreIndices(isrow,&r);
1187: VecRestoreArrayRead(bb,&b);
1188: VecRestoreArray(xx,&x);
1189: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1190: return 0;
1191: }
1193: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1194: {
1195: const MatScalar *v,*diag;
1196: PetscScalar *xp,x0,x1,x2;
1197: PetscInt nz,k;
1198: const PetscInt *vj;
1200: for (k=0; k<mbs; k++) {
1201: v = aa + 9*ai[k];
1202: xp = x + k*3;
1203: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1204: nz = ai[k+1] - ai[k];
1205: vj = aj + ai[k];
1206: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1207: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1208: while (nz--) {
1209: xp = x + (*vj)*3;
1210: /* x(:) += U(k,:)^T*(Dk*xk) */
1211: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1212: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1213: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1214: vj++;
1215: v += 9;
1216: }
1217: /* xk = inv(Dk)*(Dk*xk) */
1218: diag = aa+k*9; /* ptr to inv(Dk) */
1219: xp = x + k*3;
1220: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1221: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1222: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1223: }
1224: return 0;
1225: }
1227: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1228: {
1229: const MatScalar *v;
1230: PetscScalar *xp,x0,x1,x2;
1231: PetscInt nz,k;
1232: const PetscInt *vj;
1234: for (k=mbs-1; k>=0; k--) {
1235: v = aa + 9*ai[k];
1236: xp = x + k*3;
1237: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1238: nz = ai[k+1] - ai[k];
1239: vj = aj + ai[k];
1240: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1241: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1242: while (nz--) {
1243: xp = x + (*vj)*3;
1244: /* xk += U(k,:)*x(:) */
1245: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1246: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1247: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1248: vj++;
1249: v += 9;
1250: }
1251: xp = x + k*3;
1252: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1253: }
1254: return 0;
1255: }
1257: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1258: {
1259: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1260: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1261: const MatScalar *aa=a->a;
1262: PetscScalar *x;
1263: const PetscScalar *b;
1265: VecGetArrayRead(bb,&b);
1266: VecGetArray(xx,&x);
1268: /* solve U^T * D * y = b by forward substitution */
1269: PetscArraycpy(x,b,3*mbs);
1270: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1272: /* solve U*x = y by back substitution */
1273: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1275: VecRestoreArrayRead(bb,&b);
1276: VecRestoreArray(xx,&x);
1277: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1278: return 0;
1279: }
1281: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1282: {
1283: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1284: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1285: const MatScalar *aa=a->a;
1286: PetscScalar *x;
1287: const PetscScalar *b;
1289: VecGetArrayRead(bb,&b);
1290: VecGetArray(xx,&x);
1291: PetscArraycpy(x,b,3*mbs);
1292: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1293: VecRestoreArrayRead(bb,&b);
1294: VecRestoreArray(xx,&x);
1295: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1296: return 0;
1297: }
1299: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1300: {
1301: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1302: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1303: const MatScalar *aa=a->a;
1304: PetscScalar *x;
1305: const PetscScalar *b;
1307: VecGetArrayRead(bb,&b);
1308: VecGetArray(xx,&x);
1309: PetscArraycpy(x,b,3*mbs);
1310: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1311: VecRestoreArrayRead(bb,&b);
1312: VecRestoreArray(xx,&x);
1313: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1314: return 0;
1315: }
1317: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1318: {
1319: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1320: IS isrow=a->row;
1321: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1322: const PetscInt *r,*vj;
1323: PetscInt nz,k,k2,idx;
1324: const MatScalar *aa=a->a,*v,*diag;
1325: PetscScalar *x,x0,x1,*t;
1326: const PetscScalar *b;
1328: VecGetArrayRead(bb,&b);
1329: VecGetArray(xx,&x);
1330: t = a->solve_work;
1331: ISGetIndices(isrow,&r);
1333: /* solve U^T * D * y = perm(b) by forward substitution */
1334: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1335: idx = 2*r[k];
1336: t[k*2] = b[idx];
1337: t[k*2+1] = b[idx+1];
1338: }
1339: for (k=0; k<mbs; k++) {
1340: v = aa + 4*ai[k];
1341: vj = aj + ai[k];
1342: k2 = k*2;
1343: x0 = t[k2]; x1 = t[k2+1];
1344: nz = ai[k+1] - ai[k];
1345: while (nz--) {
1346: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1347: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1348: vj++; v += 4;
1349: }
1350: diag = aa+k*4; /* ptr to inv(Dk) */
1351: t[k2] = diag[0]*x0 + diag[2]*x1;
1352: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1353: }
1355: /* solve U*x = y by back substitution */
1356: for (k=mbs-1; k>=0; k--) {
1357: v = aa + 4*ai[k];
1358: vj = aj + ai[k];
1359: k2 = k*2;
1360: x0 = t[k2]; x1 = t[k2+1];
1361: nz = ai[k+1] - ai[k];
1362: while (nz--) {
1363: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1364: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1365: vj++;
1366: v += 4;
1367: }
1368: t[k2] = x0;
1369: t[k2+1] = x1;
1370: idx = 2*r[k];
1371: x[idx] = x0;
1372: x[idx+1] = x1;
1373: }
1375: ISRestoreIndices(isrow,&r);
1376: VecRestoreArrayRead(bb,&b);
1377: VecRestoreArray(xx,&x);
1378: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1379: return 0;
1380: }
1382: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1383: {
1384: const MatScalar *v,*diag;
1385: PetscScalar x0,x1;
1386: PetscInt nz,k,k2;
1387: const PetscInt *vj;
1389: for (k=0; k<mbs; k++) {
1390: v = aa + 4*ai[k];
1391: vj = aj + ai[k];
1392: k2 = k*2;
1393: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1394: nz = ai[k+1] - ai[k];
1395: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1396: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1397: while (nz--) {
1398: /* x(:) += U(k,:)^T*(Dk*xk) */
1399: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1400: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1401: vj++; v += 4;
1402: }
1403: /* xk = inv(Dk)*(Dk*xk) */
1404: diag = aa+k*4; /* ptr to inv(Dk) */
1405: x[k2] = diag[0]*x0 + diag[2]*x1;
1406: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1407: }
1408: return 0;
1409: }
1411: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1412: {
1413: const MatScalar *v;
1414: PetscScalar x0,x1;
1415: PetscInt nz,k,k2;
1416: const PetscInt *vj;
1418: for (k=mbs-1; k>=0; k--) {
1419: v = aa + 4*ai[k];
1420: vj = aj + ai[k];
1421: k2 = k*2;
1422: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1423: nz = ai[k+1] - ai[k];
1424: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1425: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1426: while (nz--) {
1427: /* xk += U(k,:)*x(:) */
1428: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1429: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1430: vj++;
1431: v += 4;
1432: }
1433: x[k2] = x0;
1434: x[k2+1] = x1;
1435: }
1436: return 0;
1437: }
1439: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1440: {
1441: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1442: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1443: const MatScalar *aa=a->a;
1444: PetscScalar *x;
1445: const PetscScalar *b;
1447: VecGetArrayRead(bb,&b);
1448: VecGetArray(xx,&x);
1450: /* solve U^T * D * y = b by forward substitution */
1451: PetscArraycpy(x,b,2*mbs);
1452: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1454: /* solve U*x = y by back substitution */
1455: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1457: VecRestoreArrayRead(bb,&b);
1458: VecRestoreArray(xx,&x);
1459: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1460: return 0;
1461: }
1463: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1464: {
1465: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1466: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1467: const MatScalar *aa=a->a;
1468: PetscScalar *x;
1469: const PetscScalar *b;
1471: VecGetArrayRead(bb,&b);
1472: VecGetArray(xx,&x);
1473: PetscArraycpy(x,b,2*mbs);
1474: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1475: VecRestoreArrayRead(bb,&b);
1476: VecRestoreArray(xx,&x);
1477: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1478: return 0;
1479: }
1481: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1482: {
1483: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1484: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1485: const MatScalar *aa=a->a;
1486: PetscScalar *x;
1487: const PetscScalar *b;
1489: VecGetArrayRead(bb,&b);
1490: VecGetArray(xx,&x);
1491: PetscArraycpy(x,b,2*mbs);
1492: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1493: VecRestoreArrayRead(bb,&b);
1494: VecRestoreArray(xx,&x);
1495: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1496: return 0;
1497: }
1499: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1500: {
1501: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1502: IS isrow=a->row;
1503: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1504: const MatScalar *aa=a->a,*v;
1505: const PetscScalar *b;
1506: PetscScalar *x,xk,*t;
1507: PetscInt nz,k,j;
1509: VecGetArrayRead(bb,&b);
1510: VecGetArray(xx,&x);
1511: t = a->solve_work;
1512: ISGetIndices(isrow,&rp);
1514: /* solve U^T*D*y = perm(b) by forward substitution */
1515: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1516: for (k=0; k<mbs; k++) {
1517: v = aa + ai[k];
1518: vj = aj + ai[k];
1519: xk = t[k];
1520: nz = ai[k+1] - ai[k] - 1;
1521: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1522: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1523: }
1525: /* solve U*perm(x) = y by back substitution */
1526: for (k=mbs-1; k>=0; k--) {
1527: v = aa + adiag[k] - 1;
1528: vj = aj + adiag[k] - 1;
1529: nz = ai[k+1] - ai[k] - 1;
1530: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1531: x[rp[k]] = t[k];
1532: }
1534: ISRestoreIndices(isrow,&rp);
1535: VecRestoreArrayRead(bb,&b);
1536: VecRestoreArray(xx,&x);
1537: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1538: return 0;
1539: }
1541: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1542: {
1543: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1544: IS isrow=a->row;
1545: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1546: const MatScalar *aa=a->a,*v;
1547: PetscScalar *x,xk,*t;
1548: const PetscScalar *b;
1549: PetscInt nz,k;
1551: VecGetArrayRead(bb,&b);
1552: VecGetArray(xx,&x);
1553: t = a->solve_work;
1554: ISGetIndices(isrow,&rp);
1556: /* solve U^T*D*y = perm(b) by forward substitution */
1557: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1558: for (k=0; k<mbs; k++) {
1559: v = aa + ai[k] + 1;
1560: vj = aj + ai[k] + 1;
1561: xk = t[k];
1562: nz = ai[k+1] - ai[k] - 1;
1563: while (nz--) t[*vj++] += (*v++) * xk;
1564: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1565: }
1567: /* solve U*perm(x) = y by back substitution */
1568: for (k=mbs-1; k>=0; k--) {
1569: v = aa + ai[k] + 1;
1570: vj = aj + ai[k] + 1;
1571: nz = ai[k+1] - ai[k] - 1;
1572: while (nz--) t[k] += (*v++) * t[*vj++];
1573: x[rp[k]] = t[k];
1574: }
1576: ISRestoreIndices(isrow,&rp);
1577: VecRestoreArrayRead(bb,&b);
1578: VecRestoreArray(xx,&x);
1579: PetscLogFlops(4.0*a->nz - 3*mbs);
1580: return 0;
1581: }
1583: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1584: {
1585: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1586: IS isrow=a->row;
1587: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1588: const MatScalar *aa=a->a,*v;
1589: PetscReal diagk;
1590: PetscScalar *x,xk;
1591: const PetscScalar *b;
1592: PetscInt nz,k;
1594: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1595: VecGetArrayRead(bb,&b);
1596: VecGetArray(xx,&x);
1597: ISGetIndices(isrow,&rp);
1599: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1600: for (k=0; k<mbs; k++) {
1601: v = aa + ai[k];
1602: vj = aj + ai[k];
1603: xk = x[k];
1604: nz = ai[k+1] - ai[k] - 1;
1605: while (nz--) x[*vj++] += (*v++) * xk;
1607: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1609: x[k] = xk*PetscSqrtReal(diagk);
1610: }
1611: ISRestoreIndices(isrow,&rp);
1612: VecRestoreArrayRead(bb,&b);
1613: VecRestoreArray(xx,&x);
1614: PetscLogFlops(2.0*a->nz - mbs);
1615: return 0;
1616: }
1618: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1619: {
1620: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1621: IS isrow=a->row;
1622: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1623: const MatScalar *aa=a->a,*v;
1624: PetscReal diagk;
1625: PetscScalar *x,xk;
1626: const PetscScalar *b;
1627: PetscInt nz,k;
1629: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1630: VecGetArrayRead(bb,&b);
1631: VecGetArray(xx,&x);
1632: ISGetIndices(isrow,&rp);
1634: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1635: for (k=0; k<mbs; k++) {
1636: v = aa + ai[k] + 1;
1637: vj = aj + ai[k] + 1;
1638: xk = x[k];
1639: nz = ai[k+1] - ai[k] - 1;
1640: while (nz--) x[*vj++] += (*v++) * xk;
1642: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1644: x[k] = xk*PetscSqrtReal(diagk);
1645: }
1646: ISRestoreIndices(isrow,&rp);
1647: VecRestoreArrayRead(bb,&b);
1648: VecRestoreArray(xx,&x);
1649: PetscLogFlops(2.0*a->nz - mbs);
1650: return 0;
1651: }
1653: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1654: {
1655: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1656: IS isrow=a->row;
1657: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1658: const MatScalar *aa=a->a,*v;
1659: PetscReal diagk;
1660: PetscScalar *x,*t;
1661: const PetscScalar *b;
1662: PetscInt nz,k;
1664: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1665: VecGetArrayRead(bb,&b);
1666: VecGetArray(xx,&x);
1667: t = a->solve_work;
1668: ISGetIndices(isrow,&rp);
1670: for (k=mbs-1; k>=0; k--) {
1671: v = aa + ai[k];
1672: vj = aj + ai[k];
1673: diagk = PetscRealPart(aa[adiag[k]]);
1675: t[k] = b[k] * PetscSqrtReal(diagk);
1676: nz = ai[k+1] - ai[k] - 1;
1677: while (nz--) t[k] += (*v++) * t[*vj++];
1678: x[rp[k]] = t[k];
1679: }
1680: ISRestoreIndices(isrow,&rp);
1681: VecRestoreArrayRead(bb,&b);
1682: VecRestoreArray(xx,&x);
1683: PetscLogFlops(2.0*a->nz - mbs);
1684: return 0;
1685: }
1687: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1688: {
1689: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1690: IS isrow=a->row;
1691: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1692: const MatScalar *aa=a->a,*v;
1693: PetscReal diagk;
1694: PetscScalar *x,*t;
1695: const PetscScalar *b;
1696: PetscInt nz,k;
1698: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1699: VecGetArrayRead(bb,&b);
1700: VecGetArray(xx,&x);
1701: t = a->solve_work;
1702: ISGetIndices(isrow,&rp);
1704: for (k=mbs-1; k>=0; k--) {
1705: v = aa + ai[k] + 1;
1706: vj = aj + ai[k] + 1;
1707: diagk = PetscRealPart(aa[ai[k]]);
1709: t[k] = b[k] * PetscSqrtReal(diagk);
1710: nz = ai[k+1] - ai[k] - 1;
1711: while (nz--) t[k] += (*v++) * t[*vj++];
1712: x[rp[k]] = t[k];
1713: }
1714: ISRestoreIndices(isrow,&rp);
1715: VecRestoreArrayRead(bb,&b);
1716: VecRestoreArray(xx,&x);
1717: PetscLogFlops(2.0*a->nz - mbs);
1718: return 0;
1719: }
1721: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1722: {
1723: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1725: if (A->rmap->bs == 1) {
1726: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1727: } else {
1728: IS isrow=a->row;
1729: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1730: const MatScalar *aa=a->a,*v;
1731: PetscScalar *x,*t;
1732: const PetscScalar *b;
1733: PetscInt nz,k,n,i,j;
1735: if (bb->n > a->solves_work_n) {
1736: PetscFree(a->solves_work);
1737: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1738: a->solves_work_n = bb->n;
1739: }
1740: n = bb->n;
1741: VecGetArrayRead(bb->v,&b);
1742: VecGetArray(xx->v,&x);
1743: t = a->solves_work;
1745: ISGetIndices(isrow,&rp);
1747: /* solve U^T*D*y = perm(b) by forward substitution */
1748: for (k=0; k<mbs; k++) {
1749: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1750: }
1751: for (k=0; k<mbs; k++) {
1752: v = aa + ai[k];
1753: vj = aj + ai[k];
1754: nz = ai[k+1] - ai[k] - 1;
1755: for (j=0; j<nz; j++) {
1756: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1757: v++;vj++;
1758: }
1759: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1760: }
1762: /* solve U*perm(x) = y by back substitution */
1763: for (k=mbs-1; k>=0; k--) {
1764: v = aa + ai[k] - 1;
1765: vj = aj + ai[k] - 1;
1766: nz = ai[k+1] - ai[k] - 1;
1767: for (j=0; j<nz; j++) {
1768: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1769: v++;vj++;
1770: }
1771: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1772: }
1774: ISRestoreIndices(isrow,&rp);
1775: VecRestoreArrayRead(bb->v,&b);
1776: VecRestoreArray(xx->v,&x);
1777: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1778: }
1779: return 0;
1780: }
1782: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1783: {
1784: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1786: if (A->rmap->bs == 1) {
1787: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1788: } else {
1789: IS isrow=a->row;
1790: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1791: const MatScalar *aa=a->a,*v;
1792: PetscScalar *x,*t;
1793: const PetscScalar *b;
1794: PetscInt nz,k,n,i;
1796: if (bb->n > a->solves_work_n) {
1797: PetscFree(a->solves_work);
1798: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1799: a->solves_work_n = bb->n;
1800: }
1801: n = bb->n;
1802: VecGetArrayRead(bb->v,&b);
1803: VecGetArray(xx->v,&x);
1804: t = a->solves_work;
1806: ISGetIndices(isrow,&rp);
1808: /* solve U^T*D*y = perm(b) by forward substitution */
1809: for (k=0; k<mbs; k++) {
1810: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1811: }
1812: for (k=0; k<mbs; k++) {
1813: v = aa + ai[k];
1814: vj = aj + ai[k];
1815: nz = ai[k+1] - ai[k];
1816: while (nz--) {
1817: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1818: v++;vj++;
1819: }
1820: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1821: }
1823: /* solve U*perm(x) = y by back substitution */
1824: for (k=mbs-1; k>=0; k--) {
1825: v = aa + ai[k];
1826: vj = aj + ai[k];
1827: nz = ai[k+1] - ai[k];
1828: while (nz--) {
1829: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1830: v++;vj++;
1831: }
1832: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1833: }
1835: ISRestoreIndices(isrow,&rp);
1836: VecRestoreArrayRead(bb->v,&b);
1837: VecRestoreArray(xx->v,&x);
1838: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1839: }
1840: return 0;
1841: }
1843: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1844: {
1845: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1846: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1847: const MatScalar *aa=a->a,*v;
1848: const PetscScalar *b;
1849: PetscScalar *x,xi;
1850: PetscInt nz,i,j;
1852: VecGetArrayRead(bb,&b);
1853: VecGetArray(xx,&x);
1854: /* solve U^T*D*y = b by forward substitution */
1855: PetscArraycpy(x,b,mbs);
1856: for (i=0; i<mbs; i++) {
1857: v = aa + ai[i];
1858: vj = aj + ai[i];
1859: xi = x[i];
1860: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1861: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1862: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1863: }
1864: /* solve U*x = y by backward substitution */
1865: for (i=mbs-2; i>=0; i--) {
1866: xi = x[i];
1867: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
1868: vj = aj + adiag[i] - 1;
1869: nz = ai[i+1] - ai[i] - 1;
1870: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1871: x[i] = xi;
1872: }
1873: VecRestoreArrayRead(bb,&b);
1874: VecRestoreArray(xx,&x);
1875: PetscLogFlops(4.0*a->nz - 3*mbs);
1876: return 0;
1877: }
1879: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1880: {
1881: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1882: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1883: const MatScalar *aa=a->a,*v;
1884: const PetscScalar *b;
1885: PetscScalar *x,xi;
1886: PetscInt nz,i,j,neq,ldb,ldx;
1887: PetscBool isdense;
1889: if (!mbs) return 0;
1890: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1892: if (X != B) {
1893: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1895: }
1896: MatDenseGetArrayRead(B,&b);
1897: MatDenseGetLDA(B,&ldb);
1898: MatDenseGetArray(X,&x);
1899: MatDenseGetLDA(X,&ldx);
1900: for (neq=0; neq<B->cmap->n; neq++) {
1901: /* solve U^T*D*y = b by forward substitution */
1902: PetscArraycpy(x,b,mbs);
1903: for (i=0; i<mbs; i++) {
1904: v = aa + ai[i];
1905: vj = aj + ai[i];
1906: xi = x[i];
1907: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1908: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1909: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1910: }
1911: /* solve U*x = y by backward substitution */
1912: for (i=mbs-2; i>=0; i--) {
1913: xi = x[i];
1914: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
1915: vj = aj + adiag[i] - 1;
1916: nz = ai[i+1] - ai[i] - 1;
1917: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1918: x[i] = xi;
1919: }
1920: b += ldb;
1921: x += ldx;
1922: }
1923: MatDenseRestoreArrayRead(B,&b);
1924: MatDenseRestoreArray(X,&x);
1925: PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
1926: return 0;
1927: }
1929: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1930: {
1931: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1932: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
1933: const MatScalar *aa=a->a,*v;
1934: PetscScalar *x,xk;
1935: const PetscScalar *b;
1936: PetscInt nz,k;
1938: VecGetArrayRead(bb,&b);
1939: VecGetArray(xx,&x);
1941: /* solve U^T*D*y = b by forward substitution */
1942: PetscArraycpy(x,b,mbs);
1943: for (k=0; k<mbs; k++) {
1944: v = aa + ai[k] + 1;
1945: vj = aj + ai[k] + 1;
1946: xk = x[k];
1947: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1948: while (nz--) x[*vj++] += (*v++) * xk;
1949: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
1950: }
1952: /* solve U*x = y by back substitution */
1953: for (k=mbs-2; k>=0; k--) {
1954: v = aa + ai[k] + 1;
1955: vj = aj + ai[k] + 1;
1956: xk = x[k];
1957: nz = ai[k+1] - ai[k] - 1;
1958: while (nz--) {
1959: xk += (*v++) * x[*vj++];
1960: }
1961: x[k] = xk;
1962: }
1964: VecRestoreArrayRead(bb,&b);
1965: VecRestoreArray(xx,&x);
1966: PetscLogFlops(4.0*a->nz - 3*mbs);
1967: return 0;
1968: }
1970: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1971: {
1972: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1973: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
1974: const MatScalar *aa=a->a,*v;
1975: PetscReal diagk;
1976: PetscScalar *x;
1977: const PetscScalar *b;
1978: PetscInt nz,k;
1980: /* solve U^T*D^(1/2)*x = b by forward substitution */
1981: VecGetArrayRead(bb,&b);
1982: VecGetArray(xx,&x);
1983: PetscArraycpy(x,b,mbs);
1984: for (k=0; k<mbs; k++) {
1985: v = aa + ai[k];
1986: vj = aj + ai[k];
1987: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1988: while (nz--) x[*vj++] += (*v++) * x[k];
1989: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
1991: x[k] *= PetscSqrtReal(diagk);
1992: }
1993: VecRestoreArrayRead(bb,&b);
1994: VecRestoreArray(xx,&x);
1995: PetscLogFlops(2.0*a->nz - mbs);
1996: return 0;
1997: }
1999: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2000: {
2001: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2002: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2003: const MatScalar *aa=a->a,*v;
2004: PetscReal diagk;
2005: PetscScalar *x;
2006: const PetscScalar *b;
2007: PetscInt nz,k;
2009: /* solve U^T*D^(1/2)*x = b by forward substitution */
2010: VecGetArrayRead(bb,&b);
2011: VecGetArray(xx,&x);
2012: PetscArraycpy(x,b,mbs);
2013: for (k=0; k<mbs; k++) {
2014: v = aa + ai[k] + 1;
2015: vj = aj + ai[k] + 1;
2016: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2017: while (nz--) x[*vj++] += (*v++) * x[k];
2018: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2020: x[k] *= PetscSqrtReal(diagk);
2021: }
2022: VecRestoreArrayRead(bb,&b);
2023: VecRestoreArray(xx,&x);
2024: PetscLogFlops(2.0*a->nz - mbs);
2025: return 0;
2026: }
2028: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2029: {
2030: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2031: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2032: const MatScalar *aa=a->a,*v;
2033: PetscReal diagk;
2034: PetscScalar *x;
2035: const PetscScalar *b;
2036: PetscInt nz,k;
2038: /* solve D^(1/2)*U*x = b by back substitution */
2039: VecGetArrayRead(bb,&b);
2040: VecGetArray(xx,&x);
2042: for (k=mbs-1; k>=0; k--) {
2043: v = aa + ai[k];
2044: vj = aj + ai[k];
2045: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2047: x[k] = PetscSqrtReal(diagk)*b[k];
2048: nz = ai[k+1] - ai[k] - 1;
2049: while (nz--) x[k] += (*v++) * x[*vj++];
2050: }
2051: VecRestoreArrayRead(bb,&b);
2052: VecRestoreArray(xx,&x);
2053: PetscLogFlops(2.0*a->nz - mbs);
2054: return 0;
2055: }
2057: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2058: {
2059: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2060: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2061: const MatScalar *aa=a->a,*v;
2062: PetscReal diagk;
2063: PetscScalar *x;
2064: const PetscScalar *b;
2065: PetscInt nz,k;
2067: /* solve D^(1/2)*U*x = b by back substitution */
2068: VecGetArrayRead(bb,&b);
2069: VecGetArray(xx,&x);
2071: for (k=mbs-1; k>=0; k--) {
2072: v = aa + ai[k] + 1;
2073: vj = aj + ai[k] + 1;
2074: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2076: x[k] = PetscSqrtReal(diagk)*b[k];
2077: nz = ai[k+1] - ai[k] - 1;
2078: while (nz--) x[k] += (*v++) * x[*vj++];
2079: }
2080: VecRestoreArrayRead(bb,&b);
2081: VecRestoreArray(xx,&x);
2082: PetscLogFlops(2.0*a->nz - mbs);
2083: return 0;
2084: }
2086: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2087: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2088: {
2089: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2090: const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2091: PetscInt *jutmp,bs = A->rmap->bs,i;
2092: PetscInt m,reallocs = 0,*levtmp;
2093: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2094: PetscInt incrlev,*lev,shift,prow,nz;
2095: PetscReal f = info->fill,levels = info->levels;
2096: PetscBool perm_identity;
2098: /* check whether perm is the identity mapping */
2099: ISIdentity(perm,&perm_identity);
2101: if (perm_identity) {
2102: a->permute = PETSC_FALSE;
2103: ai = a->i; aj = a->j;
2104: } else { /* non-trivial permutation */
2105: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2106: }
2108: /* initialization */
2109: ISGetIndices(perm,&rip);
2110: umax = (PetscInt)(f*ai[mbs] + 1);
2111: PetscMalloc1(umax,&lev);
2112: umax += mbs + 1;
2113: shift = mbs + 1;
2114: PetscMalloc1(mbs+1,&iu);
2115: PetscMalloc1(umax,&ju);
2116: iu[0] = mbs + 1;
2117: juidx = mbs + 1;
2118: /* prowl: linked list for pivot row */
2119: PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2120: /* q: linked list for col index */
2122: for (i=0; i<mbs; i++) {
2123: prowl[i] = mbs;
2124: q[i] = 0;
2125: }
2127: /* for each row k */
2128: for (k=0; k<mbs; k++) {
2129: nzk = 0;
2130: q[k] = mbs;
2131: /* copy current row into linked list */
2132: nz = ai[rip[k]+1] - ai[rip[k]];
2133: j = ai[rip[k]];
2134: while (nz--) {
2135: vj = rip[aj[j++]];
2136: if (vj > k) {
2137: qm = k;
2138: do {
2139: m = qm; qm = q[m];
2140: } while (qm < vj);
2142: nzk++;
2143: q[m] = vj;
2144: q[vj] = qm;
2145: levtmp[vj] = 0; /* initialize lev for nonzero element */
2146: }
2147: }
2149: /* modify nonzero structure of k-th row by computing fill-in
2150: for each row prow to be merged in */
2151: prow = k;
2152: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2154: while (prow < k) {
2155: /* merge row prow into k-th row */
2156: jmin = iu[prow] + 1;
2157: jmax = iu[prow+1];
2158: qm = k;
2159: for (j=jmin; j<jmax; j++) {
2160: incrlev = lev[j-shift] + 1;
2161: if (incrlev > levels) continue;
2162: vj = ju[j];
2163: do {
2164: m = qm; qm = q[m];
2165: } while (qm < vj);
2166: if (qm != vj) { /* a fill */
2167: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2168: levtmp[vj] = incrlev;
2169: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2170: }
2171: prow = prowl[prow]; /* next pivot row */
2172: }
2174: /* add k to row list for first nonzero element in k-th row */
2175: if (nzk > 1) {
2176: i = q[k]; /* col value of first nonzero element in k_th row of U */
2177: prowl[k] = prowl[i]; prowl[i] = k;
2178: }
2179: iu[k+1] = iu[k] + nzk;
2181: /* allocate more space to ju and lev if needed */
2182: if (iu[k+1] > umax) {
2183: /* estimate how much additional space we will need */
2184: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2185: /* just double the memory each time */
2186: maxadd = umax;
2187: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2188: umax += maxadd;
2190: /* allocate a longer ju */
2191: PetscMalloc1(umax,&jutmp);
2192: PetscArraycpy(jutmp,ju,iu[k]);
2193: PetscFree(ju);
2194: ju = jutmp;
2196: PetscMalloc1(umax,&jutmp);
2197: PetscArraycpy(jutmp,lev,iu[k]-shift);
2198: PetscFree(lev);
2199: lev = jutmp;
2200: reallocs += 2; /* count how many times we realloc */
2201: }
2203: /* save nonzero structure of k-th row in ju */
2204: i=k;
2205: while (nzk--) {
2206: i = q[i];
2207: ju[juidx] = i;
2208: lev[juidx-shift] = levtmp[i];
2209: juidx++;
2210: }
2211: }
2213: #if defined(PETSC_USE_INFO)
2214: if (ai[mbs] != 0) {
2215: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2216: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2217: PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2218: PetscInfo(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2219: PetscInfo(A,"for best performance.\n");
2220: } else {
2221: PetscInfo(A,"Empty matrix\n");
2222: }
2223: #endif
2225: ISRestoreIndices(perm,&rip);
2226: PetscFree3(prowl,q,levtmp);
2227: PetscFree(lev);
2229: /* put together the new matrix */
2230: MatSeqSBAIJSetPreallocation(B,bs,0,NULL);
2232: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2233: b = (Mat_SeqSBAIJ*)(B)->data;
2234: PetscFree2(b->imax,b->ilen);
2236: b->singlemalloc = PETSC_FALSE;
2237: b->free_a = PETSC_TRUE;
2238: b->free_ij = PETSC_TRUE;
2239: /* the next line frees the default space generated by the Create() */
2240: PetscFree3(b->a,b->j,b->i);
2241: PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2242: b->j = ju;
2243: b->i = iu;
2244: b->diag = NULL;
2245: b->ilen = NULL;
2246: b->imax = NULL;
2248: ISDestroy(&b->row);
2249: ISDestroy(&b->icol);
2250: b->row = perm;
2251: b->icol = perm;
2252: PetscObjectReference((PetscObject)perm);
2253: PetscObjectReference((PetscObject)perm);
2254: PetscMalloc1(bs*mbs+bs,&b->solve_work);
2255: /* In b structure: Free imax, ilen, old a, old j.
2256: Allocate idnew, solve_work, new a, new j */
2257: PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2258: b->maxnz = b->nz = iu[mbs];
2260: (B)->info.factor_mallocs = reallocs;
2261: (B)->info.fill_ratio_given = f;
2262: if (ai[mbs] != 0) {
2263: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2264: } else {
2265: (B)->info.fill_ratio_needed = 0.0;
2266: }
2267: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2268: return 0;
2269: }
2271: /*
2272: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2273: */
2274: #include <petscbt.h>
2275: #include <../src/mat/utils/freespace.h>
2276: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2277: {
2278: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2279: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2280: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2281: const PetscInt *rip;
2282: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2283: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2284: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2285: PetscReal fill =info->fill,levels=info->levels;
2286: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2287: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2288: PetscBT lnkbt;
2291: MatMissingDiagonal(A,&missing,&d);
2293: if (bs > 1) {
2294: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2295: return 0;
2296: }
2298: /* check whether perm is the identity mapping */
2299: ISIdentity(perm,&perm_identity);
2301: a->permute = PETSC_FALSE;
2303: PetscMalloc1(am+1,&ui);
2304: PetscMalloc1(am+1,&udiag);
2305: ui[0] = 0;
2307: /* ICC(0) without matrix ordering: simply rearrange column indices */
2308: if (!levels) {
2309: /* reuse the column pointers and row offsets for memory savings */
2310: for (i=0; i<am; i++) {
2311: ncols = ai[i+1] - ai[i];
2312: ui[i+1] = ui[i] + ncols;
2313: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2314: }
2315: PetscMalloc1(ui[am]+1,&uj);
2316: cols = uj;
2317: for (i=0; i<am; i++) {
2318: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2319: ncols = ai[i+1] - ai[i] -1;
2320: for (j=0; j<ncols; j++) *cols++ = aj[j];
2321: *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2322: }
2323: } else { /* case: levels>0 */
2324: ISGetIndices(perm,&rip);
2326: /* initialization */
2327: /* jl: linked list for storing indices of the pivot rows
2328: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2329: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2330: for (i=0; i<am; i++) {
2331: jl[i] = am; il[i] = 0;
2332: }
2334: /* create and initialize a linked list for storing column indices of the active row k */
2335: nlnk = am + 1;
2336: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2338: /* initial FreeSpace size is fill*(ai[am]+1) */
2339: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2341: current_space = free_space;
2343: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2345: current_space_lvl = free_space_lvl;
2347: for (k=0; k<am; k++) { /* for each active row k */
2348: /* initialize lnk by the column indices of row k */
2349: nzk = 0;
2350: ncols = ai[k+1] - ai[k];
2352: cols = aj+ai[k];
2353: PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt);
2354: nzk += nlnk;
2356: /* update lnk by computing fill-in for each pivot row to be merged in */
2357: prow = jl[k]; /* 1st pivot row */
2359: while (prow < k) {
2360: nextprow = jl[prow];
2362: /* merge prow into k-th row */
2363: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2364: jmax = ui[prow+1];
2365: ncols = jmax-jmin;
2366: i = jmin - ui[prow];
2368: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2369: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2370: j = *(uj - 1);
2371: PetscICCLLAddSorted(ncols,cols,levels,uj,am,&nlnk,lnk,lnk_lvl,lnkbt,j);
2372: nzk += nlnk;
2374: /* update il and jl for prow */
2375: if (jmin < jmax) {
2376: il[prow] = jmin;
2378: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2379: }
2380: prow = nextprow;
2381: }
2383: /* if free space is not available, make more free space */
2384: if (current_space->local_remaining<nzk) {
2385: i = am - k + 1; /* num of unfactored rows */
2386: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2387: PetscFreeSpaceGet(i,¤t_space);
2388: PetscFreeSpaceGet(i,¤t_space_lvl);
2389: reallocs++;
2390: }
2392: /* copy data into free_space and free_space_lvl, then initialize lnk */
2394: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2396: /* add the k-th row into il and jl */
2397: if (nzk > 1) {
2398: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2399: jl[k] = jl[i]; jl[i] = k;
2400: il[k] = ui[k] + 1;
2401: }
2402: uj_ptr[k] = current_space->array;
2403: uj_lvl_ptr[k] = current_space_lvl->array;
2405: current_space->array += nzk;
2406: current_space->local_used += nzk;
2407: current_space->local_remaining -= nzk;
2408: current_space_lvl->array += nzk;
2409: current_space_lvl->local_used += nzk;
2410: current_space_lvl->local_remaining -= nzk;
2412: ui[k+1] = ui[k] + nzk;
2413: }
2415: ISRestoreIndices(perm,&rip);
2416: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2418: /* destroy list of free space and other temporary array(s) */
2419: PetscMalloc1(ui[am]+1,&uj);
2420: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2421: PetscIncompleteLLDestroy(lnk,lnkbt);
2422: PetscFreeSpaceDestroy(free_space_lvl);
2424: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2426: /* put together the new matrix in MATSEQSBAIJ format */
2427: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2429: b = (Mat_SeqSBAIJ*)(fact)->data;
2430: PetscFree2(b->imax,b->ilen);
2432: b->singlemalloc = PETSC_FALSE;
2433: b->free_a = PETSC_TRUE;
2434: b->free_ij = free_ij;
2436: PetscMalloc1(ui[am]+1,&b->a);
2438: b->j = uj;
2439: b->i = ui;
2440: b->diag = udiag;
2441: b->free_diag = PETSC_TRUE;
2442: b->ilen = NULL;
2443: b->imax = NULL;
2444: b->row = perm;
2445: b->col = perm;
2447: PetscObjectReference((PetscObject)perm);
2448: PetscObjectReference((PetscObject)perm);
2450: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2452: PetscMalloc1(am+1,&b->solve_work);
2453: PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2455: b->maxnz = b->nz = ui[am];
2457: fact->info.factor_mallocs = reallocs;
2458: fact->info.fill_ratio_given = fill;
2459: if (ai[am] != 0) {
2460: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2461: } else {
2462: fact->info.fill_ratio_needed = 0.0;
2463: }
2464: #if defined(PETSC_USE_INFO)
2465: if (ai[am] != 0) {
2466: PetscReal af = fact->info.fill_ratio_needed;
2467: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2468: PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2469: PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2470: } else {
2471: PetscInfo(A,"Empty matrix\n");
2472: }
2473: #endif
2474: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2475: return 0;
2476: }
2478: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2479: {
2480: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2481: Mat_SeqSBAIJ *b;
2482: PetscBool perm_identity,free_ij = PETSC_TRUE;
2483: PetscInt bs=A->rmap->bs,am=a->mbs;
2484: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2485: PetscInt reallocs=0,i,*ui;
2486: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2487: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2488: PetscReal fill =info->fill,levels=info->levels,ratio_needed;
2489: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2490: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2491: PetscBT lnkbt;
2493: /*
2494: This code originally uses Modified Sparse Row (MSR) storage
2495: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2496: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2497: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2498: thus the original code in MSR format is still used for these cases.
2499: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2500: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2501: */
2502: if (bs > 1) {
2503: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2504: return 0;
2505: }
2507: /* check whether perm is the identity mapping */
2508: ISIdentity(perm,&perm_identity);
2510: a->permute = PETSC_FALSE;
2512: /* special case that simply copies fill pattern */
2513: if (!levels) {
2514: /* reuse the column pointers and row offsets for memory savings */
2515: ui = a->i;
2516: uj = a->j;
2517: free_ij = PETSC_FALSE;
2518: ratio_needed = 1.0;
2519: } else { /* case: levels>0 */
2520: ISGetIndices(perm,&rip);
2522: /* initialization */
2523: PetscMalloc1(am+1,&ui);
2524: ui[0] = 0;
2526: /* jl: linked list for storing indices of the pivot rows
2527: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2528: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2529: for (i=0; i<am; i++) {
2530: jl[i] = am; il[i] = 0;
2531: }
2533: /* create and initialize a linked list for storing column indices of the active row k */
2534: nlnk = am + 1;
2535: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2537: /* initial FreeSpace size is fill*(ai[am]+1) */
2538: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2540: current_space = free_space;
2542: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2544: current_space_lvl = free_space_lvl;
2546: for (k=0; k<am; k++) { /* for each active row k */
2547: /* initialize lnk by the column indices of row rip[k] */
2548: nzk = 0;
2549: ncols = ai[rip[k]+1] - ai[rip[k]];
2550: cols = aj+ai[rip[k]];
2551: PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt);
2552: nzk += nlnk;
2554: /* update lnk by computing fill-in for each pivot row to be merged in */
2555: prow = jl[k]; /* 1st pivot row */
2557: while (prow < k) {
2558: nextprow = jl[prow];
2560: /* merge prow into k-th row */
2561: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2562: jmax = ui[prow+1];
2563: ncols = jmax-jmin;
2564: i = jmin - ui[prow];
2565: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2566: j = *(uj_lvl_ptr[prow] + i - 1);
2567: cols_lvl = uj_lvl_ptr[prow]+i;
2568: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,&nlnk,lnk,lnk_lvl,lnkbt,j);
2569: nzk += nlnk;
2571: /* update il and jl for prow */
2572: if (jmin < jmax) {
2573: il[prow] = jmin;
2574: j = *cols;
2575: jl[prow] = jl[j];
2576: jl[j] = prow;
2577: }
2578: prow = nextprow;
2579: }
2581: /* if free space is not available, make more free space */
2582: if (current_space->local_remaining<nzk) {
2583: i = am - k + 1; /* num of unfactored rows */
2584: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585: PetscFreeSpaceGet(i,¤t_space);
2586: PetscFreeSpaceGet(i,¤t_space_lvl);
2587: reallocs++;
2588: }
2590: /* copy data into free_space and free_space_lvl, then initialize lnk */
2591: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2593: /* add the k-th row into il and jl */
2594: if (nzk-1 > 0) {
2595: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2596: jl[k] = jl[i]; jl[i] = k;
2597: il[k] = ui[k] + 1;
2598: }
2599: uj_ptr[k] = current_space->array;
2600: uj_lvl_ptr[k] = current_space_lvl->array;
2602: current_space->array += nzk;
2603: current_space->local_used += nzk;
2604: current_space->local_remaining -= nzk;
2605: current_space_lvl->array += nzk;
2606: current_space_lvl->local_used += nzk;
2607: current_space_lvl->local_remaining -= nzk;
2609: ui[k+1] = ui[k] + nzk;
2610: }
2612: ISRestoreIndices(perm,&rip);
2613: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2615: /* destroy list of free space and other temporary array(s) */
2616: PetscMalloc1(ui[am]+1,&uj);
2617: PetscFreeSpaceContiguous(&free_space,uj);
2618: PetscIncompleteLLDestroy(lnk,lnkbt);
2619: PetscFreeSpaceDestroy(free_space_lvl);
2620: if (ai[am] != 0) {
2621: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2622: } else {
2623: ratio_needed = 0.0;
2624: }
2625: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2627: /* put together the new matrix in MATSEQSBAIJ format */
2628: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2630: b = (Mat_SeqSBAIJ*)(fact)->data;
2632: PetscFree2(b->imax,b->ilen);
2634: b->singlemalloc = PETSC_FALSE;
2635: b->free_a = PETSC_TRUE;
2636: b->free_ij = free_ij;
2638: PetscMalloc1(ui[am]+1,&b->a);
2640: b->j = uj;
2641: b->i = ui;
2642: b->diag = NULL;
2643: b->ilen = NULL;
2644: b->imax = NULL;
2645: b->row = perm;
2646: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2648: PetscObjectReference((PetscObject)perm);
2649: b->icol = perm;
2650: PetscObjectReference((PetscObject)perm);
2651: PetscMalloc1(am+1,&b->solve_work);
2653: b->maxnz = b->nz = ui[am];
2655: fact->info.factor_mallocs = reallocs;
2656: fact->info.fill_ratio_given = fill;
2657: fact->info.fill_ratio_needed = ratio_needed;
2658: #if defined(PETSC_USE_INFO)
2659: if (ai[am] != 0) {
2660: PetscReal af = fact->info.fill_ratio_needed;
2661: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2662: PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2663: PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2664: } else {
2665: PetscInfo(A,"Empty matrix\n");
2666: }
2667: #endif
2668: if (perm_identity) {
2669: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2670: } else {
2671: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2672: }
2673: return 0;
2674: }