Actual source code: pipecg2.c
1: #include <petsc/private/kspimpl.h>
3: /* The auxiliary functions below are merged vector operations that load vectors from memory once and use
4: the data multiple times by performing vector operations element-wise. These functions
5: only apply to sequential vectors.
6: */
7: /* VecMergedDot_Private function merges the dot products for gamma, delta and dp */
8: static PetscErrorCode VecMergedDot_Private(Vec U,Vec W,Vec R,PetscInt normtype,PetscScalar *ru, PetscScalar *wu, PetscScalar *uu)
9: {
10: const PetscScalar *PETSC_RESTRICT PU, *PETSC_RESTRICT PW, *PETSC_RESTRICT PR;
11: PetscScalar sumru = 0.0, sumwu = 0.0, sumuu = 0.0;
12: PetscInt j, n;
14: VecGetArrayRead(U,(const PetscScalar**)&PU);
15: VecGetArrayRead(W,(const PetscScalar**)&PW);
16: VecGetArrayRead(R,(const PetscScalar**)&PR);
17: VecGetLocalSize(U,&n);
19: if (normtype==KSP_NORM_PRECONDITIONED) {
20: PetscPragmaSIMD
21: for (j=0; j<n; j++) {
22: sumwu += PW[j] * PetscConj(PU[j]);
23: sumru += PR[j] * PetscConj(PU[j]);
24: sumuu += PU[j] * PetscConj(PU[j]);
25: }
26: } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
27: PetscPragmaSIMD
28: for (j=0; j<n; j++) {
29: sumwu += PW[j] * PetscConj(PU[j]);
30: sumru += PR[j] * PetscConj(PU[j]);
31: sumuu += PR[j] * PetscConj(PR[j]);
32: }
33: } else if (normtype==KSP_NORM_NATURAL) {
34: PetscPragmaSIMD
35: for (j=0; j<n; j++) {
36: sumwu += PW[j] * PetscConj(PU[j]);
37: sumru += PR[j] * PetscConj(PU[j]);
38: }
39: sumuu = sumru;
40: }
42: *ru = sumru;
43: *wu = sumwu;
44: *uu = sumuu;
46: VecRestoreArrayRead(U,(const PetscScalar**)&PU);
47: VecRestoreArrayRead(W,(const PetscScalar**)&PW);
48: VecRestoreArrayRead(R,(const PetscScalar**)&PR);
49: return 0;
50: }
52: /* VecMergedDot2_Private function merges the dot products for lambda_1 and lambda_4 */
53: static PetscErrorCode VecMergedDot2_Private(Vec N,Vec M,Vec W,PetscScalar *wm,PetscScalar *nm)
54: {
55: const PetscScalar *PETSC_RESTRICT PN, *PETSC_RESTRICT PM, *PETSC_RESTRICT PW;
56: PetscScalar sumwm = 0.0, sumnm = 0.0;
57: PetscInt j, n;
59: VecGetArrayRead(W,(const PetscScalar**)&PW);
60: VecGetArrayRead(N,(const PetscScalar**)&PN);
61: VecGetArrayRead(M,(const PetscScalar**)&PM);
62: VecGetLocalSize(N,&n);
64: PetscPragmaSIMD
65: for (j=0; j<n; j++) {
66: sumwm += PW[j] * PetscConj(PM[j]);
67: sumnm += PN[j] * PetscConj(PM[j]);
68: }
70: *wm = sumwm;
71: *nm = sumnm;
73: VecRestoreArrayRead(W,(const PetscScalar**)&PW);
74: VecRestoreArrayRead(N,(const PetscScalar**)&PN);
75: VecRestoreArrayRead(M,(const PetscScalar**)&PM);
76: return 0;
77: }
79: /* VecMergedOpsShort_Private function merges the dot products, AXPY and SAXPY operations for all vectors for iteration 0 */
80: static PetscErrorCode VecMergedOpsShort_Private(Vec vx,Vec vr,Vec vz,Vec vw,Vec vp,Vec vq,Vec vc,Vec vd,Vec vg0,Vec vh0,Vec vg1,Vec vh1,Vec vs,Vec va1,Vec vb1,Vec ve,Vec vf,Vec vm,Vec vn,Vec vu,PetscInt normtype,PetscScalar beta0,PetscScalar alpha0,PetscScalar beta1,PetscScalar alpha1,PetscScalar *lambda)
81: {
82: PetscScalar *PETSC_RESTRICT px, *PETSC_RESTRICT pr, *PETSC_RESTRICT pz, *PETSC_RESTRICT pw;
83: PetscScalar *PETSC_RESTRICT pp, *PETSC_RESTRICT pq;
84: PetscScalar *PETSC_RESTRICT pc, *PETSC_RESTRICT pd, *PETSC_RESTRICT pg0, *PETSC_RESTRICT ph0, *PETSC_RESTRICT pg1,*PETSC_RESTRICT ph1,*PETSC_RESTRICT ps,*PETSC_RESTRICT pa1,*PETSC_RESTRICT pb1, *PETSC_RESTRICT pe,*PETSC_RESTRICT pf,*PETSC_RESTRICT pm,*PETSC_RESTRICT pn, *PETSC_RESTRICT pu;
85: PetscInt j, n;
87: VecGetArray(vx,(PetscScalar**)&px);
88: VecGetArray(vr,(PetscScalar**)&pr);
89: VecGetArray(vz,(PetscScalar**)&pz);
90: VecGetArray(vw,(PetscScalar**)&pw);
91: VecGetArray(vp,(PetscScalar**)&pp);
92: VecGetArray(vq,(PetscScalar**)&pq);
93: VecGetArray(vc,(PetscScalar**)&pc);
94: VecGetArray(vd,(PetscScalar**)&pd);
95: VecGetArray(vg0,(PetscScalar**)&pg0);
96: VecGetArray(vh0,(PetscScalar**)&ph0);
97: VecGetArray(vg1,(PetscScalar**)&pg1);
98: VecGetArray(vh1,(PetscScalar**)&ph1);
99: VecGetArray(vs,(PetscScalar**)&ps);
100: VecGetArray(va1,(PetscScalar**)&pa1);
101: VecGetArray(vb1,(PetscScalar**)&pb1);
102: VecGetArray(ve,(PetscScalar**)&pe);
103: VecGetArray(vf,(PetscScalar**)&pf);
104: VecGetArray(vm,(PetscScalar**)&pm);
105: VecGetArray(vn,(PetscScalar**)&pn);
106: VecGetArray(vu,(PetscScalar**)&pu);
108: VecGetLocalSize(vx,&n);
109: for (j=0; j<15; j++) lambda[j] = 0.0;
111: if (normtype==KSP_NORM_PRECONDITIONED) {
112: PetscPragmaSIMD
113: for (j=0; j<n; j++) {
114: pz[j] = pn[j];
115: pq[j] = pm[j];
116: ps[j] = pw[j];
117: pp[j] = pu[j];
118: pc[j] = pg0[j];
119: pd[j] = ph0[j];
120: pa1[j] = pe[j];
121: pb1[j] = pf[j];
123: px[j] = px[j] + alpha0 * pp[j];
124: pr[j] = pr[j] - alpha0 * ps[j];
125: pu[j] = pu[j] - alpha0 * pq[j];
126: pw[j] = pw[j] - alpha0 * pz[j];
127: pm[j] = pm[j] - alpha0 * pc[j];
128: pn[j] = pn[j] - alpha0 * pd[j];
129: pg0[j] = pg0[j] - alpha0 * pa1[j];
130: ph0[j] = ph0[j] - alpha0 * pb1[j];
132: pg1[j] = pg0[j];
133: ph1[j] = ph0[j];
135: pz[j] = pn[j] + beta1 * pz[j];
136: pq[j] = pm[j] + beta1 * pq[j];
137: ps[j] = pw[j] + beta1 * ps[j];
138: pp[j] = pu[j] + beta1 * pp[j];
139: pc[j] = pg0[j] + beta1 * pc[j];
140: pd[j] = ph0[j] + beta1 * pd[j];
142: px[j] = px[j] + alpha1 * pp[j];
143: pr[j] = pr[j] - alpha1 * ps[j];
144: pu[j] = pu[j] - alpha1 * pq[j];
145: pw[j] = pw[j] - alpha1 * pz[j];
146: pm[j] = pm[j] - alpha1 * pc[j];
147: pn[j] = pn[j] - alpha1 * pd[j];
149: lambda[0] += ps[j] * PetscConj(pu[j]); lambda[1] += pw[j] * PetscConj(pm[j]);
150: lambda[2] += pw[j] * PetscConj(pq[j]); lambda[4] += ps[j] * PetscConj(pq[j]);
151: lambda[6] += pn[j] * PetscConj(pm[j]); lambda[7] += pn[j] * PetscConj(pq[j]);
152: lambda[9] += pz[j] * PetscConj(pq[j]); lambda[10] += pr[j] * PetscConj(pu[j]);
153: lambda[11] += pw[j] * PetscConj(pu[j]); lambda[12] += pu[j] * PetscConj(pu[j]);
154: }
155: lambda[3] = PetscConj(lambda[2]);
156: lambda[5] = PetscConj(lambda[1]);
157: lambda[8] = PetscConj(lambda[7]);
158: lambda[13] = PetscConj(lambda[11]);
159: lambda[14] = PetscConj(lambda[0]);
161: } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
162: PetscPragmaSIMD
163: for (j=0; j<n; j++) {
164: pz[j] = pn[j];
165: pq[j] = pm[j];
166: ps[j] = pw[j];
167: pp[j] = pu[j];
168: pc[j] = pg0[j];
169: pd[j] = ph0[j];
170: pa1[j] = pe[j];
171: pb1[j] = pf[j];
173: px[j] = px[j] + alpha0 * pp[j];
174: pr[j] = pr[j] - alpha0 * ps[j];
175: pu[j] = pu[j] - alpha0 * pq[j];
176: pw[j] = pw[j] - alpha0 * pz[j];
177: pm[j] = pm[j] - alpha0 * pc[j];
178: pn[j] = pn[j] - alpha0 * pd[j];
179: pg0[j] = pg0[j] - alpha0 * pa1[j];
180: ph0[j] = ph0[j] - alpha0 * pb1[j];
182: pg1[j] = pg0[j];
183: ph1[j] = ph0[j];
185: pz[j] = pn[j] + beta1 * pz[j];
186: pq[j] = pm[j] + beta1 * pq[j];
187: ps[j] = pw[j] + beta1 * ps[j];
188: pp[j] = pu[j] + beta1 * pp[j];
189: pc[j] = pg0[j] + beta1 * pc[j];
190: pd[j] = ph0[j] + beta1 * pd[j];
192: px[j] = px[j] + alpha1 * pp[j];
193: pr[j] = pr[j] - alpha1 * ps[j];
194: pu[j] = pu[j] - alpha1 * pq[j];
195: pw[j] = pw[j] - alpha1 * pz[j];
196: pm[j] = pm[j] - alpha1 * pc[j];
197: pn[j] = pn[j] - alpha1 * pd[j];
199: lambda[0] += ps[j] * PetscConj(pu[j]); lambda[1] += pw[j] * PetscConj(pm[j]);
200: lambda[2] += pw[j] * PetscConj(pq[j]); lambda[4] += ps[j] * PetscConj(pq[j]);
201: lambda[6] += pn[j] * PetscConj(pm[j]); lambda[7] += pn[j] * PetscConj(pq[j]);
202: lambda[9] += pz[j] * PetscConj(pq[j]); lambda[10] += pr[j] * PetscConj(pu[j]);
203: lambda[11] += pw[j] * PetscConj(pu[j]); lambda[12] += pr[j] * PetscConj(pr[j]);
204: }
205: lambda[3] = PetscConj(lambda[2]);
206: lambda[5] = PetscConj(lambda[1]);
207: lambda[8] = PetscConj(lambda[7]);
208: lambda[13] = PetscConj(lambda[11]);
209: lambda[14] = PetscConj(lambda[0]);
211: } else if (normtype==KSP_NORM_NATURAL) {
212: PetscPragmaSIMD
213: for (j=0; j<n; j++) {
214: pz[j] = pn[j];
215: pq[j] = pm[j];
216: ps[j] = pw[j];
217: pp[j] = pu[j];
218: pc[j] = pg0[j];
219: pd[j] = ph0[j];
220: pa1[j] = pe[j];
221: pb1[j] = pf[j];
223: px[j] = px[j] + alpha0 * pp[j];
224: pr[j] = pr[j] - alpha0 * ps[j];
225: pu[j] = pu[j] - alpha0 * pq[j];
226: pw[j] = pw[j] - alpha0 * pz[j];
227: pm[j] = pm[j] - alpha0 * pc[j];
228: pn[j] = pn[j] - alpha0 * pd[j];
229: pg0[j] = pg0[j] - alpha0 * pa1[j];
230: ph0[j] = ph0[j] - alpha0 * pb1[j];
232: pg1[j] = pg0[j];
233: ph1[j] = ph0[j];
235: pz[j] = pn[j] + beta1 * pz[j];
236: pq[j] = pm[j] + beta1 * pq[j];
237: ps[j] = pw[j] + beta1 * ps[j];
238: pp[j] = pu[j] + beta1 * pp[j];
239: pc[j] = pg0[j] + beta1 * pc[j];
240: pd[j] = ph0[j] + beta1 * pd[j];
242: px[j] = px[j] + alpha1 * pp[j];
243: pr[j] = pr[j] - alpha1 * ps[j];
244: pu[j] = pu[j] - alpha1 * pq[j];
245: pw[j] = pw[j] - alpha1 * pz[j];
246: pm[j] = pm[j] - alpha1 * pc[j];
247: pn[j] = pn[j] - alpha1 * pd[j];
249: lambda[0] += ps[j] * PetscConj(pu[j]); lambda[1] += pw[j] * PetscConj(pm[j]);
250: lambda[2] += pw[j] * PetscConj(pq[j]); lambda[4] += ps[j] * PetscConj(pq[j]);
251: lambda[6] += pn[j] * PetscConj(pm[j]); lambda[7] += pn[j] * PetscConj(pq[j]);
252: lambda[9] += pz[j] * PetscConj(pq[j]); lambda[10] += pr[j] * PetscConj(pu[j]);
253: lambda[11] += pw[j] * PetscConj(pu[j]);
254: }
255: lambda[3] = PetscConj(lambda[2]);
256: lambda[5] = PetscConj(lambda[1]);
257: lambda[8] = PetscConj(lambda[7]);
258: lambda[13] = PetscConj(lambda[11]);
259: lambda[14] = PetscConj(lambda[0]);
260: lambda[12] = lambda[10];
262: }
264: VecRestoreArray(vx,(PetscScalar**)&px);
265: VecRestoreArray(vr,(PetscScalar**)&pr);
266: VecRestoreArray(vz,(PetscScalar**)&pz);
267: VecRestoreArray(vw,(PetscScalar**)&pw);
268: VecRestoreArray(vp,(PetscScalar**)&pp);
269: VecRestoreArray(vq,(PetscScalar**)&pq);
270: VecRestoreArray(vc,(PetscScalar**)&pc);
271: VecRestoreArray(vd,(PetscScalar**)&pd);
272: VecRestoreArray(vg0,(PetscScalar**)&pg0);
273: VecRestoreArray(vh0,(PetscScalar**)&ph0);
274: VecRestoreArray(vg1,(PetscScalar**)&pg1);
275: VecRestoreArray(vh1,(PetscScalar**)&ph1);
276: VecRestoreArray(vs,(PetscScalar**)&ps);
277: VecRestoreArray(va1,(PetscScalar**)&pa1);
278: VecRestoreArray(vb1,(PetscScalar**)&pb1);
279: VecRestoreArray(ve,(PetscScalar**)&pe);
280: VecRestoreArray(vf,(PetscScalar**)&pf);
281: VecRestoreArray(vm,(PetscScalar**)&pm);
282: VecRestoreArray(vn,(PetscScalar**)&pn);
283: VecRestoreArray(vu,(PetscScalar**)&pu);
284: return 0;
285: }
287: /* VecMergedOps_Private function merges the dot products, AXPY and SAXPY operations for all vectors for iteration > 0 */
288: static PetscErrorCode VecMergedOps_Private(Vec vx,Vec vr,Vec vz,Vec vw,Vec vp,Vec vq,Vec vc,Vec vd,Vec vg0,Vec vh0,Vec vg1,Vec vh1,Vec vs,Vec va1,Vec vb1,Vec ve,Vec vf,Vec vm,Vec vn,Vec vu,PetscInt normtype,PetscScalar beta0,PetscScalar alpha0,PetscScalar beta1,PetscScalar alpha1,PetscScalar *lambda, PetscScalar alphaold)
289: {
290: PetscScalar *PETSC_RESTRICT px, *PETSC_RESTRICT pr, *PETSC_RESTRICT pz, *PETSC_RESTRICT pw;
291: PetscScalar *PETSC_RESTRICT pp, *PETSC_RESTRICT pq;
292: PetscScalar *PETSC_RESTRICT pc, *PETSC_RESTRICT pd, *PETSC_RESTRICT pg0, *PETSC_RESTRICT ph0, *PETSC_RESTRICT pg1, *PETSC_RESTRICT ph1,*PETSC_RESTRICT ps, *PETSC_RESTRICT pa1,*PETSC_RESTRICT pb1,*PETSC_RESTRICT pe,*PETSC_RESTRICT pf, *PETSC_RESTRICT pm,*PETSC_RESTRICT pn, *PETSC_RESTRICT pu;
293: PetscInt j, n;
295: VecGetArray(vx,(PetscScalar**)&px);
296: VecGetArray(vr,(PetscScalar**)&pr);
297: VecGetArray(vz,(PetscScalar**)&pz);
298: VecGetArray(vw,(PetscScalar**)&pw);
299: VecGetArray(vp,(PetscScalar**)&pp);
300: VecGetArray(vq,(PetscScalar**)&pq);
301: VecGetArray(vc,(PetscScalar**)&pc);
302: VecGetArray(vd,(PetscScalar**)&pd);
303: VecGetArray(vg0,(PetscScalar**)&pg0);
304: VecGetArray(vh0,(PetscScalar**)&ph0);
305: VecGetArray(vg1,(PetscScalar**)&pg1);
306: VecGetArray(vh1,(PetscScalar**)&ph1);
307: VecGetArray(vs,(PetscScalar**)&ps);
308: VecGetArray(va1,(PetscScalar**)&pa1);
309: VecGetArray(vb1,(PetscScalar**)&pb1);
310: VecGetArray(ve,(PetscScalar**)&pe);
311: VecGetArray(vf,(PetscScalar**)&pf);
312: VecGetArray(vm,(PetscScalar**)&pm);
313: VecGetArray(vn,(PetscScalar**)&pn);
314: VecGetArray(vu,(PetscScalar**)&pu);
316: VecGetLocalSize(vx,&n);
317: for (j=0; j<15; j++) lambda[j] = 0.0;
319: if (normtype==KSP_NORM_PRECONDITIONED) {
320: PetscPragmaSIMD
321: for (j=0; j<n; j++) {
322: pa1[j] = (pg1[j] - pg0[j])/alphaold;
323: pb1[j] = (ph1[j] - ph0[j])/alphaold;
325: pz[j] = pn[j] + beta0 * pz[j];
326: pq[j] = pm[j] + beta0 * pq[j];
327: ps[j] = pw[j] + beta0 * ps[j];
328: pp[j] = pu[j] + beta0 * pp[j];
329: pc[j] = pg0[j] + beta0 * pc[j];
330: pd[j] = ph0[j] + beta0 * pd[j];
331: pa1[j] = pe[j] + beta0 * pa1[j];
332: pb1[j] = pf[j] + beta0 * pb1[j];
334: px[j] = px[j] + alpha0 * pp[j];
335: pr[j] = pr[j] - alpha0 * ps[j];
336: pu[j] = pu[j] - alpha0 * pq[j];
337: pw[j] = pw[j] - alpha0 * pz[j];
338: pm[j] = pm[j] - alpha0 * pc[j];
339: pn[j] = pn[j] - alpha0 * pd[j];
340: pg0[j] = pg0[j] - alpha0 * pa1[j];
341: ph0[j] = ph0[j] - alpha0 * pb1[j];
343: pg1[j] = pg0[j];
344: ph1[j] = ph0[j];
346: pz[j] = pn[j] + beta1 * pz[j];
347: pq[j] = pm[j] + beta1 * pq[j];
348: ps[j] = pw[j] + beta1 * ps[j];
349: pp[j] = pu[j] + beta1 * pp[j];
350: pc[j] = pg0[j] + beta1 * pc[j];
351: pd[j] = ph0[j] + beta1 * pd[j];
353: px[j] = px[j] + alpha1 * pp[j];
354: pr[j] = pr[j] - alpha1 * ps[j];
355: pu[j] = pu[j] - alpha1 * pq[j];
356: pw[j] = pw[j] - alpha1 * pz[j];
357: pm[j] = pm[j] - alpha1 * pc[j];
358: pn[j] = pn[j] - alpha1 * pd[j];
360: lambda[0] += ps[j] * PetscConj(pu[j]); lambda[1] += pw[j] * PetscConj(pm[j]);
361: lambda[2] += pw[j] * PetscConj(pq[j]); lambda[4] += ps[j] * PetscConj(pq[j]);
362: lambda[6] += pn[j] * PetscConj(pm[j]); lambda[7] += pn[j] * PetscConj(pq[j]);
363: lambda[9] += pz[j] * PetscConj(pq[j]); lambda[10] += pr[j] * PetscConj(pu[j]);
364: lambda[11] += pw[j] * PetscConj(pu[j]); lambda[12] += pu[j] * PetscConj(pu[j]);
365: }
366: lambda[3] = PetscConj(lambda[2]);
367: lambda[5] = PetscConj(lambda[1]);
368: lambda[8] = PetscConj(lambda[7]);
369: lambda[13] = PetscConj(lambda[11]);
370: lambda[14] = PetscConj(lambda[0]);
371: } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
372: PetscPragmaSIMD
373: for (j=0; j<n; j++) {
374: pa1[j] = (pg1[j] - pg0[j])/alphaold;
375: pb1[j] = (ph1[j] - ph0[j])/alphaold;
377: pz[j] = pn[j] + beta0 * pz[j];
378: pq[j] = pm[j] + beta0 * pq[j];
379: ps[j] = pw[j] + beta0 * ps[j];
380: pp[j] = pu[j] + beta0 * pp[j];
381: pc[j] = pg0[j] + beta0 * pc[j];
382: pd[j] = ph0[j] + beta0 * pd[j];
383: pa1[j] = pe[j] + beta0 * pa1[j];
384: pb1[j] = pf[j] + beta0 * pb1[j];
386: px[j] = px[j] + alpha0 * pp[j];
387: pr[j] = pr[j] - alpha0 * ps[j];
388: pu[j] = pu[j] - alpha0 * pq[j];
389: pw[j] = pw[j] - alpha0 * pz[j];
390: pm[j] = pm[j] - alpha0 * pc[j];
391: pn[j] = pn[j] - alpha0 * pd[j];
392: pg0[j] = pg0[j] - alpha0 * pa1[j];
393: ph0[j] = ph0[j] - alpha0 * pb1[j];
395: pg1[j] = pg0[j];
396: ph1[j] = ph0[j];
398: pz[j] = pn[j] + beta1 * pz[j];
399: pq[j] = pm[j] + beta1 * pq[j];
400: ps[j] = pw[j] + beta1 * ps[j];
401: pp[j] = pu[j] + beta1 * pp[j];
402: pc[j] = pg0[j] + beta1 * pc[j];
403: pd[j] = ph0[j] + beta1 * pd[j];
405: px[j] = px[j] + alpha1 * pp[j];
406: pr[j] = pr[j] - alpha1 * ps[j];
407: pu[j] = pu[j] - alpha1 * pq[j];
408: pw[j] = pw[j] - alpha1 * pz[j];
409: pm[j] = pm[j] - alpha1 * pc[j];
410: pn[j] = pn[j] - alpha1 * pd[j];
412: lambda[0] += ps[j] * PetscConj(pu[j]); lambda[1] += pw[j] * PetscConj(pm[j]);
413: lambda[2] += pw[j] * PetscConj(pq[j]); lambda[4] += ps[j] * PetscConj(pq[j]);
414: lambda[6] += pn[j] * PetscConj(pm[j]); lambda[7] += pn[j] * PetscConj(pq[j]);
415: lambda[9] += pz[j] * PetscConj(pq[j]); lambda[10] += pr[j] * PetscConj(pu[j]);
416: lambda[11] += pw[j] * PetscConj(pu[j]); lambda[12] += pr[j] * PetscConj(pr[j]);
417: }
418: lambda[3] = PetscConj(lambda[2]);
419: lambda[5] = PetscConj(lambda[1]);
420: lambda[8] = PetscConj(lambda[7]);
421: lambda[13] = PetscConj(lambda[11]);
422: lambda[14] = PetscConj(lambda[0]);
423: } else if (normtype==KSP_NORM_NATURAL) {
424: PetscPragmaSIMD
425: for (j=0; j<n; j++) {
426: pa1[j] = (pg1[j] - pg0[j])/alphaold;
427: pb1[j] = (ph1[j] - ph0[j])/alphaold;
429: pz[j] = pn[j] + beta0 * pz[j];
430: pq[j] = pm[j] + beta0 * pq[j];
431: ps[j] = pw[j] + beta0 * ps[j];
432: pp[j] = pu[j] + beta0 * pp[j];
433: pc[j] = pg0[j] + beta0 * pc[j];
434: pd[j] = ph0[j] + beta0 * pd[j];
435: pa1[j] = pe[j] + beta0 * pa1[j];
436: pb1[j] = pf[j] + beta0 * pb1[j];
438: px[j] = px[j] + alpha0 * pp[j];
439: pr[j] = pr[j] - alpha0 * ps[j];
440: pu[j] = pu[j] - alpha0 * pq[j];
441: pw[j] = pw[j] - alpha0 * pz[j];
442: pm[j] = pm[j] - alpha0 * pc[j];
443: pn[j] = pn[j] - alpha0 * pd[j];
444: pg0[j] = pg0[j] - alpha0 * pa1[j];
445: ph0[j] = ph0[j] - alpha0 * pb1[j];
447: pg1[j] = pg0[j];
448: ph1[j] = ph0[j];
450: pz[j] = pn[j] + beta1 * pz[j];
451: pq[j] = pm[j] + beta1 * pq[j];
452: ps[j] = pw[j] + beta1 * ps[j];
453: pp[j] = pu[j] + beta1 * pp[j];
454: pc[j] = pg0[j] + beta1 * pc[j];
455: pd[j] = ph0[j] + beta1 * pd[j];
457: px[j] = px[j] + alpha1 * pp[j];
458: pr[j] = pr[j] - alpha1 * ps[j];
459: pu[j] = pu[j] - alpha1 * pq[j];
460: pw[j] = pw[j] - alpha1 * pz[j];
461: pm[j] = pm[j] - alpha1 * pc[j];
462: pn[j] = pn[j] - alpha1 * pd[j];
464: lambda[0] += ps[j] * PetscConj(pu[j]); lambda[1] += pw[j] * PetscConj(pm[j]);
465: lambda[2] += pw[j] * PetscConj(pq[j]); lambda[4] += ps[j] * PetscConj(pq[j]);
466: lambda[6] += pn[j] * PetscConj(pm[j]); lambda[7] += pn[j] * PetscConj(pq[j]);
467: lambda[9] += pz[j] * PetscConj(pq[j]); lambda[10] += pr[j] * PetscConj(pu[j]);
468: lambda[11] += pw[j] * PetscConj(pu[j]);
469: }
470: lambda[3] = PetscConj(lambda[2]);
471: lambda[5] = PetscConj(lambda[1]);
472: lambda[8] = PetscConj(lambda[7]);
473: lambda[13] = PetscConj(lambda[11]);
474: lambda[14] = PetscConj(lambda[0]);
475: lambda[12] = lambda[10];
476: }
478: VecRestoreArray(vx,(PetscScalar**)&px);
479: VecRestoreArray(vr,(PetscScalar**)&pr);
480: VecRestoreArray(vz,(PetscScalar**)&pz);
481: VecRestoreArray(vw,(PetscScalar**)&pw);
482: VecRestoreArray(vp,(PetscScalar**)&pp);
483: VecRestoreArray(vq,(PetscScalar**)&pq);
484: VecRestoreArray(vc,(PetscScalar**)&pc);
485: VecRestoreArray(vd,(PetscScalar**)&pd);
486: VecRestoreArray(vg0,(PetscScalar**)&pg0);
487: VecRestoreArray(vh0,(PetscScalar**)&ph0);
488: VecRestoreArray(vg1,(PetscScalar**)&pg1);
489: VecRestoreArray(vh1,(PetscScalar**)&ph1);
490: VecRestoreArray(vs,(PetscScalar**)&ps);
491: VecRestoreArray(va1,(PetscScalar**)&pa1);
492: VecRestoreArray(vb1,(PetscScalar**)&pb1);
493: VecRestoreArray(ve,(PetscScalar**)&pe);
494: VecRestoreArray(vf,(PetscScalar**)&pf);
495: VecRestoreArray(vm,(PetscScalar**)&pm);
496: VecRestoreArray(vn,(PetscScalar**)&pn);
497: VecRestoreArray(vu,(PetscScalar**)&pu);
498: return 0;
499: }
501: /*
502: KSPSetUp_PIPECG2 - Sets up the workspace needed by the PIPECG method.
504: This is called once, usually automatically by KSPSolve() or KSPSetUp()
505: but can be called directly by KSPSetUp()
506: */
507: static PetscErrorCode KSPSetUp_PIPECG2(KSP ksp)
508: {
509: /* get work vectors needed by PIPECG2 */
510: KSPSetWorkVecs(ksp,20);
511: return 0;
512: }
514: /*
515: KSPSolve_PIPECG2 - This routine actually applies the PIPECG2 method
516: */
517: static PetscErrorCode KSPSolve_PIPECG2(KSP ksp)
518: {
519: PetscInt i,n;
520: PetscScalar alpha[2],beta[2],gamma[2],delta[2],lambda[15];
521: PetscScalar dps = 0.0,alphaold=0.0;
522: PetscReal dp = 0.0;
523: Vec X,B,Z,P,W,Q,U,M,N,R,S,C,D,E,F,G[2],H[2],A1,B1;
524: Mat Amat,Pmat;
525: PetscBool diagonalscale;
526: MPI_Comm pcomm;
527: MPI_Request req;
528: MPI_Status stat;
530: pcomm = PetscObjectComm((PetscObject)ksp);
531: PCGetDiagonalScale(ksp->pc,&diagonalscale);
534: X = ksp->vec_sol;
535: B = ksp->vec_rhs;
536: M = ksp->work[0];
537: Z = ksp->work[1];
538: P = ksp->work[2];
539: N = ksp->work[3];
540: W = ksp->work[4];
541: Q = ksp->work[5];
542: U = ksp->work[6];
543: R = ksp->work[7];
544: S = ksp->work[8];
545: C = ksp->work[9];
546: D = ksp->work[10];
547: E = ksp->work[11];
548: F = ksp->work[12];
549: G[0] = ksp->work[13];
550: H[0] = ksp->work[14];
551: G[1] = ksp->work[15];
552: H[1] = ksp->work[16];
553: A1 = ksp->work[17];
554: B1 = ksp->work[18];
556: PetscMemzero(alpha,2*sizeof(PetscScalar));
557: PetscMemzero(beta,2*sizeof(PetscScalar));
558: PetscMemzero(gamma,2*sizeof(PetscScalar));
559: PetscMemzero(delta,2*sizeof(PetscScalar));
560: PetscMemzero(lambda,15*sizeof(PetscScalar));
562: VecGetLocalSize(B,&n);
563: PCGetOperators(ksp->pc,&Amat,&Pmat);
565: ksp->its = 0;
566: if (!ksp->guess_zero) {
567: KSP_MatMult(ksp,Amat,X,R); /* r <- b - Ax */
568: VecAYPX(R,-1.0,B);
569: } else {
570: VecCopy(B,R); /* r <- b (x is 0) */
571: }
573: KSP_PCApply(ksp,R,U); /* u <- Br */
574: KSP_MatMult(ksp,Amat,U,W); /* w <- Au */
576: VecMergedDot_Private(U,W,R,ksp->normtype,&gamma[0],&delta[0],&dps); /* gamma <- r'*u , delta <- w'*u , dp <- u'*u or r'*r or r'*u depending on ksp_norm_type */
577: lambda[10]= gamma[0];
578: lambda[11]= delta[0];
579: lambda[12] = dps;
581: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
582: MPI_Iallreduce(MPI_IN_PLACE,&lambda[10],3,MPIU_SCALAR,MPIU_SUM,pcomm,&req);
583: #else
584: MPIU_Allreduce(MPI_IN_PLACE,&lambda[10],3,MPIU_SCALAR,MPIU_SUM,pcomm);
585: req = MPI_REQUEST_NULL;
586: #endif
588: KSP_PCApply(ksp,W,M); /* m <- Bw */
589: KSP_MatMult(ksp,Amat,M,N); /* n <- Am */
591: KSP_PCApply(ksp,N,G[0]); /* g <- Bn */
592: KSP_MatMult(ksp,Amat,G[0],H[0]); /* h <- Ag */
594: KSP_PCApply(ksp,H[0],E); /* e <- Bh */
595: KSP_MatMult(ksp,Amat,E,F); /* f <- Ae */
597: MPI_Wait(&req,&stat);
599: gamma[0] = lambda[10];
600: delta[0] = lambda[11];
601: dp = PetscSqrtReal(PetscAbsScalar(lambda[12]));
603: VecMergedDot2_Private(N,M,W,&lambda[1],&lambda[6]); /* lambda_1 <- w'*m , lambda_4 <- n'*m */
604: MPIU_Allreduce(MPI_IN_PLACE,&lambda[1],1,MPIU_SCALAR,MPIU_SUM,pcomm);
605: MPIU_Allreduce(MPI_IN_PLACE,&lambda[6],1,MPIU_SCALAR,MPIU_SUM,pcomm);
607: lambda[5] = PetscConj(lambda[1]);
608: lambda[13] = PetscConj(lambda[11]);
610: KSPLogResidualHistory(ksp,dp);
611: KSPMonitor(ksp,0,dp);
612: ksp->rnorm = dp;
614: (*ksp->converged)(ksp,0,dp,&ksp->reason,ksp->cnvP); /* test for convergence */
615: if (ksp->reason) return 0;
617: for (i=2; i<ksp->max_it; i+=2) {
618: if (i == 2) {
619: beta[0] = 0;
620: alpha[0] = gamma[0]/delta[0];
622: gamma[1] = gamma[0] - alpha[0] * lambda[13] - alpha[0] * delta[0] + alpha[0] * alpha[0] * lambda[1];
623: delta[1] = delta[0] - alpha[0] * lambda[1] - alpha[0] * lambda[5] + alpha[0]*alpha[0]*lambda[6];
625: beta[1] = gamma[1] / gamma[0];
626: alpha[1] = gamma[1] / (delta[1] - beta[1] / alpha[0] * gamma[1]);
628: VecMergedOpsShort_Private(X,R,Z,W,P,Q,C,D,G[0],H[0],G[1],H[1],S,A1,B1,E,F,M,N,U,ksp->normtype,beta[0],alpha[0],beta[1],alpha[1],lambda);
629: } else {
630: beta[0] = gamma[1] / gamma[0];
631: alpha[0] = gamma[1] / (delta[1] - beta[0] / alpha[1] * gamma[1]);
633: gamma[0] = gamma[1];
634: delta[0] = delta[1];
636: gamma[1] = gamma[0] - alpha[0]*(lambda[13] + beta[0]*lambda[14]) - alpha[0] *(delta[0] + beta[0] * lambda[0]) + alpha[0] * alpha[0] * (lambda[1] + beta[0] * lambda[2] + beta[0] * lambda[3] + beta[0] * beta[0] * lambda[4]);
638: delta[1] = delta[0] - alpha[0] * (lambda[1] + beta[0]* lambda[2]) - alpha[0] *(lambda[5] + beta[0] * lambda[3]) + alpha[0]*alpha[0] * (lambda[6] + beta[0] * lambda[7] + beta[0] *lambda[8] + beta[0] * beta[0] * lambda[9]);
640: beta[1] = gamma[1] / gamma[0];
641: alpha[1] = gamma[1] / (delta[1] - beta[1] / alpha[0] * gamma[1]);
643: VecMergedOps_Private(X,R,Z,W,P,Q,C,D,G[0],H[0],G[1],H[1],S,A1,B1,E,F,M,N,U,ksp->normtype,beta[0],alpha[0],beta[1],alpha[1],lambda,alphaold);
644: }
646: gamma[0] = gamma[1];
647: delta[0] = delta[1];
649: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
650: MPI_Iallreduce(MPI_IN_PLACE,lambda,15,MPIU_SCALAR,MPIU_SUM,pcomm,&req);
651: #else
652: MPIU_Allreduce(MPI_IN_PLACE,lambda,15,MPIU_SCALAR,MPIU_SUM,pcomm);
653: req = MPI_REQUEST_NULL;
654: #endif
656: KSP_PCApply(ksp,N,G[0]); /* g <- Bn */
657: KSP_MatMult(ksp,Amat,G[0],H[0]); /* h <- Ag */
659: KSP_PCApply(ksp,H[0],E); /* e <- Bh */
660: KSP_MatMult(ksp,Amat,E,F); /* f <- Ae */
662: MPI_Wait(&req,&stat);
664: gamma[1] = lambda[10];
665: delta[1] = lambda[11];
666: dp = PetscSqrtReal(PetscAbsScalar(lambda[12]));
668: alphaold = alpha[1];
669: ksp->its = i;
671: if (i > 0) {
672: if (ksp->normtype == KSP_NORM_NONE) dp = 0.0;
673: ksp->rnorm = dp;
674: KSPLogResidualHistory(ksp,dp);
675: KSPMonitor(ksp,i,dp);
676: (*ksp->converged)(ksp,i,dp,&ksp->reason,ksp->cnvP);
677: if (ksp->reason) break;
678: }
679: }
681: if (i >= ksp->max_it) ksp->reason = KSP_DIVERGED_ITS;
682: return 0;
683: }
685: /*MC
686: KSPPIPECG2 - Pipelined conjugate gradient method with a single non-blocking allreduce per two iterations.
688: This method has only a single non-blocking reduction per two iterations, compared to 2 blocking for standard CG. The
689: non-blocking reduction is overlapped by two matrix-vector products and two preconditioner applications.
691: Level: intermediate
693: Notes:
694: MPI configuration may be necessary for reductions to make asynchronous progress, which is important for performance of pipelined methods.
695: See the FAQ on the PETSc website for details.
697: Contributed by:
698: Manasi Tiwari, Computational and Data Sciences, Indian Institute of Science, Bangalore
700: Reference:
701: Manasi Tiwari and Sathish Vadhiyar, "Pipelined Conjugate Gradient Methods for Distributed Memory Systems",
702: Submitted to International Conference on High Performance Computing, Data and Analytics 2020.
704: .seealso: KSPCreate(), KSPSetType(), KSPCG, KSPPIPECG
705: M*/
706: PETSC_EXTERN PetscErrorCode KSPCreate_PIPECG2(KSP ksp)
707: {
708: KSPSetSupportedNorm(ksp,KSP_NORM_UNPRECONDITIONED,PC_LEFT,2);
709: KSPSetSupportedNorm(ksp,KSP_NORM_PRECONDITIONED,PC_LEFT,2);
710: KSPSetSupportedNorm(ksp,KSP_NORM_NATURAL,PC_LEFT,2);
711: KSPSetSupportedNorm(ksp,KSP_NORM_NONE,PC_LEFT,1);
713: ksp->ops->setup = KSPSetUp_PIPECG2;
714: ksp->ops->solve = KSPSolve_PIPECG2;
715: ksp->ops->destroy = KSPDestroyDefault;
716: ksp->ops->view = NULL;
717: ksp->ops->setfromoptions = NULL;
718: ksp->ops->buildsolution = KSPBuildSolutionDefault;
719: ksp->ops->buildresidual = KSPBuildResidualDefault;
720: return 0;
721: }