Actual source code: landaucu.cu

  1: /*
  2:   Implements the Landau kernel
  3: */
  4: #include <petscconf.h>
  5: #include <petsc/private/dmpleximpl.h>
  6: #include <petsclandau.h>
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <petscmat.h>
 10: #include <petscdevice.h>

 12: #include "../land_tensors.h"
 13: #include <petscaijdevice.h>

 15: #define CHECK_LAUNCH_ERROR()                                                             \
 16: do {                                                                                     \
 17:   /* Check synchronous errors, i.e. pre-launch */                                        \
 18:   cudaError_t err = cudaGetLastError();                                                  \
 19:   if (cudaSuccess != err) {                                                              \
 20:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 21:   }                                                                                      \
 22:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
 23:   err = cudaDeviceSynchronize();                                                         \
 24:   if (cudaSuccess != err) {                                                              \
 25:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 26:   }                                                                                      \
 27:  } while (0)

 29: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE],
 30:                                                     PetscInt Nf[], PetscInt Nq, PetscInt grid)
 31: {
 32:   P4estVertexMaps h_maps;
 33:   h_maps.num_elements = maps[grid].num_elements;
 34:   h_maps.num_face = maps[grid].num_face;
 35:   h_maps.num_reduced = maps[grid].num_reduced;
 36:   h_maps.deviceType = maps[grid].deviceType;
 37:   h_maps.Nf = Nf[grid];
 38:   h_maps.numgrids = maps[grid].numgrids;
 39:   cudaMalloc((void **)&h_maps.c_maps,                    maps[grid].num_reduced  * sizeof *pointMaps);
 40:   cudaMemcpy(          h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced  * sizeof *pointMaps, cudaMemcpyHostToDevice);
 41:   cudaMalloc((void **)&h_maps.gIdx,                 maps[grid].num_elements * sizeof *maps[grid].gIdx);
 42:   cudaMemcpy(          h_maps.gIdx, maps[grid].gIdx,maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice);
 43:   cudaMalloc((void **)&maps[grid].d_self,            sizeof(P4estVertexMaps));
 44:   cudaMemcpy(          maps[grid].d_self,   &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);
 45:   return 0;
 46: }

 48: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
 49: {
 50:   for (PetscInt grid=0;grid<num_grids;grid++) {
 51:     P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
 52:     cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);
 53:     cudaFree(h_maps.c_maps);
 54:     cudaFree(h_maps.gIdx);
 55:     cudaFree(d_maps);
 56:   }
 57:   return 0;
 58: }

 60: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids,
 61:                                        PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[],
 62:                                        PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal a_invJ[],
 63:                                        PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
 64: {
 65:   PetscTabulation *Tf;
 66:   PetscReal       *BB,*DD;
 67:   PetscInt        dim,Nb=Nq,szf=sizeof(PetscReal),szs=sizeof(PetscScalar),szi=sizeof(PetscInt);
 68:   PetscInt        h_ip_offset[LANDAU_MAX_GRIDS+1],h_ipf_offset[LANDAU_MAX_GRIDS+1],h_elem_offset[LANDAU_MAX_GRIDS+1],nip,IPfdf_sz,Nf;
 69:   PetscDS         prob;

 71:   DMGetDimension(plex, &dim);
 72:   DMGetDS(plex, &prob);
 74:   PetscDSGetTabulation(prob, &Tf);
 75:   BB   = Tf[0]->T[0]; DD = Tf[0]->T[1];
 76:   Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
 77:   nip = 0;
 78:   IPfdf_sz = 0;
 79:   for (PetscInt grid=0 ; grid<num_grids ; grid++) {
 80:     PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
 81:     h_elem_offset[grid+1] = h_elem_offset[grid] + a_numCells[grid];
 82:     nip += a_numCells[grid]*Nq;
 83:     h_ip_offset[grid+1] = nip;
 84:     IPfdf_sz += Nq*nfloc*a_numCells[grid];
 85:     h_ipf_offset[grid+1] = IPfdf_sz;
 86:   }
 87:   Nf = a_species_offset[num_grids];
 88:   {
 89:     cudaMalloc((void **)&SData_d->B,      Nq*Nb*szf);     // kernel input
 90:     cudaMemcpy(          SData_d->B, BB,  Nq*Nb*szf,   cudaMemcpyHostToDevice);
 91:     cudaMalloc((void **)&SData_d->D,      Nq*Nb*dim*szf); // kernel input
 92:     cudaMemcpy(          SData_d->D, DD,  Nq*Nb*dim*szf,   cudaMemcpyHostToDevice);

 94:     cudaMalloc((void **)&SData_d->alpha, Nf*szf); // kernel input
 95:     cudaMalloc((void **)&SData_d->beta,  Nf*szf); // kernel input
 96:     cudaMalloc((void **)&SData_d->invMass,  Nf*szf); // kernel input

 98:     cudaMemcpy(SData_d->alpha,  nu_alpha, Nf*szf, cudaMemcpyHostToDevice);
 99:     cudaMemcpy(SData_d->beta,   nu_beta,  Nf*szf, cudaMemcpyHostToDevice);
100:     cudaMemcpy(SData_d->invMass,a_invMass,Nf*szf, cudaMemcpyHostToDevice);

102:     // collect geometry
103:     cudaMalloc((void **)&SData_d->invJ,   nip*dim*dim*szf); // kernel input
104:     cudaMemcpy(SData_d->invJ,   a_invJ,   nip*dim*dim*szf, cudaMemcpyHostToDevice);
105:     cudaMalloc((void **)&SData_d->x,      nip*szf);     // kernel input
106:     cudaMemcpy(          SData_d->x, a_x, nip*szf,   cudaMemcpyHostToDevice);
107:     cudaMalloc((void **)&SData_d->y,      nip*szf); // kernel input
108:     cudaMemcpy(          SData_d->y, a_y, nip*szf,   cudaMemcpyHostToDevice);
109: #if LANDAU_DIM==3
110:     cudaMalloc((void **)&SData_d->z,      nip*szf); // kernel input
111:     cudaMemcpy(          SData_d->z, a_z, nip*szf,   cudaMemcpyHostToDevice);
112: #endif
113:     cudaMalloc((void **)&SData_d->w,      nip*szf); // kernel input
114:     cudaMemcpy(          SData_d->w, a_w, nip*szf,   cudaMemcpyHostToDevice);

116:     cudaMalloc((void **)&SData_d->NCells,              num_grids*szi);
117:     cudaMemcpy(          SData_d->NCells, a_numCells,  num_grids*szi,   cudaMemcpyHostToDevice);
118:     cudaMalloc((void **)&SData_d->species_offset,                   (num_grids+1)*szi);
119:     cudaMemcpy(          SData_d->species_offset, a_species_offset, (num_grids+1)*szi,   cudaMemcpyHostToDevice);
120:     cudaMalloc((void **)&SData_d->mat_offset,                      (num_grids+1)*szi);
121:     cudaMemcpy(          SData_d->mat_offset, a_mat_offset,       (num_grids+1)*szi,   cudaMemcpyHostToDevice);
122:     cudaMalloc((void **)&SData_d->ip_offset,                       (num_grids+1)*szi);
123:     cudaMemcpy(          SData_d->ip_offset, h_ip_offset,          (num_grids+1)*szi,   cudaMemcpyHostToDevice);
124:     cudaMalloc((void **)&SData_d->ipf_offset,                     (num_grids+1)*szi);
125:     cudaMemcpy(          SData_d->ipf_offset, h_ipf_offset,     (num_grids+1)*szi,   cudaMemcpyHostToDevice);
126:     cudaMalloc((void **)&SData_d->elem_offset,                     (num_grids+1)*szi);
127:     cudaMemcpy(          SData_d->elem_offset, h_elem_offset,     (num_grids+1)*szi,   cudaMemcpyHostToDevice);
128:     cudaMalloc((void**)&SData_d->maps, num_grids*sizeof(P4estVertexMaps*));
129:     // allocate space for dynamic data once
130:     cudaMalloc((void **)&SData_d->Eq_m,                          Nf*szf); // this could be for each vertex (todo?)
131:     cudaMalloc((void **)&SData_d->f,      nip*Nf*szs*batch_sz); // for each vertex in batch
132:     cudaMalloc((void **)&SData_d->dfdx,   nip*Nf*szs*batch_sz);
133:     cudaMalloc((void **)&SData_d->dfdy,   nip*Nf*szs*batch_sz);
134: #if LANDAU_DIM==3
135:     cudaMalloc((void **)&SData_d->dfdz,   nip*Nf*szs*batch_sz);
136: #endif
137:   }
138:   return 0;
139: }

141: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
142: {
143:   if (SData_d->alpha) {
144:     cudaFree(SData_d->alpha);
145:     SData_d->alpha = NULL;
146:     cudaFree(SData_d->beta);
147:     cudaFree(SData_d->invMass);
148:     cudaFree(SData_d->B);
149:     cudaFree(SData_d->D);
150:     cudaFree(SData_d->invJ);
151: #if LANDAU_DIM==3
152:     cudaFree(SData_d->z);
153: #endif
154:     cudaFree(SData_d->x);
155:     cudaFree(SData_d->y);
156:     cudaFree(SData_d->w);
157:     // dynamic data
158:     cudaFree(SData_d->Eq_m);
159:     cudaFree(SData_d->f);
160:     cudaFree(SData_d->dfdx);
161:     cudaFree(SData_d->dfdy);
162: #if LANDAU_DIM==3
163:     cudaFree(SData_d->dfdz);
164: #endif
165:     cudaFree(SData_d->NCells);
166:     cudaFree(SData_d->species_offset);
167:     cudaFree(SData_d->mat_offset);
168:     cudaFree(SData_d->ip_offset);
169:     cudaFree(SData_d->ipf_offset);
170:     cudaFree(SData_d->elem_offset);
171:     cudaFree(SData_d->maps);
172:   }
173:   return 0;
174: }
175: //
176: // The GPU Landau kernel
177: //
178: __global__
179: void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_invJ[],
180:                      const PetscReal * const BB, const PetscReal * const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[],
181:                      PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
182: #if LANDAU_DIM==3
183:                      PetscReal d_dfdz[],
184: #endif
185:                      const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
186: {
187:   const PetscInt    Nq = blockDim.y, myQi = threadIdx.y;
188:   const PetscInt    b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
189:   const PetscReal   *Bq = &BB[myQi*Nb], *Dq = &DD[myQi*Nb*dim];
190:   PetscInt          grid = 0, f,d,b,e,q;
191:   while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
192:   {
193:     const PetscInt     loc_nip = d_numCells[grid]*Nq, loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
194:     const PetscInt     moffset = LAND_MOFFSET(b_id,grid,gridDim.x,num_grids,d_mat_offset);
195:     const PetscScalar  *coef;
196:     PetscReal          u_x[LANDAU_DIM];
197:     const PetscReal    *invJ = &d_invJ[(d_ip_offset[grid] + loc_elem*Nq + myQi)*dim*dim];
198:     PetscScalar        coef_buff[LANDAU_MAX_SPECIES*LANDAU_MAX_NQ];
199:     if (!d_maps) {
200:       coef = &d_vertex_f[b_id*IPf_sz_glb + d_ipf_offset[grid] + loc_elem*Nb*loc_Nf]; // closure and IP indexing are the same
201:     } else {
202:       coef = coef_buff;
203:       for (f = 0; f < loc_Nf ; ++f) {
204:         LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][f][0];
205:         for (b = 0; b < Nb; ++b) {
206:           PetscInt idx = Idxs[b];
207:           if (idx >= 0) {
208:             coef_buff[f*Nb+b] = d_vertex_f[idx+moffset];
209:           } else {
210:             idx = -idx - 1;
211:             coef_buff[f*Nb+b] = 0;
212:             for (q = 0; q < d_maps[grid]->num_face; q++) {
213:               PetscInt  id    = d_maps[grid]->c_maps[idx][q].gid;
214:               PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
215:               if (id >= 0) coef_buff[f*Nb+b] += scale*d_vertex_f[id+moffset];
216:             }
217:           }
218:         }
219:       }
220:     }

222:     /* get f and df */
223:     for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
224:       PetscReal      refSpaceDer[LANDAU_DIM];
225:       const PetscInt idx = b_id*IPf_sz_glb + d_ipf_offset[grid] + f*loc_nip + loc_elem*Nq + myQi;
226:       d_f[idx] = 0.0;
227:       for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
228:       for (b = 0; b < Nb; ++b) {
229:         const PetscInt    cidx = b;
230:         d_f[idx] += Bq[cidx]*PetscRealPart(coef[f*Nb+cidx]);
231:         for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx*dim+d]*PetscRealPart(coef[f*Nb+cidx]);
232:       }
233:       for (d = 0; d < dim; ++d) {
234:         for (e = 0, u_x[d] = 0.0; e < dim; ++e) {
235:           u_x[d] += invJ[e*dim+d]*refSpaceDer[e];
236:         }
237:         //printf("%d.%d.%d.%d %d) u_x[%d]=%g\n",b_id,grid,loc_elem,myQi,idx,d,u_x[d]);
238:       }
239:       d_dfdx[idx] = u_x[0];
240:       d_dfdy[idx] = u_x[1];
241: #if LANDAU_DIM==3
242:       d_dfdz[idx] = u_x[2];
243: #endif
244:     }
245:   }
246: }

248: __device__ void
249: landau_jac_kernel(const PetscInt num_grids, const PetscInt jpidx, PetscInt nip_global, const PetscInt grid,
250:            const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[],
251:            const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
252:            const PetscReal * const BB, const PetscReal * const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
253:            PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
254:            PetscReal s_scale[][LANDAU_MAX_Q_FACE],
255:            PetscInt  s_idx[][LANDAU_MAX_Q_FACE],
256:            PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
257:            PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
258:            PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
259:            PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
260:            PetscReal s_nu_alpha[],
261:            PetscReal s_nu_beta[],
262:            PetscReal s_invMass[],
263:            PetscReal s_f[],
264:            PetscReal s_dfx[],
265:            PetscReal s_dfy[],
266:            PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
267: #if LANDAU_DIM==3
268:            const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
269: #endif
270:            const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
271: {
272:   const PetscInt  Nq = blockDim.y, myQi = threadIdx.y;
273:   const PetscInt  b_elem_idx=blockIdx.y, b_id=blockIdx.x, IPf_sz_glb=d_ipf_offset[num_grids];
274:   const PetscInt  loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
275:   const PetscInt  moffset = LAND_MOFFSET(b_id,grid,gridDim.x,num_grids,d_mat_offset);
276:   int             delta,d,f,g,d2,dp,d3,fieldA,ipidx_b;
277:   PetscReal       gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
278: #if LANDAU_DIM==2
279:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
280:   constexpr int dim = 2;
281: #else
282:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
283:   constexpr int dim = 3;
284: #endif
285:   const PetscInt  f_off = d_species_offset[grid], Nb=Nq;
286:   // create g2 & g3
287:   for (f=threadIdx.x; f<loc_Nf; f+=blockDim.x) {
288:     for (d=0;d<dim;d++) { // clear accumulation data D & K
289:       s_gg2[d][myQi][f] = 0;
290:       for (d2=0;d2<dim;d2++) s_gg3[d][d2][myQi][f] = 0;
291:     }
292:   }
293:   #pragma unroll
294:   for (d2 = 0; d2 < dim; d2++) {
295:     gg2_temp[d2] = 0;
296:     #pragma unroll
297:     for (d3 = 0; d3 < dim; d3++) {
298:       gg3_temp[d2][d3] = 0;
299:     }
300:   }
301:   if (threadIdx.y == 0) {
302:     // copy species into shared memory
303:     for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
304:       s_nu_alpha[fieldA] = nu_alpha[fieldA];
305:       s_nu_beta[fieldA] = nu_beta[fieldA];
306:       s_invMass[fieldA] = invMass[fieldA];
307:     }
308:   }
309:   __syncthreads();
310:   // inner integral, collect gg2/3
311:   for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
312:     const PetscInt ipidx = ipidx_b + threadIdx.x;
313:     PetscInt f_off_r,grid_r,loc_Nf_r,nip_loc_r,ipidx_g,fieldB,IPf_idx_r;
314:     __syncthreads();
315:     if (ipidx < nip_global) {
316:       grid_r = 0;
317:       while (ipidx >= d_ip_offset[grid_r+1]) grid_r++;
318:       f_off_r = d_species_offset[grid_r];
319:       ipidx_g = ipidx - d_ip_offset[grid_r];
320:       nip_loc_r = d_numCells[grid_r]*Nq;
321:       loc_Nf_r = d_species_offset[grid_r+1] - d_species_offset[grid_r];
322:       IPf_idx_r = b_id*IPf_sz_glb + d_ipf_offset[grid_r] + ipidx_g;
323:       for (fieldB = threadIdx.y; fieldB < loc_Nf_r ; fieldB += blockDim.y) {
324:         const PetscInt idx = IPf_idx_r + fieldB*nip_loc_r;
325:         s_f  [fieldB*blockDim.x + threadIdx.x] =    d_f[idx]; // all vector threads get copy of data
326:         //printf("\t\td_f %d.%d.%d.%d.%d) %d) d_f[%d]=%e == s_f[%d]\n",b_id,grid,loc_elem,myQi,fieldB,ipidx,idx,d_f[idx],fieldB*blockDim.x + threadIdx.x);
327:         s_dfx[fieldB*blockDim.x + threadIdx.x] = d_dfdx[idx];
328:         s_dfy[fieldB*blockDim.x + threadIdx.x] = d_dfdy[idx];
329: #if LANDAU_DIM==3
330:         s_dfz[fieldB*blockDim.x + threadIdx.x] = d_dfdz[idx];
331: #endif
332:       }
333:     }
334:     __syncthreads();
335:     if (ipidx < nip_global) {
336:       const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
337:       PetscReal       temp1[3] = {0, 0, 0}, temp2 = 0;
338: #if LANDAU_DIM==2
339:       PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
340:       LandauTensor2D(vj, x, y, Ud, Uk, mask);
341: #else
342:       PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2]-z) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
343:       LandauTensor3D(vj, x, y, z, U, mask);
344: #endif
345:       for (int fieldB = 0; fieldB < loc_Nf_r; fieldB++) {
346:         temp1[0] += s_dfx[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
347:         temp1[1] += s_dfy[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
348: #if LANDAU_DIM==3
349:         temp1[2] += s_dfz[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
350: #endif
351:         temp2    += s_f  [fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r];
352:         //printf("\t\t\t temp2 %d.%d.%d * %d * %d: ipidx %d temp2=%e\n",b_id,grid,loc_elem,myQi,fieldB, ipidx, temp2);
353:       }
354:       temp1[0] *= wi;
355:       temp1[1] *= wi;
356: #if LANDAU_DIM==3
357:       temp1[2] *= wi;
358: #endif
359:       temp2    *= wi;
360: #if LANDAU_DIM==2
361:       #pragma unroll
362:       for (d2 = 0; d2 < 2; d2++) {
363:         #pragma unroll
364:         for (d3 = 0; d3 < 2; ++d3) {
365:           /* K = U * grad(f): g2=e: i,A */
366:           gg2_temp[d2] += Uk[d2][d3]*temp1[d3];
367:           /* D = -U * (I \kron (fx)): g3=f: i,j,A */
368:           gg3_temp[d2][d3] += Ud[d2][d3]*temp2;
369:         }
370:       }
371: #else
372:       #pragma unroll
373:       for (d2 = 0; d2 < 3; ++d2) {
374:         #pragma unroll
375:         for (d3 = 0; d3 < 3; ++d3) {
376:           /* K = U * grad(f): g2 = e: i,A */
377:           gg2_temp[d2] += U[d2][d3]*temp1[d3];
378:           /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
379:           gg3_temp[d2][d3] += U[d2][d3]*temp2;
380:         }
381:       }
382: #endif
383:     }
384:   } /* IPs */

386:   /* reduce gg temp sums across threads */
387:   for (delta = blockDim.x/2; delta > 0; delta /= 2) {
388:     #pragma unroll
389:     for (d2 = 0; d2 < dim; d2++) {
390:       gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
391:       #pragma unroll
392:       for (d3 = 0; d3 < dim; d3++) {
393:         gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
394:       }
395:     }
396:   }
397:   // add alpha and put in gg2/3
398:   for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
399:     #pragma unroll
400:     for (d2 = 0; d2 < dim; d2++) {
401:       s_gg2[d2][myQi][fieldA] += gg2_temp[d2]*s_nu_alpha[fieldA+f_off];
402:       #pragma unroll
403:       for (d3 = 0; d3 < dim; d3++) {
404:         s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3]*s_nu_alpha[fieldA+f_off]*s_invMass[fieldA+f_off];
405:       }
406:     }
407:   }
408:   __syncthreads();
409:   /* add electric field term once per IP */
410:   for (fieldA = threadIdx.x ; fieldA < loc_Nf ; fieldA += blockDim.x) {
411:     s_gg2[dim-1][myQi][fieldA] += Eq_m[fieldA+f_off];
412:   }
413:   __syncthreads();
414:   /* Jacobian transform - g2 */
415:   for (fieldA = threadIdx.x ; fieldA < loc_Nf ; fieldA += blockDim.x) {
416:     PetscReal wj = ww[jpidx];
417:     for (d = 0; d < dim; ++d) {
418:       s_g2[d][myQi][fieldA] = 0.0;
419:       for (d2 = 0; d2 < dim; ++d2) {
420:         s_g2[d][myQi][fieldA] += invJj[d*dim+d2]*s_gg2[d2][myQi][fieldA];
421:         s_g3[d][d2][myQi][fieldA] = 0.0;
422:         for (d3 = 0; d3 < dim; ++d3) {
423:           for (dp = 0; dp < dim; ++dp) {
424:             s_g3[d][d2][myQi][fieldA] += invJj[d*dim + d3]*s_gg3[d3][dp][myQi][fieldA]*invJj[d2*dim + dp];
425:           }
426:         }
427:         s_g3[d][d2][myQi][fieldA] *= wj;
428:       }
429:       s_g2[d][myQi][fieldA] *= wj;
430:     }
431:   }
432:   __syncthreads();  // Synchronize (ensure all the data is available) and sum IP matrices
433:   /* FE matrix construction */
434:   {
435:     int fieldA,d,qj,d2,q,idx,totDim=Nb*loc_Nf;
436:     /* assemble */
437:     for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
438:       for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
439:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
440:           PetscScalar t = 0;
441:           for (qj = 0 ; qj < Nq ; qj++) {
442:             const PetscReal *BJq = &BB[qj*Nb], *DIq = &DD[qj*Nb*dim];
443:             for (d = 0; d < dim; ++d) {
444:               t += DIq[f*dim+d]*s_g2[d][qj][fieldA]*BJq[g];
445:               for (d2 = 0; d2 < dim; ++d2) {
446:                 t += DIq[f*dim + d]*s_g3[d][d2][qj][fieldA]*DIq[g*dim + d2];
447:               }
448:             }
449:           }
450:           if (elemMat) {
451:             const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
452:             elemMat[fOff] += t; // ????
453:           } else s_fieldMats[f][g] = t;
454:         }
455:       }
456:       if (s_fieldMats) {
457:         PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
458:         PetscInt    nr,nc;
459:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
460:         __syncthreads();
461:         if (threadIdx.y == 0) {
462:           for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
463:             idx = Idxs[f];
464:             if (idx >= 0) {
465:               s_idx[f][0] = idx + moffset;
466:               s_scale[f][0] = 1.;
467:             } else {
468:               idx = -idx - 1;
469:               for (q = 0; q < d_maps[grid]->num_face; q++) {
470:                 if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
471:                 else s_idx[f][q] = -1;
472:                 s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
473:               }
474:             }
475:           }
476:         }
477:         __syncthreads();
478:         for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
479:           idx = Idxs[f];
480:           if (idx >= 0) {
481:             nr = 1;
482:           } else {
483:             nr = d_maps[grid]->num_face;
484:           }
485:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
486:             idx = Idxs[g];
487:             if (idx >= 0) {
488:               nc = 1;
489:             } else {
490:               nc = d_maps[grid]->num_face;
491:             }
492:             for (q = 0; q < nr; q++) {
493:               for (d = 0; d < nc; d++) {
494:                 vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
495:               }
496:             }
497:             MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
498:           }
499:         }
500:         __syncthreads();
501:       }
502:     }
503:   }
504: }

506: //
507: // The CUDA Landau kernel
508: //
509: __global__
510: void __launch_bounds__(256,2)
511:   landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal invJj[],
512:                   const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
513:                   const PetscReal * const BB, const PetscReal * const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[],
514:                   PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
515: #if LANDAU_DIM==3
516:                   const PetscReal zz[], PetscReal d_dfdz[],
517: #endif
518:                   const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
519: {
520:   extern __shared__ PetscReal smem[];
521:   int size = 0;
522:   PetscReal (*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]              =
523:     (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES])             &smem[size];
524:   size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
525:   PetscReal (*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]  =
526:     (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
527:   size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
528:   PetscReal (*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]             =
529:     (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES])             &smem[size];
530:   size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
531:   PetscReal (*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
532:     (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
533:   size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
534:   PetscReal *s_nu_alpha = &smem[size];
535:   size += LANDAU_MAX_SPECIES;
536:   PetscReal *s_nu_beta  = &smem[size];
537:   size += LANDAU_MAX_SPECIES;
538:   PetscReal *s_invMass  = &smem[size];
539:   size += LANDAU_MAX_SPECIES;
540:   PetscReal *s_f        = &smem[size];
541:   size += blockDim.x*LANDAU_MAX_SPECIES;
542:   PetscReal *s_dfx      = &smem[size];
543:   size += blockDim.x*LANDAU_MAX_SPECIES;
544:   PetscReal *s_dfy      = &smem[size];
545:   size += blockDim.x*LANDAU_MAX_SPECIES;
546: #if LANDAU_DIM==3
547:   PetscReal *s_dfz      = &smem[size];
548:   size += blockDim.x*LANDAU_MAX_SPECIES;
549: #endif
550:   PetscScalar (*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
551:   PetscReal   (*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
552:   PetscInt    (*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
553:   const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x;
554:   PetscInt       Nq = blockDim.y, grid = 0; // Nq == Nb
555:   PetscScalar    *elemMat = NULL; /* my output */

557:   while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
558:   {
559:     const PetscInt   loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
560:     const PetscInt   myQi = threadIdx.y;
561:     const PetscInt   jpidx = d_ip_offset[grid] + myQi + loc_elem * Nq;
562:     const PetscReal  *invJ = &invJj[jpidx*dim*dim];
563:     if (d_elem_mats) {
564:       PetscInt totDim = loc_Nf*Nb;
565:       elemMat = d_elem_mats; // start a beginning and get to my element matrix
566:       for (PetscInt b_id2=0 ; b_id2<b_id ; b_id2++) {
567:         for (PetscInt grid2=0 ; grid2<num_grids ; grid2++) {
568:           PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
569:           elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
570:         }
571:       }
572:       for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
573:         PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
574:         elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids, could be in an offset
575:       }
576:       elemMat += loc_elem*totDim*totDim; // index into local matrix & zero out
577:       for (int i = threadIdx.x + threadIdx.y*blockDim.x; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
578:       //if (threadIdx.x) printf("\t %d.%d.%d landau_jacobian elem mat idx = %d\n",b_id,grid,loc_elem,(int)(elemMat-d_elem_mats));
579:     }
580:     __syncthreads();
581:     if (d_maps) {
582:       // reuse the space for fieldMats
583:       s_fieldMats = (PetscScalar (*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) &smem[size];
584:       size += LANDAU_MAX_NQ*LANDAU_MAX_NQ;
585:       s_scale =  (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
586:       size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
587:       s_idx = (PetscInt (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
588:       size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
589:     } else {
590:       s_fieldMats = NULL;
591:     }
592:     __syncthreads();
593:     landau_jac_kernel(num_grids, jpidx, nip_global, grid, xx, yy, ww,
594:                invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD,
595:                elemMat, d_maps, d_mat,
596:                *s_fieldMats, *s_scale, *s_idx,
597:                *s_g2, *s_g3, *s_gg2, *s_gg3,
598:                s_nu_alpha, s_nu_beta, s_invMass,
599:                s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
600: #if LANDAU_DIM==3
601:                zz, s_dfz, d_dfdz,
602: #endif
603:                d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
604:   }
605: }

607: __global__
608: void __launch_bounds__(256,4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_w[], const PetscReal * const BB, const PetscReal * const DD,
609:                                           PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift,
610:                                           const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_elem_offset[])
611: {
612:   const PetscInt         Nq = blockDim.y, b_elem_idx = blockIdx.y, b_id = blockIdx.x;
613:   __shared__ PetscScalar s_fieldMats[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
614:   __shared__ PetscInt    s_idx[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
615:   __shared__ PetscReal   s_scale[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
616:   PetscScalar            *elemMat = NULL; /* my output */
617:   PetscInt               fieldA,d,qj,q,idx,f,g, grid = 0;

619:   while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
620:   {
621:     const PetscInt loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
622:     const PetscInt moffset = LAND_MOFFSET(b_id,grid,gridDim.x,num_grids,d_mat_offset), totDim = loc_Nf*Nq;
623:     if (d_elem_mats) {
624:       elemMat = d_elem_mats; // start a beginning
625:       for (PetscInt b_id2=0 ; b_id2<b_id ; b_id2++) {
626:         for (PetscInt grid2=0 ; grid2<num_grids ; grid2++) {
627:           PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
628:           elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
629:         }
630:       }
631:       for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
632:         PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
633:         elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
634:       }
635:       elemMat += loc_elem*totDim*totDim;
636:       for (int i = threadIdx.x + threadIdx.y*blockDim.x ; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
637:     }
638:     __syncthreads();
639:     /* FE mass matrix construction */
640:     for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
641:       PetscScalar            vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
642:       PetscInt               nr,nc;
643:       for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
644:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
645:           PetscScalar t = 0;
646:           for (qj = 0 ; qj < Nq ; qj++) {
647:             const PetscReal *BJq = &BB[qj*Nb];
648:             const PetscInt jpidx = d_ip_offset[grid] + qj + loc_elem * Nq;
649:             if (dim==2) {
650:               t += BJq[f] * d_w[jpidx]*shift * BJq[g] * 2. * PETSC_PI;
651:             } else {
652:               t += BJq[f] * d_w[jpidx]*shift * BJq[g];
653:             }
654:           }
655:           if (elemMat) {
656:             const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
657:             elemMat[fOff] += t; // ????
658:           } else s_fieldMats[f][g] = t;
659:         }
660:       }
661:       if (!elemMat) {
662:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
663:         __syncthreads();
664:         if (threadIdx.y == 0) {
665:           for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
666:             idx = Idxs[f];
667:             if (idx >= 0) {
668:               s_idx[f][0] = idx + moffset;
669:               s_scale[f][0] = 1.;
670:             } else {
671:               idx = -idx - 1;
672:               for (q = 0; q < d_maps[grid]->num_face; q++) {
673:                 if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
674:                 else s_idx[f][q] = -1;
675:                 s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
676:               }
677:             }
678:           }
679:         }
680:         __syncthreads();
681:         for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
682:           idx = Idxs[f];
683:           if (idx >= 0) {
684:             nr = 1;
685:           } else {
686:             nr = d_maps[grid]->num_face;
687:           }
688:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
689:             idx = Idxs[g];
690:             if (idx >= 0) {
691:               nc = 1;
692:             } else {
693:               nc = d_maps[grid]->num_face;
694:             }
695:             for (q = 0; q < nr; q++) {
696:               for (d = 0; d < nc; d++) {
697:                 vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
698:               }
699:             }
700:             MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
701:           }
702:         }
703:       }
704:       __syncthreads();
705:     }
706:   }
707: }

709: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[],
710:                                   const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscReal shift,
711:                                   const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
712: {
713:   cudaError_t       cerr;
714:   PetscInt          Nb=Nq,dim,nip_global,num_cells_batch,elem_mat_size_tot;
715:   PetscInt          *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
716:   PetscInt          szf=sizeof(PetscReal),szs=sizeof(PetscScalar),Nftot=a_species_offset[num_grids];
717:   PetscReal         *d_BB=NULL,*d_DD=NULL,*d_invJj=NULL,*d_nu_alpha=NULL,*d_nu_beta=NULL,*d_invMass=NULL,*d_Eq_m=NULL,*d_x=NULL,*d_y=NULL,*d_w=NULL;
718:   PetscScalar       *d_elem_mats=NULL,*d_vertex_f=NULL;
719:   PetscReal         *d_f=NULL,*d_dfdx=NULL,*d_dfdy=NULL;
720: #if LANDAU_DIM==3
721:   PetscReal         *d_dfdz=NULL, *d_z = NULL;
722: #endif
723:   LandauCtx         *ctx;
724:   PetscSplitCSRDataStructure d_mat=NULL;
725:   P4estVertexMaps   **d_maps,*maps[LANDAU_MAX_GRIDS];
726:   int               nnn = 256/Nq; // machine dependent
727:   PetscContainer    container;

729:   PetscLogEventBegin(events[3],0,0,0,0);
730:   while (nnn & nnn - 1) nnn = nnn & nnn - 1;
731:   if (nnn>16) nnn = 16; // printf("DEBUG\n");
732:   DMGetApplicationContext(plex[0], &ctx);
734:   DMGetDimension(plex[0], &dim);
736:   if (ctx->gpu_assembly) {
737:     PetscObjectQuery((PetscObject) JacP, "assembly_maps", (PetscObject *) &container);
738:     if (container) { // not here first call
739:       static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
740:       if (!init++) {
741:         P4estVertexMaps   *h_maps=NULL;
742:         PetscContainerGetPointer(container, (void **) &h_maps);
743:         for (PetscInt grid=0 ; grid<num_grids ; grid++) {
744:           if (h_maps[grid].d_self) {
745:             maps[grid] = h_maps[grid].d_self;
746:           } else {
747:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
748:           }
749:         }
750:         cudaMemcpy(SData_d->maps, maps, num_grids*sizeof(P4estVertexMaps*), cudaMemcpyHostToDevice);
751:       }
752:       d_maps = (P4estVertexMaps**)SData_d->maps;
753:       // this does the setup the first time called
754:       MatCUSPARSEGetDeviceMatWrite(JacP,&d_mat);
755:     } else {
756:       d_maps = NULL;
757:     }
758:   } else {
759:     container = NULL;
760:     d_maps = NULL;
761:   }
762:   PetscLogEventEnd(events[3],0,0,0,0);
763:   {
764:     PetscInt elem_mat_size = 0;
765:     nip_global = num_cells_batch = 0;
766:     for (PetscInt grid=0 ; grid<num_grids ; grid++) {
767:       PetscInt Nfloc = a_species_offset[grid+1] - a_species_offset[grid], totDim = Nfloc*Nb;
768:       nip_global += a_numCells[grid]*Nq;
769:       num_cells_batch += a_numCells[grid]; // is in d_elem_offset, but not on host
770:       elem_mat_size += a_numCells[grid]*totDim*totDim; // could save in an offset here -- batch major ordering
771:     }
772:     elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
773:   }
774:   dim3 dimGrid(batch_sz,num_cells_batch);
775:   if (elem_mat_size_tot) {
776:     cudaMalloc((void **)&d_elem_mats, batch_sz*elem_mat_size_tot*szs); // kernel output - first call is on CPU
777:   } else d_elem_mats = NULL;
778:   // create data
779:   d_BB = (PetscReal*)SData_d->B;
780:   d_DD = (PetscReal*)SData_d->D;
781:   if (a_elem_closure || a_xarray) {  // form f and df
782:     PetscLogEventBegin(events[1],0,0,0,0);
783:     cudaMemcpy(SData_d->Eq_m, a_Eq_m,   Nftot*szf, cudaMemcpyHostToDevice);
784:     d_invJj = (PetscReal*)SData_d->invJ;
785:     d_nu_alpha = (PetscReal*)SData_d->alpha;
786:     d_nu_beta = (PetscReal*)SData_d->beta;
787:     d_invMass = (PetscReal*)SData_d->invMass;
788:     d_x = (PetscReal*)SData_d->x;
789:     d_y = (PetscReal*)SData_d->y;
790:     d_w = (PetscReal*)SData_d->w;
791:     d_Eq_m = (PetscReal*)SData_d->Eq_m;
792:     d_dfdx = (PetscReal*)SData_d->dfdx;
793:     d_dfdy = (PetscReal*)SData_d->dfdy;
794: #if LANDAU_DIM==3
795:     d_dfdz = (PetscReal*)SData_d->dfdz;
796:     d_z = (PetscReal*)SData_d->z;
797: #endif
798:     d_f    = (PetscReal*)SData_d->f;
799:     // get a d_vertex_f
800:     if (a_elem_closure) {
801:       PetscInt closure_sz = 0; // argh, don't have this on the host!!!
802:       for (PetscInt grid=0 ; grid<num_grids ; grid++) {
803:         PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
804:         closure_sz += Nq*nfloc*a_numCells[grid];
805:       }
806:       closure_sz *= batch_sz;
807:       cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure));
808:       cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz*sizeof(*a_elem_closure), cudaMemcpyHostToDevice);
809:     } else {
810:       d_vertex_f = (PetscScalar*)a_xarray;
811:     }
812:     PetscLogEventEnd(events[1],0,0,0,0);
813:   } else {
814:     d_w = (PetscReal*)SData_d->w; // mass just needs the weights
815:   }
816:   //
817:   d_numCells = (PetscInt*)SData_d->NCells; // redundant -- remove
818:   d_species_offset = (PetscInt*)SData_d->species_offset;
819:   d_mat_offset = (PetscInt*)SData_d->mat_offset;
820:   d_ip_offset = (PetscInt*)SData_d->ip_offset;
821:   d_ipf_offset = (PetscInt*)SData_d->ipf_offset;
822:   d_elem_offset = (PetscInt*)SData_d->elem_offset;
823:   if (a_elem_closure || a_xarray) {  // form f and df
824:     dim3 dimBlockFDF(nnn>Nftot ? Nftot : nnn, Nq), dimBlock((nip_global>nnn) ? nnn : nip_global, Nq);
825:     PetscLogEventBegin(events[8],0,0,0,0);
826:     PetscLogGpuTimeBegin();
827:     PetscInfo(plex[0], "Form F and dF/dx vectors: nip_global=%D num_grids=%D\n",nip_global,num_grids);
828:     landau_form_fdf<<<dimGrid,dimBlockFDF>>>(dim, Nb, num_grids, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
829: #if LANDAU_DIM==3
830:                                              d_dfdz,
831: #endif
832:                                              d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
833:     CHECK_LAUNCH_ERROR();
834:     PetscLogGpuFlops(batch_sz*nip_global*(PetscLogDouble)(2*Nb*(1+dim)));
835:     if (a_elem_closure) {
836:       cudaFree(d_vertex_f);
837:       d_vertex_f = NULL;
838:     }
839:     PetscLogGpuTimeEnd();
840:     PetscLogEventEnd(events[8],0,0,0,0);
841:     // Jacobian
842:     PetscLogEventBegin(events[4],0,0,0,0);
843:     PetscLogGpuTimeBegin();
844:     PetscLogGpuFlops(batch_sz*nip_global*(PetscLogDouble)(a_elem_closure ? (nip_global*(11*Nftot+ 4*dim*dim) + 6*Nftot*dim*dim*dim + 10*Nftot*dim*dim + 4*Nftot*dim + Nb*Nftot*Nb*Nq*dim*dim*5) : Nb*Nftot*Nb*Nq*4));
845:     PetscInt ii = 2*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM) + 3*LANDAU_MAX_SPECIES + (1+LANDAU_DIM)*dimBlock.x*LANDAU_MAX_SPECIES + LANDAU_MAX_NQ*LANDAU_MAX_NQ + 2*LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
846:     if (ii*szf >= 49152) {
847:       cerr = cudaFuncSetAttribute(landau_jacobian,
848:                                   cudaFuncAttributeMaxDynamicSharedMemorySize,
849:                                   98304);cerr;
850:     }
851:     PetscInfo(plex[0], "Jacobian shared memory size: %D bytes, d_elem_mats=%p d_maps=%p\n",ii,d_elem_mats,d_maps);
852:     landau_jacobian<<<dimGrid,dimBlock,ii*szf>>>(nip_global, dim, Nb, num_grids, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m,
853:                                                  d_BB, d_DD, d_x, d_y, d_w,
854:                                                  d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
855: #if LANDAU_DIM==3
856:                                                  d_z, d_dfdz,
857: #endif
858:                                                  d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
859:     CHECK_LAUNCH_ERROR(); // has sync
860:     PetscLogGpuTimeEnd();
861:     PetscLogEventEnd(events[4],0,0,0,0);
862:   } else { // mass
863:     dim3 dimBlock(nnn,Nq);
864:     PetscInfo(plex[0], "Mass d_maps = %p. Nq=%d, vector size %d num_cells_batch=%d\n",d_maps,Nq,nnn,num_cells_batch);
865:     PetscLogEventBegin(events[16],0,0,0,0);
866:     PetscLogGpuTimeBegin();
867:     landau_mass<<<dimGrid,dimBlock>>>(dim, Nb, num_grids, d_w, d_BB, d_DD, d_elem_mats,
868:                                       d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_elem_offset);
869:     CHECK_LAUNCH_ERROR(); // has sync
870:     PetscLogGpuTimeEnd();
871:     PetscLogEventEnd(events[16],0,0,0,0);
872:   }
873:   // First time assembly with or without GPU assembly
874:   if (d_elem_mats) {
875:     PetscInt elem_mats_idx = 0;
876:     for (PetscInt b_id = 0 ; b_id < batch_sz ; b_id++) { // OpenMP (once)
877:       for (PetscInt grid=0 ; grid<num_grids ; grid++) {  // elem_mats_idx += totDim*totDim*a_numCells[grid];
878:         const PetscInt    Nfloc = a_species_offset[grid+1]-a_species_offset[grid], totDim = Nfloc*Nq;
879:         PetscScalar       *elemMats = NULL, *elMat;
880:         PetscSection      section, globalSection;
881:         PetscInt          cStart,cEnd,ej;
882:         PetscInt          moffset = LAND_MOFFSET(b_id,grid,batch_sz,num_grids,a_mat_offset), nloc, nzl, colbuf[1024], row;
883:         const PetscInt    *cols;
884:         const PetscScalar *vals;
885:         Mat               B = subJ[ LAND_PACK_IDX(b_id,grid) ];
886:         PetscLogEventBegin(events[5],0,0,0,0);
887:         DMPlexGetHeightStratum(plex[grid],0,&cStart,&cEnd);
888:         DMGetLocalSection(plex[grid], &section);
889:         DMGetGlobalSection(plex[grid], &globalSection);
890:         PetscMalloc1(totDim*totDim*a_numCells[grid],&elemMats);
891:         cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim*totDim*a_numCells[grid]*sizeof(*elemMats), cudaMemcpyDeviceToHost);
892:         PetscLogEventEnd(events[5],0,0,0,0);
893:         PetscLogEventBegin(events[6],0,0,0,0);
894:         for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
895:           DMPlexMatSetClosure(plex[grid], section, globalSection, B, ej, elMat, ADD_VALUES);
896:           if (ej==-1) {
897:             int d,f;
898:             PetscPrintf(PETSC_COMM_SELF,"GPU Element matrix\n");
899:             for (d = 0; d < totDim; ++d) {
900:               for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF," %12.5e",  PetscRealPart(elMat[d*totDim + f]));
901:               PetscPrintf(PETSC_COMM_SELF,"\n");
902:             }
903:             //exit(14);
904:           }
905:         }
906:         PetscFree(elemMats);
907:         MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
908:         MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
909:         // move nest matrix to global JacP
910:         MatGetSize(B, &nloc, NULL);
911:         for (int i=0 ; i<nloc ; i++) {
912:           MatGetRow(B,i,&nzl,&cols,&vals);
914:           for (int j=0; j<nzl; j++) colbuf[j] = cols[j] + moffset;
915:           row = i + moffset;
916:           MatSetValues(JacP,1,&row,nzl,colbuf,vals,ADD_VALUES);
917:           MatRestoreRow(B,i,&nzl,&cols,&vals);
918:         }
919:         MatDestroy(&B);
920:         PetscLogEventEnd(events[6],0,0,0,0);
921:         elem_mats_idx += totDim*totDim*a_numCells[grid]; // this can be a stored offset?
922:       } // grids
923:     }
925:     cudaFree(d_elem_mats);
926:   }

928:   return 0;
929: }