Actual source code: ex2.c
1: static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
2: /*
3: SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4: operations in the default stream and does not sync these operations since it assumes routines consume
5: the destination data are also on the default stream. However, when destination data in on CPU,
6: SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
7: */
9: #include <petscvec.h>
10: int main(int argc,char **argv)
11: {
12: PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */
13: PetscScalar *val;
14: const PetscScalar *yval;
15: Vec x,y;
16: PetscMPIInt size;
17: IS ix,iy;
18: VecScatter vscat;
20: PetscInitialize(&argc,&argv,(char*)0,help);
21: MPI_Comm_size(PETSC_COMM_WORLD,&size);
24: /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
25: since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
26: cudaMemcpyDeviceToHost.
27: */
28: VecCreateSeq(PETSC_COMM_WORLD,n,&x);
29: VecSetFromOptions(x);
30: VecCreateSeq(PETSC_COMM_WORLD,n,&y);
31: VecSetFromOptions(y);
33: /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
34: VecGetArray(x,&val);
35: for (i=0; i<n; i++) val[i] = i/2.0;
36: VecRestoreArray(x,&val);
37: VecScale(x,2.0);
38: VecSet(y,314);
40: /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
41: VecGetArray(y,&val);
42: VecRestoreArray(y,&val);
44: /* The vscat is simply a vector copy */
45: ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);
46: ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);
47: VecScatterCreate(x,ix,y,iy,&vscat);
49: /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
50: cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
51: */
52: VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
53: VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
54: VecGetArrayRead(y,&yval);
55: /* Display the first and the last entries of y to see if it is valid on host */
56: PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));
57: VecRestoreArrayRead(y,&yval);
59: VecDestroy(&x);
60: VecDestroy(&y);
61: ISDestroy(&ix);
62: ISDestroy(&iy);
63: VecScatterDestroy(&vscat);
64: PetscFinalize();
65: return 0;
66: }
68: /*TEST
70: test:
71: requires: cuda
72: diff_args: -j
73: #make sure the host memory is pinned
74: # sf_backend cuda is not needed if compiling only with cuda
75: args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
77: test:
78: suffix: hip
79: requires: hip
80: diff_args: -j
81: output_file: output/ex2_1.out
82: #make sure the host memory is pinned
83: # sf_backend hip is not needed if compiling only with hip
84: args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
86: TEST*/