Actual source code: matcuda.cu
slepc-3.20.1 2023-11-27
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented with a dense Mat (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepccublas.h>
16: #include "../src/sys/classes/bv/impls/mat/bvmat.h"
18: PetscErrorCode BVMult_Mat_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
19: {
20: BV_MAT *y = (BV_MAT*)Y->data,*x = (BV_MAT*)X->data;
21: const PetscScalar *d_px,*d_A,*d_B,*d_q;
22: PetscScalar *d_py,*d_C;
23: PetscInt ldq;
25: PetscFunctionBegin;
26: if (!Y->n) PetscFunctionReturn(PETSC_SUCCESS);
27: PetscCall(MatDenseCUDAGetArrayRead(x->A,&d_px));
28: if (beta==(PetscScalar)0.0) PetscCall(MatDenseCUDAGetArrayWrite(y->A,&d_py));
29: else PetscCall(MatDenseCUDAGetArray(y->A,&d_py));
30: d_A = d_px+(X->nc+X->l)*X->ld;
31: d_C = d_py+(Y->nc+Y->l)*Y->ld;
32: if (Q) {
33: PetscCall(MatDenseGetLDA(Q,&ldq));
34: PetscCall(BV_MatDenseCUDAGetArrayRead(Y,Q,&d_q));
35: d_B = d_q+Y->l*ldq+X->l;
36: PetscCall(BVMult_BLAS_CUDA(Y,Y->n,Y->k-Y->l,X->k-X->l,alpha,d_A,X->ld,d_B,ldq,beta,d_C,Y->ld));
37: PetscCall(BV_MatDenseCUDARestoreArrayRead(Y,Q,&d_q));
38: } else PetscCall(BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,X->ld,beta,d_C,Y->ld));
39: PetscCall(MatDenseCUDARestoreArrayRead(x->A,&d_px));
40: if (beta==(PetscScalar)0.0) PetscCall(MatDenseCUDARestoreArrayWrite(y->A,&d_py));
41: else PetscCall(MatDenseCUDARestoreArray(y->A,&d_py));
42: PetscFunctionReturn(PETSC_SUCCESS);
43: }
45: PetscErrorCode BVMultVec_Mat_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
46: {
47: BV_MAT *x = (BV_MAT*)X->data;
48: PetscScalar *d_py,*d_q;
49: const PetscScalar *d_px;
51: PetscFunctionBegin;
52: PetscCall(MatDenseCUDAGetArrayRead(x->A,&d_px));
53: if (beta==(PetscScalar)0.0) PetscCall(VecCUDAGetArrayWrite(y,&d_py));
54: else PetscCall(VecCUDAGetArray(y,&d_py));
55: if (!q) PetscCall(VecCUDAGetArray(X->buffer,&d_q));
56: else {
57: PetscInt k=X->k-X->l;
58: PetscCallCUDA(cudaMalloc((void**)&d_q,k*sizeof(PetscScalar)));
59: PetscCallCUDA(cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice));
60: PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar)));
61: }
62: PetscCall(BVMultVec_BLAS_CUDA(X,X->n,X->k-X->l,alpha,d_px+(X->nc+X->l)*X->ld,X->ld,d_q,beta,d_py));
63: PetscCall(MatDenseCUDARestoreArrayRead(x->A,&d_px));
64: if (beta==(PetscScalar)0.0) PetscCall(VecCUDARestoreArrayWrite(y,&d_py));
65: else PetscCall(VecCUDARestoreArray(y,&d_py));
66: if (!q) PetscCall(VecCUDARestoreArray(X->buffer,&d_q));
67: else PetscCallCUDA(cudaFree(d_q));
68: PetscFunctionReturn(PETSC_SUCCESS);
69: }
71: PetscErrorCode BVMultInPlace_Mat_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
72: {
73: BV_MAT *ctx = (BV_MAT*)V->data;
74: PetscScalar *d_pv;
75: const PetscScalar *d_q;
76: PetscInt ldq;
78: PetscFunctionBegin;
79: if (s>=e || !V->n) PetscFunctionReturn(PETSC_SUCCESS);
80: PetscCall(MatDenseGetLDA(Q,&ldq));
81: PetscCall(MatDenseCUDAGetArray(ctx->A,&d_pv));
82: PetscCall(BV_MatDenseCUDAGetArrayRead(V,Q,&d_q));
83: PetscCall(BVMultInPlace_BLAS_CUDA(V,V->n,V->k-V->l,s-V->l,e-V->l,d_pv+(V->nc+V->l)*V->ld,V->ld,d_q+V->l*ldq+V->l,ldq,PETSC_FALSE));
84: PetscCall(BV_MatDenseCUDARestoreArrayRead(V,Q,&d_q));
85: PetscCall(MatDenseCUDARestoreArray(ctx->A,&d_pv));
86: PetscFunctionReturn(PETSC_SUCCESS);
87: }
89: PetscErrorCode BVMultInPlaceHermitianTranspose_Mat_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
90: {
91: BV_MAT *ctx = (BV_MAT*)V->data;
92: PetscScalar *d_pv;
93: const PetscScalar *d_q;
94: PetscInt ldq;
96: PetscFunctionBegin;
97: if (s>=e || !V->n) PetscFunctionReturn(PETSC_SUCCESS);
98: PetscCall(MatDenseGetLDA(Q,&ldq));
99: PetscCall(MatDenseCUDAGetArray(ctx->A,&d_pv));
100: PetscCall(BV_MatDenseCUDAGetArrayRead(V,Q,&d_q));
101: PetscCall(BVMultInPlace_BLAS_CUDA(V,V->n,V->k-V->l,s-V->l,e-V->l,d_pv+(V->nc+V->l)*V->ld,V->ld,d_q+V->l*ldq+V->l,ldq,PETSC_TRUE));
102: PetscCall(BV_MatDenseCUDARestoreArrayRead(V,Q,&d_q));
103: PetscCall(MatDenseCUDARestoreArray(ctx->A,&d_pv));
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
107: PetscErrorCode BVDot_Mat_CUDA(BV X,BV Y,Mat M)
108: {
109: BV_MAT *x = (BV_MAT*)X->data,*y = (BV_MAT*)Y->data;
110: const PetscScalar *d_px,*d_py;
111: PetscScalar *pm;
112: PetscInt ldm;
114: PetscFunctionBegin;
115: PetscCall(MatDenseGetLDA(M,&ldm));
116: PetscCall(MatDenseCUDAGetArrayRead(x->A,&d_px));
117: PetscCall(MatDenseCUDAGetArrayRead(y->A,&d_py));
118: PetscCall(MatDenseGetArrayWrite(M,&pm));
119: PetscCall(BVDot_BLAS_CUDA(X,Y->k-Y->l,X->k-X->l,X->n,d_py+(Y->nc+Y->l)*Y->ld,Y->ld,d_px+(X->nc+X->l)*X->ld,X->ld,pm+X->l*ldm+Y->l,ldm,x->mpi));
120: PetscCall(MatDenseRestoreArrayWrite(M,&pm));
121: PetscCall(MatDenseCUDARestoreArrayRead(x->A,&d_px));
122: PetscCall(MatDenseCUDARestoreArrayRead(y->A,&d_py));
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }
126: PetscErrorCode BVDotVec_Mat_CUDA(BV X,Vec y,PetscScalar *q)
127: {
128: BV_MAT *x = (BV_MAT*)X->data;
129: const PetscScalar *d_px,*d_py;
130: Vec z = y;
132: PetscFunctionBegin;
133: if (PetscUnlikely(X->matrix)) {
134: PetscCall(BV_IPMatMult(X,y));
135: z = X->Bx;
136: }
137: PetscCall(MatDenseCUDAGetArrayRead(x->A,&d_px));
138: PetscCall(VecCUDAGetArrayRead(z,&d_py));
139: PetscCall(BVDotVec_BLAS_CUDA(X,X->n,X->k-X->l,d_px+(X->nc+X->l)*X->ld,X->ld,d_py,q,x->mpi));
140: PetscCall(VecCUDARestoreArrayRead(z,&d_py));
141: PetscCall(MatDenseCUDARestoreArrayRead(x->A,&d_px));
142: PetscFunctionReturn(PETSC_SUCCESS);
143: }
145: PetscErrorCode BVDotVec_Local_Mat_CUDA(BV X,Vec y,PetscScalar *m)
146: {
147: BV_MAT *x = (BV_MAT*)X->data;
148: const PetscScalar *d_px,*d_py;
149: Vec z = y;
151: PetscFunctionBegin;
152: if (PetscUnlikely(X->matrix)) {
153: PetscCall(BV_IPMatMult(X,y));
154: z = X->Bx;
155: }
156: PetscCall(MatDenseCUDAGetArrayRead(x->A,&d_px));
157: PetscCall(VecCUDAGetArrayRead(z,&d_py));
158: PetscCall(BVDotVec_BLAS_CUDA(X,X->n,X->k-X->l,d_px+(X->nc+X->l)*X->ld,X->ld,d_py,m,PETSC_FALSE));
159: PetscCall(VecCUDARestoreArrayRead(z,&d_py));
160: PetscCall(MatDenseCUDARestoreArrayRead(x->A,&d_px));
161: PetscFunctionReturn(PETSC_SUCCESS);
162: }
164: PetscErrorCode BVScale_Mat_CUDA(BV bv,PetscInt j,PetscScalar alpha)
165: {
166: BV_MAT *ctx = (BV_MAT*)bv->data;
167: PetscScalar *d_array,*d_A;
168: PetscInt n=0;
170: PetscFunctionBegin;
171: if (!bv->n) PetscFunctionReturn(PETSC_SUCCESS);
172: PetscCall(MatDenseCUDAGetArray(ctx->A,&d_array));
173: if (PetscUnlikely(j<0)) {
174: d_A = d_array+(bv->nc+bv->l)*bv->ld;
175: n = (bv->k-bv->l)*bv->ld;
176: } else {
177: d_A = d_array+(bv->nc+j)*bv->ld;
178: n = bv->n;
179: }
180: PetscCall(BVScale_BLAS_CUDA(bv,n,d_A,alpha));
181: PetscCall(MatDenseCUDARestoreArray(ctx->A,&d_array));
182: PetscFunctionReturn(PETSC_SUCCESS);
183: }
185: PetscErrorCode BVMatMult_Mat_CUDA(BV V,Mat A,BV W)
186: {
187: BV_MAT *v = (BV_MAT*)V->data,*w = (BV_MAT*)W->data;
188: Mat Vmat,Wmat;
189: const PetscScalar *d_pv;
190: PetscScalar *d_pw;
191: PetscInt j;
193: PetscFunctionBegin;
194: if (V->vmm) {
195: PetscCall(BVGetMat(V,&Vmat));
196: PetscCall(BVGetMat(W,&Wmat));
197: PetscCall(MatProductCreateWithMat(A,Vmat,NULL,Wmat));
198: PetscCall(MatProductSetType(Wmat,MATPRODUCT_AB));
199: PetscCall(MatProductSetFromOptions(Wmat));
200: PetscCall(MatProductSymbolic(Wmat));
201: PetscCall(MatProductNumeric(Wmat));
202: PetscCall(MatProductClear(Wmat));
203: PetscCall(BVRestoreMat(V,&Vmat));
204: PetscCall(BVRestoreMat(W,&Wmat));
205: } else {
206: PetscCall(MatDenseCUDAGetArrayRead(v->A,&d_pv));
207: PetscCall(MatDenseCUDAGetArrayWrite(w->A,&d_pw));
208: for (j=0;j<V->k-V->l;j++) {
209: PetscCall(VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->ld));
210: PetscCall(VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->ld));
211: PetscCall(MatMult(A,V->cv[1],W->cv[1]));
212: PetscCall(VecCUDAResetArray(V->cv[1]));
213: PetscCall(VecCUDAResetArray(W->cv[1]));
214: }
215: PetscCall(MatDenseCUDARestoreArrayRead(v->A,&d_pv));
216: PetscCall(MatDenseCUDARestoreArrayWrite(w->A,&d_pw));
217: }
218: PetscFunctionReturn(PETSC_SUCCESS);
219: }
221: PetscErrorCode BVCopy_Mat_CUDA(BV V,BV W)
222: {
223: BV_MAT *v = (BV_MAT*)V->data,*w = (BV_MAT*)W->data;
224: const PetscScalar *d_pv;
225: PetscScalar *d_pw;
227: PetscFunctionBegin;
228: PetscCall(MatDenseCUDAGetArrayRead(v->A,&d_pv));
229: PetscCall(MatDenseCUDAGetArray(w->A,&d_pw));
230: PetscCallCUDA(cudaMemcpy2D(d_pw+(W->nc+W->l)*W->ld,W->ld*sizeof(PetscScalar),d_pv+(V->nc+V->l)*V->ld,V->ld*sizeof(PetscScalar),V->n*sizeof(PetscScalar),V->k-V->l,cudaMemcpyDeviceToDevice));
231: PetscCall(MatDenseCUDARestoreArrayRead(v->A,&d_pv));
232: PetscCall(MatDenseCUDARestoreArray(w->A,&d_pw));
233: PetscFunctionReturn(PETSC_SUCCESS);
234: }
236: PetscErrorCode BVCopyColumn_Mat_CUDA(BV V,PetscInt j,PetscInt i)
237: {
238: BV_MAT *v = (BV_MAT*)V->data;
239: PetscScalar *d_pv;
241: PetscFunctionBegin;
242: PetscCall(MatDenseCUDAGetArray(v->A,&d_pv));
243: PetscCallCUDA(cudaMemcpy(d_pv+(V->nc+i)*V->ld,d_pv+(V->nc+j)*V->ld,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
244: PetscCall(MatDenseCUDARestoreArray(v->A,&d_pv));
245: PetscFunctionReturn(PETSC_SUCCESS);
246: }
248: PetscErrorCode BVGetColumn_Mat_CUDA(BV bv,PetscInt j,Vec*)
249: {
250: BV_MAT *ctx = (BV_MAT*)bv->data;
251: PetscScalar *d_pv;
252: PetscInt l;
254: PetscFunctionBegin;
255: l = BVAvailableVec;
256: PetscCall(MatDenseCUDAGetArray(ctx->A,&d_pv));
257: PetscCall(VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->ld));
258: PetscFunctionReturn(PETSC_SUCCESS);
259: }
261: PetscErrorCode BVRestoreColumn_Mat_CUDA(BV bv,PetscInt j,Vec*)
262: {
263: BV_MAT *ctx = (BV_MAT*)bv->data;
264: PetscInt l;
266: PetscFunctionBegin;
267: l = (j==bv->ci[0])? 0: 1;
268: PetscCall(VecCUDAResetArray(bv->cv[l]));
269: PetscCall(MatDenseCUDARestoreArray(ctx->A,NULL));
270: PetscFunctionReturn(PETSC_SUCCESS);
271: }
273: PetscErrorCode BVRestoreSplit_Mat_CUDA(BV bv,BV *L,BV *R)
274: {
275: Mat A;
276: const PetscScalar *d_pv;
277: PetscObjectState lstate,rstate;
278: PetscBool change=PETSC_FALSE;
280: PetscFunctionBegin;
281: /* force sync flag to PETSC_CUDA_BOTH */
282: if (L) {
283: PetscCall(PetscObjectStateGet((PetscObject)*L,&lstate));
284: if (lstate != bv->lstate) {
285: A = ((BV_MAT*)bv->L->data)->A;
286: PetscCall(MatDenseCUDAGetArrayRead(A,&d_pv));
287: PetscCall(MatDenseCUDARestoreArrayRead(A,&d_pv));
288: change = PETSC_TRUE;
289: }
290: }
291: if (R) {
292: PetscCall(PetscObjectStateGet((PetscObject)*R,&rstate));
293: if (rstate != bv->rstate) {
294: A = ((BV_MAT*)bv->R->data)->A;
295: PetscCall(MatDenseCUDAGetArrayRead(A,&d_pv));
296: PetscCall(MatDenseCUDARestoreArrayRead(A,&d_pv));
297: change = PETSC_TRUE;
298: }
299: }
300: if (change) {
301: A = ((BV_MAT*)bv->data)->A;
302: PetscCall(MatDenseCUDAGetArray(A,(PetscScalar **)&d_pv));
303: PetscCall(MatDenseCUDARestoreArray(A,(PetscScalar **)&d_pv));
304: }
305: PetscFunctionReturn(PETSC_SUCCESS);
306: }
308: PetscErrorCode BVGetMat_Mat_CUDA(BV bv,Mat *A)
309: {
310: BV_MAT *ctx = (BV_MAT*)bv->data;
311: PetscScalar *vv,*aa;
312: PetscBool create=PETSC_FALSE;
313: PetscInt m,cols;
314: VecType vtype;
316: PetscFunctionBegin;
317: m = bv->k-bv->l;
318: if (!bv->Aget) create=PETSC_TRUE;
319: else {
320: PetscCall(MatDenseCUDAGetArray(bv->Aget,&aa));
321: PetscCheck(!aa,PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
322: PetscCall(MatGetSize(bv->Aget,NULL,&cols));
323: if (cols!=m) {
324: PetscCall(MatDestroy(&bv->Aget));
325: create=PETSC_TRUE;
326: }
327: }
328: PetscCall(MatDenseCUDAGetArray(ctx->A,&vv));
329: if (create) {
330: PetscCall(VecGetType(bv->t,&vtype));
331: PetscCall(MatCreateDenseFromVecType(PetscObjectComm((PetscObject)bv),vtype,bv->n,PETSC_DECIDE,bv->N,m,bv->ld,vv,&bv->Aget)); /* pass a pointer to avoid allocation of storage */
332: PetscCall(MatDenseCUDAReplaceArray(bv->Aget,NULL)); /* replace with a null pointer, the value after BVRestoreMat */
333: }
334: PetscCall(MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->ld)); /* set the actual pointer */
335: *A = bv->Aget;
336: PetscFunctionReturn(PETSC_SUCCESS);
337: }
339: PetscErrorCode BVRestoreMat_Mat_CUDA(BV bv,Mat *A)
340: {
341: BV_MAT *ctx = (BV_MAT*)bv->data;
342: PetscScalar *vv,*aa;
344: PetscFunctionBegin;
345: PetscCall(MatDenseCUDAGetArray(bv->Aget,&aa));
346: vv = aa-(bv->nc+bv->l)*bv->ld;
347: PetscCall(MatDenseCUDAResetArray(bv->Aget));
348: PetscCall(MatDenseCUDARestoreArray(ctx->A,&vv));
349: *A = NULL;
350: PetscFunctionReturn(PETSC_SUCCESS);
351: }