Actual source code: veccusp.cu
petsc-3.4.2 2013-07-02
1: /*
2: Implements the sequential cusp vectors.
3: */
5: #include <petscconf.h>
6: PETSC_CUDA_EXTERN_C_BEGIN
7: #include <petsc-private/vecimpl.h> /*I "petscvec.h" I*/
8: #include <../src/vec/vec/impls/dvecimpl.h>
9: PETSC_CUDA_EXTERN_C_END
10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
12: #include <cuda_runtime.h>
16: /*
17: Allocates space for the vector array on the Host if it does not exist.
18: Does NOT change the PetscCUSPFlag for the vector
19: Does NOT zero the CUSP array
20: */
21: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
22: {
24: PetscScalar *array;
25: Vec_Seq *s;
26: PetscInt n = v->map->n;
29: s = (Vec_Seq*)v->data;
30: VecCUSPAllocateCheck(v);
31: if (s->array == 0) {
32: //#if defined(PETSC_HAVE_TXPETSCGPU)
33: //if (n>0)
34: // cudaMallocHost((void **) &array, n*sizeof(PetscScalar));CHKERRCUSP(ierr);
35: //#else
36: PetscMalloc(n*sizeof(PetscScalar),&array);
37: PetscLogObjectMemory(v,n*sizeof(PetscScalar));
38: s->array = array;
39: s->array_allocated = array;
40: }
41: return(0);
42: }
47: /*
48: Allocates space for the vector array on the GPU if it does not exist.
49: Does NOT change the PetscCUSPFlag for the vector
50: Does NOT zero the CUSP array
52: */
53: PetscErrorCode VecCUSPAllocateCheck(Vec v)
54: {
56: int rank;
59: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
60: // First allocate memory on the GPU if needed
61: if (!v->spptr) {
62: try {
63: v->spptr = new Vec_CUSP;
64: ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
65: ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);
67: #if defined(PETSC_HAVE_TXPETSCGPU)
69: ((Vec_CUSP*)v->spptr)->GPUvector = new GPU_Vector<PetscInt, PetscScalar>(((Vec_CUSP*)v->spptr)->GPUarray, rank);
70: ((Vec_CUSP*)v->spptr)->GPUvector->buildStreamsAndEvents();CHKERRCUSP(ierr);
72: Vec_Seq *s;
73: s = (Vec_Seq*)v->data;
74: if (v->map->n>0) {
75: if (s->array==0) {
76: // In this branch, GPUvector owns the ptr and manages the memory
77: ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);
79: s->array = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
80: s->array_allocated = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
81: } else {
82: // In this branch, Petsc owns the ptr to start, however we want to use
83: // page locked host memory for faster data transfers. So, a new
84: // page-locked buffer is allocated. Then, the old Petsc memory
85: // is copied in to the new buffer. Then the old Petsc memory is freed.
86: // GPUvector owns the new ptr.
87: ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);
88: PetscScalar * temp = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
90: PetscMemcpy(temp,s->array,v->map->n*sizeof(PetscScalar));
91: PetscFree(s->array);
93: s->array = temp;
94: s->array_allocated = temp;
95: }
96: WaitForGPU();CHKERRCUSP(ierr);
97: }
98: v->ops->destroy = VecDestroy_SeqCUSP;
99: #endif
100: } catch(char *ex) {
101: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
102: }
103: }
104: return(0);
105: }
110: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
111: PetscErrorCode VecCUSPCopyToGPU(Vec v)
112: {
116: VecCUSPAllocateCheck(v);
117: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
118: PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
119: try {
120: #if defined(PETSC_HAVE_TXPETSCGPU)
121: ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUAll();CHKERRCUSP(ierr);
122: #else
123: CUSPARRAY *varray;
124: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
125: varray->assign(*(PetscScalar**)v->data,*(PetscScalar**)v->data + v->map->n);
126: WaitForGPU();CHKERRCUSP(ierr);
127: #endif
129: } catch(char *ex) {
130: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
131: }
132: PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
133: v->valid_GPU_array = PETSC_CUSP_BOTH;
134: }
135: return(0);
136: }
140: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
141: {
143: CUSPARRAY *varray;
146: VecCUSPAllocateCheck(v);
147: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
148: PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
149: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
150: #if defined(PETSC_HAVE_TXPETSCGPU)
151: ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUSome(varray, ci->recvIndices);CHKERRCUSP(ierr);
152: #else
153: Vec_Seq *s;
154: s = (Vec_Seq*)v->data;
156: CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
157: CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;
159: thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
160: thrust::make_permutation_iterator(s->array,indicesCPU->end()),
161: thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
162: #endif
163: // Set the buffer states
164: v->valid_GPU_array = PETSC_CUSP_BOTH;
165: PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
166: }
167: return(0);
168: }
173: /*
174: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
175: */
176: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
177: {
181: VecCUSPAllocateCheckHost(v);
182: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
183: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
184: try {
185: #if defined(PETSC_HAVE_TXPETSCGPU)
186: ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUAll();CHKERRCUSP(ierr);
187: #else
188: CUSPARRAY *varray;
189: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
190: thrust::copy(varray->begin(),varray->end(),*(PetscScalar**)v->data);
191: WaitForGPU();CHKERRCUSP(ierr);
192: #endif
193: } catch(char *ex) {
194: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
195: }
196: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
197: v->valid_GPU_array = PETSC_CUSP_BOTH;
198: }
199: return(0);
200: }
204: /* Note that this function only copies *some* of the values up from the GPU to CPU,
205: which means that we need recombine the data at some point before using any of the standard functions.
206: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
207: where you have to always call in pairs
208: */
209: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
210: {
211: CUSPARRAY *varray;
215: VecCUSPAllocateCheck(v);
216: VecCUSPAllocateCheckHost(v);
217: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
218: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
219: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
220: #if defined(PETSC_HAVE_TXPETSCGPU)
221: ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUSome(varray, ci->sendIndices);CHKERRCUSP(ierr);
222: #else
223: Vec_Seq *s;
224: s = (Vec_Seq*)v->data;
225: CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
226: CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;
228: thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
229: thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
230: thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
231: #endif
232: VecCUSPRestoreArrayRead(v,&varray);
233: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
234: v->valid_GPU_array = PETSC_CUSP_BOTH;
235: }
236: return(0);
237: }
242: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
243: {
244: PetscScalar *ya;
245: const PetscScalar *xa;
246: PetscErrorCode ierr;
249: if (xin != yin) {
250: VecGetArrayRead(xin,&xa);
251: VecGetArray(yin,&ya);
252: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
253: VecRestoreArrayRead(xin,&xa);
254: VecRestoreArray(yin,&ya);
255: }
256: return(0);
257: }
261: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
262: {
264: PetscInt n = xin->map->n,i;
265: PetscScalar *xx;
268: VecGetArray(xin,&xx);
269: for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
270: VecRestoreArray(xin,&xx);
271: return(0);
272: }
276: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
277: {
278: Vec_Seq *vs = (Vec_Seq*)v->data;
282: PetscObjectAMSViewOff(v);
283: #if defined(PETSC_USE_LOG)
284: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
285: #endif
286: if (vs->array_allocated) PetscFree(vs->array_allocated);
287: PetscFree(vs);
288: return(0);
289: }
293: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
294: {
295: Vec_Seq *v = (Vec_Seq*)vin->data;
298: v->array = v->unplacedarray;
299: v->unplacedarray = 0;
300: return(0);
301: }
303: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
306: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
307: {
311: VecCUSPAllocateCheck(v);
312: return(0);
313: }
317: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
318: {
322: VecCUSPCopyToGPU(v);
323: return(0);
324: }
328: /*
329: PetscCUSPIndicesCreate - creates the data structure needed by VecCUSPCopyToGPUSome_Public()
331: Input Parameters:
332: + n - the number of indices
333: - indices - integer list of indices
335: Output Parameter:
336: . ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
338: .seealso: PetscCUSPIndicesDestroy(), VecCUSPCopyToGPUSome_Public()
339: */
340: PetscErrorCode PetscCUSPIndicesCreate(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
341: {
342: PetscCUSPIndices cci;
345: cci = new struct _p_PetscCUSPIndices;
346: #if defined(PETSC_HAVE_TXPETSCGPU)
347: cci->sendIndices = new GPU_Indices<PetscInt, PetscScalar>();
348: cci->sendIndices->buildIndices(sendIndices, ns);
349: cci->recvIndices = new GPU_Indices<PetscInt, PetscScalar>();
350: cci->recvIndices->buildIndices(recvIndices, nr);
351: #else
352: cci->sendIndicesCPU.assign(sendIndices,sendIndices+ns);
353: cci->sendIndicesGPU.assign(sendIndices,sendIndices+ns);
355: cci->recvIndicesCPU.assign(recvIndices,recvIndices+nr);
356: cci->recvIndicesGPU.assign(recvIndices,recvIndices+nr);
357: #endif
358: *ci = cci;
359: return(0);
360: }
364: /*
365: PetscCUSPIndicesDestroy - destroys the data structure needed by VecCUSPCopyToGPUSome_Public()
367: Input Parameters:
368: . ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
370: .seealso: PetscCUSPIndicesCreate(), VecCUSPCopyToGPUSome_Public()
371: */
372: PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices *ci)
373: {
375: if (!(*ci)) return(0);
376: try {
377: #if defined(PETSC_HAVE_TXPETSCGPU)
378: if ((*ci)->sendIndices) delete (*ci)->sendIndices;
379: if ((*ci)->recvIndices) delete (*ci)->recvIndices;
380: #endif
381: if (ci) delete *ci;
382: } catch(char *ex) {
383: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
384: }
385: *ci = 0;
386: return(0);
387: }
389: #if defined(PETSC_HAVE_TXPETSCGPU)
392: /*
393: *VecCUSPResetIndexBuffersFlagsGPU_Public resets indexing flags ... only called in VecScatterFinalizeForGPU
394: */
395: PetscErrorCode VecCUSPResetIndexBuffersFlagsGPU_Public(PetscCUSPIndices ci)
396: {
398: if (ci->sendIndices) ci->sendIndices->resetStatusFlag();
399: if (ci->recvIndices) ci->recvIndices->resetStatusFlag();
400: return(0);
401: }
402: #endif
407: /*
408: VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
410: Input Parameters:
411: + v - the vector
412: - indices - the requested indices, this should be created with CUSPIndicesCreate()
414: */
415: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
416: {
420: VecCUSPCopyToGPUSome(v,ci);
421: return(0);
422: }
426: /*
427: VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
429: Input Parameters:
430: + v - the vector
431: - indices - the requested indices, this should be created with CUSPIndicesCreate()
432: */
433: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
434: {
438: VecCUSPCopyFromGPUSome(v,ci);
439: return(0);
440: }
442: #if defined(PETSC_HAVE_TXPETSCGPU)
445: /* Note that this function only moves *some* of the data from a GPU vector to a contiguous buffer on the GPU.
446: Afterwords, this buffer can be messaged to the host easily with asynchronous memory transfers.
447: which means that we need recombine the data at some point before using any of the standard functions.
448: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
449: where you have to always call in pairs
450: */
451: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
452: {
453: CUSPARRAY *varray;
457: VecCUSPAllocateCheck(v);
458: if (v->valid_GPU_array == PETSC_CUSP_GPU || v->valid_GPU_array == PETSC_CUSP_BOTH) {
459: VecCUSPGetArrayRead(v,&varray);
460: ((Vec_CUSP*)v->spptr)->GPUvector->copySomeToContiguousBuffer(varray, ci->sendIndices);CHKERRCUSP(ierr);
461: VecCUSPRestoreArrayRead(v,&varray);
462: }
463: return(0);
464: }
470: /*
471: VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector
473: Input Parameters:
474: + v - the vector
475: - indices - the requested indices, this should be created with CUSPIndicesCreate()
476: */
477: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
478: {
482: VecCUSPCopySomeToContiguousBufferGPU(v,ci);
483: return(0);
484: }
486: /* Note that this function only moves *some* of the data from a contiguous buffer on the GPU to arbitrary locations
487: in a GPU vector. This function will typically be called after an asynchronous memory transfer from the host to the device.
488: which means that we need recombine the data at some point before using any of the standard functions.
489: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
490: where you have to always call in pairs
491: */
492: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
493: {
494: CUSPARRAY *varray;
498: VecCUSPAllocateCheck(v);
499: if (v->valid_GPU_array == PETSC_CUSP_CPU || v->valid_GPU_array == PETSC_CUSP_BOTH) {
500: VecCUSPGetArrayRead(v,&varray);
501: ((Vec_CUSP*)v->spptr)->GPUvector->copySomeFromContiguousBuffer(varray, ci->recvIndices);CHKERRCUSP(ierr);
502: VecCUSPRestoreArrayRead(v,&varray);
503: }
504: return(0);
505: }
509: /*
510: VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector
512: Input Parameters:
513: + v - the vector
514: - indices - the requested indices, this should be created with CUSPIndicesCreate()
515: */
516: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
517: {
521: VecCUSPCopySomeFromContiguousBufferGPU(v,ci);
522: return(0);
523: }
525: #endif
528: /*MC
529: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
531: Options Database Keys:
532: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
534: Level: beginner
536: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
537: M*/
539: /* for VecAYPX_SeqCUSP*/
540: namespace cusp
541: {
542: namespace blas
543: {
544: namespace detail
545: {
546: template <typename T>
547: struct AYPX : public thrust::binary_function<T,T,T>
548: {
549: T alpha;
551: AYPX(T _alpha) : alpha(_alpha) {}
553: __host__ __device__
554: T operator()(T x, T y)
555: {
556: return alpha * y + x;
557: }
558: };
559: }
561: template <typename ForwardIterator1,
562: typename ForwardIterator2,
563: typename ScalarType>
564: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
565: {
566: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
567: }
568: template <typename Array1, typename Array2, typename ScalarType>
569: void aypx(const Array1& x, Array2& y, ScalarType alpha)
570: {
571: detail::assert_same_dimensions(x,y);
572: aypx(x.begin(),x.end(),y.begin(),alpha);
573: }
574: }
575: }
579: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
580: {
581: CUSPARRAY *xarray,*yarray;
585: if (alpha != 0.0) {
586: VecCUSPGetArrayRead(xin,&xarray);
587: VecCUSPGetArrayReadWrite(yin,&yarray);
588: try {
589: cusp::blas::aypx(*xarray,*yarray,alpha);
590: WaitForGPU();CHKERRCUSP(ierr);
591: } catch(char *ex) {
592: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
593: }
594: VecCUSPRestoreArrayRead(xin,&xarray);
595: VecCUSPRestoreArrayReadWrite(yin,&yarray);
596: PetscLogFlops(2.0*yin->map->n);
597: }
598: return(0);
599: }
604: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
605: {
606: CUSPARRAY *xarray,*yarray;
610: if (alpha != 0.0) {
611: VecCUSPGetArrayRead(xin,&xarray);
612: VecCUSPGetArrayReadWrite(yin,&yarray);
613: try {
614: cusp::blas::axpy(*xarray,*yarray,alpha);
615: WaitForGPU();CHKERRCUSP(ierr);
616: } catch(char *ex) {
617: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
618: }
619: VecCUSPRestoreArrayRead(xin,&xarray);
620: VecCUSPRestoreArrayReadWrite(yin,&yarray);
621: PetscLogFlops(2.0*yin->map->n);
622: }
623: return(0);
624: }
626: struct VecCUSPPointwiseDivide
627: {
628: template <typename Tuple>
629: __host__ __device__
630: void operator()(Tuple t)
631: {
632: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
633: }
634: };
638: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
639: {
640: CUSPARRAY *warray=NULL,*xarray=NULL,*yarray=NULL;
644: VecCUSPGetArrayRead(xin,&xarray);
645: VecCUSPGetArrayRead(yin,&yarray);
646: VecCUSPGetArrayWrite(win,&warray);
647: try {
648: thrust::for_each(
649: thrust::make_zip_iterator(
650: thrust::make_tuple(
651: warray->begin(),
652: xarray->begin(),
653: yarray->begin())),
654: thrust::make_zip_iterator(
655: thrust::make_tuple(
656: warray->end(),
657: xarray->end(),
658: yarray->end())),
659: VecCUSPPointwiseDivide());
660: WaitForGPU();CHKERRCUSP(ierr);
661: } catch(char *ex) {
662: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
663: }
664: PetscLogFlops(win->map->n);
665: VecCUSPRestoreArrayRead(xin,&xarray);
666: VecCUSPRestoreArrayRead(yin,&yarray);
667: VecCUSPRestoreArrayWrite(win,&warray);
668: return(0);
669: }
672: struct VecCUSPWAXPY
673: {
674: template <typename Tuple>
675: __host__ __device__
676: void operator()(Tuple t)
677: {
678: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
679: }
680: };
682: struct VecCUSPSum
683: {
684: template <typename Tuple>
685: __host__ __device__
686: void operator()(Tuple t)
687: {
688: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
689: }
690: };
692: struct VecCUSPDiff
693: {
694: template <typename Tuple>
695: __host__ __device__
696: void operator()(Tuple t)
697: {
698: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
699: }
700: };
704: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
705: {
706: CUSPARRAY *xarray=NULL,*yarray=NULL,*warray=NULL;
710: if (alpha == 0.0) {
711: VecCopy_SeqCUSP(yin,win);
712: } else {
713: VecCUSPGetArrayRead(xin,&xarray);
714: VecCUSPGetArrayRead(yin,&yarray);
715: VecCUSPGetArrayWrite(win,&warray);
716: if (alpha == 1.0) {
717: try {
718: thrust::for_each(
719: thrust::make_zip_iterator(
720: thrust::make_tuple(
721: warray->begin(),
722: yarray->begin(),
723: xarray->begin())),
724: thrust::make_zip_iterator(
725: thrust::make_tuple(
726: warray->end(),
727: yarray->end(),
728: xarray->end())),
729: VecCUSPSum());
730: } catch(char *ex) {
731: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
732: }
733: PetscLogFlops(win->map->n);
734: } else if (alpha == -1.0) {
735: try {
736: thrust::for_each(
737: thrust::make_zip_iterator(
738: thrust::make_tuple(
739: warray->begin(),
740: yarray->begin(),
741: xarray->begin())),
742: thrust::make_zip_iterator(
743: thrust::make_tuple(
744: warray->end(),
745: yarray->end(),
746: xarray->end())),
747: VecCUSPDiff());
748: } catch(char *ex) {
749: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
750: }
751: PetscLogFlops(win->map->n);
752: } else {
753: try {
754: thrust::for_each(
755: thrust::make_zip_iterator(
756: thrust::make_tuple(
757: warray->begin(),
758: yarray->begin(),
759: thrust::make_constant_iterator(alpha),
760: xarray->begin())),
761: thrust::make_zip_iterator(
762: thrust::make_tuple(
763: warray->end(),
764: yarray->end(),
765: thrust::make_constant_iterator(alpha),
766: xarray->end())),
767: VecCUSPWAXPY());
768: } catch(char *ex) {
769: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
770: }
771: PetscLogFlops(2*win->map->n);
772: }
773: WaitForGPU();CHKERRCUSP(ierr);
774: VecCUSPRestoreArrayRead(xin,&xarray);
775: VecCUSPRestoreArrayRead(yin,&yarray);
776: VecCUSPRestoreArrayWrite(win,&warray);
777: }
778: return(0);
779: }
781: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
782: struct VecCUSPMAXPY4
783: {
784: template <typename Tuple>
785: __host__ __device__
786: void operator()(Tuple t)
787: {
788: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
789: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
790: }
791: };
794: struct VecCUSPMAXPY3
795: {
796: template <typename Tuple>
797: __host__ __device__
798: void operator()(Tuple t)
799: {
800: /*y += a1*x1 +a2*x2 + a3*x3 */
801: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
802: }
803: };
805: struct VecCUSPMAXPY2
806: {
807: template <typename Tuple>
808: __host__ __device__
809: void operator()(Tuple t)
810: {
811: /*y += a1*x1 +a2*x2*/
812: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
813: }
814: };
817: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
818: {
820: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
821: PetscInt n = xin->map->n,j,j_rem;
822: PetscScalar alpha0,alpha1,alpha2,alpha3;
825: PetscLogFlops(nv*2.0*n);
826: VecCUSPGetArrayReadWrite(xin,&xarray);
827: switch (j_rem=nv&0x3) {
828: case 3:
829: alpha0 = alpha[0];
830: alpha1 = alpha[1];
831: alpha2 = alpha[2];
832: alpha += 3;
833: VecCUSPGetArrayRead(y[0],&yy0);
834: VecCUSPGetArrayRead(y[1],&yy1);
835: VecCUSPGetArrayRead(y[2],&yy2);
836: try {
837: thrust::for_each(
838: thrust::make_zip_iterator(
839: thrust::make_tuple(
840: xarray->begin(),
841: thrust::make_constant_iterator(alpha0),
842: yy0->begin(),
843: thrust::make_constant_iterator(alpha1),
844: yy1->begin(),
845: thrust::make_constant_iterator(alpha2),
846: yy2->begin())),
847: thrust::make_zip_iterator(
848: thrust::make_tuple(
849: xarray->end(),
850: thrust::make_constant_iterator(alpha0),
851: yy0->end(),
852: thrust::make_constant_iterator(alpha1),
853: yy1->end(),
854: thrust::make_constant_iterator(alpha2),
855: yy2->end())),
856: VecCUSPMAXPY3());
857: } catch(char *ex) {
858: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
859: }
860: VecCUSPRestoreArrayRead(y[0],&yy0);
861: VecCUSPRestoreArrayRead(y[1],&yy1);
862: VecCUSPRestoreArrayRead(y[2],&yy2);
863: y += 3;
864: break;
865: case 2:
866: alpha0 = alpha[0];
867: alpha1 = alpha[1];
868: alpha +=2;
869: VecCUSPGetArrayRead(y[0],&yy0);
870: VecCUSPGetArrayRead(y[1],&yy1);
871: try {
872: thrust::for_each(
873: thrust::make_zip_iterator(
874: thrust::make_tuple(
875: xarray->begin(),
876: thrust::make_constant_iterator(alpha0),
877: yy0->begin(),
878: thrust::make_constant_iterator(alpha1),
879: yy1->begin())),
880: thrust::make_zip_iterator(
881: thrust::make_tuple(
882: xarray->end(),
883: thrust::make_constant_iterator(alpha0),
884: yy0->end(),
885: thrust::make_constant_iterator(alpha1),
886: yy1->end())),
887: VecCUSPMAXPY2());
888: } catch(char *ex) {
889: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
890: }
891: y +=2;
892: break;
893: case 1:
894: alpha0 = *alpha++;
895: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
896: y +=1;
897: break;
898: }
899: for (j=j_rem; j<nv; j+=4) {
900: alpha0 = alpha[0];
901: alpha1 = alpha[1];
902: alpha2 = alpha[2];
903: alpha3 = alpha[3];
904: alpha += 4;
905: VecCUSPGetArrayRead(y[0],&yy0);
906: VecCUSPGetArrayRead(y[1],&yy1);
907: VecCUSPGetArrayRead(y[2],&yy2);
908: VecCUSPGetArrayRead(y[3],&yy3);
909: try {
910: thrust::for_each(
911: thrust::make_zip_iterator(
912: thrust::make_tuple(
913: xarray->begin(),
914: thrust::make_constant_iterator(alpha0),
915: yy0->begin(),
916: thrust::make_constant_iterator(alpha1),
917: yy1->begin(),
918: thrust::make_constant_iterator(alpha2),
919: yy2->begin(),
920: thrust::make_constant_iterator(alpha3),
921: yy3->begin())),
922: thrust::make_zip_iterator(
923: thrust::make_tuple(
924: xarray->end(),
925: thrust::make_constant_iterator(alpha0),
926: yy0->end(),
927: thrust::make_constant_iterator(alpha1),
928: yy1->end(),
929: thrust::make_constant_iterator(alpha2),
930: yy2->end(),
931: thrust::make_constant_iterator(alpha3),
932: yy3->end())),
933: VecCUSPMAXPY4());
934: } catch(char *ex) {
935: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
936: }
937: VecCUSPRestoreArrayRead(y[0],&yy0);
938: VecCUSPRestoreArrayRead(y[1],&yy1);
939: VecCUSPRestoreArrayRead(y[2],&yy2);
940: VecCUSPRestoreArrayRead(y[3],&yy3);
941: y += 4;
942: }
943: VecCUSPRestoreArrayReadWrite(xin,&xarray);
944: WaitForGPU();CHKERRCUSP(ierr);
945: return(0);
946: }
951: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
952: {
953: CUSPARRAY *xarray,*yarray;
955: // PetscScalar *xptr,*yptr,*zgpu;
956: //PetscReal tmp;
959: //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
960: //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
961: VecCUSPGetArrayRead(xin,&xarray);
962: VecCUSPGetArrayRead(yin,&yarray);
963: try {
964: #if defined(PETSC_USE_COMPLEX)
965: *z = cusp::blas::dotc(*yarray,*xarray);
966: #else
967: *z = cusp::blas::dot(*yarray,*xarray);
968: #endif
969: } catch(char *ex) {
970: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
971: }
972: WaitForGPU();CHKERRCUSP(ierr);
973: if (xin->map->n >0) {
974: PetscLogFlops(2.0*xin->map->n-1);
975: }
976: VecCUSPRestoreArrayRead(xin,&xarray);
977: VecCUSPRestoreArrayRead(yin,&yarray);
978: //printf("VecDot_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*z),PetscImaginaryPart(*z));
979: return(0);
980: }
982: //
983: // CUDA kernels for MDot to follow
984: //
986: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
987: #define MDOT_WORKGROUP_SIZE 128
988: #define MDOT_WORKGROUP_NUM 128
990: // M = 2:
991: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
992: PetscInt size, PetscScalar *group_results)
993: {
994: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
995: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
996: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
997: PetscInt vec_start_index = blockIdx.x * entries_per_group;
998: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
1000: PetscScalar entry_x = 0;
1001: PetscScalar group_sum0 = 0;
1002: PetscScalar group_sum1 = 0;
1003: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1004: entry_x = x[i]; // load only once from global memory!
1005: group_sum0 += entry_x * y0[i];
1006: group_sum1 += entry_x * y1[i];
1007: }
1008: tmp_buffer[threadIdx.x] = group_sum0;
1009: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
1011: // parallel reduction
1012: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1013: __syncthreads();
1014: if (threadIdx.x < stride) {
1015: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1016: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
1017: }
1018: }
1020: // write result of group to group_results
1021: if (threadIdx.x == 0) {
1022: group_results[blockIdx.x] = tmp_buffer[0];
1023: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
1024: }
1025: }
1027: // M = 3:
1028: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
1029: PetscInt size, PetscScalar *group_results)
1030: {
1031: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
1032: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
1033: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
1034: PetscInt vec_start_index = blockIdx.x * entries_per_group;
1035: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
1037: PetscScalar entry_x = 0;
1038: PetscScalar group_sum0 = 0;
1039: PetscScalar group_sum1 = 0;
1040: PetscScalar group_sum2 = 0;
1041: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1042: entry_x = x[i]; // load only once from global memory!
1043: group_sum0 += entry_x * y0[i];
1044: group_sum1 += entry_x * y1[i];
1045: group_sum2 += entry_x * y2[i];
1046: }
1047: tmp_buffer[threadIdx.x] = group_sum0;
1048: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
1049: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
1051: // parallel reduction
1052: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1053: __syncthreads();
1054: if (threadIdx.x < stride) {
1055: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1056: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
1057: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1058: }
1059: }
1061: // write result of group to group_results
1062: if (threadIdx.x == 0) {
1063: group_results[blockIdx.x ] = tmp_buffer[0];
1064: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
1065: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1066: }
1067: }
1069: // M = 4:
1070: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
1071: PetscInt size, PetscScalar *group_results)
1072: {
1073: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
1074: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
1075: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
1076: PetscInt vec_start_index = blockIdx.x * entries_per_group;
1077: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
1079: PetscScalar entry_x = 0;
1080: PetscScalar group_sum0 = 0;
1081: PetscScalar group_sum1 = 0;
1082: PetscScalar group_sum2 = 0;
1083: PetscScalar group_sum3 = 0;
1084: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1085: entry_x = x[i]; // load only once from global memory!
1086: group_sum0 += entry_x * y0[i];
1087: group_sum1 += entry_x * y1[i];
1088: group_sum2 += entry_x * y2[i];
1089: group_sum3 += entry_x * y3[i];
1090: }
1091: tmp_buffer[threadIdx.x] = group_sum0;
1092: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
1093: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
1094: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
1096: // parallel reduction
1097: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1098: __syncthreads();
1099: if (threadIdx.x < stride) {
1100: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1101: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
1102: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1103: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1104: }
1105: }
1107: // write result of group to group_results
1108: if (threadIdx.x == 0) {
1109: group_results[blockIdx.x ] = tmp_buffer[0];
1110: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
1111: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1112: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1113: }
1114: }
1116: // M = 8:
1117: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
1118: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
1119: PetscInt size, PetscScalar *group_results)
1120: {
1121: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
1122: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
1123: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
1124: PetscInt vec_start_index = blockIdx.x * entries_per_group;
1125: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
1127: PetscScalar entry_x = 0;
1128: PetscScalar group_sum0 = 0;
1129: PetscScalar group_sum1 = 0;
1130: PetscScalar group_sum2 = 0;
1131: PetscScalar group_sum3 = 0;
1132: PetscScalar group_sum4 = 0;
1133: PetscScalar group_sum5 = 0;
1134: PetscScalar group_sum6 = 0;
1135: PetscScalar group_sum7 = 0;
1136: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1137: entry_x = x[i]; // load only once from global memory!
1138: group_sum0 += entry_x * y0[i];
1139: group_sum1 += entry_x * y1[i];
1140: group_sum2 += entry_x * y2[i];
1141: group_sum3 += entry_x * y3[i];
1142: group_sum4 += entry_x * y4[i];
1143: group_sum5 += entry_x * y5[i];
1144: group_sum6 += entry_x * y6[i];
1145: group_sum7 += entry_x * y7[i];
1146: }
1147: tmp_buffer[threadIdx.x] = group_sum0;
1148: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
1149: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
1150: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
1151: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
1152: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
1153: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
1154: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
1156: // parallel reduction
1157: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1158: __syncthreads();
1159: if (threadIdx.x < stride) {
1160: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1161: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
1162: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1163: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1164: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
1165: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
1166: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
1167: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
1168: }
1169: }
1171: // write result of group to group_results
1172: if (threadIdx.x == 0) {
1173: group_results[blockIdx.x ] = tmp_buffer[0];
1174: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
1175: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1176: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1177: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
1178: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
1179: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
1180: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
1181: }
1182: }
1187: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1188: {
1190: PetscInt i,j,n = xin->map->n,current_y_index = 0;
1191: CUSPARRAY *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1192: PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1193: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1194: cudaError_t cuda_ierr;
1197: // allocate scratchpad memory for the results of individual work groups:
1198: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1199: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1200: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);
1202: VecCUSPGetArrayRead(xin,&xarray);
1203: xptr = thrust::raw_pointer_cast(xarray->data());
1205: while (current_y_index < nv)
1206: {
1207: switch (nv - current_y_index) {
1209: case 7:
1210: case 6:
1211: case 5:
1212: case 4:
1213: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1214: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1215: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1216: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1218: #if defined(PETSC_USE_COMPLEX)
1219: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1220: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1221: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1222: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1223: #else
1224: // extract raw device pointers:
1225: y0ptr = thrust::raw_pointer_cast(y0array->data());
1226: y1ptr = thrust::raw_pointer_cast(y1array->data());
1227: y2ptr = thrust::raw_pointer_cast(y2array->data());
1228: y3ptr = thrust::raw_pointer_cast(y3array->data());
1230: // run kernel:
1231: VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
1233: // copy results back to
1234: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1235: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1237: // sum group results into z:
1238: for (j=0; j<4; ++j) {
1239: z[current_y_index + j] = 0;
1240: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1241: }
1242: #endif
1243: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1244: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1245: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1246: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1247: current_y_index += 4;
1248: break;
1250: case 3:
1251: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1252: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1253: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1255: #if defined(PETSC_USE_COMPLEX)
1256: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1257: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1258: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1259: #else
1260: // extract raw device pointers:
1261: y0ptr = thrust::raw_pointer_cast(y0array->data());
1262: y1ptr = thrust::raw_pointer_cast(y1array->data());
1263: y2ptr = thrust::raw_pointer_cast(y2array->data());
1265: // run kernel:
1266: VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
1268: // copy results back to
1269: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1270: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1272: // sum group results into z:
1273: for (j=0; j<3; ++j) {
1274: z[current_y_index + j] = 0;
1275: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1276: }
1277: #endif
1279: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1280: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1281: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1282: current_y_index += 3;
1283: break;
1285: case 2:
1286: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1287: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1289: #if defined(PETSC_USE_COMPLEX)
1290: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1291: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1292: #else
1293: // extract raw device pointers:
1294: y0ptr = thrust::raw_pointer_cast(y0array->data());
1295: y1ptr = thrust::raw_pointer_cast(y1array->data());
1297: // run kernel:
1298: VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
1300: // copy results back to
1301: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1302: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1304: // sum group results into z:
1305: for (j=0; j<2; ++j) {
1306: z[current_y_index + j] = 0;
1307: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1308: }
1309: #endif
1310: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1311: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1312: current_y_index += 2;
1313: break;
1315: case 1:
1316: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1317: #if defined(PETSC_USE_COMPLEX)
1318: z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1319: #else
1320: z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1321: #endif
1322: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1323: current_y_index += 1;
1324: break;
1326: default: // 8 or more vectors left
1327: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1328: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1329: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1330: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1331: VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1332: VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1333: VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1334: VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);
1336: #if defined(PETSC_USE_COMPLEX)
1337: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1338: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1339: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1340: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1341: z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1342: z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1343: z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1344: z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1345: #else
1346: // extract raw device pointers:
1347: y0ptr = thrust::raw_pointer_cast(y0array->data());
1348: y1ptr = thrust::raw_pointer_cast(y1array->data());
1349: y2ptr = thrust::raw_pointer_cast(y2array->data());
1350: y3ptr = thrust::raw_pointer_cast(y3array->data());
1351: y4ptr = thrust::raw_pointer_cast(y4array->data());
1352: y5ptr = thrust::raw_pointer_cast(y5array->data());
1353: y6ptr = thrust::raw_pointer_cast(y6array->data());
1354: y7ptr = thrust::raw_pointer_cast(y7array->data());
1356: // run kernel:
1357: VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
1359: // copy results back to
1360: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1361: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1363: // sum group results into z:
1364: for (j=0; j<8; ++j) {
1365: z[current_y_index + j] = 0;
1366: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1367: }
1368: #endif
1369: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1370: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1371: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1372: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1373: VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1374: VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1375: VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1376: VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1377: current_y_index += 8;
1378: break;
1379: }
1380: }
1381: VecCUSPRestoreArrayRead(xin,&xarray);
1383: cuda_cudaFree(group_results_gpu);
1384: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1385: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1386: return(0);
1387: }
1389: #undef MDOT_WORKGROUP_SIZE
1390: #undef MDOT_WORKGROUP_NUM
1396: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1397: {
1398: CUSPARRAY *xarray=NULL;
1402: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1403: VecCUSPGetArrayWrite(xin,&xarray);
1404: try {
1405: cusp::blas::fill(*xarray,alpha);
1406: } catch(char *ex) {
1407: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1408: }
1409: WaitForGPU();CHKERRCUSP(ierr);
1410: VecCUSPRestoreArrayWrite(xin,&xarray);
1411: return(0);
1412: }
1416: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1417: {
1418: CUSPARRAY *xarray;
1422: if (alpha == 0.0) {
1423: VecSet_SeqCUSP(xin,alpha);
1424: } else if (alpha != 1.0) {
1425: VecCUSPGetArrayReadWrite(xin,&xarray);
1426: try {
1427: cusp::blas::scal(*xarray,alpha);
1428: } catch(char *ex) {
1429: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1430: }
1431: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1432: }
1433: WaitForGPU();CHKERRCUSP(ierr);
1434: PetscLogFlops(xin->map->n);
1435: return(0);
1436: }
1441: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1442: {
1443: CUSPARRAY *xarray,*yarray;
1447: //#if defined(PETSC_USE_COMPLEX)
1448: /*Not working for complex*/
1449: //#else
1450: VecCUSPGetArrayRead(xin,&xarray);
1451: VecCUSPGetArrayRead(yin,&yarray);
1452: try {
1453: *z = cusp::blas::dot(*xarray,*yarray);
1454: } catch(char *ex) {
1455: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1456: }
1457: //#endif
1458: WaitForGPU();CHKERRCUSP(ierr);
1459: if (xin->map->n > 0) {
1460: PetscLogFlops(2.0*xin->map->n-1);
1461: }
1462: VecCUSPRestoreArrayRead(yin,&yarray);
1463: VecCUSPRestoreArrayRead(xin,&xarray);
1464: return(0);
1465: }
1468: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1469: {
1470: CUSPARRAY *xarray,*yarray;
1474: if (xin != yin) {
1475: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1476: VecCUSPGetArrayRead(xin,&xarray);
1477: VecCUSPGetArrayWrite(yin,&yarray);
1478: try {
1479: cusp::blas::copy(*xarray,*yarray);
1480: } catch(char *ex) {
1481: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1482: }
1483: WaitForGPU();CHKERRCUSP(ierr);
1484: VecCUSPRestoreArrayRead(xin,&xarray);
1485: VecCUSPRestoreArrayWrite(yin,&yarray);
1487: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1488: /* copy in CPU if we are on the CPU*/
1489: VecCopy_SeqCUSP_Private(xin,yin);
1490: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1491: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1492: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1493: /* copy in CPU */
1494: VecCopy_SeqCUSP_Private(xin,yin);
1496: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1497: /* copy in GPU */
1498: VecCUSPGetArrayRead(xin,&xarray);
1499: VecCUSPGetArrayWrite(yin,&yarray);
1500: try {
1501: cusp::blas::copy(*xarray,*yarray);
1502: WaitForGPU();CHKERRCUSP(ierr);
1503: } catch(char *ex) {
1504: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1505: }
1506: VecCUSPRestoreArrayRead(xin,&xarray);
1507: VecCUSPRestoreArrayWrite(yin,&yarray);
1508: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1509: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1510: default to copy in GPU (this is an arbitrary choice) */
1511: VecCUSPGetArrayRead(xin,&xarray);
1512: VecCUSPGetArrayWrite(yin,&yarray);
1513: try {
1514: cusp::blas::copy(*xarray,*yarray);
1515: WaitForGPU();CHKERRCUSP(ierr);
1516: } catch(char *ex) {
1517: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1518: }
1519: VecCUSPRestoreArrayRead(xin,&xarray);
1520: VecCUSPRestoreArrayWrite(yin,&yarray);
1521: } else {
1522: VecCopy_SeqCUSP_Private(xin,yin);
1523: }
1524: }
1525: }
1526: return(0);
1527: }
1532: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1533: {
1535: PetscBLASInt one = 1,bn;
1536: CUSPARRAY *xarray,*yarray;
1539: PetscBLASIntCast(xin->map->n,&bn);
1540: if (xin != yin) {
1541: VecCUSPGetArrayReadWrite(xin,&xarray);
1542: VecCUSPGetArrayReadWrite(yin,&yarray);
1544: #if defined(PETSC_USE_COMPLEX)
1545: #if defined(PETSC_USE_REAL_SINGLE)
1546: cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1547: #else
1548: cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1549: #endif
1550: #else
1551: #if defined(PETSC_USE_REAL_SINGLE)
1552: cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1553: #else
1554: cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1555: #endif
1556: #endif
1557: cublasGetError();CHKERRCUSP(ierr);
1558: WaitForGPU();CHKERRCUSP(ierr);
1559: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1560: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1561: }
1562: return(0);
1563: }
1565: struct VecCUSPAX
1566: {
1567: template <typename Tuple>
1568: __host__ __device__
1569: void operator()(Tuple t)
1570: {
1571: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1572: }
1573: };
1576: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1577: {
1579: PetscScalar a = alpha,b = beta;
1580: CUSPARRAY *xarray,*yarray;
1583: if (a == 0.0) {
1584: VecScale_SeqCUSP(yin,beta);
1585: } else if (b == 1.0) {
1586: VecAXPY_SeqCUSP(yin,alpha,xin);
1587: } else if (a == 1.0) {
1588: VecAYPX_SeqCUSP(yin,beta,xin);
1589: } else if (b == 0.0) {
1590: VecCUSPGetArrayRead(xin,&xarray);
1591: VecCUSPGetArrayReadWrite(yin,&yarray);
1592: try {
1593: thrust::for_each(
1594: thrust::make_zip_iterator(
1595: thrust::make_tuple(
1596: yarray->begin(),
1597: thrust::make_constant_iterator(a),
1598: xarray->begin())),
1599: thrust::make_zip_iterator(
1600: thrust::make_tuple(
1601: yarray->end(),
1602: thrust::make_constant_iterator(a),
1603: xarray->end())),
1604: VecCUSPAX());
1605: } catch(char *ex) {
1606: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1607: }
1608: PetscLogFlops(xin->map->n);
1609: WaitForGPU();CHKERRCUSP(ierr);
1610: VecCUSPRestoreArrayRead(xin,&xarray);
1611: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1612: } else {
1613: VecCUSPGetArrayRead(xin,&xarray);
1614: VecCUSPGetArrayReadWrite(yin,&yarray);
1615: try {
1616: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1617: } catch(char *ex) {
1618: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1619: }
1620: VecCUSPRestoreArrayRead(xin,&xarray);
1621: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1622: WaitForGPU();CHKERRCUSP(ierr);
1623: PetscLogFlops(3.0*xin->map->n);
1624: }
1625: return(0);
1626: }
1628: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1629: struct VecCUSPXPBYPCZ
1630: {
1631: /* z = x + b*y + c*z */
1632: template <typename Tuple>
1633: __host__ __device__
1634: void operator()(Tuple t)
1635: {
1636: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1637: }
1638: };
1639: struct VecCUSPAXPBYPZ
1640: {
1641: /* z = ax + b*y + z */
1642: template <typename Tuple>
1643: __host__ __device__
1644: void operator()(Tuple t)
1645: {
1646: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1647: }
1648: };
1652: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1653: {
1655: PetscInt n = zin->map->n;
1656: CUSPARRAY *xarray,*yarray,*zarray;
1659: VecCUSPGetArrayRead(xin,&xarray);
1660: VecCUSPGetArrayRead(yin,&yarray);
1661: VecCUSPGetArrayReadWrite(zin,&zarray);
1662: if (alpha == 1.0) {
1663: try {
1664: thrust::for_each(
1665: thrust::make_zip_iterator(
1666: thrust::make_tuple(
1667: zarray->begin(),
1668: thrust::make_constant_iterator(gamma),
1669: xarray->begin(),
1670: yarray->begin(),
1671: thrust::make_constant_iterator(beta))),
1672: thrust::make_zip_iterator(
1673: thrust::make_tuple(
1674: zarray->end(),
1675: thrust::make_constant_iterator(gamma),
1676: xarray->end(),
1677: yarray->end(),
1678: thrust::make_constant_iterator(beta))),
1679: VecCUSPXPBYPCZ());
1680: } catch(char *ex) {
1681: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1682: }
1683: PetscLogFlops(4.0*n);
1684: } else if (gamma == 1.0) {
1685: try {
1686: thrust::for_each(
1687: thrust::make_zip_iterator(
1688: thrust::make_tuple(
1689: zarray->begin(),
1690: xarray->begin(),
1691: thrust::make_constant_iterator(alpha),
1692: yarray->begin(),
1693: thrust::make_constant_iterator(beta))),
1694: thrust::make_zip_iterator(
1695: thrust::make_tuple(
1696: zarray->end(),
1697: xarray->end(),
1698: thrust::make_constant_iterator(alpha),
1699: yarray->end(),
1700: thrust::make_constant_iterator(beta))),
1701: VecCUSPAXPBYPZ());
1702: } catch(char *ex) {
1703: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1704: }
1705: PetscLogFlops(4.0*n);
1706: } else {
1707: try {
1708: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1709: } catch(char *ex) {
1710: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1711: }
1712: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1713: VecCUSPRestoreArrayRead(xin,&xarray);
1714: VecCUSPRestoreArrayRead(yin,&yarray);
1715: PetscLogFlops(5.0*n);
1716: }
1717: WaitForGPU();CHKERRCUSP(ierr);
1718: return(0);
1719: }
1723: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1724: {
1726: PetscInt n = win->map->n;
1727: CUSPARRAY *xarray,*yarray,*warray;
1730: VecCUSPGetArrayRead(xin,&xarray);
1731: VecCUSPGetArrayRead(yin,&yarray);
1732: VecCUSPGetArrayReadWrite(win,&warray);
1733: try {
1734: cusp::blas::xmy(*xarray,*yarray,*warray);
1735: } catch(char *ex) {
1736: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1737: }
1738: VecCUSPRestoreArrayRead(xin,&xarray);
1739: VecCUSPRestoreArrayRead(yin,&yarray);
1740: VecCUSPRestoreArrayReadWrite(win,&warray);
1741: PetscLogFlops(n);
1742: WaitForGPU();CHKERRCUSP(ierr);
1743: return(0);
1744: }
1747: /* should do infinity norm in cusp */
1751: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1752: {
1753: const PetscScalar *xx;
1754: PetscErrorCode ierr;
1755: PetscInt n = xin->map->n;
1756: PetscBLASInt one = 1, bn;
1757: CUSPARRAY *xarray;
1760: PetscBLASIntCast(n,&bn);
1761: if (type == NORM_2 || type == NORM_FROBENIUS) {
1762: VecCUSPGetArrayRead(xin,&xarray);
1763: try {
1764: *z = cusp::blas::nrm2(*xarray);
1765: } catch(char *ex) {
1766: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1767: }
1768: WaitForGPU();CHKERRCUSP(ierr);
1769: VecCUSPRestoreArrayRead(xin,&xarray);
1770: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1771: } else if (type == NORM_INFINITY) {
1772: PetscInt i;
1773: PetscReal max = 0.0,tmp;
1775: VecGetArrayRead(xin,&xx);
1776: for (i=0; i<n; i++) {
1777: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1778: /* check special case of tmp == NaN */
1779: if (tmp != tmp) {max = tmp; break;}
1780: xx++;
1781: }
1782: VecRestoreArrayRead(xin,&xx);
1783: *z = max;
1784: } else if (type == NORM_1) {
1785: VecCUSPGetArrayRead(xin,&xarray);
1786: #if defined(PETSC_USE_COMPLEX)
1787: #if defined(PETSC_USE_REAL_SINGLE)
1788: *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1789: #else
1790: *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1791: #endif
1792: #else
1793: #if defined(PETSC_USE_REAL_SINGLE)
1794: *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1795: #else
1796: *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1797: #endif
1798: #endif
1799: cublasGetError();CHKERRCUSP(ierr);
1800: VecCUSPRestoreArrayRead(xin,&xarray);
1801: WaitForGPU();CHKERRCUSP(ierr);
1802: PetscLogFlops(PetscMax(n-1.0,0.0));
1803: } else if (type == NORM_1_AND_2) {
1804: VecNorm_SeqCUSP(xin,NORM_1,z);
1805: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1806: }
1807: //printf("VecNorm_SeqCUSP=%1.5g\n",*z);
1808: return(0);
1809: }
1812: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */
1816: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1817: {
1821: VecSetRandom_SeqCUSP_Private(xin,r);
1822: xin->valid_GPU_array = PETSC_CUSP_CPU;
1823: return(0);
1824: }
1828: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1829: {
1833: VecCUSPCopyFromGPU(vin);
1834: VecResetArray_SeqCUSP_Private(vin);
1835: vin->valid_GPU_array = PETSC_CUSP_CPU;
1836: return(0);
1837: }
1841: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1842: {
1846: VecCUSPCopyFromGPU(vin);
1847: VecPlaceArray_Seq(vin,a);
1848: vin->valid_GPU_array = PETSC_CUSP_CPU;
1849: return(0);
1850: }
1855: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1856: {
1860: VecCUSPCopyFromGPU(vin);
1861: VecReplaceArray_Seq(vin,a);
1862: vin->valid_GPU_array = PETSC_CUSP_CPU;
1863: return(0);
1864: }
1869: /*@
1870: VecCreateSeqCUSP - Creates a standard, sequential array-style vector.
1872: Collective on MPI_Comm
1874: Input Parameter:
1875: + comm - the communicator, should be PETSC_COMM_SELF
1876: - n - the vector length
1878: Output Parameter:
1879: . V - the vector
1881: Notes:
1882: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1883: same type as an existing vector.
1885: Level: intermediate
1887: Concepts: vectors^creating sequential
1889: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1890: @*/
1891: PetscErrorCode VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1892: {
1896: VecCreate(comm,v);
1897: VecSetSizes(*v,n,n);
1898: VecSetType(*v,VECSEQCUSP);
1899: return(0);
1900: }
1902: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1903: template <typename T>
1904: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1905: {
1906: __host__ __device__
1907: T operator()(T x)
1908: {
1909: #if defined(PETSC_USE_COMPLEX)
1910: //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1911: #else
1912: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1913: #endif
1914: }
1915: };
1917: template <typename T>
1918: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1919: {
1920: __host__ __device__
1921: T operator()(T x,T y)
1922: {
1923: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1924: }
1925: };
1929: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1930: {
1931: PetscErrorCode ierr;
1932: PetscScalar zero = 0.0;
1933: PetscReal n=s->map->n;
1934: thrust::tuple<PetscScalar,PetscScalar> result;
1935: CUSPARRAY *sarray,*tarray;
1938: /*VecCUSPCopyToGPU(s);
1939: VecCUSPCopyToGPU(t);*/
1940: VecCUSPGetArrayRead(s,&sarray);
1941: VecCUSPGetArrayRead(t,&tarray);
1942: try {
1943: #if defined(PETSC_USE_COMPLEX)
1944: VecDot_SeqCUSP(s,t,dp);
1945: VecDot_SeqCUSP(t,t,nm);
1946: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1947: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1948: #else
1949: result = thrust::transform_reduce(
1950: thrust::make_zip_iterator(
1951: thrust::make_tuple(
1952: sarray->begin(),
1953: tarray->begin())),
1954: thrust::make_zip_iterator(
1955: thrust::make_tuple(
1956: sarray->end(),
1957: tarray->end())),
1958: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1959: thrust::make_tuple(zero,zero), /*init */
1960: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1961: *dp = thrust::get<0>(result);
1962: *nm = thrust::get<1>(result);
1963: #endif
1964: } catch(char *ex) {
1965: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1966: }
1967: VecCUSPRestoreArrayRead(s,&sarray);
1968: VecCUSPRestoreArrayRead(t,&tarray);
1969: WaitForGPU();CHKERRCUSP(ierr);
1970: PetscLogFlops(4.0*n);
1971: return(0);
1972: }
1976: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1977: {
1981: VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1982: PetscLayoutReference(win->map,&(*V)->map);
1983: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1984: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1985: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1986: return(0);
1987: }
1991: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1992: {
1996: try {
1997: if (v->spptr) {
1998: #if defined(PETSC_HAVE_TXPETSCGPU)
1999: if (((Vec_CUSP*)v->spptr)->GPUvector) delete ((Vec_CUSP*)v->spptr)->GPUvector;
2000: Vec_Seq *s;
2001: s = (Vec_Seq*)v->data;
2002: s->array = NULL;
2003: s->array_allocated = NULL;
2004: #endif
2005: delete ((Vec_CUSP*)v->spptr)->GPUarray;
2006: delete (Vec_CUSP*) v->spptr;
2007: }
2008: } catch(char *ex) {
2009: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
2010: }
2011: VecDestroy_SeqCUSP_Private(v);
2012: return(0);
2013: }
2016: #if defined(PETSC_USE_COMPLEX)
2017: struct conjugate
2018: {
2019: __host__ __device__
2020: PetscScalar operator()(PetscScalar x)
2021: {
2022: return cusp::conj(x);
2023: }
2024: };
2025: #endif
2030: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
2031: {
2033: CUSPARRAY *xarray;
2036: VecCUSPGetArrayReadWrite(xin,&xarray);
2037: #if defined(PETSC_USE_COMPLEX)
2038: thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
2039: #endif
2040: VecCUSPRestoreArrayReadWrite(xin,&xarray);
2041: return(0);
2042: }
2046: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
2047: {
2049: PetscMPIInt size;
2052: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
2053: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
2054: VecCreate_Seq_Private(V,0);
2055: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
2057: V->ops->dot = VecDot_SeqCUSP;
2058: V->ops->norm = VecNorm_SeqCUSP;
2059: V->ops->tdot = VecTDot_SeqCUSP;
2060: V->ops->scale = VecScale_SeqCUSP;
2061: V->ops->copy = VecCopy_SeqCUSP;
2062: V->ops->set = VecSet_SeqCUSP;
2063: V->ops->swap = VecSwap_SeqCUSP;
2064: V->ops->axpy = VecAXPY_SeqCUSP;
2065: V->ops->axpby = VecAXPBY_SeqCUSP;
2066: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUSP;
2067: V->ops->pointwisemult = VecPointwiseMult_SeqCUSP;
2068: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
2069: V->ops->setrandom = VecSetRandom_SeqCUSP;
2070: V->ops->dot_local = VecDot_SeqCUSP;
2071: V->ops->tdot_local = VecTDot_SeqCUSP;
2072: V->ops->norm_local = VecNorm_SeqCUSP;
2073: V->ops->mdot_local = VecMDot_SeqCUSP;
2074: V->ops->maxpy = VecMAXPY_SeqCUSP;
2075: V->ops->mdot = VecMDot_SeqCUSP;
2076: V->ops->aypx = VecAYPX_SeqCUSP;
2077: V->ops->waxpy = VecWAXPY_SeqCUSP;
2078: V->ops->dotnorm2 = VecDotNorm2_SeqCUSP;
2079: V->ops->placearray = VecPlaceArray_SeqCUSP;
2080: V->ops->replacearray = VecReplaceArray_SeqCUSP;
2081: V->ops->resetarray = VecResetArray_SeqCUSP;
2082: V->ops->destroy = VecDestroy_SeqCUSP;
2083: V->ops->duplicate = VecDuplicate_SeqCUSP;
2084: V->ops->conjugate = VecConjugate_SeqCUSP;
2086: VecCUSPAllocateCheck(V);
2087: V->valid_GPU_array = PETSC_CUSP_GPU;
2088: VecSet(V,0.0);
2089: return(0);
2090: }
2094: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
2095: {
2099: *a = 0;
2100: VecCUSPCopyToGPU(v);
2101: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
2102: return(0);
2103: }
2107: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
2108: {
2112: v->valid_GPU_array = PETSC_CUSP_GPU;
2114: PetscObjectStateIncrease((PetscObject)v);
2115: return(0);
2116: }
2120: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
2121: {
2125: *a = 0;
2126: VecCUSPCopyToGPU(v);
2127: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
2128: return(0);
2129: }
2133: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
2134: {
2136: return(0);
2137: }
2141: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
2142: {
2146: *a = 0;
2147: VecCUSPAllocateCheck(v);
2148: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
2149: return(0);
2150: }
2154: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2155: {
2159: v->valid_GPU_array = PETSC_CUSP_GPU;
2161: PetscObjectStateIncrease((PetscObject)v);
2162: return(0);
2163: }