Actual source code: mpiaijAssemble.cu
petsc-3.4.2 2013-07-02
1: #include "petscconf.h"
2: PETSC_CUDA_EXTERN_C_BEGIN
3: #include ../src/mat/impls/aij/seq/aij.h
4: #include ../src/mat/impls/aij/mpi/mpiaij.h
5: #include petscbt.h
6: #include ../src/vec/vec/impls/dvecimpl.h
7: #include "petsc-private/vecimpl.h"
8: PETSC_CUDA_EXTERN_C_END
9: #undef VecType
10: #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h
12: #include <thrust/reduce.h>
13: #include <thrust/inner_product.h>
15: #include <cusp/array1d.h>
16: #include <cusp/print.h>
17: #include <cusp/coo_matrix.h>
19: #include <cusp/io/matrix_market.h>
21: #include <thrust/iterator/counting_iterator.h>
22: #include <thrust/iterator/transform_iterator.h>
23: #include <thrust/iterator/permutation_iterator.h>
24: #include <thrust/functional.h>
25: #include <thrust/partition.h>
26: #include <thrust/remove.h>
28: // this example illustrates how to make repeated access to a range of values
29: // examples:
30: // repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
31: // repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
32: // repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
33: // ...
35: template <typename Iterator>
36: class repeated_range
37: {
38: public:
40: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
42: struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
43: {
44: difference_type repeats;
46: repeat_functor(difference_type repeats) : repeats(repeats) {}
48: __host__ __device__
49: difference_type operator()(const difference_type &i) const
50: {
51: return i / repeats;
52: }
53: };
55: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
56: typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
57: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
59: // type of the repeated_range iterator
60: typedef PermutationIterator iterator;
62: // construct repeated_range for the range [first,last)
63: repeated_range(Iterator first, Iterator last, difference_type repeats)
64: : first(first), last(last), repeats(repeats) {}
66: iterator begin(void) const
67: {
68: return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
69: }
71: iterator end(void) const
72: {
73: return begin() + repeats * (last - first);
74: }
76: protected:
77: difference_type repeats;
78: Iterator first;
79: Iterator last;
81: };
83: // this example illustrates how to repeat blocks in a range multiple times
84: // examples:
85: // tiled_range([0, 1, 2, 3], 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
86: // tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
87: // tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
88: // tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
89: // ...
91: template <typename Iterator>
92: class tiled_range
93: {
94: public:
96: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
98: struct tile_functor : public thrust::unary_function<difference_type,difference_type>
99: {
100: difference_type repeats;
101: difference_type tile_size;
103: tile_functor(difference_type repeats, difference_type tile_size)
104: : tile_size(tile_size), repeats(repeats) {}
106: __host__ __device__
107: difference_type operator() (const difference_type &i) const
108: {
109: return tile_size * (i / (tile_size * repeats)) + i % tile_size;
110: }
111: };
113: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
114: typedef typename thrust::transform_iterator<tile_functor, CountingIterator> TransformIterator;
115: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
117: // type of the tiled_range iterator
118: typedef PermutationIterator iterator;
120: // construct repeated_range for the range [first,last)
121: tiled_range(Iterator first, Iterator last, difference_type repeats)
122: : first(first), last(last), repeats(repeats), tile_size(last - first) {}
124: tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
125: : first(first), last(last), repeats(repeats), tile_size(tile_size)
126: {
127: // ASSERT((last - first) % tile_size == 0)
128: }
130: iterator begin(void) const
131: {
132: return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
133: }
135: iterator end(void) const
136: {
137: return begin() + repeats * (last - first);
138: }
140: protected:
141: difference_type repeats;
142: difference_type tile_size;
143: Iterator first;
144: Iterator last;
145: };
147: typedef cusp::device_memory memSpace;
148: typedef int IndexType;
149: typedef PetscScalar ValueType;
150: typedef cusp::array1d<IndexType, memSpace> IndexArray;
151: typedef cusp::array1d<ValueType, memSpace> ValueArray;
152: typedef cusp::array1d<IndexType, cusp::host_memory> IndexHostArray;
153: typedef IndexArray::iterator IndexArrayIterator;
154: typedef ValueArray::iterator ValueArrayIterator;
156: struct is_diag
157: {
158: IndexType first, last;
160: is_diag(IndexType first, IndexType last) : first(first), last(last) {}
162: template <typename Tuple>
163: __host__ __device__
164: bool operator()(Tuple t)
165: {
166: // Check column
167: IndexType row = thrust::get<0>(t);
168: IndexType col = thrust::get<1>(t);
169: return (row >= first) && (row < last) && (col >= first) && (col < last);
170: }
171: };
173: struct is_nonlocal
174: {
175: IndexType first, last;
177: is_nonlocal(IndexType first, IndexType last) : first(first), last(last) {}
179: template <typename Tuple>
180: __host__ __device__
181: bool operator() (Tuple t)
182: {
183: // Check column
184: IndexType row = thrust::get<0>(t);
185: return (row < first) || (row >= last);
186: }
187: };
189: /*@C
190: MatMPIAIJSetValuesBatch - Set multiple blocks of values into a matrix
192: Not collective
194: Input Parameters:
195: + J - the assembled Mat object
196: . Ne - the number of blocks (elements)
197: . Nl - the block size (number of dof per element)
198: . elemRows - List of block row indices, in bunches of length Nl
199: - elemMats - List of block values, in bunches of Nl*Nl
201: Level: advanced
203: .seealso MatSetValues()
204: @*/
207: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
208: {
209: // Assumptions:
210: // 1) Each elemMat is square, of size Nl x Nl
211: //
212: // This means that any nonlocal entry (i,j) where i is owned by another process is matched to
213: // a) an offdiagonal entry (j,i) if j is diagonal, or
214: // b) another nonlocal entry (j,i) if j is offdiagonal
215: //
216: // No - numSendEntries: The number of on-process diagonal+offdiagonal entries
217: // numRecvEntries: The number of off-process diagonal+offdiagonal entries
218: //
219: // Glossary:
220: // diagonal: (i,j) such that i,j in [firstRow, lastRow)
221: // offdiagonal: (i,j) such that i in [firstRow, lastRow), and j not in [firstRow, lastRow)
222: // nonlocal: (i,j) such that i not in [firstRow, lastRow)
223: // nondiagonal: (i,j) such that i not in [firstRow, lastRow), or j not in [firstRow, lastRow)
224: // on-process: entries provided by elemMats
225: // off-process: entries received from other processes
226: MPI_Comm comm;
227: Mat_MPIAIJ *j = (Mat_MPIAIJ*) J->data;
228: size_t N = Ne * Nl; // Length of elemRows (dimension of unassembled space)
229: size_t No = Ne * Nl*Nl; // Length of elemMats (total number of values)
230: PetscInt Nr; // Size of J (dimension of assembled space)
231: PetscInt firstRow, lastRow, firstCol;
232: const PetscInt *rowRanges;
233: PetscInt numNonlocalRows; // Number of rows in elemRows not owned by this process
234: PetscInt numSendEntries; // Number of (i,j,v) entries sent to other processes
235: PetscInt numRecvEntries; // Number of (i,j,v) entries received from other processes
236: PetscInt Nc;
237: PetscMPIInt numProcs, rank;
240: // copy elemRows and elemMat to device
241: IndexArray d_elemRows(elemRows, elemRows + N);
242: ValueArray d_elemMats(elemMats, elemMats + No);
245: PetscObjectGetComm((PetscObject)J,&comm);
246: MPI_Comm_size(comm, &numProcs);
247: MPI_Comm_rank(comm, &rank);
248: // get matrix information
249: MatGetLocalSize(J, &Nr, NULL);
250: MatGetOwnershipRange(J, &firstRow, &lastRow);
251: MatGetOwnershipRanges(J, &rowRanges);
252: MatGetOwnershipRangeColumn(J, &firstCol, NULL);
253: PetscInfo3(J, "Assembling matrix of size %d (rows %d -- %d)\n", Nr, firstRow, lastRow);
255: // repeat elemRows entries Nl times
256: PetscInfo(J, "Making row indices\n");
257: repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);
259: // tile rows of elemRows Nl times
260: PetscInfo(J, "Making column indices\n");
261: tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);
263: // Find number of nonlocal rows, convert nonlocal rows to procs, and send sizes of off-proc entries (could send diag and offdiag sizes)
264: // TODO: Ask Nathan how to do this on GPU
265: PetscLogEventBegin(MAT_SetValuesBatchI,0,0,0,0);
266: PetscMPIInt *procSendSizes, *procRecvSizes;
268: PetscMalloc2(numProcs, PetscMPIInt, &procSendSizes, numProcs, PetscMPIInt, &procRecvSizes);
269: PetscMemzero(procSendSizes, numProcs * sizeof(PetscInt));
270: PetscMemzero(procRecvSizes, numProcs * sizeof(PetscInt));
272: numNonlocalRows = 0;
273: for (size_t i = 0; i < N; ++i) {
274: const PetscInt row = elemRows[i];
276: if ((row < firstRow) || (row >= lastRow)) {
277: numNonlocalRows++;
278: for (IndexType p = 0; p < numProcs; ++p) {
279: if ((row >= rowRanges[p]) && (row < rowRanges[p+1])) {
280: procSendSizes[p] += Nl;
281: break;
282: }
283: }
284: }
285: }
286: numSendEntries = numNonlocalRows*Nl;
288: PetscInfo2(J, "Nonlocal rows %d total entries %d\n", numNonlocalRows, No);
289: MPI_Alltoall(procSendSizes, 1, MPIU_INT, procRecvSizes, 1, MPIU_INT, comm);
291: numRecvEntries = 0;
292: for (PetscInt p = 0; p < numProcs; ++p) numRecvEntries += procRecvSizes[p];
293: PetscInfo2(j->A, "Send entries %d Recv Entries %d\n", numSendEntries, numRecvEntries);
294: PetscLogEventEnd(MAT_SetValuesBatchI,0,0,0,0);
295: // Allocate storage for "fat" COO representation of matrix
296: PetscLogEventBegin(MAT_SetValuesBatchII,0,0,0,0);
297: PetscInfo2(j->A, "Making COO matrices, diag entries %d, nondiag entries %d\n", No-numSendEntries+numRecvEntries, numSendEntries*2);
298: cusp::coo_matrix<IndexType,ValueType, memSpace> diagCOO(Nr, Nr, No-numSendEntries+numRecvEntries); // ALLOC: This is oversized because I also count offdiagonal entries
299: IndexArray nondiagonalRows(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
300: IndexArray nondiagonalCols(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
301: ValueArray nondiagonalVals(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
302: IndexArray nonlocalRows(numSendEntries);
303: IndexArray nonlocalCols(numSendEntries);
304: ValueArray nonlocalVals(numSendEntries);
305: // partition on-process entries into diagonal and off-diagonal+nonlocal
306: PetscInfo(J, "Splitting on-process entries into diagonal and off-diagonal+nonlocal\n");
307: thrust::fill(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
308: thrust::fill(nondiagonalRows.begin(), nondiagonalRows.end(), -1);
309: thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(rowInd.begin(), colInd.begin(), d_elemMats.begin())),
310: thrust::make_zip_iterator(thrust::make_tuple(rowInd.end(), colInd.end(), d_elemMats.end())),
311: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin(), diagCOO.values.begin())),
312: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
313: is_diag(firstRow, lastRow));
314: // Current size without off-proc entries
315: PetscInt diagonalSize = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - thrust::count(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
316: PetscInt nondiagonalSize = No - diagonalSize;
317: PetscInfo2(j->A, "Diagonal size %d Nondiagonal size %d\n", diagonalSize, nondiagonalSize);
318: ///cusp::print(diagCOO);
319: ///cusp::print(nondiagonalRows);
320: // partition on-process entries again into off-diagonal and nonlocal
321: PetscInfo(J, "Splitting on-process entries into off-diagonal and nonlocal\n");
322: thrust::stable_partition(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
323: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(), nondiagonalCols.end(), nondiagonalVals.end())),
324: is_nonlocal(firstRow, lastRow));
325: PetscInt nonlocalSize = numSendEntries;
326: PetscInt offdiagonalSize = nondiagonalSize - nonlocalSize;
327: PetscInfo2(j->A, "Nonlocal size %d Offdiagonal size %d\n", nonlocalSize, offdiagonalSize);
328: PetscLogEventEnd(MAT_SetValuesBatchII,0,0,0,0);
329: ///cusp::print(nondiagonalRows);
330: // send off-proc entries (pack this up later)
331: PetscLogEventBegin(MAT_SetValuesBatchIII,0,0,0,0);
332: PetscMPIInt *procSendDispls, *procRecvDispls;
333: PetscInt *sendRows, *recvRows;
334: PetscInt *sendCols, *recvCols;
335: PetscScalar *sendVals, *recvVals;
337: PetscMalloc2(numProcs, PetscMPIInt, &procSendDispls, numProcs, PetscMPIInt, &procRecvDispls);
338: PetscMalloc3(numSendEntries, PetscInt, &sendRows, numSendEntries, PetscInt, &sendCols, numSendEntries, PetscScalar, &sendVals);
339: PetscMalloc3(numRecvEntries, PetscInt, &recvRows, numRecvEntries, PetscInt, &recvCols, numRecvEntries, PetscScalar, &recvVals);
341: procSendDispls[0] = procRecvDispls[0] = 0;
342: for (PetscInt p = 1; p < numProcs; ++p) {
343: procSendDispls[p] = procSendDispls[p-1] + procSendSizes[p-1];
344: procRecvDispls[p] = procRecvDispls[p-1] + procRecvSizes[p-1];
345: }
346: #if 0
347: thrust::copy(nondiagonalRows.begin(), nondiagonalRows.begin()+nonlocalSize, sendRows);
348: thrust::copy(nondiagonalCols.begin(), nondiagonalCols.begin()+nonlocalSize, sendCols);
349: thrust::copy(nondiagonalVals.begin(), nondiagonalVals.begin()+nonlocalSize, sendVals);
350: #else
351: thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
352: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin()))+nonlocalSize,
353: thrust::make_zip_iterator(thrust::make_tuple(sendRows, sendCols, sendVals)));
354: #endif
355: // could pack into a struct and unpack
356: MPI_Alltoallv(sendRows, procSendSizes, procSendDispls, MPIU_INT, recvRows, procRecvSizes, procRecvDispls, MPIU_INT, comm);
357: MPI_Alltoallv(sendCols, procSendSizes, procSendDispls, MPIU_INT, recvCols, procRecvSizes, procRecvDispls, MPIU_INT, comm);
358: MPI_Alltoallv(sendVals, procSendSizes, procSendDispls, MPIU_SCALAR, recvVals, procRecvSizes, procRecvDispls, MPIU_SCALAR, comm);
359: PetscFree2(procSendSizes, procRecvSizes);
360: PetscFree2(procSendDispls, procRecvDispls);
361: PetscFree3(sendRows, sendCols, sendVals);
362: PetscLogEventEnd(MAT_SetValuesBatchIII,0,0,0,0);
364: PetscLogEventBegin(MAT_SetValuesBatchIV,0,0,0,0);
365: // Create off-diagonal matrix
366: cusp::coo_matrix<IndexType,ValueType, memSpace> offdiagCOO(Nr, Nr, offdiagonalSize+numRecvEntries); // ALLOC: This is oversizes because we count diagonal entries in numRecvEntries
367: // partition again into diagonal and off-diagonal
368: IndexArray d_recvRows(recvRows, recvRows+numRecvEntries);
369: IndexArray d_recvCols(recvCols, recvCols+numRecvEntries);
370: ValueArray d_recvVals(recvVals, recvVals+numRecvEntries);
371: #if 0
372: thrust::copy(nondiagonalRows.end()-offdiagonalSize, nondiagonalRows.end(), offdiagCOO.row_indices.begin());
373: thrust::copy(nondiagonalCols.end()-offdiagonalSize, nondiagonalCols.end(), offdiagCOO.column_indices.begin());
374: thrust::copy(nondiagonalVals.end()-offdiagonalSize, nondiagonalVals.end(), offdiagCOO.values.begin());
375: #else
376: thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(), nondiagonalCols.end(), nondiagonalVals.end()))-offdiagonalSize,
377: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(), nondiagonalCols.end(), nondiagonalVals.end())),
378: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin(), offdiagCOO.values.begin())));
379: #endif
380: thrust::fill(diagCOO.row_indices.begin()+diagonalSize, diagCOO.row_indices.end(), -1);
381: thrust::fill(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.row_indices.end(), -1);
382: thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.begin(), d_recvCols.begin(), d_recvVals.begin())),
383: thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.end(), d_recvCols.end(), d_recvVals.end())),
384: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin()+diagonalSize, diagCOO.column_indices.begin()+diagonalSize, diagCOO.values.begin()+diagonalSize)),
385: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.column_indices.begin()+offdiagonalSize, offdiagCOO.values.begin()+offdiagonalSize)),
386: is_diag(firstRow, lastRow));
388: PetscFree3(recvRows, recvCols, recvVals);
390: diagonalSize = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - thrust::count(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
391: offdiagonalSize = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - thrust::count(offdiagCOO.row_indices.begin(), offdiagCOO.row_indices.end(), -1);
393: // sort COO format by (i,j), this is the most costly step
394: PetscInfo(J, "Sorting rows and columns\n");
395: diagCOO.sort_by_row_and_column();
396: offdiagCOO.sort_by_row_and_column();
397: PetscInt diagonalOffset = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - diagonalSize;
398: PetscInt offdiagonalOffset = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - offdiagonalSize;
400: // print the "fat" COO representation
401: if (PetscLogPrintInfo) {
402: #if !defined(PETSC_USE_COMPLEX)
403: cusp::print(diagCOO);
404: cusp::print(offdiagCOO);
405: #endif
406: }
408: // Figure out the number of unique off-diagonal columns
409: Nc = thrust::inner_product(offdiagCOO.column_indices.begin()+offdiagonalOffset,
410: offdiagCOO.column_indices.end() - 1,
411: offdiagCOO.column_indices.begin()+offdiagonalOffset + 1,
412: size_t(1), thrust::plus<size_t>(), thrust::not_equal_to<IndexType>());
414: // compute number of unique (i,j) entries
415: // this counts the number of changes as we move along the (i,j) list
416: PetscInfo(J, "Computing number of unique entries\n");
417: size_t num_diag_entries = thrust::inner_product
418: (thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
419: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(), diagCOO.column_indices.end())) - 1,
420: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset + 1,
421: size_t(1),
422: thrust::plus<size_t>(),
423: thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
424: size_t num_offdiag_entries = thrust::inner_product
425: (thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
426: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(), offdiagCOO.column_indices.end())) - 1,
427: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset + 1,
428: size_t(1),
429: thrust::plus<size_t>(),
430: thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
432: // allocate COO storage for final matrix
433: PetscInfo(J, "Allocating compressed matrices\n");
434: cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_diag_entries);
435: cusp::coo_matrix<IndexType, ValueType, memSpace> B(Nr, Nc, num_offdiag_entries);
437: // sum values with the same (i,j) index
438: // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
439: // the Cusp one is 2x faster, but still not optimal
440: // This could possibly be done in-place
441: PetscInfo(J, "Compressing matrices\n");
442: thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
443: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(), diagCOO.column_indices.end())),
444: diagCOO.values.begin() + diagonalOffset,
445: thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
446: A.values.begin(),
447: thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
448: thrust::plus<ValueType>());
450: thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
451: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(), offdiagCOO.column_indices.end())),
452: offdiagCOO.values.begin() + offdiagonalOffset,
453: thrust::make_zip_iterator(thrust::make_tuple(B.row_indices.begin(), B.column_indices.begin())),
454: B.values.begin(),
455: thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
456: thrust::plus<ValueType>());
458: // Convert row and column numbers
459: if (firstRow) {
460: thrust::transform(A.row_indices.begin(), A.row_indices.end(), thrust::make_constant_iterator(firstRow), A.row_indices.begin(), thrust::minus<IndexType>());
461: thrust::transform(B.row_indices.begin(), B.row_indices.end(), thrust::make_constant_iterator(firstRow), B.row_indices.begin(), thrust::minus<IndexType>());
462: }
463: if (firstCol) {
464: thrust::transform(A.column_indices.begin(), A.column_indices.end(), thrust::make_constant_iterator(firstCol), A.column_indices.begin(), thrust::minus<IndexType>());
465: }
466: #if 0 // This is done by MatSetUpMultiply_MPIAIJ()
467: // TODO: Get better code from Nathan
468: IndexArray d_colmap(Nc);
469: thrust::unique_copy(B.column_indices.begin(), B.column_indices.end(), d_colmap.begin());
470: IndexHostArray colmap(d_colmap.begin(), d_colmap.end());
471: IndexType newCol = 0;
472: for (IndexHostArray::iterator c_iter = colmap.begin(); c_iter != colmap.end(); ++c_iter, ++newCol) {
473: thrust::replace(B.column_indices.begin(), B.column_indices.end(), *c_iter, newCol);
474: }
475: #endif
477: // print the final matrix
478: if (PetscLogPrintInfo) {
479: #if !defined(PETSC_USE_COMPLEX)
480: cusp::print(A);
481: cusp::print(B);
482: #endif
483: }
485: PetscInfo(J, "Converting to PETSc matrix\n");
486: MatSetType(J, MATMPIAIJCUSP);
487: CUSPMATRIX *Agpu = new CUSPMATRIX;
488: CUSPMATRIX *Bgpu = new CUSPMATRIX;
489: cusp::convert(A, *Agpu);
490: cusp::convert(B, *Bgpu);
491: if (PetscLogPrintInfo) {
492: #if !defined(PETSC_USE_COMPLEX)
493: cusp::print(*Agpu);
494: cusp::print(*Bgpu);
495: #endif
496: }
497: {
498: PetscInfo(J, "Copying to CPU matrix");
499: MatCUSPCopyFromGPU(j->A, Agpu);
500: MatCUSPCopyFromGPU(j->B, Bgpu);
501: #if 0 // This is done by MatSetUpMultiply_MPIAIJ()
503: // Create the column map
504: PetscFree(j->garray);
505: PetscMalloc(Nc * sizeof(PetscInt), &j->garray);
506: PetscInt c = 0;
507: for (IndexHostArray::iterator c_iter = colmap.begin(); c_iter != colmap.end(); ++c_iter, ++c) {
508: j->garray[c] = *c_iter;
509: }
510: #endif
511: }
512: PetscLogEventEnd(MAT_SetValuesBatchIV,0,0,0,0);
513: return(0);
514: }