ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
spgemm.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPGEMM_HPP_
2 #define VIENNACL_LINALG_CUDA_SPGEMM_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <stdexcept>
26 
27 #include <thrust/scan.h>
28 #include <thrust/device_ptr.h>
29 
30 #include "viennacl/forwards.h"
31 #include "viennacl/scalar.hpp"
32 #include "viennacl/vector.hpp"
33 #include "viennacl/tools/tools.hpp"
35 
36 #include "viennacl/tools/timer.hpp"
37 
39 
40 namespace viennacl
41 {
42 namespace linalg
43 {
44 namespace cuda
45 {
46 
48 template<typename NumericT>
49 static inline __device__ NumericT load_and_cache(const NumericT *address)
50 {
51 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
52  return __ldg(address);
53 #else
54  return *address;
55 #endif
56 }
57 
58 
59 //
60 // Stage 1: Obtain upper bound for number of elements per row in C:
61 //
62 template<typename IndexT>
63 __device__ IndexT round_to_next_power_of_2(IndexT val)
64 {
65  if (val > 32)
66  return 64; // just to indicate that we need to split/factor the matrix!
67  else if (val > 16)
68  return 32;
69  else if (val > 8)
70  return 16;
71  else if (val > 4)
72  return 8;
73  else if (val > 2)
74  return 4;
75  else if (val > 1)
76  return 2;
77  else
78  return 1;
79 }
80 
81 template<typename IndexT>
83  const IndexT * A_row_indices,
84  const IndexT * A_col_indices,
85  IndexT A_size1,
86  const IndexT * B_row_indices,
87  IndexT *subwarpsize_per_group,
88  IndexT *max_nnz_row_A_per_group,
89  IndexT *max_nnz_row_B_per_group)
90 {
91  unsigned int subwarpsize_in_thread = 0;
92  unsigned int max_nnz_row_A = 0;
93  unsigned int max_nnz_row_B = 0;
94 
95  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
96  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
97 
98  for (unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; row < row_per_group_end; row += blockDim.x)
99  {
100  unsigned int A_row_start = A_row_indices[row];
101  unsigned int A_row_end = A_row_indices[row+1];
102  unsigned int row_num = A_row_end - A_row_start;
103  if (row_num > 32) // too many rows in B need to be merged for a single pass
104  {
105  unsigned int subwarp_sqrt = (unsigned int)sqrt(double(row_num)) + 1;
106  subwarpsize_in_thread = max(subwarp_sqrt, subwarpsize_in_thread);
107  }
108  else
109  subwarpsize_in_thread = max(A_row_end - A_row_start, subwarpsize_in_thread);
110  max_nnz_row_A = max(max_nnz_row_A, row_num);
111  for (unsigned int j = A_row_start; j < A_row_end; ++j)
112  {
113  unsigned int col = A_col_indices[j];
114  unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
115  max_nnz_row_B = max(row_len_B, max_nnz_row_B);
116  }
117  }
118 
119  // reduction to obtain maximum in thread block
120  __shared__ unsigned int shared_subwarpsize[256];
121  __shared__ unsigned int shared_max_nnz_row_A[256];
122  __shared__ unsigned int shared_max_nnz_row_B[256];
123 
124  shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
125  shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
126  shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
127  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
128  {
129  __syncthreads();
130  if (threadIdx.x < stride)
131  {
132  shared_subwarpsize[threadIdx.x] = max( shared_subwarpsize[threadIdx.x], shared_subwarpsize[threadIdx.x + stride]);
133  shared_max_nnz_row_A[threadIdx.x] = max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + stride]);
134  shared_max_nnz_row_B[threadIdx.x] = max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + stride]);
135  }
136  }
137 
138  if (threadIdx.x == 0)
139  {
140  subwarpsize_per_group[blockIdx.x] = round_to_next_power_of_2(shared_subwarpsize[0]);
141  max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
142  max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
143  }
144 }
145 
146 //
147 // Stage 2: Determine sparsity pattern of C
148 //
149 inline __device__ unsigned int merge_subwarp_symbolic(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int subwarpsize)
150 {
151  unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
152 
153  unsigned int num_nnz = 0;
154  while (1)
155  {
156  // determine current minimum (warp shuffle)
157  unsigned int min_index = current_front_index;
158  for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
159  min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
160 
161  if (min_index == B_size2)
162  break;
163 
164  // update front:
165  current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
166  : current_front_index;
167  ++num_nnz;
168  }
169 
170  return num_nnz;
171 }
172 
173 inline __device__ unsigned int merge_subwarp_symbolic_double(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2,
174  unsigned int *output_array, unsigned int id_in_warp, unsigned int subwarpsize)
175 {
176  unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
177 
178  unsigned int num_nnz = 0;
179  unsigned int index_buffer = 0;
180  unsigned int buffer_size = 0;
181  while (1)
182  {
183  // determine current minimum (warp shuffle)
184  unsigned int min_index = current_front_index;
185  for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
186  min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
187 
188  if (min_index == B_size2)
189  break;
190 
191  // update front:
192  current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
193  : current_front_index;
194 
195  // write output
196  index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
197  ++buffer_size;
198 
199  if (buffer_size == subwarpsize) // register buffer full?
200  {
201  output_array[id_in_warp] = index_buffer;
202  output_array += subwarpsize;
203  buffer_size = 0;
204  }
205 
206  ++num_nnz;
207  }
208 
209  // write remaining entries from register buffer:
210  if (id_in_warp < buffer_size)
211  output_array[id_in_warp] = index_buffer;
212 
213  return num_nnz;
214 }
215 
216 template<typename IndexT>
218  const IndexT * A_row_indices,
219  const IndexT * A_col_indices,
220  IndexT A_size1,
221  const IndexT * B_row_indices,
222  const IndexT * B_col_indices,
223  IndexT B_size2,
224  IndexT * C_row_indices,
225  unsigned int *subwarpsize_array,
226  unsigned int *max_row_size_A,
227  unsigned int *max_row_size_B,
228  unsigned int *scratchpad_offsets,
229  unsigned int *scratchpad_indices)
230 {
231  unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
232 
233  unsigned int num_warps = blockDim.x / subwarpsize;
234  unsigned int warp_id = threadIdx.x / subwarpsize;
235  unsigned int id_in_warp = threadIdx.x % subwarpsize;
236 
237  unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize;
238  unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
239  unsigned int *subwarp_scratchpad_start = scratchpad_indices + scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
240 
241  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
242  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
243 
244  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
245  {
246  unsigned int row_A_start = A_row_indices[row];
247  unsigned int row_A_end = A_row_indices[row+1];
248 
249  if (row_A_end - row_A_start > subwarpsize)
250  {
251  unsigned int final_merge_start = 0;
252  unsigned int final_merge_end = 0;
253 
254  // merge to temporary scratchpad memory:
255  unsigned int *subwarp_scratchpad = subwarp_scratchpad_start;
256  unsigned int iter = 0;
257  while (row_A_end > row_A_start)
258  {
259  unsigned int my_row_B = row_A_start + id_in_warp;
260  unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
261  unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
262  unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
263 
264  unsigned int nnz_in_merge = merge_subwarp_symbolic_double(row_B_start, row_B_end, B_col_indices, B_size2,
265  subwarp_scratchpad, id_in_warp, subwarpsize);
266 
267  final_merge_start = (iter == id_in_warp) ? subwarp_scratchpad - scratchpad_indices : final_merge_start;
268  final_merge_end = (iter == id_in_warp) ? final_merge_start + nnz_in_merge : final_merge_end;
269  ++iter;
270 
271  row_A_start += subwarpsize;
272  subwarp_scratchpad += scratchpad_rowlength; // write to next row in scratchpad
273  }
274 
275  // final merge:
276  unsigned int num_nnz = merge_subwarp_symbolic(final_merge_start, final_merge_end, scratchpad_indices, B_size2, subwarpsize);
277 
278  if (id_in_warp == 0)
279  C_row_indices[row] = num_nnz;
280  }
281  else
282  {
283  // single merge
284  unsigned int my_row_B = row_A_start + id_in_warp;
285  unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
286  unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
287  unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
288 
289  unsigned int num_nnz = merge_subwarp_symbolic(row_B_start, row_B_end, B_col_indices, B_size2, subwarpsize);
290 
291  if (id_in_warp == 0)
292  C_row_indices[row] = num_nnz;
293  }
294  }
295 
296 }
297 
298 
299 //
300 // Stage 3: Fill C with values
301 //
302 template<typename NumericT>
303 __device__ unsigned int merge_subwarp_numeric(NumericT scaling_factor,
304  unsigned int input_start, unsigned int input_end, const unsigned int *input_indices, const NumericT *input_values, unsigned int invalid_token,
305  unsigned int *output_indices, NumericT *output_values,
306  unsigned int id_in_warp, unsigned int subwarpsize)
307 {
308  unsigned int current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
309  NumericT current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0;
310 
311  unsigned int index_buffer = 0;
312  NumericT value_buffer = 0;
313  unsigned int buffer_size = 0;
314  unsigned int nnz_written = 0;
315  while (1)
316  {
317  // determine current minimum:
318  unsigned int min_index = current_front_index;
319  for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
320  min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
321 
322  if (min_index == invalid_token) // done
323  break;
324 
325  // compute entry in C:
326  NumericT output_value = (current_front_index == min_index) ? scaling_factor * current_front_value : 0;
327  for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
328  output_value += __shfl_xor((int)output_value, (int)i);
329 
330  // update front:
331  if (current_front_index == min_index)
332  {
333  ++input_start;
334  current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
335  current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0;
336  }
337 
338  // write current front to register buffer:
339  index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
340  value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
341  ++buffer_size;
342 
343  // flush register buffer via a coalesced write once full:
344  if (buffer_size == subwarpsize)
345  {
346  output_indices[id_in_warp] = index_buffer; output_indices += subwarpsize;
347  output_values[id_in_warp] = value_buffer; output_values += subwarpsize;
348  buffer_size = 0;
349  }
350 
351  ++nnz_written;
352  }
353 
354  // write remaining entries in register buffer to C:
355  if (id_in_warp < buffer_size)
356  {
357  output_indices[id_in_warp] = index_buffer;
358  output_values[id_in_warp] = value_buffer;
359  }
360 
361  return nnz_written;
362 }
363 
364 template<typename IndexT, typename NumericT>
366  const IndexT * A_row_indices,
367  const IndexT * A_col_indices,
368  const NumericT * A_elements,
369  IndexT A_size1,
370  const IndexT * B_row_indices,
371  const IndexT * B_col_indices,
372  const NumericT * B_elements,
373  IndexT B_size2,
374  IndexT const * C_row_indices,
375  IndexT * C_col_indices,
376  NumericT * C_elements,
377  unsigned int *subwarpsize_array,
378  unsigned int *max_row_size_A,
379  unsigned int *max_row_size_B,
380  unsigned int *scratchpad_offsets,
381  unsigned int *scratchpad_indices,
382  NumericT *scratchpad_values)
383 {
384  unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
385 
386  unsigned int num_warps = blockDim.x / subwarpsize;
387  unsigned int warp_id = threadIdx.x / subwarpsize;
388  unsigned int id_in_warp = threadIdx.x % subwarpsize;
389 
390  unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize;
391  unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
392  unsigned int subwarp_scratchpad_shift = scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
393 
394  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
395  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
396 
397  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
398  {
399  unsigned int row_A_start = A_row_indices[row];
400  unsigned int row_A_end = A_row_indices[row+1];
401 
402  if (row_A_end - row_A_start > subwarpsize)
403  {
404  // first merge stage:
405  unsigned int final_merge_start = 0;
406  unsigned int final_merge_end = 0;
407  unsigned int iter = 0;
408  unsigned int *scratchpad_indices_ptr = scratchpad_indices + subwarp_scratchpad_shift;
409  NumericT *scratchpad_values_ptr = scratchpad_values + subwarp_scratchpad_shift;
410 
411  while (row_A_start < row_A_end)
412  {
413  unsigned int my_row_B = row_A_start + id_in_warp;
414  unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
415  unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
416  unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
417  NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
418 
419  unsigned int nnz_written = merge_subwarp_numeric(val_A,
420  row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
421  scratchpad_indices_ptr, scratchpad_values_ptr,
422  id_in_warp, subwarpsize);
423 
424  if (iter == id_in_warp)
425  {
426  final_merge_start = scratchpad_indices_ptr - scratchpad_indices;
427  final_merge_end = final_merge_start + nnz_written;
428  }
429  ++iter;
430 
431  row_A_start += subwarpsize;
432  scratchpad_indices_ptr += scratchpad_rowlength;
433  scratchpad_values_ptr += scratchpad_rowlength;
434  }
435 
436  // second merge stage:
437  unsigned int index_in_C = C_row_indices[row];
439  final_merge_start, final_merge_end, scratchpad_indices, scratchpad_values, B_size2,
440  C_col_indices + index_in_C, C_elements + index_in_C,
441  id_in_warp, subwarpsize);
442  }
443  else
444  {
445  unsigned int my_row_B = row_A_start + id_in_warp;
446  unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
447  unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
448  unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
449  NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
450 
451  unsigned int index_in_C = C_row_indices[row];
452 
453  merge_subwarp_numeric(val_A,
454  row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
455  C_col_indices + index_in_C, C_elements + index_in_C,
456  id_in_warp, subwarpsize);
457  }
458  }
459 
460 }
461 
462 
463 
464 
465 //
466 // Decomposition kernels:
467 //
468 template<typename IndexT>
470  const IndexT * A_row_indices,
471  IndexT A_size1,
472  IndexT max_per_row,
473  IndexT *chunks_per_row)
474 {
475  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
476  {
477  IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
478  chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
479  }
480 }
481 
482 
483 template<typename IndexT, typename NumericT>
485  IndexT * A2_row_indices,
486  IndexT * A2_col_indices,
487  NumericT * A2_elements,
488  IndexT A2_size1,
489  IndexT *new_row_buffer)
490 {
491  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
492  {
493  unsigned int index_start = new_row_buffer[i];
494  unsigned int index_stop = new_row_buffer[i+1];
495 
496  A2_row_indices[i] = index_start;
497 
498  for (IndexT j = index_start; j < index_stop; ++j)
499  {
500  A2_col_indices[j] = j;
501  A2_elements[j] = NumericT(1);
502  }
503  }
504 
505  // write last entry in row_buffer with global thread 0:
506  if (threadIdx.x == 0 && blockIdx.x == 0)
507  A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
508 }
509 
510 template<typename IndexT, typename NumericT>
512  IndexT * G1_row_indices,
513  IndexT * G1_col_indices,
514  NumericT * G1_elements,
515  IndexT G1_size1,
516  IndexT const *A_row_indices,
517  IndexT const *A_col_indices,
518  NumericT const *A_elements,
519  IndexT A_size1,
520  IndexT A_nnz,
521  IndexT max_per_row,
522  IndexT *new_row_buffer)
523 {
524  // Part 1: Copy column indices and entries:
525  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
526  {
527  G1_col_indices[i] = A_col_indices[i];
528  G1_elements[i] = A_elements[i];
529  }
530 
531  // Part 2: Derive new row indicies:
532  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
533  {
534  unsigned int old_start = A_row_indices[i];
535  unsigned int new_start = new_row_buffer[i];
536  unsigned int row_chunks = new_row_buffer[i+1] - new_start;
537 
538  for (IndexT j=0; j<row_chunks; ++j)
539  G1_row_indices[new_start + j] = old_start + j * max_per_row;
540  }
541 
542  // write last entry in row_buffer with global thread 0:
543  if (threadIdx.x == 0 && blockIdx.x == 0)
544  G1_row_indices[G1_size1] = A_row_indices[A_size1];
545 }
546 
547 
548 
558 template<class NumericT, unsigned int AlignmentV>
562 {
563  C.resize(A.size1(), B.size2(), false);
564 
565  unsigned int blocknum = 256;
566  unsigned int threadnum = 128;
567 
568  viennacl::vector<unsigned int> subwarp_sizes(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
569  viennacl::vector<unsigned int> max_nnz_row_A(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
570  viennacl::vector<unsigned int> max_nnz_row_B(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
571 
572 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
574 #endif
575 
576  //
577  // Stage 1: Determine upper bound for number of nonzeros
578  //
579 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
580  cudaDeviceSynchronize();
581  timer.start();
582 #endif
583 
584  compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
585  viennacl::cuda_arg<unsigned int>(A.handle2()),
586  static_cast<unsigned int>(A.size1()),
587  viennacl::cuda_arg<unsigned int>(B.handle1()),
588  viennacl::cuda_arg(subwarp_sizes),
589  viennacl::cuda_arg(max_nnz_row_A),
590  viennacl::cuda_arg(max_nnz_row_B)
591  );
592  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_1");
593 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
594  cudaDeviceSynchronize();
595  std::cout << "Stage 1 device: " << timer.get() << std::endl;
596  timer.start();
597 #endif
598 
600  unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
601 
603  unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
604 
606  unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
607 
608  //std::cout << "Subwarp sizes: " << subwarp_sizes << std::endl;
609 
610  viennacl::vector<unsigned int> scratchpad_offsets(blocknum, viennacl::context(MAIN_MEMORY)); // upper bound for the nonzeros per row encountered for each work group
611  unsigned int * scratchpad_offsets_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(scratchpad_offsets.handle());
612 
613  unsigned int max_subwarp_size = 0;
614  unsigned int A_max_nnz_per_row = 0;
615  unsigned int scratchpad_offset = 0;
616  //std::cout << "Scratchpad offsets: " << std::endl;
617  for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
618  {
619  max_subwarp_size = std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
620  A_max_nnz_per_row = std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
621 
622  scratchpad_offsets_ptr[i] = scratchpad_offset;
623  //std::cout << scratchpad_offset << " (with " << (max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1) << " warp reloads per group at " << max_nnz_row_A_ptr[i] << " max rows, "
624  // << upper_bound_nonzeros_per_row_C_ptr[i] << " row length, "
625  // << (256 / subwarp_sizes_ptr[i]) << " warps per group " << std::endl;
626  unsigned int max_warp_reloads = max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1;
627  unsigned int max_row_length_after_warp_merge = subwarp_sizes_ptr[i] * max_nnz_row_B_ptr[i];
628  unsigned int warps_in_group = threadnum / subwarp_sizes_ptr[i];
629  scratchpad_offset += max_warp_reloads
630  * max_row_length_after_warp_merge
631  * warps_in_group;
632  }
633  //std::cout << "Scratchpad memory for indices: " << scratchpad_offset << " entries (" << scratchpad_offset * sizeof(unsigned int) * 1e-6 << " MB)" << std::endl;
634 
635  if (max_subwarp_size > 32)
636  {
637  // determine augmented size:
638  unsigned int max_entries_in_G = 1024;
639  if (A_max_nnz_per_row <= 512*512)
640  max_entries_in_G = 512;
641  if (A_max_nnz_per_row <= 256*256)
642  max_entries_in_G = 256;
643  if (A_max_nnz_per_row <= 128*128)
644  max_entries_in_G = 128;
645  if (A_max_nnz_per_row <= 64*64)
646  max_entries_in_G = 64;
647 
648  viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
649  compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
650  static_cast<unsigned int>(A.size1()),
651  static_cast<unsigned int>(max_entries_in_G),
652  viennacl::cuda_arg(exclusive_scan_helper)
653  );
654  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_decompose_1");
655 
656  thrust::exclusive_scan(thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper)),
657  thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper) + exclusive_scan_helper.size()),
658  thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper)));
659 
660  unsigned int augmented_size = exclusive_scan_helper[A.size1()];
661 
662  // split A = A2 * G1
663  viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
665 
666  // fill A2:
667  compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
668  viennacl::cuda_arg<unsigned int>(A2.handle2()),
669  viennacl::cuda_arg<NumericT>(A2.handle()),
670  static_cast<unsigned int>(A2.size1()),
671  viennacl::cuda_arg(exclusive_scan_helper)
672  );
673  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_A2");
674 
675  // fill G1:
676  compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
677  viennacl::cuda_arg<unsigned int>(G1.handle2()),
678  viennacl::cuda_arg<NumericT>(G1.handle()),
679  static_cast<unsigned int>(G1.size1()),
680  viennacl::cuda_arg<unsigned int>(A.handle1()),
681  viennacl::cuda_arg<unsigned int>(A.handle2()),
682  viennacl::cuda_arg<NumericT>(A.handle()),
683  static_cast<unsigned int>(A.size1()),
684  static_cast<unsigned int>(A.nnz()),
685  static_cast<unsigned int>(max_entries_in_G),
686  viennacl::cuda_arg(exclusive_scan_helper)
687  );
688  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_G1");
689 
690  // compute tmp = G1 * B;
691  // C = A2 * tmp;
693  prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
694  prod_impl(A2, tmp, C); // this may split A2 again
695  return;
696  }
697 
701  scratchpad_offsets.switch_memory_context(viennacl::traits::context(A));
702 
703  viennacl::vector<unsigned int> scratchpad_indices(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
704 
705 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
706  std::cout << "Intermediate host stage: " << timer.get() << std::endl;
707  timer.start();
708 #endif
709 
710  //
711  // Stage 2: Determine pattern of C
712  //
713 
714  compressed_matrix_gemm_stage_2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
715  viennacl::cuda_arg<unsigned int>(A.handle2()),
716  static_cast<unsigned int>(A.size1()),
717  viennacl::cuda_arg<unsigned int>(B.handle1()),
718  viennacl::cuda_arg<unsigned int>(B.handle2()),
719  static_cast<unsigned int>(B.size2()),
720  viennacl::cuda_arg<unsigned int>(C.handle1()),
721  viennacl::cuda_arg(subwarp_sizes),
722  viennacl::cuda_arg(max_nnz_row_A),
723  viennacl::cuda_arg(max_nnz_row_B),
724  viennacl::cuda_arg(scratchpad_offsets),
725  viennacl::cuda_arg(scratchpad_indices)
726  );
727  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
728 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
729  cudaDeviceSynchronize();
730  std::cout << "Stage 2: " << timer.get() << std::endl;
731  timer.start();
732 #endif
733 
734 
735  // exclusive scan on C.handle1(), ultimately allowing to allocate remaining memory for C
737  viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
738  unsigned int current_offset = 0;
739  for (std::size_t i=0; i<C.size1(); ++i)
740  {
741  unsigned int tmp = row_buffer[i];
742  row_buffer.set(i, current_offset);
743  current_offset += tmp;
744  }
745  row_buffer.set(C.size1(), current_offset);
746  viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
747 
748 
749  //
750  // Stage 3: Compute entries in C
751  //
752  C.reserve(current_offset, false);
753 
754  viennacl::vector<NumericT> scratchpad_values(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
755 
756 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
757  std::cout << "Intermediate stage 2->3: " << timer.get() << std::endl;
758  timer.start();
759 #endif
760 
761  compressed_matrix_gemm_stage_3<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
762  viennacl::cuda_arg<unsigned int>(A.handle2()),
763  viennacl::cuda_arg<NumericT>(A.handle()),
764  static_cast<unsigned int>(A.size1()),
765  viennacl::cuda_arg<unsigned int>(B.handle1()),
766  viennacl::cuda_arg<unsigned int>(B.handle2()),
767  viennacl::cuda_arg<NumericT>(B.handle()),
768  static_cast<unsigned int>(B.size2()),
769  viennacl::cuda_arg<unsigned int>(C.handle1()),
770  viennacl::cuda_arg<unsigned int>(C.handle2()),
771  viennacl::cuda_arg<NumericT>(C.handle()),
772  viennacl::cuda_arg(subwarp_sizes),
773  viennacl::cuda_arg(max_nnz_row_A),
774  viennacl::cuda_arg(max_nnz_row_B),
775  viennacl::cuda_arg(scratchpad_offsets),
776  viennacl::cuda_arg(scratchpad_indices),
777  viennacl::cuda_arg(scratchpad_values)
778  );
779  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
780 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
781  cudaDeviceSynchronize();
782  std::cout << "Stage 3: " << timer.get() << std::endl;
783  std::cout << "----------" << std::endl;
784 #endif
785 
786 }
787 
788 } // namespace cuda
789 } //namespace linalg
790 } //namespace viennacl
791 
792 
793 #endif
const vcl_size_t & size2() const
Returns the number of columns.
Simple timer class based on gettimeofday (POSIX) or QueryPerformanceCounter (Windows).
Definition: timer.hpp:90
Helper class implementing an array on the host. Default case: No conversion necessary.
Definition: util.hpp:92
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
Definition: memory.hpp:220
const vcl_size_t & size1() const
Returns the number of rows.
__global__ void compressed_matrix_gemm_decompose_1(const IndexT *A_row_indices, IndexT A_size1, IndexT max_per_row, IndexT *chunks_per_row)
Definition: spgemm.hpp:469
Various little tools used here and there in ViennaCL.
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
void exclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an exclusive scan using CUDA.
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
__global__ void compressed_matrix_gemm_stage_3(const IndexT *A_row_indices, const IndexT *A_col_indices, const NumericT *A_elements, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, const NumericT *B_elements, IndexT B_size2, IndexT const *C_row_indices, IndexT *C_col_indices, NumericT *C_elements, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices, NumericT *scratchpad_values)
Definition: spgemm.hpp:365
float NumericT
Definition: bisect.cpp:40
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
__device__ IndexT round_to_next_power_of_2(IndexT val)
Definition: spgemm.hpp:63
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
__device__ unsigned int merge_subwarp_symbolic(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int subwarpsize)
Definition: spgemm.hpp:149
__global__ void compressed_matrix_gemm_stage_1(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, IndexT *subwarpsize_per_group, IndexT *max_nnz_row_A_per_group, IndexT *max_nnz_row_B_per_group)
Definition: spgemm.hpp:82
__global__ void compressed_matrix_gemm_A2(IndexT *A2_row_indices, IndexT *A2_col_indices, NumericT *A2_elements, IndexT A2_size1, IndexT *new_row_buffer)
Definition: spgemm.hpp:484
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:900
__device__ unsigned int merge_subwarp_symbolic_double(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int *output_array, unsigned int id_in_warp, unsigned int subwarpsize)
Definition: spgemm.hpp:173
Implementations of direct triangular solvers for sparse matrices using CUDA.
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
void set(vcl_size_t index, U value)
Definition: util.hpp:115
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
__device__ unsigned int merge_subwarp_numeric(NumericT scaling_factor, unsigned int input_start, unsigned int input_end, const unsigned int *input_indices, const NumericT *input_values, unsigned int invalid_token, unsigned int *output_indices, NumericT *output_values, unsigned int id_in_warp, unsigned int subwarpsize)
Definition: spgemm.hpp:303
void switch_memory_context(viennacl::context new_ctx)
Definition: vector.hpp:1064
__global__ void compressed_matrix_gemm_G1(IndexT *G1_row_indices, IndexT *G1_col_indices, NumericT *G1_elements, IndexT G1_size1, IndexT const *A_row_indices, IndexT const *A_col_indices, NumericT const *A_elements, IndexT A_size1, IndexT A_nnz, IndexT max_per_row, IndexT *new_row_buffer)
Definition: spgemm.hpp:511
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
Definition: mem_handle.hpp:230
A sparse square matrix in compressed sparse rows format.
double get() const
Definition: timer.hpp:104
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
__global__ void compressed_matrix_gemm_stage_2(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, IndexT B_size2, IndexT *C_row_indices, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices)
Definition: spgemm.hpp:217
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91