ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
31 namespace viennacl
32 {
33 namespace linalg
34 {
35 namespace cuda
36 {
37 
38 //
39 // CG vector update:
40 //
41 
42 // cpu scalar
43 template<typename NumericT>
44 __global__ void pipelined_cg_vector_kernel(NumericT * result,
45  NumericT alpha,
46  NumericT * p,
47  NumericT * r,
48  NumericT const * Ap,
49  NumericT beta,
50  NumericT * inner_prod_buffer,
51  unsigned int size)
52 {
53  NumericT inner_prod_contrib = 0;
54  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
55  {
56  NumericT value_p = p[i];
57  NumericT value_r = r[i];
58 
59  result[i] += alpha * value_p;
60  value_r -= alpha * Ap[i];
61  value_p = value_r + beta * value_p;
62 
63  p[i] = value_p;
64  r[i] = value_r;
65  inner_prod_contrib += value_r * value_r;
66  }
67 
68  // parallel reduction in work group
69  __shared__ NumericT shared_array[256];
70  shared_array[threadIdx.x] = inner_prod_contrib;
71  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
72  {
73  __syncthreads();
74  if (threadIdx.x < stride)
75  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
76  }
77 
78  // write results to result array
79  if (threadIdx.x == 0)
80  inner_prod_buffer[blockIdx.x] = shared_array[0];
81 }
82 
83 
84 template<typename NumericT>
86  NumericT alpha,
89  vector_base<NumericT> const & Ap,
90  NumericT beta,
91  vector_base<NumericT> & inner_prod_buffer)
92 {
93  unsigned int size = result.size();
94  pipelined_cg_vector_kernel<<<128, 128>>>(viennacl::cuda_arg(result),
95  alpha,
99  beta,
100  viennacl::cuda_arg(inner_prod_buffer),
101  size);
102  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_vector_kernel");
103 }
104 
105 
106 
107 
108 //
109 // Compressed matrix
110 //
111 
112 
113 template<unsigned int SubWarpSizeV, typename NumericT>
115  const unsigned int * row_indices,
116  const unsigned int * column_indices,
117  const NumericT * elements,
118  const NumericT * p,
119  NumericT * Ap,
120  unsigned int size,
121  NumericT * inner_prod_buffer,
122  unsigned int buffer_size)
123 {
124  __shared__ NumericT shared_elements[256];
125  NumericT inner_prod_ApAp = 0;
126  NumericT inner_prod_pAp = 0;
127 
128  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
129  const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
130  const unsigned int block_start = blockIdx.x * block_increment;
131  const unsigned int block_stop = min(block_start + block_increment, size);
132 
133  for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
134  row < block_stop;
135  row += blockDim.x / SubWarpSizeV)
136  {
138  unsigned int row_end = row_indices[row+1];
139  for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
140  dot_prod += elements[i] * p[column_indices[i]];
141 
142  shared_elements[threadIdx.x] = dot_prod;
143  if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
144  if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
145  if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
146  if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
147  if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
148 
149  if (id_in_row == 0)
150  {
151  Ap[row] = shared_elements[threadIdx.x];
152  inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
153  inner_prod_pAp += p[row] * shared_elements[threadIdx.x];
154  }
155  }
156 
158  __shared__ NumericT shared_array_ApAp[256];
159  __shared__ NumericT shared_array_pAp[256];
160  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
161  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
162  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
163  {
164  __syncthreads();
165  if (threadIdx.x < stride)
166  {
167  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
168  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
169  }
170  }
171 
172  // write results to result array
173  if (threadIdx.x == 0) {
174  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
175  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
176  }
177 
178 }
179 
180 template<typename NumericT>
182  const unsigned int * row_indices,
183  const unsigned int * column_indices,
184  const unsigned int * row_blocks,
185  const NumericT * elements,
186  unsigned int num_blocks,
187  const NumericT * p,
188  NumericT * Ap,
189  unsigned int size,
190  NumericT * inner_prod_buffer,
191  unsigned int buffer_size)
192 {
193  NumericT inner_prod_ApAp = 0;
194  NumericT inner_prod_pAp = 0;
195 
196  __shared__ NumericT shared_elements[1024];
197 
198  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
199  {
200  unsigned int row_start = row_blocks[block_id];
201  unsigned int row_stop = row_blocks[block_id + 1];
202  unsigned int element_start = row_indices[row_start];
203  unsigned int element_stop = row_indices[row_stop];
204  unsigned int rows_to_process = row_stop - row_start;
205 
206  if (rows_to_process > 1) // CSR stream with one thread per row
207  {
208  // load to shared buffer:
209  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
210  shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
211 
212  __syncthreads();
213 
214  // use one thread per row to sum:
215  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
216  {
217  NumericT dot_prod = 0;
218  unsigned int thread_row_start = row_indices[row] - element_start;
219  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
220  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
221  dot_prod += shared_elements[i];
222  Ap[row] = dot_prod;
223  inner_prod_ApAp += dot_prod * dot_prod;
224  inner_prod_pAp += p[row] * dot_prod;
225  }
226  }
227  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
228  else // CSR vector for a single row
229  {
230  // load and sum to shared buffer:
231  shared_elements[threadIdx.x] = 0;
232  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
233  shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
234 
235  // reduction to obtain final result
236  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
237  {
238  __syncthreads();
239  if (threadIdx.x < stride)
240  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
241  }
242 
243  if (threadIdx.x == 0)
244  {
245  Ap[row_start] = shared_elements[0];
246  inner_prod_ApAp += shared_elements[0] * shared_elements[0];
247  inner_prod_pAp += p[row_start] * shared_elements[0];
248  }
249  }
250 
251  __syncthreads(); // avoid race conditions
252  }
253 
255  __shared__ NumericT shared_array_ApAp[256];
256  __shared__ NumericT shared_array_pAp[256];
257  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
258  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
259  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
260  {
261  __syncthreads();
262  if (threadIdx.x < stride)
263  {
264  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
265  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
266  }
267  }
268 
269  // write results to result array
270  if (threadIdx.x == 0) {
271  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
272  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
273  }
274 }
275 
276 
277 
278 
279 template<typename NumericT>
281  vector_base<NumericT> const & p,
283  vector_base<NumericT> & inner_prod_buffer)
284 {
285  unsigned int size = p.size();
286  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
287 
288 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
289  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
290  {
291  pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
292 #else
293  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
294  {
295  pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
296 #endif
297  viennacl::cuda_arg<unsigned int>(A.handle1()),
298  viennacl::cuda_arg<unsigned int>(A.handle2()),
299  viennacl::cuda_arg<NumericT>(A.handle()),
301  viennacl::cuda_arg(Ap),
302  size,
303  viennacl::cuda_arg(inner_prod_buffer),
304  buffer_size_per_vector
305  );
306  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
307  }
308  else
309  {
310  pipelined_cg_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
311  viennacl::cuda_arg<unsigned int>(A.handle2()),
312  viennacl::cuda_arg<unsigned int>(A.handle3()),
313  viennacl::cuda_arg<NumericT>(A.handle()),
314  static_cast<unsigned int>(A.blocks1()),
316  viennacl::cuda_arg(Ap),
317  size,
318  viennacl::cuda_arg(inner_prod_buffer),
319  buffer_size_per_vector);
320  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
321  }
322 }
323 
324 
325 //
326 // Coordinate Matrix
327 //
328 
329 
330 template<typename NumericT>
331 __global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
332  const NumericT * elements,
333  const unsigned int * group_boundaries,
334  const NumericT * p,
335  NumericT * Ap,
336  unsigned int size,
337  NumericT * inner_prod_buffer,
338  unsigned int buffer_size)
339 {
340  NumericT inner_prod_ApAp = 0;
341  NumericT inner_prod_pAp = 0;
342  __shared__ unsigned int shared_rows[128];
343  __shared__ NumericT inter_results[128];
344 
345  uint2 tmp;
346  NumericT val;
347  unsigned int group_start = group_boundaries[blockIdx.x];
348  unsigned int group_end = group_boundaries[blockIdx.x + 1];
349  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
350 
351  unsigned int local_index = 0;
352 
353  for (unsigned int k = 0; k < k_end; ++k)
354  {
355  local_index = group_start + k * blockDim.x + threadIdx.x;
356 
357  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
358  val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
359 
360  //check for carry from previous loop run:
361  if (threadIdx.x == 0 && k > 0)
362  {
363  if (tmp.x == shared_rows[blockDim.x-1])
364  val += inter_results[blockDim.x-1];
365  else
366  {
367  NumericT Ap_entry = inter_results[blockDim.x-1];
368  Ap[shared_rows[blockDim.x-1]] = Ap_entry;
369  inner_prod_ApAp += Ap_entry * Ap_entry;
370  inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
371  }
372  }
373 
374  //segmented parallel reduction begin
375  __syncthreads();
376  shared_rows[threadIdx.x] = tmp.x;
377  inter_results[threadIdx.x] = val;
378  NumericT left = 0;
379  __syncthreads();
380 
381  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
382  {
383  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
384  __syncthreads();
385  inter_results[threadIdx.x] += left;
386  __syncthreads();
387  }
388  //segmented parallel reduction end
389 
390  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
391  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
392  {
393  NumericT Ap_entry = inter_results[threadIdx.x];
394  Ap[tmp.x] = Ap_entry;
395  inner_prod_ApAp += Ap_entry * Ap_entry;
396  inner_prod_pAp += Ap_entry * p[tmp.x];
397  }
398 
399  __syncthreads();
400  } //for k
401 
402  if (local_index + 1 == group_end)
403  {
404  NumericT Ap_entry = inter_results[threadIdx.x];
405  Ap[tmp.x] = Ap_entry;
406  inner_prod_ApAp += Ap_entry * Ap_entry;
407  inner_prod_pAp += Ap_entry * p[tmp.x];
408  }
409 
411  __shared__ NumericT shared_array_ApAp[256];
412  __shared__ NumericT shared_array_pAp[256];
413  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
414  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
415  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
416  {
417  __syncthreads();
418  if (threadIdx.x < stride)
419  {
420  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
421  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
422  }
423  }
424 
425  // write results to result array
426  if (threadIdx.x == 0) {
427  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
428  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
429  }
430 
431 }
432 
433 
434 template<typename NumericT>
436  vector_base<NumericT> const & p,
438  vector_base<NumericT> & inner_prod_buffer)
439 {
440  unsigned int size = p.size();
441  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
442 
443  Ap.clear();
444 
445  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
446  viennacl::cuda_arg<NumericT>(A.handle()),
447  viennacl::cuda_arg<unsigned int>(A.handle3()),
449  viennacl::cuda_arg(Ap),
450  size,
451  viennacl::cuda_arg(inner_prod_buffer),
452  buffer_size_per_vector);
453  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
454 }
455 
456 
457 
458 //
459 // ELL Matrix
460 //
461 
462 template<typename NumericT>
463 __global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int * coords,
464  const NumericT * elements,
465  unsigned int internal_row_num,
466  unsigned int items_per_row,
467  const NumericT * p,
468  NumericT * Ap,
469  unsigned int size,
470  NumericT * inner_prod_buffer,
471  unsigned int buffer_size)
472 {
473  NumericT inner_prod_ApAp = 0;
474  NumericT inner_prod_pAp = 0;
475  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
476  unsigned int glb_sz = gridDim.x * blockDim.x;
477 
478  for (unsigned int row = glb_id; row < size; row += glb_sz)
479  {
480  NumericT sum = 0;
481 
482  unsigned int offset = row;
483  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
484  {
485  NumericT val = elements[offset];
486  sum += val ? p[coords[offset]] * val : NumericT(0);
487  }
488 
489  Ap[row] = sum;
490  inner_prod_ApAp += sum * sum;
491  inner_prod_pAp += sum * p[row];
492  }
493 
495  __shared__ NumericT shared_array_ApAp[256];
496  __shared__ NumericT shared_array_pAp[256];
497  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
498  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
499  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
500  {
501  __syncthreads();
502  if (threadIdx.x < stride)
503  {
504  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
505  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
506  }
507  }
508 
509  // write results to result array
510  if (threadIdx.x == 0) {
511  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
512  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
513  }
514 }
515 
516 
517 template<typename NumericT>
519  vector_base<NumericT> const & p,
521  vector_base<NumericT> & inner_prod_buffer)
522 {
523  unsigned int size = p.size();
524  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
525 
526  pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
527  viennacl::cuda_arg<NumericT>(A.handle()),
528  static_cast<unsigned int>(A.internal_size1()),
529  static_cast<unsigned int>(A.maxnnz()),
531  viennacl::cuda_arg(Ap),
532  size,
533  viennacl::cuda_arg(inner_prod_buffer),
534  buffer_size_per_vector);
535  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
536 }
537 
538 
539 //
540 // SELL-C-\sigma Matrix
541 //
542 
543 template<typename NumericT>
544 __global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
545  const unsigned int * column_indices,
546  const unsigned int * block_start,
547  const NumericT * elements,
548  const NumericT * p,
549  NumericT * Ap,
550  unsigned int size,
551  unsigned int block_size,
552  NumericT * inner_prod_buffer,
553  unsigned int buffer_size)
554 {
555  NumericT inner_prod_ApAp = 0;
556  NumericT inner_prod_pAp = 0;
557 
558  unsigned int blocks_per_threadblock = blockDim.x / block_size;
559  unsigned int id_in_block = threadIdx.x % block_size;
560  unsigned int num_blocks = (size - 1) / block_size + 1;
561  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
562  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
563 
564  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
565  {
566  unsigned int row = block_idx * block_size + id_in_block;
567  unsigned int offset = block_start[block_idx];
568  unsigned int num_columns = columns_per_block[block_idx];
569 
570  NumericT sum = 0;
571  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
572  {
573  unsigned int index = offset + item_id * block_size + id_in_block;
574  NumericT val = elements[index];
575 
576  sum += val ? (p[column_indices[index]] * val) : 0;
577  }
578 
579  if (row < size)
580  {
581  Ap[row] = sum;
582  inner_prod_ApAp += sum * sum;
583  inner_prod_pAp += sum * p[row];
584  }
585  }
586 
588  __shared__ NumericT shared_array_ApAp[256];
589  __shared__ NumericT shared_array_pAp[256];
590  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
591  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
592  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
593  {
594  __syncthreads();
595  if (threadIdx.x < stride)
596  {
597  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
598  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
599  }
600  }
601 
602  // write results to result array
603  if (threadIdx.x == 0) {
604  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
605  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
606  }
607 }
608 
609 template<typename NumericT>
611  vector_base<NumericT> const & p,
613  vector_base<NumericT> & inner_prod_buffer)
614 {
615  unsigned int size = p.size();
616  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
617 
618  pipelined_cg_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
619  viennacl::cuda_arg<unsigned int>(A.handle2()),
620  viennacl::cuda_arg<unsigned int>(A.handle3()),
621  viennacl::cuda_arg<NumericT>(A.handle()),
623  viennacl::cuda_arg(Ap),
624  size,
625  static_cast<unsigned int>(A.rows_per_block()),
626  viennacl::cuda_arg(inner_prod_buffer),
627  buffer_size_per_vector);
628  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
629 }
630 
631 
632 //
633 // Hybrid Matrix
634 //
635 
636 
637 template<typename NumericT>
638 __global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int * ell_coords,
639  const NumericT * ell_elements,
640  const unsigned int * csr_rows,
641  const unsigned int * csr_cols,
642  const NumericT * csr_elements,
643  unsigned int internal_row_num,
644  unsigned int items_per_row,
645  const NumericT * p,
646  NumericT * Ap,
647  unsigned int size,
648  NumericT * inner_prod_buffer,
649  unsigned int buffer_size)
650 {
651  NumericT inner_prod_ApAp = 0;
652  NumericT inner_prod_pAp = 0;
653  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
654  unsigned int glb_sz = gridDim.x * blockDim.x;
655 
656  for (unsigned int row = glb_id; row < size; row += glb_sz)
657  {
658  NumericT sum = 0;
659 
660  unsigned int offset = row;
661  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
662  {
663  NumericT val = ell_elements[offset];
664 
665  sum += val ? p[ell_coords[offset]] * val : NumericT(0);
666  }
667 
668  unsigned int col_begin = csr_rows[row];
669  unsigned int col_end = csr_rows[row + 1];
670 
671  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
672  {
673  sum += p[csr_cols[item_id]] * csr_elements[item_id];
674  }
675 
676  Ap[row] = sum;
677  inner_prod_ApAp += sum * sum;
678  inner_prod_pAp += sum * p[row];
679  }
680 
682  __shared__ NumericT shared_array_ApAp[256];
683  __shared__ NumericT shared_array_pAp[256];
684  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
685  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
686  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
687  {
688  __syncthreads();
689  if (threadIdx.x < stride)
690  {
691  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
692  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
693  }
694  }
695 
696  // write results to result array
697  if (threadIdx.x == 0) {
698  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
699  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
700  }
701 }
702 
703 
704 
705 template<typename NumericT>
707  vector_base<NumericT> const & p,
709  vector_base<NumericT> & inner_prod_buffer)
710 {
711  unsigned int size = p.size();
712  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
713 
714  pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
715  viennacl::cuda_arg<NumericT>(A.handle()),
716  viennacl::cuda_arg<unsigned int>(A.handle3()),
717  viennacl::cuda_arg<unsigned int>(A.handle4()),
718  viennacl::cuda_arg<NumericT>(A.handle5()),
719  static_cast<unsigned int>(A.internal_size1()),
720  static_cast<unsigned int>(A.ell_nnz()),
722  viennacl::cuda_arg(Ap),
723  size,
724  viennacl::cuda_arg(inner_prod_buffer),
725  buffer_size_per_vector);
726  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
727 }
728 
729 
730 
732 
733 template<typename NumericT>
735  NumericT const * residual,
736  NumericT const * Ap,
737  unsigned int size,
738  NumericT * inner_prod_buffer,
739  unsigned int chunk_size,
740  unsigned int chunk_offset)
741 {
742  NumericT alpha = 0;
743 
744  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
745  __shared__ NumericT shared_array[256];
746  __shared__ NumericT shared_array_Ap_in_r0[256];
747 
748  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
749  shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
750  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
751  {
752  __syncthreads();
753  if (threadIdx.x < stride) {
754  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
755  shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x + stride];
756  }
757  }
758 
759  // compute alpha from reduced values:
760  __syncthreads();
761  alpha = shared_array[0] / shared_array_Ap_in_r0[0];
762 
763  // run vector update and compute first stage of <s, s>
764  NumericT inner_prod_contrib = 0;
765  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
766  {
767  NumericT value_s = s[i];
768 
769  value_s = residual[i] - alpha * Ap[i];
770  inner_prod_contrib += value_s * value_s;
771 
772  s[i] = value_s;
773  }
774  __syncthreads();
775 
776  // parallel reduction in work group
777  shared_array[threadIdx.x] = inner_prod_contrib;
778  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
779  {
780  __syncthreads();
781  if (threadIdx.x < stride)
782  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
783  }
784 
785  // write results to inner_prod_buffer
786  if (threadIdx.x == 0)
787  inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
788 }
789 
790 template<typename NumericT>
793  vector_base<NumericT> const & Ap,
794  vector_base<NumericT> & inner_prod_buffer,
795  vcl_size_t buffer_chunk_size,
796  vcl_size_t buffer_chunk_offset)
797 {
798  unsigned int size = static_cast<unsigned int>(s.size());
799  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
800  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
801 
802  pipelined_bicgstab_update_s_kernel<<<256, 256>>>(viennacl::cuda_arg(s),
804  viennacl::cuda_arg(Ap),
805  size,
806  viennacl::cuda_arg(inner_prod_buffer),
807  chunk_size,
808  chunk_offset);
809  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_update_s_kernel");
810 }
811 
812 template<typename NumericT>
814  NumericT alpha,
815  NumericT * p,
816  NumericT omega,
817  NumericT const * s,
818  NumericT * residual,
819  NumericT const * As,
820  NumericT beta,
821  NumericT const * Ap,
822  NumericT const * r0star,
823  NumericT * inner_prod_buffer,
824  unsigned int size)
825 {
826  NumericT inner_prod_r_r0star = 0;
827  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
828  {
829  NumericT value_result = result[i];
830  NumericT value_p = p[i];
831  NumericT value_s = s[i];
832  NumericT value_residual = residual[i];
833  NumericT value_As = As[i];
834  NumericT value_Ap = Ap[i];
835  NumericT value_r0star = r0star[i];
836 
837  value_result += alpha * value_p + omega * value_s;
838  value_residual = value_s - omega * value_As;
839  value_p = value_residual + beta * (value_p - omega * value_Ap);
840 
841  result[i] = value_result;
842  residual[i] = value_residual;
843  p[i] = value_p;
844  inner_prod_r_r0star += value_residual * value_r0star;
845  }
846 
847  // parallel reduction in work group
848  __shared__ NumericT shared_array[256];
849  shared_array[threadIdx.x] = inner_prod_r_r0star;
850  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
851  {
852  __syncthreads();
853  if (threadIdx.x < stride)
854  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
855  }
856 
857  // write results to result array
858  if (threadIdx.x == 0)
859  inner_prod_buffer[blockIdx.x] = shared_array[0];
860 }
861 
862 
863 template<typename NumericT>
865  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
866  NumericT beta, vector_base<NumericT> const & Ap,
867  vector_base<NumericT> const & r0star,
868  vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
869 {
870  (void)buffer_chunk_size;
871  unsigned int size = static_cast<unsigned int>(result.size());
872 
873  pipelined_bicgstab_vector_kernel<<<256, 256>>>(viennacl::cuda_arg(result),
874  alpha,
876  omega,
878  viennacl::cuda_arg(residual),
879  viennacl::cuda_arg(As),
880  beta,
881  viennacl::cuda_arg(Ap),
882  viennacl::cuda_arg(r0star),
883  viennacl::cuda_arg(inner_prod_buffer),
884  size);
885  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_vector_kernel");
886 }
887 
888 
889 
890 //
891 // Compressed matrix
892 //
893 
894 
895 template<unsigned int SubWarpSizeV, typename NumericT>
897  const unsigned int * row_indices,
898  const unsigned int * column_indices,
899  const NumericT * elements,
900  const NumericT * p,
901  NumericT * Ap,
902  const NumericT * r0star,
903  unsigned int size,
904  NumericT * inner_prod_buffer,
905  unsigned int buffer_size,
906  unsigned int buffer_offset)
907 {
908  __shared__ NumericT shared_elements[256];
909  NumericT inner_prod_ApAp = 0;
910  NumericT inner_prod_pAp = 0;
911  NumericT inner_prod_r0Ap = 0;
912 
913  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
914  const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
915  const unsigned int block_start = blockIdx.x * block_increment;
916  const unsigned int block_stop = min(block_start + block_increment, size);
917 
918  for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
919  row < block_stop;
920  row += blockDim.x / SubWarpSizeV)
921  {
923  unsigned int row_end = row_indices[row+1];
924  for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
925  dot_prod += elements[i] * p[column_indices[i]];
926 
927  shared_elements[threadIdx.x] = dot_prod;
928  if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
929  if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
930  if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
931  if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
932  if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
933 
934  if (id_in_row == 0)
935  {
936  Ap[row] = shared_elements[threadIdx.x];
937  inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
938  inner_prod_pAp += p[row] * shared_elements[threadIdx.x];
939  inner_prod_r0Ap += r0star[row] * shared_elements[threadIdx.x];
940  }
941  }
942 
944  __shared__ NumericT shared_array_ApAp[256];
945  __shared__ NumericT shared_array_pAp[256];
946  __shared__ NumericT shared_array_r0Ap[256];
947  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
948  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
949  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
950  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
951  {
952  __syncthreads();
953  if (threadIdx.x < stride)
954  {
955  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
956  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
957  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
958  }
959  }
960 
961  // write results to result array
962  if (threadIdx.x == 0) {
963  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
964  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
965  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
966  }
967 
968 }
969 
970 
971 template<typename NumericT>
973  const unsigned int * row_indices,
974  const unsigned int * column_indices,
975  const unsigned int * row_blocks,
976  const NumericT * elements,
977  unsigned int num_blocks,
978  const NumericT * p,
979  NumericT * Ap,
980  const NumericT * r0star,
981  unsigned int size,
982  NumericT * inner_prod_buffer,
983  unsigned int buffer_size,
984  unsigned int buffer_offset)
985 {
986  NumericT inner_prod_ApAp = 0;
987  NumericT inner_prod_pAp = 0;
988  NumericT inner_prod_r0Ap = 0;
989 
990  __shared__ NumericT shared_elements[1024];
991 
992  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
993  {
994  unsigned int row_start = row_blocks[block_id];
995  unsigned int row_stop = row_blocks[block_id + 1];
996  unsigned int element_start = row_indices[row_start];
997  unsigned int element_stop = row_indices[row_stop];
998  unsigned int rows_to_process = row_stop - row_start;
999 
1000  if (rows_to_process > 1) // CSR stream with one thread per row
1001  {
1002  // load to shared buffer:
1003  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
1004  shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
1005 
1006  __syncthreads();
1007 
1008  // use one thread per row to sum:
1009  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
1010  {
1011  NumericT dot_prod = 0;
1012  unsigned int thread_row_start = row_indices[row] - element_start;
1013  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
1014  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
1015  dot_prod += shared_elements[i];
1016  Ap[row] = dot_prod;
1017  inner_prod_ApAp += dot_prod * dot_prod;
1018  inner_prod_pAp += p[row] * dot_prod;
1019  inner_prod_r0Ap += r0star[row] * dot_prod;
1020  }
1021  }
1022  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
1023  else // CSR vector for a single row
1024  {
1025  // load and sum to shared buffer:
1026  shared_elements[threadIdx.x] = 0;
1027  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
1028  shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
1029 
1030  // reduction to obtain final result
1031  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1032  {
1033  __syncthreads();
1034  if (threadIdx.x < stride)
1035  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
1036  }
1037 
1038  if (threadIdx.x == 0)
1039  {
1040  Ap[row_start] = shared_elements[0];
1041  inner_prod_ApAp += shared_elements[0] * shared_elements[0];
1042  inner_prod_pAp += p[row_start] * shared_elements[0];
1043  inner_prod_r0Ap += r0star[row_start] * shared_elements[0];
1044  }
1045  }
1046 
1047  __syncthreads(); // avoid race conditions
1048  }
1049 
1051  __shared__ NumericT shared_array_ApAp[256];
1052  __shared__ NumericT shared_array_pAp[256];
1053  __shared__ NumericT shared_array_r0Ap[256];
1054  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1055  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1056  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1057  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1058  {
1059  __syncthreads();
1060  if (threadIdx.x < stride)
1061  {
1062  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1063  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1064  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1065  }
1066  }
1067 
1068  // write results to result array
1069  if (threadIdx.x == 0) {
1070  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1071  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1072  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1073  }
1074 }
1075 
1076 
1077 
1078 
1079 template<typename NumericT>
1081  vector_base<NumericT> const & p,
1082  vector_base<NumericT> & Ap,
1083  vector_base<NumericT> const & r0star,
1084  vector_base<NumericT> & inner_prod_buffer,
1085  vcl_size_t buffer_chunk_size,
1086  vcl_size_t buffer_chunk_offset)
1087 {
1088  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1089  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1090  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1091 
1092 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
1093  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
1094  {
1095  pipelined_bicgstab_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
1096 #else
1097  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
1098  {
1099  pipelined_bicgstab_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
1100 #endif
1101  viennacl::cuda_arg<unsigned int>(A.handle1()),
1102  viennacl::cuda_arg<unsigned int>(A.handle2()),
1103  viennacl::cuda_arg<NumericT>(A.handle()),
1104  viennacl::cuda_arg(p),
1105  viennacl::cuda_arg(Ap),
1106  viennacl::cuda_arg(r0star),
1107  vec_size,
1108  viennacl::cuda_arg(inner_prod_buffer),
1109  chunk_size,
1110  chunk_offset
1111  );
1112  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
1113  }
1114  else
1115  {
1116  pipelined_bicgstab_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
1117  viennacl::cuda_arg<unsigned int>(A.handle2()),
1118  viennacl::cuda_arg<unsigned int>(A.handle3()),
1119  viennacl::cuda_arg<NumericT>(A.handle()),
1120  static_cast<unsigned int>(A.blocks1()),
1121  viennacl::cuda_arg(p),
1122  viennacl::cuda_arg(Ap),
1123  viennacl::cuda_arg(r0star),
1124  vec_size,
1125  viennacl::cuda_arg(inner_prod_buffer),
1126  chunk_size,
1127  chunk_offset);
1128  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_csr_vec_mul_adaptive_kernel");
1129  }
1130 }
1131 
1132 
1133 //
1134 // Coordinate Matrix
1135 //
1136 
1137 
1138 template<typename NumericT>
1139 __global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1140  const NumericT * elements,
1141  const unsigned int * group_boundaries,
1142  const NumericT * p,
1143  NumericT * Ap,
1144  const NumericT * r0star,
1145  unsigned int size,
1146  NumericT * inner_prod_buffer,
1147  unsigned int buffer_size,
1148  unsigned int buffer_offset)
1149 {
1150  NumericT inner_prod_ApAp = 0;
1151  NumericT inner_prod_pAp = 0;
1152  NumericT inner_prod_r0Ap = 0;
1153  __shared__ unsigned int shared_rows[128];
1154  __shared__ NumericT inter_results[128];
1155 
1156  uint2 tmp;
1157  NumericT val;
1158  unsigned int group_start = group_boundaries[blockIdx.x];
1159  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1160  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1161 
1162  unsigned int local_index = 0;
1163 
1164  for (unsigned int k = 0; k < k_end; ++k)
1165  {
1166  local_index = group_start + k * blockDim.x + threadIdx.x;
1167 
1168  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1169  val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
1170 
1171  //check for carry from previous loop run:
1172  if (threadIdx.x == 0 && k > 0)
1173  {
1174  if (tmp.x == shared_rows[blockDim.x-1])
1175  val += inter_results[blockDim.x-1];
1176  else
1177  {
1178  NumericT Ap_entry = inter_results[blockDim.x-1];
1179  Ap[shared_rows[blockDim.x-1]] = Ap_entry;
1180  inner_prod_ApAp += Ap_entry * Ap_entry;
1181  inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
1182  inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
1183  }
1184  }
1185 
1186  //segmented parallel reduction begin
1187  __syncthreads();
1188  shared_rows[threadIdx.x] = tmp.x;
1189  inter_results[threadIdx.x] = val;
1190  NumericT left = 0;
1191  __syncthreads();
1192 
1193  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1194  {
1195  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1196  __syncthreads();
1197  inter_results[threadIdx.x] += left;
1198  __syncthreads();
1199  }
1200  //segmented parallel reduction end
1201 
1202  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1203  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1204  {
1205  NumericT Ap_entry = inter_results[threadIdx.x];
1206  Ap[tmp.x] = Ap_entry;
1207  inner_prod_ApAp += Ap_entry * Ap_entry;
1208  inner_prod_pAp += Ap_entry * p[tmp.x];
1209  inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
1210  }
1211 
1212  __syncthreads();
1213  } //for k
1214 
1215  if (local_index + 1 == group_end)
1216  {
1217  NumericT Ap_entry = inter_results[threadIdx.x];
1218  Ap[tmp.x] = Ap_entry;
1219  inner_prod_ApAp += Ap_entry * Ap_entry;
1220  inner_prod_pAp += Ap_entry * p[tmp.x];
1221  inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
1222  }
1223 
1225  __shared__ NumericT shared_array_ApAp[256];
1226  __shared__ NumericT shared_array_pAp[256];
1227  __shared__ NumericT shared_array_r0Ap[256];
1228  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1229  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1230  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1231  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1232  {
1233  __syncthreads();
1234  if (threadIdx.x < stride)
1235  {
1236  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1237  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1238  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1239  }
1240  }
1241 
1242  // write results to result array
1243  if (threadIdx.x == 0) {
1244  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1245  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1246  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1247  }
1248 
1249 }
1250 
1251 
1252 template<typename NumericT>
1254  vector_base<NumericT> const & p,
1255  vector_base<NumericT> & Ap,
1256  vector_base<NumericT> const & r0star,
1257  vector_base<NumericT> & inner_prod_buffer,
1258  vcl_size_t buffer_chunk_size,
1259  vcl_size_t buffer_chunk_offset)
1260 {
1261  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1262  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1263  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1264 
1265  Ap.clear();
1266 
1267  pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
1268  viennacl::cuda_arg<NumericT>(A.handle()),
1269  viennacl::cuda_arg<unsigned int>(A.handle3()),
1270  viennacl::cuda_arg(p),
1271  viennacl::cuda_arg(Ap),
1272  viennacl::cuda_arg(r0star),
1273  vec_size,
1274  viennacl::cuda_arg(inner_prod_buffer),
1275  chunk_size,
1276  chunk_offset);
1277  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_coo_vec_mul_kernel");
1278 }
1279 
1280 
1281 
1282 //
1283 // ELL Matrix
1284 //
1285 
1286 template<typename NumericT>
1287 __global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int * coords,
1288  const NumericT * elements,
1289  unsigned int internal_row_num,
1290  unsigned int items_per_row,
1291  const NumericT * p,
1292  NumericT * Ap,
1293  const NumericT * r0star,
1294  unsigned int size,
1295  NumericT * inner_prod_buffer,
1296  unsigned int buffer_size,
1297  unsigned int buffer_offset)
1298 {
1299  NumericT inner_prod_ApAp = 0;
1300  NumericT inner_prod_pAp = 0;
1301  NumericT inner_prod_r0Ap = 0;
1302  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1303  unsigned int glb_sz = gridDim.x * blockDim.x;
1304 
1305  for (unsigned int row = glb_id; row < size; row += glb_sz)
1306  {
1307  NumericT sum = 0;
1308 
1309  unsigned int offset = row;
1310  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1311  {
1312  NumericT val = elements[offset];
1313  sum += val ? p[coords[offset]] * val : NumericT(0);
1314  }
1315 
1316  Ap[row] = sum;
1317  inner_prod_ApAp += sum * sum;
1318  inner_prod_pAp += sum * p[row];
1319  inner_prod_r0Ap += sum * r0star[row];
1320  }
1321 
1323  __shared__ NumericT shared_array_ApAp[256];
1324  __shared__ NumericT shared_array_pAp[256];
1325  __shared__ NumericT shared_array_r0Ap[256];
1326  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1327  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1328  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1329  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1330  {
1331  __syncthreads();
1332  if (threadIdx.x < stride)
1333  {
1334  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1335  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1336  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1337  }
1338  }
1339 
1340  // write results to result array
1341  if (threadIdx.x == 0) {
1342  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1343  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1344  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1345  }
1346 }
1347 
1348 
1349 template<typename NumericT>
1351  vector_base<NumericT> const & p,
1352  vector_base<NumericT> & Ap,
1353  vector_base<NumericT> const & r0star,
1354  vector_base<NumericT> & inner_prod_buffer,
1355  vcl_size_t buffer_chunk_size,
1356  vcl_size_t buffer_chunk_offset)
1357 {
1358  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1359  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1360  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1361 
1362  pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
1363  viennacl::cuda_arg<NumericT>(A.handle()),
1364  static_cast<unsigned int>(A.internal_size1()),
1365  static_cast<unsigned int>(A.maxnnz()),
1366  viennacl::cuda_arg(p),
1367  viennacl::cuda_arg(Ap),
1368  viennacl::cuda_arg(r0star),
1369  vec_size,
1370  viennacl::cuda_arg(inner_prod_buffer),
1371  chunk_size,
1372  chunk_offset);
1373  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_ell_vec_mul_kernel");
1374 }
1375 
1376 
1377 //
1378 // SELL-C-\sigma Matrix
1379 //
1380 
1381 template<typename NumericT>
1382 __global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
1383  const unsigned int * column_indices,
1384  const unsigned int * block_start,
1385  const NumericT * elements,
1386  const NumericT * p,
1387  NumericT * Ap,
1388  const NumericT * r0star,
1389  unsigned int size,
1390  unsigned int block_size,
1391  NumericT * inner_prod_buffer,
1392  unsigned int buffer_size,
1393  unsigned int buffer_offset)
1394 {
1395  NumericT inner_prod_ApAp = 0;
1396  NumericT inner_prod_pAp = 0;
1397  NumericT inner_prod_r0Ap = 0;
1398 
1399  unsigned int blocks_per_threadblock = blockDim.x / block_size;
1400  unsigned int id_in_block = threadIdx.x % block_size;
1401  unsigned int num_blocks = (size - 1) / block_size + 1;
1402  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
1403  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
1404 
1405  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
1406  {
1407  unsigned int row = block_idx * block_size + id_in_block;
1408  unsigned int offset = block_start[block_idx];
1409  unsigned int num_columns = columns_per_block[block_idx];
1410 
1411  NumericT sum = 0;
1412  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
1413  {
1414  unsigned int index = offset + item_id * block_size + id_in_block;
1415  NumericT val = elements[index];
1416 
1417  sum += val ? (p[column_indices[index]] * val) : 0;
1418  }
1419 
1420  if (row < size)
1421  {
1422  Ap[row] = sum;
1423  inner_prod_ApAp += sum * sum;
1424  inner_prod_pAp += sum * p[row];
1425  inner_prod_r0Ap += sum * r0star[row];
1426  }
1427  }
1428 
1430  __shared__ NumericT shared_array_ApAp[256];
1431  __shared__ NumericT shared_array_pAp[256];
1432  __shared__ NumericT shared_array_r0Ap[256];
1433  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1434  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1435  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1436  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1437  {
1438  __syncthreads();
1439  if (threadIdx.x < stride)
1440  {
1441  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1442  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1443  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1444  }
1445  }
1446 
1447  // write results to result array
1448  if (threadIdx.x == 0) {
1449  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1450  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1451  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1452  }
1453 }
1454 
1455 template<typename NumericT>
1457  vector_base<NumericT> const & p,
1458  vector_base<NumericT> & Ap,
1459  vector_base<NumericT> const & r0star,
1460  vector_base<NumericT> & inner_prod_buffer,
1461  vcl_size_t buffer_chunk_size,
1462  vcl_size_t buffer_chunk_offset)
1463 {
1464  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1465  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1466  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1467 
1468  pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
1469  viennacl::cuda_arg<unsigned int>(A.handle2()),
1470  viennacl::cuda_arg<unsigned int>(A.handle3()),
1471  viennacl::cuda_arg<NumericT>(A.handle()),
1472  viennacl::cuda_arg(p),
1473  viennacl::cuda_arg(Ap),
1474  viennacl::cuda_arg(r0star),
1475  vec_size,
1476  static_cast<unsigned int>(A.rows_per_block()),
1477  viennacl::cuda_arg(inner_prod_buffer),
1478  chunk_size,
1479  chunk_offset);
1480  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_sliced_ell_vec_mul_kernel");
1481 }
1482 
1483 
1484 //
1485 // Hybrid Matrix
1486 //
1487 
1488 
1489 template<typename NumericT>
1490 __global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int * ell_coords,
1491  const NumericT * ell_elements,
1492  const unsigned int * csr_rows,
1493  const unsigned int * csr_cols,
1494  const NumericT * csr_elements,
1495  unsigned int internal_row_num,
1496  unsigned int items_per_row,
1497  const NumericT * p,
1498  NumericT * Ap,
1499  const NumericT * r0star,
1500  unsigned int size,
1501  NumericT * inner_prod_buffer,
1502  unsigned int buffer_size,
1503  unsigned int buffer_offset)
1504 {
1505  NumericT inner_prod_ApAp = 0;
1506  NumericT inner_prod_pAp = 0;
1507  NumericT inner_prod_r0Ap = 0;
1508  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1509  unsigned int glb_sz = gridDim.x * blockDim.x;
1510 
1511  for (unsigned int row = glb_id; row < size; row += glb_sz)
1512  {
1513  NumericT sum = 0;
1514 
1515  unsigned int offset = row;
1516  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1517  {
1518  NumericT val = ell_elements[offset];
1519 
1520  sum += val ? p[ell_coords[offset]] * val : NumericT(0);
1521  }
1522 
1523  unsigned int col_begin = csr_rows[row];
1524  unsigned int col_end = csr_rows[row + 1];
1525 
1526  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
1527  {
1528  sum += p[csr_cols[item_id]] * csr_elements[item_id];
1529  }
1530 
1531  Ap[row] = sum;
1532  inner_prod_ApAp += sum * sum;
1533  inner_prod_pAp += sum * p[row];
1534  inner_prod_r0Ap += sum * r0star[row];
1535  }
1536 
1538  __shared__ NumericT shared_array_ApAp[256];
1539  __shared__ NumericT shared_array_pAp[256];
1540  __shared__ NumericT shared_array_r0Ap[256];
1541  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1542  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1543  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1544  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1545  {
1546  __syncthreads();
1547  if (threadIdx.x < stride)
1548  {
1549  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1550  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1551  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1552  }
1553  }
1554 
1555  // write results to result array
1556  if (threadIdx.x == 0) {
1557  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1558  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1559  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1560  }
1561 }
1562 
1563 
1564 
1565 template<typename NumericT>
1567  vector_base<NumericT> const & p,
1568  vector_base<NumericT> & Ap,
1569  vector_base<NumericT> const & r0star,
1570  vector_base<NumericT> & inner_prod_buffer,
1571  vcl_size_t buffer_chunk_size,
1572  vcl_size_t buffer_chunk_offset)
1573 {
1574  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1575  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1576  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1577 
1578  pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
1579  viennacl::cuda_arg<NumericT>(A.handle()),
1580  viennacl::cuda_arg<unsigned int>(A.handle3()),
1581  viennacl::cuda_arg<unsigned int>(A.handle4()),
1582  viennacl::cuda_arg<NumericT>(A.handle5()),
1583  static_cast<unsigned int>(A.internal_size1()),
1584  static_cast<unsigned int>(A.ell_nnz()),
1585  viennacl::cuda_arg(p),
1586  viennacl::cuda_arg(Ap),
1587  viennacl::cuda_arg(r0star),
1588  vec_size,
1589  viennacl::cuda_arg(inner_prod_buffer),
1590  chunk_size,
1591  chunk_offset);
1592  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_hyb_vec_mul_kernel");
1593 }
1594 
1596 
1597 template <typename T>
1599  unsigned int vk_offset,
1600  T const * residual,
1601  T * R_buffer,
1602  unsigned int R_offset,
1603  T const * inner_prod_buffer,
1604  unsigned int chunk_size,
1605  T * r_dot_vk_buffer,
1606  unsigned int chunk_offset,
1607  unsigned int size)
1608 {
1609  __shared__ T shared_array[128];
1610  T norm_vk = 0;
1611 
1612  // parallel reduction in work group to compute <vk, vk>
1613  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
1614  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1615  {
1616  __syncthreads();
1617  if (threadIdx.x < stride)
1618  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1619  }
1620 
1621  // compute alpha from reduced values:
1622  __syncthreads();
1623  norm_vk = sqrt(shared_array[0]);
1624 
1625  T inner_prod_contrib = 0;
1626  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1627  T value_vk = vk[i + vk_offset] / norm_vk;
1628 
1629  inner_prod_contrib += residual[i] * value_vk;
1630 
1631  vk[i + vk_offset] = value_vk;
1632  }
1633  __syncthreads();
1634 
1635  // parallel reduction in work group
1636  shared_array[threadIdx.x] = inner_prod_contrib;
1637  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1638  {
1639  __syncthreads();
1640  if (threadIdx.x < stride)
1641  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1642  }
1643 
1644  // write results of first reduction stage:
1645  if (threadIdx.x == 0)
1646  r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
1647  // store norm:
1648  if (blockDim.x * blockIdx.x + threadIdx.x == 0)
1649  R_buffer[R_offset] = norm_vk;
1650 }
1651 
1659 template <typename T>
1661  vector_base<T> const & residual,
1662  vector_base<T> & R_buffer,
1663  vcl_size_t offset_in_R,
1664  vector_base<T> const & inner_prod_buffer,
1665  vector_base<T> & r_dot_vk_buffer,
1666  vcl_size_t buffer_chunk_size,
1667  vcl_size_t buffer_chunk_offset)
1668 {
1669  unsigned int vk_offset = viennacl::traits::start(v_k);
1670  unsigned int R_offset = offset_in_R;
1671  unsigned int chunk_size = buffer_chunk_size;
1672  unsigned int chunk_offset = buffer_chunk_offset;
1673  unsigned int size = v_k.size();
1674 
1675  pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(viennacl::cuda_arg(v_k),
1676  vk_offset,
1677  viennacl::cuda_arg(residual),
1678  viennacl::cuda_arg(R_buffer),
1679  R_offset,
1680  viennacl::cuda_arg(inner_prod_buffer),
1681  chunk_size,
1682  viennacl::cuda_arg(r_dot_vk_buffer),
1683  chunk_offset,
1684  size);
1685  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_normalize_vk_kernel");
1686 }
1687 
1688 
1689 
1690 template <typename T>
1691 __global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const * krylov_basis,
1692  unsigned int size,
1693  unsigned int internal_size,
1694  unsigned int k,
1695  T * vi_in_vk_buffer,
1696  unsigned int chunk_size)
1697 {
1698  __shared__ T shared_array[7*128];
1699  T value_vk = 0;
1700 
1701  unsigned int k_base = 0;
1702  while (k_base < k)
1703  {
1704  unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1705 
1706  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1707  shared_array[threadIdx.x + j*chunk_size] = 0;
1708 
1709  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1710  {
1711  value_vk = krylov_basis[i + k * internal_size];
1712 
1713  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1714  shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size];
1715  }
1716 
1717  // parallel reduction in work group
1718  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1719  {
1720  __syncthreads();
1721  if (threadIdx.x < stride) {
1722  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1723  shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
1724  }
1725  }
1726 
1727  // write results to result array
1728  if (threadIdx.x == 0)
1729  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1730  vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
1731 
1732  k_base += vecs_in_iteration;
1733  }
1734 
1735 }
1736 
1737 template <typename T>
1738 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
1739  vcl_size_t v_k_size,
1740  vcl_size_t v_k_internal_size,
1741  vcl_size_t param_k,
1742  vector_base<T> & vi_in_vk_buffer,
1743  vcl_size_t buffer_chunk_size)
1744 {
1745  unsigned int chunk_size = buffer_chunk_size;
1746  unsigned int size = v_k_size;
1747  unsigned int internal_size = v_k_internal_size;
1748  unsigned int k = param_k;
1749 
1750  pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis),
1751  size,
1752  internal_size,
1753  k,
1754  viennacl::cuda_arg(vi_in_vk_buffer),
1755  chunk_size);
1756  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage1_kernel");
1757 }
1758 
1759 
1760 
1761 
1762 template <typename T>
1763 __global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T * krylov_basis,
1764  unsigned int size,
1765  unsigned int internal_size,
1766  unsigned int k,
1767  T const * vi_in_vk_buffer,
1768  unsigned int chunk_size,
1769  T * R_buffer,
1770  unsigned int krylov_dim,
1771  T * inner_prod_buffer)
1772 {
1773  __shared__ T shared_array[7*128];
1774  T vk_dot_vk = 0;
1775  T value_vk = 0;
1776 
1777  unsigned int k_base = 0;
1778  while (k_base < k)
1779  {
1780  unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1781 
1782  // parallel reduction in work group for <v_i, v_k>
1783  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1784  shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
1785  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1786  {
1787  __syncthreads();
1788  if (threadIdx.x < stride) {
1789  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1790  shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
1791  }
1792  }
1793  __syncthreads();
1794 
1795  // v_k -= <v_i, v_k> v_i:
1796  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1797  {
1798  value_vk = krylov_basis[i + k * internal_size];
1799 
1800  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1801  value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size];
1802  vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
1803  krylov_basis[i + k * internal_size] = value_vk;
1804  }
1805 
1806  // write to R: (to avoid thread divergence, all threads write the same value)
1807  if (blockIdx.x == 0)
1808  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1809  R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
1810  __syncthreads();
1811 
1812  k_base += vecs_in_iteration;
1813  }
1814 
1815  // parallel reduction in work group for <v_k, v_k>
1816  shared_array[threadIdx.x] = vk_dot_vk;
1817  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1818  {
1819  __syncthreads();
1820  if (threadIdx.x < stride)
1821  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1822  }
1823 
1824  // write results to result array
1825  if (threadIdx.x == 0)
1826  inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
1827 }
1828 
1829 template <typename T>
1831  vcl_size_t v_k_size,
1832  vcl_size_t v_k_internal_size,
1833  vcl_size_t param_k,
1834  vector_base<T> const & vi_in_vk_buffer,
1835  vector_base<T> & R_buffer,
1836  vcl_size_t krylov_dim,
1837  vector_base<T> & inner_prod_buffer,
1838  vcl_size_t buffer_chunk_size)
1839 {
1840  unsigned int chunk_size = buffer_chunk_size;
1841  unsigned int size = v_k_size;
1842  unsigned int internal_size = v_k_internal_size;
1843  unsigned int k = param_k;
1844  unsigned int krylov = krylov_dim;
1845 
1846  pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis),
1847  size,
1848  internal_size,
1849  k,
1850  viennacl::cuda_arg(vi_in_vk_buffer),
1851  chunk_size,
1852  viennacl::cuda_arg(R_buffer),
1853  krylov,
1854  viennacl::cuda_arg(inner_prod_buffer));
1855  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage2_kernel");
1856 }
1857 
1858 
1859 
1860 
1861 template <typename T>
1862 __global__ void pipelined_gmres_update_result_kernel(T * result,
1863  T const * residual,
1864  T const * krylov_basis,
1865  unsigned int size,
1866  unsigned int internal_size,
1867  T const * coefficients,
1868  unsigned int k)
1869 {
1870  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1871  {
1872  T value_result = result[i] + coefficients[0] * residual[i];
1873 
1874  for (unsigned int j = 1; j < k; ++j)
1875  value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size];
1876 
1877  result[i] = value_result;
1878  }
1879 }
1880 
1881 template <typename T>
1883  vector_base<T> const & residual,
1884  vector_base<T> const & krylov_basis,
1885  vcl_size_t v_k_size,
1886  vcl_size_t v_k_internal_size,
1887  vector_base<T> const & coefficients,
1888  vcl_size_t param_k)
1889 {
1890  unsigned int size = v_k_size;
1891  unsigned int internal_size = v_k_internal_size;
1892  unsigned int k = param_k;
1893 
1894  pipelined_gmres_update_result_kernel<<<128, 128>>>(viennacl::cuda_arg(result),
1895  viennacl::cuda_arg(residual),
1896  viennacl::cuda_arg(krylov_basis),
1897  size,
1898  internal_size,
1899  viennacl::cuda_arg(coefficients),
1900  k);
1901  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_update_result_kernel");
1902 }
1903 
1904 
1905 
1906 template <typename NumericT>
1908  vector_base<NumericT> const & p,
1909  vector_base<NumericT> & Ap,
1910  vector_base<NumericT> & inner_prod_buffer)
1911 {
1912  unsigned int size = p.size();
1913  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1914 
1915 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
1916  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
1917  {
1918  pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
1919 #else
1920  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
1921  {
1922  pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<128, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
1923 #endif
1924  viennacl::cuda_arg<unsigned int>(A.handle1()),
1925  viennacl::cuda_arg<unsigned int>(A.handle2()),
1926  viennacl::cuda_arg<NumericT>(A.handle()),
1929  size,
1930  viennacl::cuda_arg(inner_prod_buffer),
1931  buffer_size_per_vector
1932  );
1933  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
1934  }
1935  else
1936  {
1937  pipelined_cg_csr_vec_mul_adaptive_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
1938  viennacl::cuda_arg<unsigned int>(A.handle2()),
1939  viennacl::cuda_arg<unsigned int>(A.handle3()),
1940  viennacl::cuda_arg<NumericT>(A.handle()),
1941  static_cast<unsigned int>(A.blocks1()),
1944  size,
1945  viennacl::cuda_arg(inner_prod_buffer),
1946  buffer_size_per_vector);
1947  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_adaptive_kernel");
1948  }
1949 
1950 }
1951 
1952 template <typename T>
1954  vector_base<T> const & p,
1955  vector_base<T> & Ap,
1956  vector_base<T> & inner_prod_buffer)
1957 {
1958  unsigned int size = p.size();
1959  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1960 
1961  Ap.clear();
1962 
1963  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
1964  viennacl::cuda_arg<T>(A.handle()),
1965  viennacl::cuda_arg<unsigned int>(A.handle3()),
1968  size,
1969  viennacl::cuda_arg(inner_prod_buffer),
1970  buffer_size_per_vector);
1971  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
1972 }
1973 
1974 template <typename T>
1976  vector_base<T> const & p,
1977  vector_base<T> & Ap,
1978  vector_base<T> & inner_prod_buffer)
1979 {
1980  unsigned int size = p.size();
1981  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1982 
1983  pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
1984  viennacl::cuda_arg<T>(A.handle()),
1985  static_cast<unsigned int>(A.internal_size1()),
1986  static_cast<unsigned int>(A.maxnnz()),
1989  size,
1990  viennacl::cuda_arg(inner_prod_buffer),
1991  buffer_size_per_vector);
1992  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
1993 }
1994 
1995 template <typename T>
1997  vector_base<T> const & p,
1998  vector_base<T> & Ap,
1999  vector_base<T> & inner_prod_buffer)
2000 {
2001  unsigned int size = p.size();
2002  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
2003 
2004  pipelined_cg_sliced_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
2005  viennacl::cuda_arg<unsigned int>(A.handle2()),
2006  viennacl::cuda_arg<unsigned int>(A.handle3()),
2007  viennacl::cuda_arg<T>(A.handle()),
2010  size,
2011  A.rows_per_block(),
2012  viennacl::cuda_arg(inner_prod_buffer),
2013  buffer_size_per_vector);
2014  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
2015 }
2016 
2017 
2018 template <typename T>
2020  vector_base<T> const & p,
2021  vector_base<T> & Ap,
2022  vector_base<T> & inner_prod_buffer)
2023 {
2024  unsigned int size = p.size();
2025  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
2026 
2027  pipelined_cg_hyb_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
2028  viennacl::cuda_arg<T>(A.handle()),
2029  viennacl::cuda_arg<unsigned int>(A.handle3()),
2030  viennacl::cuda_arg<unsigned int>(A.handle4()),
2031  viennacl::cuda_arg<T>(A.handle5()),
2032  static_cast<unsigned int>(A.internal_size1()),
2033  static_cast<unsigned int>(A.ell_nnz()),
2036  size,
2037  viennacl::cuda_arg(inner_prod_buffer),
2038  buffer_size_per_vector);
2039  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
2040 }
2041 
2042 
2043 
2044 } // namespace cuda
2045 } //namespace linalg
2046 } //namespace viennacl
2047 
2048 
2049 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, unsigned int block_size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T const *vi_in_vk_buffer, unsigned int chunk_size, T *R_buffer, unsigned int krylov_dim, T *inner_prod_buffer)
handle_type & handle2()
Definition: ell_matrix.hpp:103
__global__ void pipelined_cg_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT *r, NumericT const *Ap, NumericT beta, NumericT *inner_prod_buffer, unsigned int size)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
const vcl_size_t & size1() const
Returns the number of rows.
Various little tools used here and there in ViennaCL.
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, unsigned int block_size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_gmres_normalize_vk_kernel(T *vk, unsigned int vk_offset, T const *residual, T *R_buffer, unsigned int R_offset, T const *inner_prod_buffer, unsigned int chunk_size, T *r_dot_vk_buffer, unsigned int chunk_offset, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
__global__ void pipelined_bicgstab_csr_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
Definition: size.hpp:371
__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t rows_per_block() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
__global__ void pipelined_cg_csr_vec_mul_blocked_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
__global__ void pipelined_bicgstab_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT omega, NumericT const *s, NumericT *residual, NumericT const *As, NumericT beta, NumericT const *Ap, NumericT const *r0star, NumericT *inner_prod_buffer, unsigned int size)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:75
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:900
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
handle_type & handle()
Definition: ell_matrix.hpp:100
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:875
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void pipelined_bicgstab_csr_vec_mul_blocked_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
__global__ void pipelined_gmres_update_result_kernel(T *result, T const *residual, T const *krylov_basis, unsigned int size, unsigned int internal_size, T const *coefficients, unsigned int k)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T *vi_in_vk_buffer, unsigned int chunk_size)
__global__ void pipelined_cg_csr_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_bicgstab_update_s_kernel(NumericT *s, NumericT const *residual, NumericT const *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int chunk_size, unsigned int chunk_offset)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
Implementation of the ViennaCL scalar class.
void pipelined_gmres_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...