ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
32 
33 //#ifdef VIENNACL_WITH_SPGEMM_RMERGE
35 //#else
36 // #include "viennacl/linalg/cuda/spgemm.hpp"
37 //#endif
38 
39 namespace viennacl
40 {
41 namespace linalg
42 {
43 namespace cuda
44 {
45 //
46 // Compressed matrix
47 //
48 
49 namespace detail
50 {
51 
52  template<typename NumericT>
54  const unsigned int * row_indices,
55  const unsigned int * column_indices,
56  const NumericT * elements,
57  NumericT * result,
58  unsigned int size,
59  unsigned int option)
60  {
61  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
62  row < size;
63  row += gridDim.x * blockDim.x)
64  {
65  NumericT value = 0;
66  unsigned int row_end = row_indices[row+1];
67 
68  switch (option)
69  {
70  case 0: //inf-norm
71  for (unsigned int i = row_indices[row]; i < row_end; ++i)
72  value = max(value, fabs(elements[i]));
73  break;
74 
75  case 1: //1-norm
76  for (unsigned int i = row_indices[row]; i < row_end; ++i)
77  value += fabs(elements[i]);
78  break;
79 
80  case 2: //2-norm
81  for (unsigned int i = row_indices[row]; i < row_end; ++i)
82  value += elements[i] * elements[i];
83  value = sqrt(value);
84  break;
85 
86  case 3: //diagonal entry
87  for (unsigned int i = row_indices[row]; i < row_end; ++i)
88  {
89  if (column_indices[i] == row)
90  {
91  value = elements[i];
92  break;
93  }
94  }
95  break;
96 
97  default:
98  break;
99  }
100  result[row] = value;
101  }
102  }
103 
104 
105  template<typename NumericT, unsigned int AligmentV>
107  vector_base<NumericT> & vec,
109  {
110  csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
111  viennacl::cuda_arg<unsigned int>(mat.handle2()),
112  viennacl::cuda_arg<NumericT>(mat.handle()),
113  viennacl::cuda_arg(vec),
114  static_cast<unsigned int>(mat.size1()),
115  static_cast<unsigned int>(info_selector)
116  );
117  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
118  }
119 
120 } //namespace detail
121 
122 
123 
124 template<unsigned int SubWarpSizeV, typename NumericT>
126  const unsigned int * row_indices,
127  const unsigned int * column_indices,
128  const NumericT * elements,
129  const NumericT * x,
130  unsigned int start_x,
131  unsigned int inc_x,
132  NumericT * result,
133  unsigned int start_result,
134  unsigned int inc_result,
135  unsigned int size_result)
136 {
137  __shared__ NumericT shared_elements[512];
138 
139  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
140  const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
141  const unsigned int block_start = blockIdx.x * block_increment;
142  const unsigned int block_stop = min(block_start + block_increment, size_result);
143 
144  for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
145  row < block_stop;
146  row += blockDim.x / SubWarpSizeV)
147  {
149  unsigned int row_end = row_indices[row+1];
150  for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
151  dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
152 
153  shared_elements[threadIdx.x] = dot_prod;
154  if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
155  if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
156  if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
157  if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
158  if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
159 
160  if (id_in_row == 0)
161  result[row * inc_result + start_result] = shared_elements[threadIdx.x];
162  }
163 }
164 
165 
166 template<typename NumericT>
168  const unsigned int * row_indices,
169  const unsigned int * column_indices,
170  const unsigned int * row_blocks,
171  const NumericT * elements,
172  unsigned int num_blocks,
173  const NumericT * x,
174  unsigned int start_x,
175  unsigned int inc_x,
176  NumericT * result,
177  unsigned int start_result,
178  unsigned int inc_result,
179  unsigned int size_result)
180 {
181  __shared__ NumericT shared_elements[1024];
182 
183  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
184  {
185  unsigned int row_start = row_blocks[block_id];
186  unsigned int row_stop = row_blocks[block_id + 1];
187  unsigned int element_start = row_indices[row_start];
188  unsigned int element_stop = row_indices[row_stop];
189  unsigned int rows_to_process = row_stop - row_start;
190 
191  if (rows_to_process > 1) // CSR stream with one thread per row
192  {
193  // load to shared buffer:
194  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
195  shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
196 
197  __syncthreads();
198 
199  // use one thread per row to sum:
200  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
201  {
202  NumericT dot_prod = 0;
203  unsigned int thread_row_start = row_indices[row] - element_start;
204  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
205  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
206  dot_prod += shared_elements[i];
207  result[row * inc_result + start_result] = dot_prod;
208  }
209  }
210  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
211  else // CSR vector for a single row
212  {
213  // load and sum to shared buffer:
214  shared_elements[threadIdx.x] = 0;
215  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
216  shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
217 
218  // reduction to obtain final result
219  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
220  {
221  __syncthreads();
222  if (threadIdx.x < stride)
223  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
224  }
225 
226  if (threadIdx.x == 0)
227  result[row_start * inc_result + start_result] = shared_elements[0];
228  }
229 
230  __syncthreads(); // avoid race conditions
231  }
232 }
233 
234 
235 
236 
245 template<class NumericT, unsigned int AlignmentV>
249 {
250 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
251  if (double(mat.nnz()) / double(mat.size1()) > 6.4) // less than 10% of threads expected to idle
252  {
253  compressed_matrix_vec_mul_kernel<8, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
254 #else
255  if (double(mat.nnz()) / double(mat.size1()) > 12.0) // less than 25% of threads expected to idle
256  {
257  compressed_matrix_vec_mul_kernel<16, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
258 #endif
259  viennacl::cuda_arg<unsigned int>(mat.handle1()),
260  viennacl::cuda_arg<unsigned int>(mat.handle2()),
261  viennacl::cuda_arg<NumericT>(mat.handle()),
262  viennacl::cuda_arg(vec),
263  static_cast<unsigned int>(vec.start()),
264  static_cast<unsigned int>(vec.stride()),
265  viennacl::cuda_arg(result),
266  static_cast<unsigned int>(result.start()),
267  static_cast<unsigned int>(result.stride()),
268  static_cast<unsigned int>(result.size())
269  );
270  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
271  }
272  else
273  {
274  compressed_matrix_vec_mul_adaptive_kernel<<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
275  viennacl::cuda_arg<unsigned int>(mat.handle2()),
276  viennacl::cuda_arg<unsigned int>(mat.handle3()),
277  viennacl::cuda_arg<NumericT>(mat.handle()),
278  static_cast<unsigned int>(mat.blocks1()),
279  viennacl::cuda_arg(vec),
280  static_cast<unsigned int>(vec.start()),
281  static_cast<unsigned int>(vec.stride()),
282  viennacl::cuda_arg(result),
283  static_cast<unsigned int>(result.start()),
284  static_cast<unsigned int>(result.stride()),
285  static_cast<unsigned int>(result.size())
286  );
287  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel");
288  }
289 }
290 
295 template<typename LayoutT>
297 {
298  static __device__ unsigned int apply(unsigned int i, unsigned int j,
299  unsigned int row_start, unsigned int row_inc,
300  unsigned int col_start, unsigned int col_inc,
301  unsigned int internal_rows, unsigned int internal_cols)
302  {
303  return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
304  }
305 };
306 
308 template<>
309 struct mat_mult_matrix_index<viennacl::column_major>
310 {
311  static __device__ unsigned int apply(unsigned int i, unsigned int j,
312  unsigned int row_start, unsigned int row_inc,
313  unsigned int col_start, unsigned int col_inc,
314  unsigned int internal_rows, unsigned int internal_cols)
315  {
316  return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
317  }
318 };
322 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
324  const unsigned int * sp_mat_row_indices,
325  const unsigned int * sp_mat_col_indices,
326  const NumericT * sp_mat_elements,
327  const NumericT * d_mat,
328  unsigned int d_mat_row_start,
329  unsigned int d_mat_col_start,
330  unsigned int d_mat_row_inc,
331  unsigned int d_mat_col_inc,
332  unsigned int d_mat_row_size,
333  unsigned int d_mat_col_size,
334  unsigned int d_mat_internal_rows,
335  unsigned int d_mat_internal_cols,
336  NumericT * result,
337  unsigned int result_row_start,
338  unsigned int result_col_start,
339  unsigned int result_row_inc,
340  unsigned int result_col_inc,
341  unsigned int result_row_size,
342  unsigned int result_col_size,
343  unsigned int result_internal_rows,
344  unsigned int result_internal_cols)
345 {
346  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
347  {
348  unsigned int row_start = sp_mat_row_indices[row];
349  unsigned int row_end = sp_mat_row_indices[row+1];
350 
351  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
352  {
353  NumericT r = 0;
354 
355  for (unsigned int k = row_start; k < row_end; k++)
356  {
357  unsigned int j = sp_mat_col_indices[k];
358  NumericT x = sp_mat_elements[k];
359  NumericT y = d_mat[ DMatIndexT::apply(j, col,
360  d_mat_row_start, d_mat_row_inc,
361  d_mat_col_start, d_mat_col_inc,
362  d_mat_internal_rows, d_mat_internal_cols) ];
363 
364  r += x * y;
365  }
366 
367  result[ResultIndexT::apply(row, col,
368  result_row_start, result_row_inc,
369  result_col_start, result_col_inc,
370  result_internal_rows, result_internal_cols)] = r;
371  }
372  }
373 }
374 
375 
384 template<typename NumericT, unsigned int AlignmentV>
386  const viennacl::matrix_base<NumericT> & d_mat,
388 {
389  if (d_mat.row_major() && result.row_major())
390  {
391  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
392  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
393  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
394  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
395 
396  viennacl::cuda_arg(d_mat),
397  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
398  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
399  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
400  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
401 
402  viennacl::cuda_arg(result),
403  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
404  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
405  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
406  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
407  );
408  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
409  }
410  else if (d_mat.row_major() && !result.row_major())
411  {
412  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
413  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
414  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
415  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
416 
417  viennacl::cuda_arg(d_mat),
418  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
419  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
420  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
421  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
422 
423  viennacl::cuda_arg(result),
424  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
425  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
426  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
427  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
428  );
429  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
430  }
431  else if (!d_mat.row_major() && result.row_major())
432  {
433  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
434  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
435  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
436  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
437 
438  viennacl::cuda_arg(d_mat),
439  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
440  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
441  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
442  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
443 
444  viennacl::cuda_arg(result),
445  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
446  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
447  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
448  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
449  );
450  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
451  }
452  else
453  {
454  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
455  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
456  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
457  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
458 
459  viennacl::cuda_arg(d_mat),
460  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
461  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
462  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
463  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
464 
465  viennacl::cuda_arg(result),
466  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
467  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
468  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
469  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
470  );
471  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
472  }
473 }
474 
475 
476 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
478  const unsigned int * sp_mat_row_indices,
479  const unsigned int * sp_mat_col_indices,
480  const NumericT * sp_mat_elements,
481  const NumericT * d_mat,
482  unsigned int d_mat_row_start,
483  unsigned int d_mat_col_start,
484  unsigned int d_mat_row_inc,
485  unsigned int d_mat_col_inc,
486  unsigned int d_mat_row_size,
487  unsigned int d_mat_col_size,
488  unsigned int d_mat_internal_rows,
489  unsigned int d_mat_internal_cols,
490  NumericT * result,
491  unsigned int result_row_start,
492  unsigned int result_col_start,
493  unsigned int result_row_inc,
494  unsigned int result_col_inc,
495  unsigned int result_row_size,
496  unsigned int result_col_size,
497  unsigned int result_internal_rows,
498  unsigned int result_internal_cols)
499 {
500  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
501  {
502  unsigned int row_start = sp_mat_row_indices[row];
503  unsigned int row_end = sp_mat_row_indices[row+1];
504 
505  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
506  {
507  NumericT r = 0;
508 
509  for (unsigned int k = row_start; k < row_end; k++)
510  {
511  unsigned int j = sp_mat_col_indices[k];
512  NumericT x = sp_mat_elements[k];
513  NumericT y = d_mat[ DMatIndexT::apply(col, j,
514  d_mat_row_start, d_mat_row_inc,
515  d_mat_col_start, d_mat_col_inc,
516  d_mat_internal_rows, d_mat_internal_cols) ];
517 
518  r += x * y;
519  }
520 
521  result [ ResultIndexT::apply(row, col,
522  result_row_start, result_row_inc,
523  result_col_start, result_col_inc,
524  result_internal_rows, result_internal_cols) ] = r;
525  }
526  }
527 
528 }
529 
539 template<typename NumericT, unsigned int AlignmentV>
543  viennacl::op_trans > & d_mat,
545 {
546 
547  if (d_mat.lhs().row_major() && result.row_major())
548  {
549  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
550  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
551  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
552  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
553 
554  viennacl::cuda_arg(d_mat.lhs()),
555  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
556  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
557  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
558  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
559 
560  viennacl::cuda_arg(result),
561  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
562  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
563  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
564  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
565  );
566  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
567  }
568  else if (d_mat.lhs().row_major() && !result.row_major())
569  {
570  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
571  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
572  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
573  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
574 
575  viennacl::cuda_arg(d_mat.lhs()),
576  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
577  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
578  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
579  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
580 
581  viennacl::cuda_arg(result),
582  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
583  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
584  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
585  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
586  );
587  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
588  }
589  else if (!d_mat.lhs().row_major() && result.row_major())
590  {
591  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
592  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
593  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
594  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
595 
596  viennacl::cuda_arg(d_mat.lhs()),
597  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
598  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
599  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
600  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
601 
602  viennacl::cuda_arg(result),
603  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
604  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
605  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
606  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
607  );
608  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
609  }
610  else
611  {
612  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
613  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
614  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
615  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
616 
617  viennacl::cuda_arg(d_mat.lhs()),
618  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
619  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
620  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
621  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
622 
623  viennacl::cuda_arg(result),
624  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
625  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
626  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
627  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
628  );
629  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
630  }
631 }
632 
633 
634 //
635 // triangular solves for compressed_matrix
636 //
637 
638 template<typename NumericT>
640  const unsigned int * row_indices,
641  const unsigned int * column_indices,
642  const NumericT * elements,
643  NumericT * result,
644  unsigned int size)
645 {
646  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
647  row < size;
648  row += gridDim.x * blockDim.x)
649  {
650  NumericT diag = NumericT(0);
651  unsigned int row_end = row_indices[row+1];
652  for (unsigned int i = row_indices[row]; i < row_end; ++i)
653  {
654  unsigned int col_index = column_indices[i];
655  if (col_index == row)
656  {
657  diag = elements[i];
658  break;
659  }
660  }
661  result[row] = diag;
662  }
663 }
664 
665 
671 template<typename SparseMatrixT, typename NumericT>
673 inplace_solve(const SparseMatrixT & mat,
676 {
677  csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
678  viennacl::cuda_arg<unsigned int>(mat.handle2()),
679  viennacl::cuda_arg<NumericT>(mat.handle()),
680  viennacl::cuda_arg(vec),
681  static_cast<unsigned int>(mat.size1())
682  );
683  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
684 }
685 
686 
692 template<typename SparseMatrixT, typename NumericT>
694 inplace_solve(const SparseMatrixT & mat,
697 {
698  csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
699  viennacl::cuda_arg<unsigned int>(mat.handle2()),
700  viennacl::cuda_arg<NumericT>(mat.handle()),
701  viennacl::cuda_arg(vec),
702  static_cast<unsigned int>(mat.size1())
703  );
704  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
705 }
706 
707 
708 
714 template<typename SparseMatrixT, typename NumericT>
716 inplace_solve(const SparseMatrixT & mat,
719 {
720  csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
721  viennacl::cuda_arg<unsigned int>(mat.handle2()),
722  viennacl::cuda_arg<NumericT>(mat.handle()),
723  viennacl::cuda_arg(vec),
724  static_cast<unsigned int>(mat.size1())
725  );
726  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
727 }
728 
729 
735 template<typename SparseMatrixT, typename NumericT>
737 inplace_solve(const SparseMatrixT & mat,
740 {
741  csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
742  viennacl::cuda_arg<unsigned int>(mat.handle2()),
743  viennacl::cuda_arg<NumericT>(mat.handle()),
744  viennacl::cuda_arg(vec),
745  static_cast<unsigned int>(mat.size1())
746  );
747  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
748 }
749 
750 
751 
752 // transposed
753 
759 template<typename SparseMatrixT, typename NumericT>
764 {
765  csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
766  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
767  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
768  viennacl::cuda_arg(vec),
769  static_cast<unsigned int>(mat.lhs().size1())
770  );
771  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
772 }
773 
774 
780 template<typename SparseMatrixT, typename NumericT>
785 {
786  viennacl::vector<NumericT> diagonal(vec.size());
787 
788  compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
789  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
790  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
791  viennacl::cuda_arg(diagonal),
792  static_cast<unsigned int>(mat.size1())
793  );
794 
795  csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
796  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
797  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
798  viennacl::cuda_arg(diagonal),
799  viennacl::cuda_arg(vec),
800  static_cast<unsigned int>(mat.lhs().size1())
801  );
802  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
803 }
804 
805 
811 template<typename SparseMatrixT, typename NumericT>
816 {
817  csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
818  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
819  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
820  viennacl::cuda_arg(vec),
821  static_cast<unsigned int>(mat.lhs().size1())
822  );
823  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
824 }
825 
826 
832 template<typename SparseMatrixT, typename NumericT>
837 {
838  viennacl::vector<NumericT> diagonal(vec.size());
839 
840  compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
841  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
842  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
843  viennacl::cuda_arg(diagonal),
844  static_cast<unsigned int>(mat.size1())
845  );
846 
847  csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
848  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
849  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
850  viennacl::cuda_arg(diagonal),
851  viennacl::cuda_arg(vec),
852  static_cast<unsigned int>(mat.lhs().size1())
853  );
854  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
855 }
856 
857 namespace detail
858 {
859  //
860  // block solves
861  //
862  template<typename NumericT, unsigned int AlignmentV>
865  op_trans> & L,
866  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
867  vector_base<NumericT> const & /* L_diagonal */, //ignored
868  vector_base<NumericT> & vec,
870  {
871  csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
872  viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
873  viennacl::cuda_arg<NumericT>(L.lhs().handle()),
874  viennacl::cuda_arg<unsigned int>(block_indices),
875  viennacl::cuda_arg(vec),
876  static_cast<unsigned int>(L.lhs().size1())
877  );
878  }
879 
880 
881  template<typename NumericT, unsigned int AlignmentV>
884  op_trans> & U,
885  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
886  vector_base<NumericT> const & U_diagonal,
887  vector_base<NumericT> & vec,
889  {
890  csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
891  viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
892  viennacl::cuda_arg<NumericT>(U.lhs().handle()),
893  viennacl::cuda_arg(U_diagonal),
894  viennacl::cuda_arg<unsigned int>(block_indices),
895  viennacl::cuda_arg(vec),
896  static_cast<unsigned int>(U.lhs().size1())
897  );
898  }
899 
900 
901 }
902 
903 
904 //
905 // Compressed Compressed Matrix
906 //
907 
908 template<typename NumericT>
910  const unsigned int * row_jumper,
911  const unsigned int * row_indices,
912  const unsigned int * column_indices,
913  const NumericT * elements,
914  unsigned int nonzero_rows,
915  const NumericT * x,
916  unsigned int start_x,
917  unsigned int inc_x,
918  NumericT * result,
919  unsigned int start_result,
920  unsigned int inc_result,
921  unsigned int size_result)
922 {
923  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
924  i < size_result;
925  i += gridDim.x * blockDim.x)
926  {
927  result[i * inc_result + start_result] = 0;
928  }
929 
930  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
931  i < nonzero_rows;
932  i += gridDim.x * blockDim.x)
933  {
935  unsigned int row_end = row_jumper[i+1];
936  for (unsigned int j = row_jumper[i]; j < row_end; ++j)
937  dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
938  result[row_indices[i] * inc_result + start_result] = dot_prod;
939  }
940 }
941 
942 
951 template<typename NumericT>
955 {
956  compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
957  viennacl::cuda_arg<unsigned int>(mat.handle3()),
958  viennacl::cuda_arg<unsigned int>(mat.handle2()),
959  viennacl::cuda_arg<NumericT>(mat.handle()),
960  static_cast<unsigned int>(mat.nnz1()),
961  viennacl::cuda_arg(vec),
962  static_cast<unsigned int>(vec.start()),
963  static_cast<unsigned int>(vec.stride()),
964  viennacl::cuda_arg(result),
965  static_cast<unsigned int>(result.start()),
966  static_cast<unsigned int>(result.stride()),
967  static_cast<unsigned int>(result.size())
968  );
969  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
970 }
971 
972 //
973 // Coordinate Matrix
974 //
975 
976 
977 namespace detail
978 {
979 
980  template<typename NumericT>
981  __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
982  const NumericT * elements,
983  const unsigned int * group_boundaries,
984  NumericT * result,
985  unsigned int option)
986  {
987  __shared__ unsigned int shared_rows[128];
988  __shared__ NumericT inter_results[128];
989 
990  uint2 tmp;
991  NumericT val;
992  unsigned int last_index = blockDim.x - 1;
993  unsigned int group_start = group_boundaries[blockIdx.x];
994  unsigned int group_end = group_boundaries[blockIdx.x + 1];
995  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
996 
997  unsigned int local_index = 0;
998 
999  for (unsigned int k = 0; k < k_end; ++k)
1000  {
1001  local_index = group_start + k * blockDim.x + threadIdx.x;
1002 
1003  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1004  val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
1005 
1006  //check for carry from previous loop run:
1007  if (threadIdx.x == 0 && k > 0)
1008  {
1009  if (tmp.x == shared_rows[last_index])
1010  {
1011  switch (option)
1012  {
1013  case 0: //inf-norm
1014  case 3: //diagonal entry
1015  val = max(val, fabs(inter_results[last_index]));
1016  break;
1017 
1018  case 1: //1-norm
1019  val = fabs(val) + inter_results[last_index];
1020  break;
1021 
1022  case 2: //2-norm
1023  val = sqrt(val * val + inter_results[last_index]);
1024  break;
1025 
1026  default:
1027  break;
1028  }
1029  }
1030  else
1031  {
1032  switch (option)
1033  {
1034  case 0: //inf-norm
1035  case 1: //1-norm
1036  case 3: //diagonal entry
1037  result[shared_rows[last_index]] = inter_results[last_index];
1038  break;
1039 
1040  case 2: //2-norm
1041  result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
1042  default:
1043  break;
1044  }
1045  }
1046  }
1047 
1048  //segmented parallel reduction begin
1049  __syncthreads();
1050  shared_rows[threadIdx.x] = tmp.x;
1051  switch (option)
1052  {
1053  case 0:
1054  case 3:
1055  inter_results[threadIdx.x] = val;
1056  break;
1057  case 1:
1058  inter_results[threadIdx.x] = fabs(val);
1059  break;
1060  case 2:
1061  inter_results[threadIdx.x] = val * val;
1062  default:
1063  break;
1064  }
1065  __syncthreads();
1066 
1067  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1068  {
1069  NumericT left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1070  __syncthreads();
1071  switch (option)
1072  {
1073  case 0: //inf-norm
1074  case 3: //diagonal entry
1075  inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
1076  break;
1077 
1078  case 1: //1-norm
1079  inter_results[threadIdx.x] += left;
1080  break;
1081 
1082  case 2: //2-norm
1083  inter_results[threadIdx.x] += left;
1084  break;
1085 
1086  default:
1087  break;
1088  }
1089  __syncthreads();
1090  }
1091  //segmented parallel reduction end
1092 
1093  if (threadIdx.x != last_index &&
1094  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1095  inter_results[threadIdx.x] != 0)
1096  {
1097  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1098  }
1099 
1100  __syncthreads();
1101  } //for k
1102 
1103  if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
1104  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1105  }
1106 
1107  template<typename NumericT, unsigned int AlignmentV>
1109  vector_base<NumericT> & vec,
1111  {
1112  coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
1113  viennacl::cuda_arg<NumericT>(mat.handle()),
1114  viennacl::cuda_arg<unsigned int>(mat.handle3()),
1115  viennacl::cuda_arg(vec),
1116  static_cast<unsigned int>(info_selector)
1117  );
1118  VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
1119  }
1120 
1121 } //namespace detail
1122 
1123 
1124 template<typename NumericT>
1125 __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1126  const NumericT * elements,
1127  const unsigned int * group_boundaries,
1128  const NumericT * x,
1129  unsigned int start_x,
1130  unsigned int inc_x,
1131  NumericT * result,
1132  unsigned int start_result,
1133  unsigned int inc_result
1134  )
1135 {
1136  __shared__ unsigned int shared_rows[128];
1137  __shared__ NumericT inter_results[128];
1138 
1139  uint2 tmp;
1140  NumericT val;
1141  unsigned int group_start = group_boundaries[blockIdx.x];
1142  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1143  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1144 
1145  unsigned int local_index = 0;
1146 
1147  for (unsigned int k = 0; k < k_end; ++k)
1148  {
1149  local_index = group_start + k * blockDim.x + threadIdx.x;
1150 
1151  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1152  val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1153 
1154  //check for carry from previous loop run:
1155  if (threadIdx.x == 0 && k > 0)
1156  {
1157  if (tmp.x == shared_rows[blockDim.x-1])
1158  val += inter_results[blockDim.x-1];
1159  else
1160  result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1161  }
1162 
1163  //segmented parallel reduction begin
1164  __syncthreads();
1165  shared_rows[threadIdx.x] = tmp.x;
1166  inter_results[threadIdx.x] = val;
1167  NumericT left = 0;
1168  __syncthreads();
1169 
1170  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1171  {
1172  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1173  __syncthreads();
1174  inter_results[threadIdx.x] += left;
1175  __syncthreads();
1176  }
1177  //segmented parallel reduction end
1178 
1179  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1180  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1181  {
1182  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1183  }
1184 
1185  __syncthreads();
1186  } //for k
1187 
1188  if (local_index + 1 == group_end)
1189  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1190 }
1191 
1192 
1201 template<typename NumericT, unsigned int AlignmentV>
1203  const viennacl::vector_base<NumericT> & vec,
1205 {
1206  result.clear();
1207 
1208  coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
1209  viennacl::cuda_arg<NumericT>(mat.handle()),
1210  viennacl::cuda_arg<unsigned int>(mat.handle3()),
1211  viennacl::cuda_arg(vec),
1212  static_cast<unsigned int>(vec.start()),
1213  static_cast<unsigned int>(vec.stride()),
1214  viennacl::cuda_arg(result),
1215  static_cast<unsigned int>(result.start()),
1216  static_cast<unsigned int>(result.stride())
1217  );
1218  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
1219 }
1220 
1221 
1222 
1223 
1224 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1225 __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1226  const NumericT * elements,
1227  const unsigned int * group_boundaries,
1228  const NumericT * d_mat,
1229  unsigned int d_mat_row_start,
1230  unsigned int d_mat_col_start,
1231  unsigned int d_mat_row_inc,
1232  unsigned int d_mat_col_inc,
1233  unsigned int d_mat_row_size,
1234  unsigned int d_mat_col_size,
1235  unsigned int d_mat_internal_rows,
1236  unsigned int d_mat_internal_cols,
1237  NumericT * result,
1238  unsigned int result_row_start,
1239  unsigned int result_col_start,
1240  unsigned int result_row_inc,
1241  unsigned int result_col_inc,
1242  unsigned int result_row_size,
1243  unsigned int result_col_size,
1244  unsigned int result_internal_rows,
1245  unsigned int result_internal_cols)
1246 {
1247  __shared__ unsigned int shared_rows[128];
1248  __shared__ NumericT inter_results[128];
1249 
1250  uint2 tmp;
1251  NumericT val;
1252  unsigned int group_start = group_boundaries[blockIdx.x];
1253  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1254  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1255 
1256  unsigned int local_index = 0;
1257 
1258  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1259  {
1260  for (unsigned int k = 0; k < k_end; ++k)
1261  {
1262  local_index = group_start + k * blockDim.x + threadIdx.x;
1263 
1264  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1265  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1266  d_mat_row_start, d_mat_row_inc,
1267  d_mat_col_start, d_mat_col_inc,
1268  d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1269 
1270  //check for carry from previous loop run:
1271  if (threadIdx.x == 0 && k > 0)
1272  {
1273  if (tmp.x == shared_rows[blockDim.x-1])
1274  val += inter_results[blockDim.x-1];
1275  else
1276  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1277  result_row_start, result_row_inc,
1278  result_col_start, result_col_inc,
1279  result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1280  }
1281 
1282  //segmented parallel reduction begin
1283  __syncthreads();
1284  shared_rows[threadIdx.x] = tmp.x;
1285  inter_results[threadIdx.x] = val;
1286  NumericT left = 0;
1287  __syncthreads();
1288 
1289  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1290  {
1291  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1292  __syncthreads();
1293  inter_results[threadIdx.x] += left;
1294  __syncthreads();
1295  }
1296  //segmented parallel reduction end
1297 
1298  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1299  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1300  {
1301  result[ResultIndexT::apply(tmp.x, result_col,
1302  result_row_start, result_row_inc,
1303  result_col_start, result_col_inc,
1304  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1305  }
1306 
1307  __syncthreads();
1308  } //for k
1309 
1310  if (local_index + 1 == group_end)
1311  result[ResultIndexT::apply(tmp.x, result_col,
1312  result_row_start, result_row_inc,
1313  result_col_start, result_col_inc,
1314  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1315  }
1316 }
1317 
1318 
1327 template<typename NumericT, unsigned int AlignmentV>
1329  const viennacl::matrix_base<NumericT> & d_mat,
1331 {
1332  if (d_mat.row_major() && result.row_major())
1333  {
1334  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1335  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1336  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1337  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1338 
1339  viennacl::cuda_arg(d_mat),
1340  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1341  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1342  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1343  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1344 
1345  viennacl::cuda_arg(result),
1346  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1347  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1348  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1349  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1350  );
1351  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1352  }
1353  else if (d_mat.row_major() && !result.row_major())
1354  {
1355  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1356  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1357  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1358  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1359 
1360  viennacl::cuda_arg(d_mat),
1361  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1362  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1363  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1364  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1365 
1366  viennacl::cuda_arg(result),
1367  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1368  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1369  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1370  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1371  );
1372  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1373  }
1374  else if (!d_mat.row_major() && result.row_major())
1375  {
1376  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1377  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1378  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1379  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1380 
1381  viennacl::cuda_arg(d_mat),
1382  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1383  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1384  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1385  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1386 
1387  viennacl::cuda_arg(result),
1388  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1389  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1390  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1391  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1392  );
1393  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1394  }
1395  else
1396  {
1397  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1398  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1399  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1400  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1401 
1402  viennacl::cuda_arg(d_mat),
1403  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1404  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1405  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1406  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1407 
1408  viennacl::cuda_arg(result),
1409  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1410  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1411  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1412  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1413  );
1414  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1415  }
1416 
1417 }
1418 
1419 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1420 __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1421  const NumericT * elements,
1422  const unsigned int * group_boundaries,
1423  const NumericT * d_mat,
1424  unsigned int d_mat_row_start,
1425  unsigned int d_mat_col_start,
1426  unsigned int d_mat_row_inc,
1427  unsigned int d_mat_col_inc,
1428  unsigned int d_mat_row_size,
1429  unsigned int d_mat_col_size,
1430  unsigned int d_mat_internal_rows,
1431  unsigned int d_mat_internal_cols,
1432  NumericT * result,
1433  unsigned int result_row_start,
1434  unsigned int result_col_start,
1435  unsigned int result_row_inc,
1436  unsigned int result_col_inc,
1437  unsigned int result_row_size,
1438  unsigned int result_col_size,
1439  unsigned int result_internal_rows,
1440  unsigned int result_internal_cols)
1441 {
1442  __shared__ unsigned int shared_rows[128];
1443  __shared__ NumericT inter_results[128];
1444 
1445  uint2 tmp;
1446  NumericT val;
1447  unsigned int group_start = group_boundaries[blockIdx.x];
1448  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1449  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1450 
1451  unsigned int local_index = 0;
1452 
1453  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1454  {
1455  for (unsigned int k = 0; k < k_end; ++k)
1456  {
1457  local_index = group_start + k * blockDim.x + threadIdx.x;
1458 
1459  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1460  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1461  d_mat_row_start, d_mat_row_inc,
1462  d_mat_col_start, d_mat_col_inc,
1463  d_mat_internal_rows, d_mat_internal_cols)] : 0;
1464 
1465  //check for carry from previous loop run:
1466  if (threadIdx.x == 0 && k > 0)
1467  {
1468  if (tmp.x == shared_rows[blockDim.x-1])
1469  val += inter_results[blockDim.x-1];
1470  else
1471  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1472  result_row_start, result_row_inc,
1473  result_col_start, result_col_inc,
1474  result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1475  }
1476 
1477  //segmented parallel reduction begin
1478  __syncthreads();
1479  shared_rows[threadIdx.x] = tmp.x;
1480  inter_results[threadIdx.x] = val;
1481  NumericT left = 0;
1482  __syncthreads();
1483 
1484  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1485  {
1486  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1487  __syncthreads();
1488  inter_results[threadIdx.x] += left;
1489  __syncthreads();
1490  }
1491  //segmented parallel reduction end
1492 
1493  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1494  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1495  {
1496  result[ ResultIndexT::apply(tmp.x, result_col,
1497  result_row_start, result_row_inc,
1498  result_col_start, result_col_inc,
1499  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1500  }
1501 
1502  __syncthreads();
1503  } //for k
1504 
1505  if (local_index + 1 == group_end)
1506  result[ ResultIndexT::apply(tmp.x, result_col,
1507  result_row_start, result_row_inc,
1508  result_col_start, result_col_inc,
1509  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1510  }
1511 }
1512 
1521 template<typename NumericT, unsigned int AlignmentV>
1525  viennacl::op_trans > & d_mat,
1527 {
1528  if (d_mat.lhs().row_major() && result.row_major())
1529  {
1530  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1531  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1532  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1533  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1534 
1535  viennacl::cuda_arg(d_mat.lhs()),
1536  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1537  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1538  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1539  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1540 
1541  viennacl::cuda_arg(result),
1542  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1543  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1544  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1545  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1546  );
1547  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1548  }
1549  else if (d_mat.lhs().row_major() && !result.row_major())
1550  {
1551  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1552  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1553  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1554  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1555 
1556  viennacl::cuda_arg(d_mat.lhs()),
1557  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1558  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1559  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1560  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1561 
1562  viennacl::cuda_arg(result),
1563  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1564  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1565  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1566  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1567  );
1568  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1569  }
1570  else if (!d_mat.lhs().row_major() && result.row_major())
1571  {
1572  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1573  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1574  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1575  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1576 
1577  viennacl::cuda_arg(d_mat.lhs()),
1578  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1579  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1580  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1581  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1582 
1583  viennacl::cuda_arg(result),
1584  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1585  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1586  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1587  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1588  );
1589  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1590  }
1591  else
1592  {
1593  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1594  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1595  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1596  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1597 
1598  viennacl::cuda_arg(d_mat.lhs()),
1599  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1600  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1601  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1602  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1603 
1604  viennacl::cuda_arg(result),
1605  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1606  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1607  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1608  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1609  );
1610  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1611  }
1612 }
1613 
1614 
1615 //
1616 // ELL Matrix
1617 //
1618 
1619 template<typename NumericT>
1620 __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
1621  const NumericT * elements,
1622  const NumericT * x,
1623  unsigned int start_x,
1624  unsigned int inc_x,
1625  NumericT * result,
1626  unsigned int start_result,
1627  unsigned int inc_result,
1628  unsigned int row_num,
1629  unsigned int col_num,
1630  unsigned int internal_row_num,
1631  unsigned int items_per_row,
1632  unsigned int aligned_items_per_row
1633  )
1634 {
1635  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1636  unsigned int glb_sz = gridDim.x * blockDim.x;
1637 
1638  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1639  {
1640  NumericT sum = 0;
1641 
1642  unsigned int offset = row_id;
1643  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1644  {
1645  NumericT val = elements[offset];
1646 
1647  if (val != NumericT(0))
1648  {
1649  int col = coords[offset];
1650  sum += x[col * inc_x + start_x] * val;
1651  }
1652  }
1653 
1654  result[row_id * inc_result + start_result] = sum;
1655  }
1656 }
1657 
1658 
1667 template<typename NumericT, unsigned int AlignmentV>
1669  const viennacl::vector_base<NumericT> & vec,
1671 {
1672  ell_matrix_vec_mul_kernel<<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle2()),
1673  viennacl::cuda_arg<NumericT>(mat.handle()),
1674  viennacl::cuda_arg(vec),
1675  static_cast<unsigned int>(vec.start()),
1676  static_cast<unsigned int>(vec.stride()),
1677  viennacl::cuda_arg(result),
1678  static_cast<unsigned int>(result.start()),
1679  static_cast<unsigned int>(result.stride()),
1680  static_cast<unsigned int>(mat.size1()),
1681  static_cast<unsigned int>(mat.size2()),
1682  static_cast<unsigned int>(mat.internal_size1()),
1683  static_cast<unsigned int>(mat.maxnnz()),
1684  static_cast<unsigned int>(mat.internal_maxnnz())
1685  );
1686  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
1687 }
1688 
1689 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1690 __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
1691  const NumericT * sp_mat_elements,
1692  unsigned int sp_mat_row_num,
1693  unsigned int sp_mat_col_num,
1694  unsigned int sp_mat_internal_row_num,
1695  unsigned int sp_mat_items_per_row,
1696  unsigned int sp_mat_aligned_items_per_row,
1697  const NumericT * d_mat,
1698  unsigned int d_mat_row_start,
1699  unsigned int d_mat_col_start,
1700  unsigned int d_mat_row_inc,
1701  unsigned int d_mat_col_inc,
1702  unsigned int d_mat_row_size,
1703  unsigned int d_mat_col_size,
1704  unsigned int d_mat_internal_rows,
1705  unsigned int d_mat_internal_cols,
1706  NumericT * result,
1707  unsigned int result_row_start,
1708  unsigned int result_col_start,
1709  unsigned int result_row_inc,
1710  unsigned int result_col_inc,
1711  unsigned int result_row_size,
1712  unsigned int result_col_size,
1713  unsigned int result_internal_rows,
1714  unsigned int result_internal_cols)
1715 {
1716  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1717  unsigned int glb_sz = gridDim.x * blockDim.x;
1718 
1719  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1720  {
1721  unsigned int row = rc % sp_mat_row_num;
1722  unsigned int col = rc / sp_mat_row_num;
1723 
1724  unsigned int offset = row;
1725  NumericT r = (NumericT)0;
1726 
1727  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1728  {
1729  unsigned int j = sp_mat_coords[offset];
1730  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1731 
1732  if (x != (NumericT)0)
1733  {
1734  NumericT y = d_mat[ DMatIndexT::apply(j, col,
1735  d_mat_row_start, d_mat_row_inc,
1736  d_mat_col_start, d_mat_col_inc,
1737  d_mat_internal_rows, d_mat_internal_cols) ];
1738 
1739  r += x*y;
1740  }
1741  }
1742  result [ ResultIndexT::apply(row, col,
1743  result_row_start, result_row_inc,
1744  result_col_start, result_col_inc,
1745  result_internal_rows, result_internal_cols) ] = r;
1746  }
1747 
1748 }
1749 
1759 template<typename NumericT, unsigned int AlignmentV>
1761  const viennacl::matrix_base<NumericT> & d_mat,
1763 {
1764  if (d_mat.row_major() && result.row_major())
1765  {
1766  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1767  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1768  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1769  static_cast<unsigned int>(sp_mat.size1()),
1770  static_cast<unsigned int>(sp_mat.size2()),
1771  static_cast<unsigned int>(sp_mat.internal_size1()),
1772  static_cast<unsigned int>(sp_mat.maxnnz()),
1773  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1774  viennacl::cuda_arg(d_mat),
1775  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1776  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1777  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1778  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1779 
1780  viennacl::cuda_arg(result),
1781  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1782  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1783  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1784  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1785  );
1786  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1787  }
1788  else if (d_mat.row_major() && !result.row_major())
1789  {
1790  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1791  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1792  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1793  static_cast<unsigned int>(sp_mat.size1()),
1794  static_cast<unsigned int>(sp_mat.size2()),
1795  static_cast<unsigned int>(sp_mat.internal_size1()),
1796  static_cast<unsigned int>(sp_mat.maxnnz()),
1797  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1798  viennacl::cuda_arg(d_mat),
1799  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1800  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1801  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1802  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1803 
1804  viennacl::cuda_arg(result),
1805  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1806  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1807  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1808  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1809  );
1810  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1811  }
1812  else if (!d_mat.row_major() && result.row_major())
1813  {
1814  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1815  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1816  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1817  static_cast<unsigned int>(sp_mat.size1()),
1818  static_cast<unsigned int>(sp_mat.size2()),
1819  static_cast<unsigned int>(sp_mat.internal_size1()),
1820  static_cast<unsigned int>(sp_mat.maxnnz()),
1821  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1822  viennacl::cuda_arg(d_mat),
1823  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1824  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1825  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1826  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1827 
1828  viennacl::cuda_arg(result),
1829  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1830  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1831  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1832  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1833  );
1834  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1835  }
1836  else
1837  {
1838  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1839  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1840  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1841  static_cast<unsigned int>(sp_mat.size1()),
1842  static_cast<unsigned int>(sp_mat.size2()),
1843  static_cast<unsigned int>(sp_mat.internal_size1()),
1844  static_cast<unsigned int>(sp_mat.maxnnz()),
1845  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1846  viennacl::cuda_arg(d_mat),
1847  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1848  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1849  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1850  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1851 
1852  viennacl::cuda_arg(result),
1853  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1854  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1855  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1856  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1857  );
1858  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1859  }
1860 }
1861 
1862 template<typename DMatIndexT, typename ResultIndexT, typename NumericT >
1863 __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
1864  const NumericT * sp_mat_elements,
1865  unsigned int sp_mat_row_num,
1866  unsigned int sp_mat_col_num,
1867  unsigned int sp_mat_internal_row_num,
1868  unsigned int sp_mat_items_per_row,
1869  unsigned int sp_mat_aligned_items_per_row,
1870  const NumericT * d_mat,
1871  unsigned int d_mat_row_start,
1872  unsigned int d_mat_col_start,
1873  unsigned int d_mat_row_inc,
1874  unsigned int d_mat_col_inc,
1875  unsigned int d_mat_row_size,
1876  unsigned int d_mat_col_size,
1877  unsigned int d_mat_internal_rows,
1878  unsigned int d_mat_internal_cols,
1879  NumericT * result,
1880  unsigned int result_row_start,
1881  unsigned int result_col_start,
1882  unsigned int result_row_inc,
1883  unsigned int result_col_inc,
1884  unsigned int result_row_size,
1885  unsigned int result_col_size,
1886  unsigned int result_internal_rows,
1887  unsigned int result_internal_cols)
1888 {
1889  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1890  unsigned int glb_sz = gridDim.x * blockDim.x;
1891 
1892  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1893  {
1894  unsigned int row = rc % sp_mat_row_num;
1895  unsigned int col = rc / sp_mat_row_num;
1896 
1897  unsigned int offset = row;
1898  NumericT r = (NumericT)0;
1899 
1900  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1901  {
1902  unsigned int j = sp_mat_coords[offset];
1903  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1904 
1905  if (x != (NumericT)0)
1906  {
1907  NumericT y = d_mat[ DMatIndexT::apply(col, j,
1908  d_mat_row_start, d_mat_row_inc,
1909  d_mat_col_start, d_mat_col_inc,
1910  d_mat_internal_rows, d_mat_internal_cols) ];
1911 
1912  r += x*y;
1913  }
1914  }
1915  result [ ResultIndexT::apply(row, col,
1916  result_row_start, result_row_inc,
1917  result_col_start, result_col_inc,
1918  result_internal_rows, result_internal_cols) ] = r;
1919  }
1920 
1921 }
1922 
1932 template<typename NumericT, unsigned int AlignmentV>
1936  viennacl::op_trans > & d_mat,
1938 {
1939  if (d_mat.lhs().row_major() && result.row_major())
1940  {
1941  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1942  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1943  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1944  static_cast<unsigned int>(sp_mat.size1()),
1945  static_cast<unsigned int>(sp_mat.size2()),
1946  static_cast<unsigned int>(sp_mat.internal_size1()),
1947  static_cast<unsigned int>(sp_mat.maxnnz()),
1948  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1949 
1950  viennacl::cuda_arg(d_mat.lhs()),
1951  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1952  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1953  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1954  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1955 
1956  viennacl::cuda_arg(result),
1957  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1958  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1959  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1960  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1961  );
1962  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1963  }
1964  else if (d_mat.lhs().row_major() && !result.row_major())
1965  {
1966  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1967  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1968  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1969  static_cast<unsigned int>(sp_mat.size1()),
1970  static_cast<unsigned int>(sp_mat.size2()),
1971  static_cast<unsigned int>(sp_mat.internal_size1()),
1972  static_cast<unsigned int>(sp_mat.maxnnz()),
1973  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1974 
1975  viennacl::cuda_arg(d_mat.lhs()),
1976  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1977  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1978  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1979  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1980 
1981  viennacl::cuda_arg(result),
1982  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1983  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1984  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1985  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1986  );
1987  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1988  }
1989  else if (!d_mat.lhs().row_major() && result.row_major())
1990  {
1991  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1992  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1993  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1994  static_cast<unsigned int>(sp_mat.size1()),
1995  static_cast<unsigned int>(sp_mat.size2()),
1996  static_cast<unsigned int>(sp_mat.internal_size1()),
1997  static_cast<unsigned int>(sp_mat.maxnnz()),
1998  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1999 
2000  viennacl::cuda_arg(d_mat.lhs()),
2001  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2002  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2003  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2004  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2005 
2006  viennacl::cuda_arg(result),
2007  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2008  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2009  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2010  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2011  );
2012  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
2013  }
2014  else
2015  {
2016  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
2017  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
2018  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
2019  static_cast<unsigned int>(sp_mat.size1()),
2020  static_cast<unsigned int>(sp_mat.size2()),
2021  static_cast<unsigned int>(sp_mat.internal_size1()),
2022  static_cast<unsigned int>(sp_mat.maxnnz()),
2023  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
2024 
2025  viennacl::cuda_arg(d_mat.lhs()),
2026  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2027  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2028  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2029  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2030 
2031  viennacl::cuda_arg(result),
2032  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2033  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2034  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2035  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2036  );
2037  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
2038  }
2039 }
2040 
2041 //
2042 // SELL-C-\sigma Matrix
2043 //
2044 
2045 template<typename NumericT>
2046 __global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int * columns_per_block,
2047  const unsigned int * column_indices,
2048  const unsigned int * block_start,
2049  const NumericT * elements,
2050  const NumericT * x,
2051  unsigned int start_x,
2052  unsigned int inc_x,
2053  unsigned int size_x,
2054  NumericT * result,
2055  unsigned int start_result,
2056  unsigned int inc_result,
2057  unsigned int size_result,
2058  unsigned int block_size)
2059 {
2060  unsigned int blocks_per_threadblock = blockDim.x / block_size;
2061  unsigned int id_in_block = threadIdx.x % block_size;
2062  unsigned int num_blocks = (size_result - 1) / block_size + 1;
2063  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
2064  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
2065 
2066  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
2067  {
2068  unsigned int row = block_idx * block_size + id_in_block;
2069  unsigned int offset = block_start[block_idx];
2070  unsigned int num_columns = columns_per_block[block_idx];
2071 
2072  NumericT sum = 0;
2073  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
2074  {
2075  unsigned int index = offset + item_id * block_size + id_in_block;
2076  NumericT val = elements[index];
2077 
2078  sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2079  }
2080 
2081  if (row < size_result)
2082  result[row * inc_result + start_result] = sum;
2083  }
2084 }
2085 
2094 template<typename NumericT, typename IndexT>
2096  const viennacl::vector_base<NumericT> & vec,
2098 {
2099  sliced_ell_matrix_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
2100  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2101  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2102  viennacl::cuda_arg<NumericT>(mat.handle()),
2103  viennacl::cuda_arg(vec),
2104  static_cast<unsigned int>(vec.start()),
2105  static_cast<unsigned int>(vec.stride()),
2106  static_cast<unsigned int>(vec.size()),
2107  viennacl::cuda_arg(result),
2108  static_cast<unsigned int>(result.start()),
2109  static_cast<unsigned int>(result.stride()),
2110  static_cast<unsigned int>(result.size()),
2111  static_cast<unsigned int>(mat.rows_per_block())
2112  );
2113  VIENNACL_CUDA_LAST_ERROR_CHECK("sliced_ell_matrix_vec_mul_kernel");
2114 }
2115 
2116 
2117 //
2118 // Hybrid Matrix
2119 //
2120 
2121 
2122 template<typename NumericT>
2123 __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
2124  const NumericT * ell_elements,
2125  const unsigned int * csr_rows,
2126  const unsigned int * csr_cols,
2127  const NumericT * csr_elements,
2128  const NumericT * x,
2129  unsigned int start_x,
2130  unsigned int inc_x,
2131  NumericT * result,
2132  unsigned int start_result,
2133  unsigned int inc_result,
2134  unsigned int row_num,
2135  unsigned int internal_row_num,
2136  unsigned int items_per_row,
2137  unsigned int aligned_items_per_row
2138  )
2139 {
2140  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2141  unsigned int glb_sz = gridDim.x * blockDim.x;
2142 
2143  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2144  {
2145  NumericT sum = 0;
2146 
2147  unsigned int offset = row_id;
2148  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2149  {
2150  NumericT val = ell_elements[offset];
2151 
2152 
2153  if (val != NumericT(0))
2154  {
2155  int col = ell_coords[offset];
2156  sum += (x[col * inc_x + start_x] * val);
2157  }
2158  }
2159 
2160  unsigned int col_begin = csr_rows[row_id];
2161  unsigned int col_end = csr_rows[row_id + 1];
2162 
2163  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2164  sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2165 
2166  result[row_id * inc_result + start_result] = sum;
2167  }
2168 }
2169 
2170 
2171 
2180 template<typename NumericT, unsigned int AlignmentV>
2182  const viennacl::vector_base<NumericT> & vec,
2184 {
2185  hyb_matrix_vec_mul_kernel<<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle2()),
2186  viennacl::cuda_arg<NumericT>(mat.handle()),
2187  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2188  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2189  viennacl::cuda_arg<NumericT>(mat.handle5()),
2190  viennacl::cuda_arg(vec),
2191  static_cast<unsigned int>(vec.start()),
2192  static_cast<unsigned int>(vec.stride()),
2193  viennacl::cuda_arg(result),
2194  static_cast<unsigned int>(result.start()),
2195  static_cast<unsigned int>(result.stride()),
2196  static_cast<unsigned int>(mat.size1()),
2197  static_cast<unsigned int>(mat.internal_size1()),
2198  static_cast<unsigned int>(mat.ell_nnz()),
2199  static_cast<unsigned int>(mat.internal_ellnnz())
2200  );
2201  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2202 }
2203 
2204 
2205 
2206 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2207 __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
2208  const NumericT * ell_elements,
2209  const unsigned int * csr_rows,
2210  const unsigned int * csr_cols,
2211  const NumericT * csr_elements,
2212  unsigned int row_num,
2213  unsigned int internal_row_num,
2214  unsigned int items_per_row,
2215  unsigned int aligned_items_per_row,
2216  const NumericT * d_mat,
2217  unsigned int d_mat_row_start,
2218  unsigned int d_mat_col_start,
2219  unsigned int d_mat_row_inc,
2220  unsigned int d_mat_col_inc,
2221  unsigned int d_mat_row_size,
2222  unsigned int d_mat_col_size,
2223  unsigned int d_mat_internal_rows,
2224  unsigned int d_mat_internal_cols,
2225  NumericT * result,
2226  unsigned int result_row_start,
2227  unsigned int result_col_start,
2228  unsigned int result_row_inc,
2229  unsigned int result_col_inc,
2230  unsigned int result_row_size,
2231  unsigned int result_col_size,
2232  unsigned int result_internal_rows,
2233  unsigned int result_internal_cols)
2234 {
2235  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2236  unsigned int glb_sz = gridDim.x * blockDim.x;
2237 
2238  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2239  {
2240  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2241  {
2242  NumericT sum = 0;
2243 
2244  unsigned int offset = row_id;
2245  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2246  {
2247  NumericT val = ell_elements[offset];
2248 
2249  if (val != 0.0f)
2250  {
2251  sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2252  d_mat_row_start, d_mat_row_inc,
2253  d_mat_col_start, d_mat_col_inc,
2254  d_mat_internal_rows, d_mat_internal_cols)] * val;
2255  }
2256  }
2257 
2258  unsigned int col_begin = csr_rows[row_id];
2259  unsigned int col_end = csr_rows[row_id + 1];
2260 
2261  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2262  {
2263  sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2264  d_mat_row_start, d_mat_row_inc,
2265  d_mat_col_start, d_mat_col_inc,
2266  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2267  }
2268 
2269  result[ResultIndexT::apply(row_id, result_col,
2270  result_row_start, result_row_inc,
2271  result_col_start, result_col_inc,
2272  result_internal_rows, result_internal_cols)] = sum;
2273  }
2274  }
2275 }
2276 
2277 
2278 
2287 template<typename NumericT, unsigned int AlignmentV>
2289  const viennacl::matrix_base<NumericT> & d_mat,
2291 {
2292  if (d_mat.row_major() && result.row_major())
2293  {
2294  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2295  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2296  viennacl::cuda_arg<NumericT>(mat.handle()),
2297  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2298  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2299  viennacl::cuda_arg<NumericT>(mat.handle5()),
2300  static_cast<unsigned int>(mat.size1()),
2301  static_cast<unsigned int>(mat.internal_size1()),
2302  static_cast<unsigned int>(mat.ell_nnz()),
2303  static_cast<unsigned int>(mat.internal_ellnnz()),
2304 
2305  viennacl::cuda_arg(d_mat),
2306  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2307  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2308  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2309  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2310 
2311  viennacl::cuda_arg(result),
2312  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2313  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2314  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2315  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2316  );
2317  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2318  }
2319  else if (d_mat.row_major() && !result.row_major())
2320  {
2321  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2322  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2323  viennacl::cuda_arg<NumericT>(mat.handle()),
2324  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2325  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2326  viennacl::cuda_arg<NumericT>(mat.handle5()),
2327  static_cast<unsigned int>(mat.size1()),
2328  static_cast<unsigned int>(mat.internal_size1()),
2329  static_cast<unsigned int>(mat.ell_nnz()),
2330  static_cast<unsigned int>(mat.internal_ellnnz()),
2331 
2332  viennacl::cuda_arg(d_mat),
2333  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2334  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2335  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2336  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2337 
2338  viennacl::cuda_arg(result),
2339  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2340  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2341  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2342  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2343  );
2344  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2345  }
2346  else if (!d_mat.row_major() && result.row_major())
2347  {
2348  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2349  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2350  viennacl::cuda_arg<NumericT>(mat.handle()),
2351  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2352  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2353  viennacl::cuda_arg<NumericT>(mat.handle5()),
2354  static_cast<unsigned int>(mat.size1()),
2355  static_cast<unsigned int>(mat.internal_size1()),
2356  static_cast<unsigned int>(mat.ell_nnz()),
2357  static_cast<unsigned int>(mat.internal_ellnnz()),
2358 
2359  viennacl::cuda_arg(d_mat),
2360  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2361  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2362  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2363  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2364 
2365  viennacl::cuda_arg(result),
2366  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2367  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2368  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2369  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2370  );
2371  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2372  }
2373  else
2374  {
2375  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2376  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2377  viennacl::cuda_arg<NumericT>(mat.handle()),
2378  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2379  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2380  viennacl::cuda_arg<NumericT>(mat.handle5()),
2381  static_cast<unsigned int>(mat.size1()),
2382  static_cast<unsigned int>(mat.internal_size1()),
2383  static_cast<unsigned int>(mat.ell_nnz()),
2384  static_cast<unsigned int>(mat.internal_ellnnz()),
2385 
2386  viennacl::cuda_arg(d_mat),
2387  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2388  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2389  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2390  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2391 
2392  viennacl::cuda_arg(result),
2393  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2394  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2395  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2396  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2397  );
2398  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2399  }
2400 }
2401 
2402 
2403 
2404 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2405 __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
2406  const NumericT * ell_elements,
2407  const unsigned int * csr_rows,
2408  const unsigned int * csr_cols,
2409  const NumericT * csr_elements,
2410  unsigned int row_num,
2411  unsigned int internal_row_num,
2412  unsigned int items_per_row,
2413  unsigned int aligned_items_per_row,
2414  const NumericT * d_mat,
2415  unsigned int d_mat_row_start,
2416  unsigned int d_mat_col_start,
2417  unsigned int d_mat_row_inc,
2418  unsigned int d_mat_col_inc,
2419  unsigned int d_mat_row_size,
2420  unsigned int d_mat_col_size,
2421  unsigned int d_mat_internal_rows,
2422  unsigned int d_mat_internal_cols,
2423  NumericT * result,
2424  unsigned int result_row_start,
2425  unsigned int result_col_start,
2426  unsigned int result_row_inc,
2427  unsigned int result_col_inc,
2428  unsigned int result_row_size,
2429  unsigned int result_col_size,
2430  unsigned int result_internal_rows,
2431  unsigned int result_internal_cols)
2432 {
2433  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2434  unsigned int glb_sz = gridDim.x * blockDim.x;
2435 
2436  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2437  {
2438  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2439  {
2440  NumericT sum = 0;
2441 
2442  unsigned int offset = row_id;
2443  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2444  {
2445  NumericT val = ell_elements[offset];
2446 
2447  if (val != 0.0f)
2448  {
2449  sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2450  d_mat_row_start, d_mat_row_inc,
2451  d_mat_col_start, d_mat_col_inc,
2452  d_mat_internal_rows, d_mat_internal_cols)] * val;
2453  }
2454  }
2455 
2456  unsigned int col_begin = csr_rows[row_id];
2457  unsigned int col_end = csr_rows[row_id + 1];
2458 
2459  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2460  {
2461  sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2462  d_mat_row_start, d_mat_row_inc,
2463  d_mat_col_start, d_mat_col_inc,
2464  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2465  }
2466 
2467  result[ResultIndexT::apply(row_id, result_col,
2468  result_row_start, result_row_inc,
2469  result_col_start, result_col_inc,
2470  result_internal_rows, result_internal_cols)] = sum;
2471  }
2472  }
2473 }
2474 
2475 
2476 
2485 template<typename NumericT, unsigned int AlignmentV>
2489  viennacl::op_trans > & d_mat,
2491 {
2492  if (d_mat.lhs().row_major() && result.row_major())
2493  {
2494  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2495  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2496  viennacl::cuda_arg<NumericT>(mat.handle()),
2497  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2498  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2499  viennacl::cuda_arg<NumericT>(mat.handle5()),
2500  static_cast<unsigned int>(mat.size1()),
2501  static_cast<unsigned int>(mat.internal_size1()),
2502  static_cast<unsigned int>(mat.ell_nnz()),
2503  static_cast<unsigned int>(mat.internal_ellnnz()),
2504 
2505  viennacl::cuda_arg(d_mat.lhs()),
2506  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2507  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2508  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2509  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2510 
2511  viennacl::cuda_arg(result),
2512  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2513  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2514  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2515  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2516  );
2517  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2518  }
2519  else if (d_mat.lhs().row_major() && !result.row_major())
2520  {
2521  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2522  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2523  viennacl::cuda_arg<NumericT>(mat.handle()),
2524  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2525  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2526  viennacl::cuda_arg<NumericT>(mat.handle5()),
2527  static_cast<unsigned int>(mat.size1()),
2528  static_cast<unsigned int>(mat.internal_size1()),
2529  static_cast<unsigned int>(mat.ell_nnz()),
2530  static_cast<unsigned int>(mat.internal_ellnnz()),
2531 
2532  viennacl::cuda_arg(d_mat.lhs()),
2533  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2534  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2535  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2536  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2537 
2538  viennacl::cuda_arg(result),
2539  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2540  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2541  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2542  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2543  );
2544  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2545  }
2546  else if (!d_mat.lhs().row_major() && result.row_major())
2547  {
2548  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2549  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2550  viennacl::cuda_arg<NumericT>(mat.handle()),
2551  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2552  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2553  viennacl::cuda_arg<NumericT>(mat.handle5()),
2554  static_cast<unsigned int>(mat.size1()),
2555  static_cast<unsigned int>(mat.internal_size1()),
2556  static_cast<unsigned int>(mat.ell_nnz()),
2557  static_cast<unsigned int>(mat.internal_ellnnz()),
2558 
2559  viennacl::cuda_arg(d_mat.lhs()),
2560  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2561  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2562  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2563  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2564 
2565  viennacl::cuda_arg(result),
2566  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2567  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2568  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2569  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2570  );
2571  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2572  }
2573  else
2574  {
2575  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2576  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2577  viennacl::cuda_arg<NumericT>(mat.handle()),
2578  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2579  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2580  viennacl::cuda_arg<NumericT>(mat.handle5()),
2581  static_cast<unsigned int>(mat.size1()),
2582  static_cast<unsigned int>(mat.internal_size1()),
2583  static_cast<unsigned int>(mat.ell_nnz()),
2584  static_cast<unsigned int>(mat.internal_ellnnz()),
2585 
2586  viennacl::cuda_arg(d_mat.lhs()),
2587  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2588  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2589  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2590  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2591 
2592  viennacl::cuda_arg(result),
2593  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2594  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2595  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2596  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2597  );
2598  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2599  }
2600 }
2601 
2602 
2603 } // namespace cuda
2604 } //namespace linalg
2605 } //namespace viennacl
2606 
2607 
2608 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
Simple enable-if variant that uses the SFINAE pattern.
Definition: enable_if.hpp:30
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
handle_type & handle2()
Definition: ell_matrix.hpp:103
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:382
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
A tag class representing a lower triangular matrix.
Definition: forwards.h:849
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:390
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size2() const
Definition: ell_matrix.hpp:92
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, unsigned int block_size)
vcl_size_t size1() const
Returns the size of the result vector.
Definition: matrix.hpp:72
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Definition: blas3.hpp:36
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
A tag class representing an upper triangular matrix.
Definition: forwards.h:854
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
std::size_t vcl_size_t
Definition: forwards.h:75
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
Definition: matrix.hpp:885
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:900
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
handle_type & handle()
Definition: ell_matrix.hpp:100
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:875
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:248
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
Definition: forwards.h:220
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
A tag for column-major storage of a dense matrix.
Definition: forwards.h:321
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
LHS & lhs() const
Get left hand side operand.
Definition: matrix.hpp:66
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)