ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/ocl/device.hpp"
27 #include "viennacl/ocl/handle.hpp"
28 #include "viennacl/ocl/kernel.hpp"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/vector.hpp"
31 #include "viennacl/tools/tools.hpp"
40 
41 namespace viennacl
42 {
43 namespace linalg
44 {
45 namespace opencl
46 {
47 
48 //
49 // Compressed matrix
50 //
51 
52 namespace detail
53 {
54  template<typename NumericT, unsigned int AlignmentV>
58  {
59  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
62 
63  viennacl::ocl::enqueue(row_info_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
64  viennacl::traits::opencl_handle(x),
65  cl_uint(A.size1()),
66  cl_uint(info_selector)
67  )
68  );
69  }
70 }
71 
80 template<typename NumericT, unsigned int AlignmentV>
84 {
85  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
87  bool use_nvidia_specific = AlignmentV == 1 && ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0);
88 
89  std::stringstream ss;
90  ss << "vec_mul";
91  unsigned int alignment = AlignmentV; //prevent unreachable code warnings below
92  if (use_nvidia_specific)
93  ss << "_nvidia";
94  else
95  {
96  if (alignment == 4)
97  ss << "4";
98  if (alignment == 8)
99  ss << "8";
100  }
101 
103 
105  layout_x.start = cl_uint(viennacl::traits::start(x));
106  layout_x.stride = cl_uint(viennacl::traits::stride(x));
107  layout_x.size = cl_uint(viennacl::traits::size(x));
108  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
109 
111  layout_y.start = cl_uint(viennacl::traits::start(y));
112  layout_y.stride = cl_uint(viennacl::traits::stride(y));
113  layout_y.size = cl_uint(viennacl::traits::size(y));
114  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
115 
116  if (alignment == 4 || alignment == 8)
117  {
118  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
119  x, layout_x,
120  y, layout_y
121  ));
122  }
123  else
124  {
125  if (ctx.current_device().max_work_group_size() >= 256)
126  k.local_work_size(0, 256);
127 
128  if (use_nvidia_specific)
129  {
130  k.global_work_size(0, 512 * k.local_work_size(0));
131 
132  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
133  x, layout_x,
134  y, layout_y
135  ));
136  }
137  else // use CSR adaptive:
138  {
139  k.global_work_size(0, A.blocks1() * k.local_work_size(0));
140 
141  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
142  x, layout_x,
143  y, layout_y
144  ));
145  }
146  }
147 }
148 
149 
158 template< typename NumericT, unsigned int AlignmentV>
162 
163  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
167 
168  viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
169  viennacl::traits::opencl_handle(d_A),
170  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
171  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
172  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
174  viennacl::traits::opencl_handle(y),
175  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
176  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
177  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
179 }
180 
190 template<typename NumericT, unsigned int AlignmentV>
194  viennacl::op_trans > const & d_A,
196 
197  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
200  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
201 
202  viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
203  viennacl::traits::opencl_handle(d_A.lhs()),
204  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
205  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
206  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
207  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
208  viennacl::traits::opencl_handle(y),
209  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
210  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
211  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
213 }
214 
224 template<typename NumericT, unsigned int AlignmentV>
228 {
229 
230  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
232 
233  /*
234  * Stage 1: Analyze sparsity pattern in order to properly allocate temporary arrays
235  *
236  * - Upper bound for the row lengths in C
237  */
238  viennacl::vector<unsigned int> upper_bound_nonzeros_per_row_A(256, ctx); // upper bound for the nonzeros per row encountered for each work group
239 
241  viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
242  viennacl::traits::opencl_handle(upper_bound_nonzeros_per_row_A)
243  ) );
244 
245  upper_bound_nonzeros_per_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
246  unsigned int * upper_bound_nonzeros_per_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(upper_bound_nonzeros_per_row_A.handle());
247 
248  unsigned int max_nnz_per_row_A = 0;
249  for (std::size_t i=0; i<upper_bound_nonzeros_per_row_A.size(); ++i)
250  max_nnz_per_row_A = std::max(max_nnz_per_row_A, upper_bound_nonzeros_per_row_A_ptr[i]);
251 
252  if (max_nnz_per_row_A > 32)
253  {
254  // determine augmented size:
255  unsigned int max_entries_in_G = 32;
256  if (max_nnz_per_row_A <= 256)
257  max_entries_in_G = 16;
258  if (max_nnz_per_row_A <= 64)
259  max_entries_in_G = 8;
260 
261  viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
263  viennacl::ocl::enqueue(k_decompose_1(A.handle1().opencl_handle(), cl_uint(A.size1()),
264  cl_uint(max_entries_in_G),
265  viennacl::traits::opencl_handle(exclusive_scan_helper)
266  ) );
267 
268  // exclusive scan of helper array to find new size:
269  viennacl::linalg::exclusive_scan(exclusive_scan_helper);
270  unsigned int augmented_size = exclusive_scan_helper[A.size1()];
271 
272  // split A = A2 * G1
273  viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
275 
276  // fill A2:
278  viennacl::ocl::enqueue(k_fill_A2(A2.handle1().opencl_handle(), A2.handle2().opencl_handle(), A2.handle().opencl_handle(), cl_uint(A2.size1()),
279  viennacl::traits::opencl_handle(exclusive_scan_helper)
280  ) );
281 
282  // fill G1:
284  viennacl::ocl::enqueue(k_fill_G1(G1.handle1().opencl_handle(), G1.handle2().opencl_handle(), G1.handle().opencl_handle(), cl_uint(G1.size1()),
285  A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), cl_uint(A.nnz()),
286  cl_uint(max_entries_in_G),
287  viennacl::traits::opencl_handle(exclusive_scan_helper)
288  ) );
289 
290  // compute tmp = G1 * B;
291  // C = A2 * tmp;
293  prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
294  prod_impl(A2, tmp, C); // this may split A2 again
295  return;
296  }
297 
298 
299  /*
300  * Stage 2: Determine sparsity pattern of C
301  */
302  C.resize(A.size1(), B.size2(), false);
303 
305  k2.local_work_size(0, 32); // run with one warp/wavefront
306  k2.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
307  viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
308  B.handle1().opencl_handle(), B.handle2().opencl_handle(), cl_uint(B.size2()),
309  C.handle1().opencl_handle()
310  ) );
311 
312  // exclusive scan on host to obtain row start indices:
314  viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
315  unsigned int current_offset = 0;
316  for (std::size_t i=0; i<C.size1(); ++i)
317  {
318  unsigned int tmp = row_buffer[i];
319  row_buffer.set(i, current_offset);
320  current_offset += tmp;
321  }
322  row_buffer.set(C.size1(), current_offset);
323  viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
324 
325 
326  /*
327  * Stage 3: Compute entries in C
328  */
329 
330  C.reserve(current_offset, false);
331 
333  k3.local_work_size(0, 32); // run with one warp/wavefront
334  k3.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
335  viennacl::ocl::enqueue(k3(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
336  B.handle1().opencl_handle(), B.handle2().opencl_handle(), B.handle().opencl_handle(), cl_uint(B.size2()),
337  C.handle1().opencl_handle(), C.handle2().opencl_handle(), C.handle().opencl_handle()
338  ) );
339 
340 }
341 
342 // triangular solvers
343 
349 template<typename NumericT, unsigned int MAT_AlignmentV>
353 {
354  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
357 
358  k.local_work_size(0, 128);
360  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
361  viennacl::traits::opencl_handle(x),
362  cl_uint(L.size1())
363  )
364  );
365 }
366 
372 template<typename NumericT, unsigned int AlignmentV>
376 {
377  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
379 
381 
382  k.local_work_size(0, 128);
384  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
385  viennacl::traits::opencl_handle(x),
386  cl_uint(L.size1())
387  )
388  );
389 }
390 
391 
397 template<typename NumericT, unsigned int AlignmentV>
401 {
402  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
405 
406  k.local_work_size(0, 128);
408  viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
409  viennacl::traits::opencl_handle(x),
410  cl_uint(U.size1())
411  )
412  );
413 }
414 
420 template<typename NumericT, unsigned int AlignmentV>
424 {
425  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
427 
429 
430  k.local_work_size(0, 128);
432  viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
433  viennacl::traits::opencl_handle(x),
434  cl_uint(U.size1())
435  )
436  );
437 }
438 
439 
440 
441 
442 
443 // transposed triangular solvers
444 
445 namespace detail
446 {
447  //
448  // block solves
449  //
450  template<typename NumericT, unsigned int AlignmentV>
453  op_trans> & L,
454  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
455  vector_base<NumericT> const & /* L_diagonal */, //ignored
458  {
459  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L.lhs()).context());
462  block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
463 
464  viennacl::ocl::enqueue(block_solve_kernel(L.lhs().handle1().opencl_handle(),
465  L.lhs().handle2().opencl_handle(),
466  L.lhs().handle().opencl_handle(),
467  block_indices.opencl_handle(),
468  x,
469  static_cast<cl_uint>(x.size())));
470  }
471 
472 
473  template<typename NumericT, unsigned int AlignmentV>
476  op_trans> const & U,
477  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
478  vector_base<NumericT> const & U_diagonal,
481  {
482  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U.lhs()).context());
485  block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
486 
487  viennacl::ocl::enqueue(block_solve_kernel(U.lhs().handle1().opencl_handle(),
488  U.lhs().handle2().opencl_handle(),
489  U.lhs().handle().opencl_handle(),
490  U_diagonal,
491  block_indices.opencl_handle(),
492  x,
493  static_cast<cl_uint>(x.size())));
494  }
495 
496 
497 }
498 
499 
505 template<typename NumericT, unsigned int AlignmentV>
508  op_trans> const & proxy_L,
511 {
512  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
515 
516  k.local_work_size(0, 128);
518  viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
519  viennacl::traits::opencl_handle(x),
520  cl_uint(proxy_L.lhs().size1())
521  )
522  );
523 }
524 
525 
531 template<typename NumericT, unsigned int AlignmentV>
534  op_trans> const & proxy_L,
537 {
538  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
540 
541  viennacl::vector<NumericT> diagonal(x.size());
543 
545 
546  k.local_work_size(0, 128);
547  k.global_work_size(0, k.local_work_size());
548  viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
549  viennacl::traits::opencl_handle(diagonal),
550  viennacl::traits::opencl_handle(x),
551  cl_uint(proxy_L.lhs().size1())
552  )
553  );
554 }
555 
561 template<typename NumericT, unsigned int AlignmentV>
564  op_trans> const & proxy_U,
567 {
568  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
571 
572  k.local_work_size(0, 128);
574  viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
575  viennacl::traits::opencl_handle(x),
576  cl_uint(proxy_U.lhs().size1())
577  )
578  );
579 }
580 
581 
587 template<typename NumericT, unsigned int AlignmentV>
590  op_trans> const & proxy_U,
593 {
594  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
596 
597  viennacl::vector<NumericT> diagonal(x.size());
599 
601 
602  k.local_work_size(0, 128);
603  k.global_work_size(0, k.local_work_size());
604  viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
605  viennacl::traits::opencl_handle(diagonal),
606  viennacl::traits::opencl_handle(x),
607  cl_uint(proxy_U.lhs().size1())
608  )
609  );
610 }
611 
612 
613 //
614 // Compressed Compressed matrix
615 //
616 
625 template<typename NumericT>
629 {
630  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
633 
634  y.clear();
635 
637  layout_x.start = cl_uint(viennacl::traits::start(x));
638  layout_x.stride = cl_uint(viennacl::traits::stride(x));
639  layout_x.size = cl_uint(viennacl::traits::size(x));
640  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
641 
643  layout_y.start = cl_uint(viennacl::traits::start(y));
644  layout_y.stride = cl_uint(viennacl::traits::stride(y));
645  layout_y.size = cl_uint(viennacl::traits::size(y));
646  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
647 
648  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle3().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.nnz1()),
649  x, layout_x,
650  y, layout_y
651  ));
652 }
653 
654 
655 //
656 // Coordinate matrix
657 //
658 
659 namespace detail
660 {
661  template<typename NumericT, unsigned int AlignmentV>
665  {
666  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
669  unsigned int thread_num = 128; //k.local_work_size(0);
670 
671  row_info_kernel.local_work_size(0, thread_num);
672 
673  row_info_kernel.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
674  viennacl::ocl::enqueue(row_info_kernel(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
675  viennacl::traits::opencl_handle(x),
676  cl_uint(info_selector),
677  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
678  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
679  }
680 }
681 
690 template<typename NumericT, unsigned int AlignmentV>
694 {
695  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
697 
698  y.clear();
699 
701  layout_x.start = cl_uint(viennacl::traits::start(x));
702  layout_x.stride = cl_uint(viennacl::traits::stride(x));
703  layout_x.size = cl_uint(viennacl::traits::size(x));
704  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
705 
707  layout_y.start = cl_uint(viennacl::traits::start(y));
708  layout_y.stride = cl_uint(viennacl::traits::stride(y));
709  layout_y.size = cl_uint(viennacl::traits::size(y));
710  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
711 
712  //std::cout << "prod(coordinate_matrix" << AlignmentV << ", vector) called with internal_nnz=" << A.internal_nnz() << std::endl;
713 
715  unsigned int thread_num = 128; //k.local_work_size(0);
716 
717  k.local_work_size(0, thread_num);
718 
719  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
720  //k.global_work_size(0, thread_num); //Only one work group
721  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
722  viennacl::traits::opencl_handle(x),
723  layout_x,
724  viennacl::traits::opencl_handle(y),
725  layout_y,
726  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
727  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
728 
729 }
730 
731 
740 template<typename NumericT, unsigned int AlignmentV>
744 {
745  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
747 
750 
751  y.clear();
752 
753  unsigned int thread_num = 128; //k.local_work_size(0);
754  k.local_work_size(0, thread_num);
755  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
756 
757  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
758  viennacl::traits::opencl_handle(d_A),
759  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
760  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
761  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
763  viennacl::traits::opencl_handle(y),
764  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
765  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
766  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
768  viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
770 
771 }
772 
781 template<typename NumericT, unsigned int AlignmentV>
785  viennacl::op_trans > const & d_A,
787 {
788  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
790 
792  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
793 
794  y.clear();
795 
796  unsigned int thread_num = 128; //k.local_work_size(0);
797  k.local_work_size(0, thread_num);
798  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
799 
800  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
801  viennacl::traits::opencl_handle(d_A),
802  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
803  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
804  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
805  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
806  viennacl::traits::opencl_handle(y),
807  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
808  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
809  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
811  viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
813 
814 }
815 
816 
817 //
818 // ELL Matrix
819 //
820 
821 template<typename NumericT, unsigned int AlignmentV>
825 {
826  assert(A.size1() == y.size());
827  assert(A.size2() == x.size());
828 
829  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
831  y.clear();
832 
834  layout_x.start = cl_uint(viennacl::traits::start(x));
835  layout_x.stride = cl_uint(viennacl::traits::stride(x));
836  layout_x.size = cl_uint(viennacl::traits::size(x));
837  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
838 
840  layout_y.start = cl_uint(viennacl::traits::start(y));
841  layout_y.stride = cl_uint(viennacl::traits::stride(y));
842  layout_y.size = cl_uint(viennacl::traits::size(y));
843  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
844 
845  std::stringstream ss;
846  ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
848 
849  unsigned int thread_num = 128;
850  unsigned int group_num = 256;
851 
852  k.local_work_size(0, thread_num);
853  k.global_work_size(0, thread_num * group_num);
854 
855  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
856  A.handle().opencl_handle(),
857  viennacl::traits::opencl_handle(x),
858  layout_x,
859  viennacl::traits::opencl_handle(y),
860  layout_y,
861  cl_uint(A.size1()),
862  cl_uint(A.size2()),
863  cl_uint(A.internal_size1()),
864  cl_uint(A.maxnnz()),
865  cl_uint(A.internal_maxnnz())
866  )
867  );
868 
869 
870 }
871 
881 template<typename NumericT, unsigned int AlignmentV>
885 
886  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
890 
891  //unsigned int thread_num = 128;
892  //unsigned int group_num = 256;
893  //
894  //k.local_work_size(0, thread_num);
895  //k.global_work_size(0, thread_num * group_num);
896 
897  viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
898  cl_uint(sp_A.size1()),
899  cl_uint(sp_A.size2()),
900  cl_uint(sp_A.internal_size1()),
901  cl_uint(sp_A.maxnnz()),
902  cl_uint(sp_A.internal_maxnnz()),
903  viennacl::traits::opencl_handle(d_A),
904  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
905  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
906  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
908  viennacl::traits::opencl_handle(y),
909  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
910  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
911  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
913  )
914  );
915 }
916 
926 template<typename NumericT, unsigned int AlignmentV>
930  viennacl::op_trans > const & d_A,
932 
933  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
936  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
937 
938  //unsigned int thread_num = 128;
939  //unsigned int group_num = 256;
940  //
941  //k.local_work_size(0, thread_num);
942  //k.global_work_size(0, thread_num * group_num);
943 
944  viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
945  cl_uint(sp_A.size1()),
946  cl_uint(sp_A.size2()),
947  cl_uint(sp_A.internal_size1()),
948  cl_uint(sp_A.maxnnz()),
949  cl_uint(sp_A.internal_maxnnz()),
950  viennacl::traits::opencl_handle(d_A.lhs()),
951  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
952  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
953  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
954  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
955  viennacl::traits::opencl_handle(y),
956  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
957  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
958  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
960  )
961  );
962 }
963 
964 //
965 // SELL-C-\sigma Matrix
966 //
967 
968 template<typename ScalarT, typename IndexT>
972 {
973  assert(A.size1() == y.size());
974  assert(A.size2() == x.size());
975 
976  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
978  y.clear();
979 
981  layout_x.start = cl_uint(viennacl::traits::start(x));
982  layout_x.stride = cl_uint(viennacl::traits::stride(x));
983  layout_x.size = cl_uint(viennacl::traits::size(x));
984  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
985 
987  layout_y.start = cl_uint(viennacl::traits::start(y));
988  layout_y.stride = cl_uint(viennacl::traits::stride(y));
989  layout_y.size = cl_uint(viennacl::traits::size(y));
990  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
991 
992  std::stringstream ss;
993  ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
995 
996  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
997  unsigned int group_num = 256;
998 
999  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1000  thread_num = 256;
1001 
1002  k.local_work_size(0, thread_num);
1003  k.global_work_size(0, thread_num * group_num);
1004 
1005  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
1006  A.handle2().opencl_handle(),
1007  A.handle3().opencl_handle(),
1008  A.handle().opencl_handle(),
1009  viennacl::traits::opencl_handle(x),
1010  layout_x,
1011  viennacl::traits::opencl_handle(y),
1012  layout_y,
1013  cl_uint(A.rows_per_block()))
1014  );
1015 }
1016 
1017 
1018 //
1019 // Hybrid Matrix
1020 //
1021 
1022 template<typename NumericT, unsigned int AlignmentV>
1026 {
1027  assert(A.size1() == y.size());
1028  assert(A.size2() == x.size());
1029 
1030  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
1032 
1034  layout_x.start = cl_uint(viennacl::traits::start(x));
1035  layout_x.stride = cl_uint(viennacl::traits::stride(x));
1036  layout_x.size = cl_uint(viennacl::traits::size(x));
1037  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
1038 
1040  layout_y.start = cl_uint(viennacl::traits::start(y));
1041  layout_y.stride = cl_uint(viennacl::traits::stride(y));
1042  layout_y.size = cl_uint(viennacl::traits::size(y));
1043  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
1044 
1046 
1047  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
1048  A.handle().opencl_handle(),
1049  A.handle3().opencl_handle(),
1050  A.handle4().opencl_handle(),
1051  A.handle5().opencl_handle(),
1052  viennacl::traits::opencl_handle(x),
1053  layout_x,
1054  viennacl::traits::opencl_handle(y),
1055  layout_y,
1056  cl_uint(A.size1()),
1057  cl_uint(A.internal_size1()),
1058  cl_uint(A.ell_nnz()),
1059  cl_uint(A.internal_ellnnz())
1060  )
1061  );
1062 }
1063 
1064 template<typename NumericT, unsigned int AlignmentV>
1066  viennacl::matrix_base<NumericT> const & d_A,
1068 {
1069  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
1073 
1074  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
1075  A.handle().opencl_handle(),
1076  A.handle3().opencl_handle(),
1077  A.handle4().opencl_handle(),
1078  A.handle5().opencl_handle(),
1079  cl_uint(A.size1()),
1080  cl_uint(A.internal_size1()),
1081  cl_uint(A.ell_nnz()),
1082  cl_uint(A.internal_ellnnz()),
1083  viennacl::traits::opencl_handle(d_A),
1084  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
1085  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
1086  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
1088  viennacl::traits::opencl_handle(y),
1089  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
1090  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
1091  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
1093  )
1094  );
1095 }
1096 
1097 template<typename NumericT, unsigned int AlignmentV>
1101  viennacl::op_trans > const & d_A,
1103 {
1104  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
1107  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
1108 
1109  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
1110  A.handle().opencl_handle(),
1111  A.handle3().opencl_handle(),
1112  A.handle4().opencl_handle(),
1113  A.handle5().opencl_handle(),
1114  cl_uint(A.size1()),
1115  cl_uint(A.internal_size1()),
1116  cl_uint(A.ell_nnz()),
1117  cl_uint(A.internal_ellnnz()),
1118  viennacl::traits::opencl_handle(d_A.lhs()),
1119  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
1120  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
1121  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
1122  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
1123  viennacl::traits::opencl_handle(y),
1124  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
1125  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
1126  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
1128  )
1129  );
1130 }
1131 
1132 
1133 } // namespace opencl
1134 } //namespace linalg
1135 } //namespace viennacl
1136 
1137 
1138 #endif
const vcl_size_t & size2() const
Returns the number of columns.
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
cl_uint stride
Increment between integers.
Definition: kernel.hpp:50
static void init(viennacl::ocl::context &ctx)
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:112
Helper class implementing an array on the host. Default case: No conversion necessary.
Definition: util.hpp:92
Helper class for packing four cl_uint numbers into a uint4 type for access inside an OpenCL kernel...
Definition: kernel.hpp:45
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
Definition: memory.hpp:220
handle_type & handle2()
Definition: ell_matrix.hpp:103
Represents an OpenCL device within ViennaCL.
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
cl_uint start
Starting value of the integer stride.
Definition: kernel.hpp:48
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:382
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:49
A tag class representing a lower triangular matrix.
Definition: forwards.h:849
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Main kernel class for generating OpenCL kernels for coordinate_matrix.
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:390
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size2() const
Definition: ell_matrix.hpp:92
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
cl_uint vendor_id() const
A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID...
Definition: device.hpp:917
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
Definition: size.hpp:371
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
vcl_size_t rows_per_block() const
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
cl_uint internal_size
Internal length of the buffer. Might be larger than 'size' due to padding.
Definition: kernel.hpp:54
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
Common implementations shared by OpenCL-based operations.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
Main kernel class for generating OpenCL kernels for ell_matrix.
Definition: ell_matrix.hpp:173
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
Definition: blas3.hpp:36
Main kernel class for generating OpenCL kernels for compressed_matrix.
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
OpenCL kernel file for compressed_matrix operations.
A tag class representing an upper triangular matrix.
Definition: forwards.h:854
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
OpenCL kernel file for ell_matrix operations.
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
void clear()
Resets all entries to zero.
Definition: matrix.hpp:634
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
Implementation of a smart-pointer-like class for handling OpenCL handles.
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:75
Main kernel class for generating OpenCL kernels for ell_matrix.
Common routines for single-threaded or OpenMP-enabled execution on CPU.
OpenCL kernel file for sliced_ell_matrix operations.
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
OpenCL kernel file for hyb_matrix operations.
handle_type & handle()
Definition: ell_matrix.hpp:100
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT)
Direct inplace solver for dense triangular systems. Matlab notation: A \ B.
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:875
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Representation of an OpenCL kernel in ViennaCL.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:248
static void init(viennacl::ocl::context &ctx)
OpenCL kernel file for vector operations.
void set(vcl_size_t index, U value)
Definition: util.hpp:115
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
void switch_memory_context(viennacl::context new_ctx)
Definition: vector.hpp:1064
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
OpenCL kernel file for coordinate_matrix operations.
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
A tag class representing transposed matrices.
Definition: forwards.h:220
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
Definition: mem_handle.hpp:230
A sparse square matrix in compressed sparse rows format.
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan.
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &x, viennacl::linalg::unit_lower_tag)
static void init(viennacl::ocl::context &ctx)
Definition: hyb_matrix.hpp:200
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
static void init(viennacl::ocl::context &ctx)
Definition: ell_matrix.hpp:180
void prod_impl(const matrix_base< NumericT > &A, bool trans_A, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
size_t max_work_group_size() const
Maximum number of work-items in a work-group executing a kernel using the data parallel execution mod...
Definition: device.hpp:483
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864
Main kernel class for generating OpenCL kernels for compressed_compressed_matrix. ...
cl_uint size
Number of values in the stride.
Definition: kernel.hpp:52
Main kernel class for generating OpenCL kernels for hyb_matrix.
Definition: hyb_matrix.hpp:193
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
vcl_size_t size2() const
Definition: hyb_matrix.hpp:99
void row_info(compressed_matrix< NumericT, AlignmentV > const &A, vector_base< NumericT > &x, viennacl::linalg::detail::row_info_types info_selector)