ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 
27 #include "viennacl/forwards.h"
29 #include "viennacl/ocl/device.hpp"
30 #include "viennacl/ocl/handle.hpp"
31 #include "viennacl/ocl/kernel.hpp"
32 #include "viennacl/scalar.hpp"
33 #include "viennacl/tools/tools.hpp"
39 #include "viennacl/traits/size.hpp"
43 
44 namespace viennacl
45 {
46 namespace linalg
47 {
48 namespace opencl
49 {
50 
51 template<typename NumericT>
53  NumericT alpha,
56  vector_base<NumericT> const & Ap,
57  NumericT beta,
58  vector_base<NumericT> & inner_prod_buffer)
59 {
60  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
62 
64  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
65 
66  k.local_work_size(0, 128);
67  k.global_work_size(0, 128*128);
68 
70  {
71  k.local_work_size(0, 256);
72  k.global_work_size(0, 256*256);
73  }
74 
75  viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
76 }
77 
78 template<typename NumericT>
80  vector_base<NumericT> const & p,
82  vector_base<NumericT> & inner_prod_buffer)
83 {
84  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
86 
87  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
88 
89  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "cg_csr_blocked_prod" : "cg_csr_prod");
90 
91  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
92  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
93 
94  k.local_work_size(0, 128);
95  k.global_work_size(0, 128*128);
96 
98  {
99  k.local_work_size(0, 256);
100  k.global_work_size(0, 256*256);
101  }
102 
103  if (use_nvidia_blocked)
104  {
105  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
106  p,
107  Ap,
108  vec_size,
109  inner_prod_buffer,
110  buffer_size_per_vector,
113  ));
114  }
115  else
116  {
117  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
118  p,
119  Ap,
120  vec_size,
121  inner_prod_buffer,
122  buffer_size_per_vector,
125  viennacl::ocl::local_mem(1024 * sizeof(NumericT))
126  ));
127  }
128 
129 }
130 
131 template<typename NumericT>
133  vector_base<NumericT> const & p,
135  vector_base<NumericT> & inner_prod_buffer)
136 {
137  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
139 
140  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
141  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
142 
143  Ap.clear();
144 
146  unsigned int thread_num = 256; //k.local_work_size(0);
147 
148  k.local_work_size(0, thread_num);
149 
150  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
151 
152  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
153  p,
154  Ap,
155  vec_size,
156  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
157  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
158  inner_prod_buffer,
159  buffer_size_per_vector,
162  ));
163 }
164 
165 template<typename NumericT>
167  vector_base<NumericT> const & p,
169  vector_base<NumericT> & inner_prod_buffer)
170 {
171  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
173 
174  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
175  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
176 
178 
179  unsigned int thread_num = 128;
180  unsigned int group_num = 256;
181 
182  k.local_work_size(0, thread_num);
183  k.global_work_size(0, thread_num * group_num);
184 
186  {
187  k.local_work_size(0, 256);
188  k.global_work_size(0, 256*256);
189  }
190 
191  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
192  A.handle().opencl_handle(),
193  cl_uint(A.internal_size1()),
194  cl_uint(A.maxnnz()),
195  cl_uint(A.internal_maxnnz()),
196  viennacl::traits::opencl_handle(p),
197  viennacl::traits::opencl_handle(Ap),
198  vec_size,
199  inner_prod_buffer,
200  buffer_size_per_vector,
203  )
204  );
205 }
206 
207 template<typename NumericT>
209  vector_base<NumericT> const & p,
211  vector_base<NumericT> & inner_prod_buffer)
212 {
213  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
215 
216  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
217  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
218 
220 
221  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
222  unsigned int group_num = 256;
223 
225  thread_num = 256;
226 
227  k.local_work_size(0, thread_num);
228  k.global_work_size(0, thread_num * group_num);
229 
230  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
231  A.handle2().opencl_handle(),
232  A.handle3().opencl_handle(),
233  A.handle().opencl_handle(),
234  viennacl::traits::opencl_handle(p),
235  viennacl::traits::opencl_handle(Ap),
236  vec_size,
237  cl_uint(A.rows_per_block()),
238  inner_prod_buffer,
239  buffer_size_per_vector,
242  )
243  );
244 }
245 
246 
247 template<typename NumericT>
249  vector_base<NumericT> const & p,
251  vector_base<NumericT> & inner_prod_buffer)
252 {
253  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
255 
256  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
257  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
258 
260 
261  unsigned int thread_num = 128;
262  unsigned int group_num = 128;
263 
264  k.local_work_size(0, thread_num);
265  k.global_work_size(0, thread_num * group_num);
266 
268  {
269  k.local_work_size(0, 256);
270  k.global_work_size(0, 256*256);
271  }
272 
273  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
274  A.handle().opencl_handle(),
275  A.handle3().opencl_handle(),
276  A.handle4().opencl_handle(),
277  A.handle5().opencl_handle(),
278  cl_uint(A.internal_size1()),
279  cl_uint(A.ell_nnz()),
280  cl_uint(A.internal_ellnnz()),
281  viennacl::traits::opencl_handle(p),
282  viennacl::traits::opencl_handle(Ap),
283  vec_size,
284  inner_prod_buffer,
285  buffer_size_per_vector,
288  )
289  );
290 }
291 
292 
294 
295 template<typename NumericT>
298  vector_base<NumericT> const & Ap,
299  vector_base<NumericT> & inner_prod_buffer,
300  vcl_size_t buffer_chunk_size,
301  vcl_size_t buffer_chunk_offset)
302 {
303  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
305 
307  cl_uint vec_size = cl_uint(viennacl::traits::size(s));
308 
309  k.local_work_size(0, 128);
310  k.global_work_size(0, 128*128);
311 
313  {
314  k.local_work_size(0, 256);
315  k.global_work_size(0, 256*256);
316  }
317 
318  cl_uint chunk_size = cl_uint(buffer_chunk_size);
319  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
320  viennacl::ocl::enqueue(k(s, r, Ap,
321  inner_prod_buffer, chunk_size, chunk_offset, vec_size,
324 }
325 
326 template<typename NumericT>
328  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
329  NumericT beta, vector_base<NumericT> const & Ap,
330  vector_base<NumericT> const & r0star,
331  vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
332 {
333  (void)buffer_chunk_size;
334 
335  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
337 
339  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
340 
341  k.local_work_size(0, 128);
342  k.global_work_size(0, 128*128);
343 
345  {
346  k.local_work_size(0, 256);
347  k.global_work_size(0, 256*256);
348  }
349 
350  viennacl::ocl::enqueue(k(result, alpha, p, omega, s,
351  residual, As,
352  beta, Ap,
353  r0star,
354  inner_prod_buffer,
355  vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
356  )
357  );
358 }
359 
360 template<typename NumericT>
362  vector_base<NumericT> const & p,
364  vector_base<NumericT> const & r0star,
365  vector_base<NumericT> & inner_prod_buffer,
366  vcl_size_t buffer_chunk_size,
367  vcl_size_t buffer_chunk_offset)
368 {
369  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
371 
372  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
373 
374  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "bicgstab_csr_blocked_prod" : "bicgstab_csr_prod");
375 
376  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
377  cl_uint chunk_size = cl_uint(buffer_chunk_size);
378  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
379 
380  k.local_work_size(0, 128);
381  k.global_work_size(0, 128*128);
382 
384  {
385  k.local_work_size(0, 256);
386  k.global_work_size(0, 256*256);
387  }
388 
389  if (use_nvidia_blocked)
390  {
391  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
392  p,
393  Ap,
394  r0star,
395  vec_size,
396  inner_prod_buffer, chunk_size, chunk_offset,
400  ));
401  }
402  else
403  {
404  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
405  p,
406  Ap,
407  r0star,
408  vec_size,
409  inner_prod_buffer, chunk_size, chunk_offset,
413  ));
414  }
415 
416 }
417 
418 
419 template<typename NumericT>
421  vector_base<NumericT> const & p,
423  vector_base<NumericT> const & r0star,
424  vector_base<NumericT> & inner_prod_buffer,
425  vcl_size_t buffer_chunk_size,
426  vcl_size_t buffer_chunk_offset)
427 {
428  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
430 
431  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
432  cl_uint chunk_size = cl_uint(buffer_chunk_size);
433  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
434 
435  Ap.clear();
436 
438  unsigned int thread_num = 256; //k.local_work_size(0);
439 
440  k.local_work_size(0, thread_num);
441 
442  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
443 
444  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
445  p,
446  Ap,
447  r0star,
448  vec_size,
449  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
450  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
451  inner_prod_buffer, chunk_size, chunk_offset,
455  ));
456 }
457 
458 template<typename NumericT>
460  vector_base<NumericT> const & p,
462  vector_base<NumericT> const & r0star,
463  vector_base<NumericT> & inner_prod_buffer,
464  vcl_size_t buffer_chunk_size,
465  vcl_size_t buffer_chunk_offset)
466 {
467  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
469 
470  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
471  cl_uint chunk_size = cl_uint(buffer_chunk_size);
472  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
473 
475 
476  unsigned int thread_num = 128;
477  unsigned int group_num = 128;
478 
479  k.local_work_size(0, thread_num);
480  k.global_work_size(0, thread_num * group_num);
481 
483  {
484  k.local_work_size(0, 256);
485  k.global_work_size(0, 256*256);
486  }
487 
488  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
489  A.handle().opencl_handle(),
490  cl_uint(A.internal_size1()),
491  cl_uint(A.maxnnz()),
492  cl_uint(A.internal_maxnnz()),
493  viennacl::traits::opencl_handle(p),
494  viennacl::traits::opencl_handle(Ap),
495  r0star,
496  vec_size,
497  inner_prod_buffer, chunk_size, chunk_offset,
501  )
502  );
503 }
504 
505 template<typename NumericT>
507  vector_base<NumericT> const & p,
509  vector_base<NumericT> const & r0star,
510  vector_base<NumericT> & inner_prod_buffer,
511  vcl_size_t buffer_chunk_size,
512  vcl_size_t buffer_chunk_offset)
513 {
514  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
516 
517  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
518  cl_uint chunk_size = cl_uint(buffer_chunk_size);
519  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
520 
522 
523  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
524  unsigned int group_num = 256;
525 
527  thread_num = 256;
528 
529  k.local_work_size(0, thread_num);
530  k.global_work_size(0, thread_num * group_num);
531 
532  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
533  A.handle2().opencl_handle(),
534  A.handle3().opencl_handle(),
535  A.handle().opencl_handle(),
536  viennacl::traits::opencl_handle(p),
537  viennacl::traits::opencl_handle(Ap),
538  r0star,
539  vec_size,
540  cl_uint(A.rows_per_block()),
541  inner_prod_buffer, chunk_size, chunk_offset,
545  )
546  );
547 }
548 
549 
550 template<typename NumericT>
552  vector_base<NumericT> const & p,
554  vector_base<NumericT> const & r0star,
555  vector_base<NumericT> & inner_prod_buffer,
556  vcl_size_t buffer_chunk_size,
557  vcl_size_t buffer_chunk_offset)
558 {
559  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
561 
562  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
563  cl_uint chunk_size = cl_uint(buffer_chunk_size);
564  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
565 
567 
568  unsigned int thread_num = 256;
569  unsigned int group_num = 128;
570 
571  k.local_work_size(0, thread_num);
572  k.global_work_size(0, thread_num * group_num);
573 
575  {
576  k.local_work_size(0, 256);
577  k.global_work_size(0, 256*256);
578  }
579 
580  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
581  A.handle().opencl_handle(),
582  A.handle3().opencl_handle(),
583  A.handle4().opencl_handle(),
584  A.handle5().opencl_handle(),
585  cl_uint(A.internal_size1()),
586  cl_uint(A.ell_nnz()),
587  cl_uint(A.internal_ellnnz()),
588  viennacl::traits::opencl_handle(p),
589  viennacl::traits::opencl_handle(Ap),
590  r0star,
591  vec_size,
592  inner_prod_buffer, chunk_size, chunk_offset,
596  )
597  );
598 }
599 
601 
609 template <typename T>
611  vector_base<T> const & residual,
612  vector_base<T> & R_buffer,
613  vcl_size_t offset_in_R,
614  vector_base<T> const & inner_prod_buffer,
615  vector_base<T> & r_dot_vk_buffer,
616  vcl_size_t buffer_chunk_size,
617  vcl_size_t buffer_chunk_offset)
618 {
619  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context());
621 
623 
624  k.local_work_size(0, 128);
625  k.global_work_size(0, 128*128);
626 
627  cl_uint size_vk = cl_uint(v_k.size());
628  cl_uint vk_offset = cl_uint(viennacl::traits::start(v_k));
629  cl_uint R_offset = cl_uint(offset_in_R);
630  cl_uint chunk_size = cl_uint(buffer_chunk_size);
631  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
632  viennacl::ocl::enqueue(k(v_k, vk_offset,
633  residual,
634  R_buffer, R_offset,
635  inner_prod_buffer, chunk_size,
636  r_dot_vk_buffer, chunk_offset,
637  size_vk,
639  ));
640 }
641 
642 template <typename T>
643 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
644  vcl_size_t v_k_size,
645  vcl_size_t v_k_internal_size,
646  vcl_size_t param_k,
647  vector_base<T> & vi_in_vk_buffer,
648  vcl_size_t buffer_chunk_size)
649 {
650  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
652 
654 
655  k.local_work_size(0, 128);
656  k.global_work_size(0, 128*128);
657 
658  cl_uint size_vk = cl_uint(v_k_size);
659  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
660  cl_uint ocl_k = cl_uint(param_k);
661  cl_uint chunk_size = cl_uint(buffer_chunk_size);
662  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
663  vi_in_vk_buffer, chunk_size
664  ));
665 }
666 
667 template <typename T>
669  vcl_size_t v_k_size,
670  vcl_size_t v_k_internal_size,
671  vcl_size_t param_k,
672  vector_base<T> const & vi_in_vk_buffer,
673  vector_base<T> & R_buffer,
674  vcl_size_t krylov_dim,
675  vector_base<T> & inner_prod_buffer,
676  vcl_size_t buffer_chunk_size)
677 {
678  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
680 
682 
683  k.local_work_size(0, 128);
684  k.global_work_size(0, 128*128);
685 
686  cl_uint size_vk = cl_uint(v_k_size);
687  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
688  cl_uint ocl_k = cl_uint(param_k);
689  cl_uint chunk_size = cl_uint(buffer_chunk_size);
690  cl_uint ocl_krylov_dim = cl_uint(krylov_dim);
691  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
692  vi_in_vk_buffer, chunk_size,
693  R_buffer, ocl_krylov_dim,
694  inner_prod_buffer,
695  viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T))
696  ));
697 }
698 
699 template <typename T>
701  vector_base<T> const & residual,
702  vector_base<T> const & krylov_basis,
703  vcl_size_t v_k_size,
704  vcl_size_t v_k_internal_size,
705  vector_base<T> const & coefficients,
706  vcl_size_t param_k)
707 {
708  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
710 
712 
713  k.local_work_size(0, 128);
714  k.global_work_size(0, 128*128);
715 
716  cl_uint size_vk = cl_uint(v_k_size);
717  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
718  cl_uint ocl_k = cl_uint(param_k);
719  viennacl::ocl::enqueue(k(result,
720  residual,
721  krylov_basis, size_vk, internal_size_vk,
722  coefficients, ocl_k
723  ));
724 }
725 
726 
727 template <typename T>
729  vector_base<T> const & p,
730  vector_base<T> & Ap,
731  vector_base<T> & inner_prod_buffer)
732 {
733  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
735 
736  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
737 
738  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), use_nvidia_blocked ? "gmres_csr_blocked_prod" : "gmres_csr_prod");
739 
740  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
741  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
742  cl_uint start_p = cl_uint(viennacl::traits::start(p));
743  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
744 
745  k.local_work_size(0, 128);
746  k.global_work_size(0, 128*128);
747 
749  {
750  k.local_work_size(0, 256);
751  k.global_work_size(0, 256*128);
752  }
753 
754  if (use_nvidia_blocked)
755  {
756  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
757  p, start_p,
758  Ap, start_Ap,
759  vec_size,
760  inner_prod_buffer,
761  buffer_size_per_vector,
762  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
764  ));
765  }
766  else
767  {
768  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
769  p, start_p,
770  Ap, start_Ap,
771  vec_size,
772  inner_prod_buffer,
773  buffer_size_per_vector,
774  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
775  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
776  viennacl::ocl::local_mem(1024 * sizeof(T))
777  ));
778  }
779 }
780 
781 template <typename T>
783  vector_base<T> const & p,
784  vector_base<T> & Ap,
785  vector_base<T> & inner_prod_buffer)
786 {
787  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
789 
790  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
791  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
792  cl_uint start_p = cl_uint(viennacl::traits::start(p));
793  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
794 
795  Ap.clear();
796  inner_prod_buffer.clear();
797 
799  unsigned int thread_num = 128; //k.local_work_size(0);
800 
801  k.local_work_size(0, thread_num);
802 
803  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
804 
805  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
806  p, start_p,
807  Ap, start_Ap,
808  vec_size,
809  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
810  viennacl::ocl::local_mem(sizeof(T)*thread_num),
811  inner_prod_buffer,
812  buffer_size_per_vector,
813  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
815  ));
816 }
817 
818 template <typename T>
820  vector_base<T> const & p,
821  vector_base<T> & Ap,
822  vector_base<T> & inner_prod_buffer)
823 {
824  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
826 
827  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
828  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
829  cl_uint start_p = cl_uint(viennacl::traits::start(p));
830  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
831 
833 
834  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
835  unsigned int group_num = 128;
836 
837  k.local_work_size(0, thread_num);
838  k.global_work_size(0, thread_num * group_num);
839 
840  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
841  A.handle().opencl_handle(),
842  cl_uint(A.internal_size1()),
843  cl_uint(A.maxnnz()),
844  cl_uint(A.internal_maxnnz()),
845  viennacl::traits::opencl_handle(p), start_p,
846  viennacl::traits::opencl_handle(Ap), start_Ap,
847  vec_size,
848  inner_prod_buffer,
849  buffer_size_per_vector,
850  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
852  )
853  );
854 }
855 
856 template <typename T>
858  vector_base<T> const & p,
859  vector_base<T> & Ap,
860  vector_base<T> & inner_prod_buffer)
861 {
862  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
864 
865  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
866  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
867  cl_uint start_p = cl_uint(viennacl::traits::start(p));
868  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
869 
871 
872  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
873  unsigned int group_num = 128;
874 
876  thread_num = 256;
877 
878  k.local_work_size(0, thread_num);
879  k.global_work_size(0, thread_num * group_num);
880 
881  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
882  A.handle2().opencl_handle(),
883  A.handle3().opencl_handle(),
884  A.handle().opencl_handle(),
885  viennacl::traits::opencl_handle(p), start_p,
886  viennacl::traits::opencl_handle(Ap), start_Ap,
887  vec_size,
888  cl_uint(A.rows_per_block()),
889  inner_prod_buffer,
890  buffer_size_per_vector,
891  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
893  )
894  );
895 }
896 
897 
898 template <typename T>
900  vector_base<T> const & p,
901  vector_base<T> & Ap,
902  vector_base<T> & inner_prod_buffer)
903 {
904  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
906 
907  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
908  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
909  cl_uint start_p = cl_uint(viennacl::traits::start(p));
910  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
911 
913 
914  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
915  unsigned int group_num = 128;
916 
917  k.local_work_size(0, thread_num);
918  k.global_work_size(0, thread_num * group_num);
919 
920 
921  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
922  A.handle().opencl_handle(),
923  A.handle3().opencl_handle(),
924  A.handle4().opencl_handle(),
925  A.handle5().opencl_handle(),
926  cl_uint(A.internal_size1()),
927  cl_uint(A.ell_nnz()),
928  cl_uint(A.internal_ellnnz()),
929  viennacl::traits::opencl_handle(p), start_p,
930  viennacl::traits::opencl_handle(Ap), start_Ap,
931  vec_size,
932  inner_prod_buffer,
933  buffer_size_per_vector,
934  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
936  )
937  );
938 }
939 
940 
941 } //namespace opencl
942 } //namespace linalg
943 } //namespace viennacl
944 
945 
946 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:112
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
Definition: iterative.hpp:1556
handle_type & handle2()
Definition: ell_matrix.hpp:103
Represents an OpenCL device within ViennaCL.
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Generic size and resize functionality for different vector and matrix types.
const vcl_size_t & size1() const
Returns the number of rows.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Definition: iterative.hpp:1563
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
This file provides the forward declarations for the main types used within ViennaCL.
Determines row and column increments for matrices and matrix proxies.
cl_uint vendor_id() const
A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID...
Definition: device.hpp:917
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
vcl_size_t rows_per_block() const
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
Common implementations shared by OpenCL-based operations.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
OpenCL kernel file for specialized iterative solver kernels.
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
Implementation of a smart-pointer-like class for handling OpenCL handles.
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:75
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
handle_type & handle()
Definition: ell_matrix.hpp:100
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:875
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Representation of an OpenCL kernel in ViennaCL.
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Forward declarations of the implicit_vector_base, vector_base class.
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
Simple enable-if variant that uses the SFINAE pattern.
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...