ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 #include <algorithm> //for std::max and std::min
27 
28 #include "viennacl/forwards.h"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/traits/size.hpp"
38 
39 
40 // Minimum vector size for using OpenMP on vector operations:
41 #ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
42  #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000
43 #endif
44 
45 namespace viennacl
46 {
47 namespace linalg
48 {
49 namespace host_based
50 {
51 
52 namespace detail
53 {
60  template<typename NumericT>
62  vector_base<NumericT> const & p,
64  NumericT const * r0star,
65  vector_base<NumericT> & inner_prod_buffer,
66  vcl_size_t buffer_chunk_size,
67  vcl_size_t buffer_chunk_offset)
68  {
69  typedef NumericT value_type;
70 
71  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);
72  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);
73  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
74  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
75  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
76  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
77 
78  value_type inner_prod_ApAp = 0;
79  value_type inner_prod_pAp = 0;
80  value_type inner_prod_Ap_r0star = 0;
81  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
82  {
83  value_type dot_prod = 0;
84  value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
85 
86  vcl_size_t row_end = row_buffer[row+1];
87  for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
88  dot_prod += elements[i] * p_buf[col_buffer[i]];
89 
90  // update contributions for the inner products (Ap, Ap) and (p, Ap)
91  Ap_buf[static_cast<vcl_size_t>(row)] = dot_prod;
92  inner_prod_ApAp += dot_prod * dot_prod;
93  inner_prod_pAp += val_p_diag * dot_prod;
94  inner_prod_Ap_r0star += r0star ? dot_prod * r0star[static_cast<vcl_size_t>(row)] : value_type(0);
95  }
96 
97  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
98  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
99  if (r0star)
100  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
101  }
102 
103 
104 
111  template<typename NumericT>
113  vector_base<NumericT> const & p,
115  NumericT const * r0star,
116  vector_base<NumericT> & inner_prod_buffer,
117  vcl_size_t buffer_chunk_size,
118  vcl_size_t buffer_chunk_offset)
119  {
120  typedef NumericT value_type;
121 
122  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
123  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
124  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
125  unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.handle12());
126  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
127 
128  // flush result buffer (cannot be expected to be zero)
129  for (vcl_size_t i = 0; i< Ap.size(); ++i)
130  Ap_buf[i] = 0;
131 
132  // matrix-vector product with a general COO format
133  for (vcl_size_t i = 0; i < A.nnz(); ++i)
134  Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];
135 
136  // computing the inner products (Ap, Ap) and (p, Ap):
137  // Note: The COO format does not allow to inject the subsequent operations into the matrix-vector product, because row and column ordering assumptions are too weak
138  value_type inner_prod_ApAp = 0;
139  value_type inner_prod_pAp = 0;
140  value_type inner_prod_Ap_r0star = 0;
141  for (vcl_size_t i = 0; i<Ap.size(); ++i)
142  {
143  NumericT value_Ap = Ap_buf[i];
144  NumericT value_p = p_buf[i];
145 
146  inner_prod_ApAp += value_Ap * value_Ap;
147  inner_prod_pAp += value_Ap * value_p;
148  inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);
149  }
150 
151  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
152  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
153  if (r0star)
154  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
155  }
156 
157 
164  template<typename NumericT>
166  vector_base<NumericT> const & p,
168  NumericT const * r0star,
169  vector_base<NumericT> & inner_prod_buffer,
170  vcl_size_t buffer_chunk_size,
171  vcl_size_t buffer_chunk_offset)
172  {
173  typedef NumericT value_type;
174 
175  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
176  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
177  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
178  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(A.handle2());
179  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
180 
181  value_type inner_prod_ApAp = 0;
182  value_type inner_prod_pAp = 0;
183  value_type inner_prod_Ap_r0star = 0;
184  for (vcl_size_t row = 0; row < A.size1(); ++row)
185  {
186  value_type sum = 0;
187  value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
188 
189  for (unsigned int item_id = 0; item_id < A.internal_maxnnz(); ++item_id)
190  {
191  vcl_size_t offset = row + item_id * A.internal_size1();
192  value_type val = elements[offset];
193 
194  if (val)
195  sum += (p_buf[coords[offset]] * val);
196  }
197 
198  Ap_buf[row] = sum;
199  inner_prod_ApAp += sum * sum;
200  inner_prod_pAp += val_p_diag * sum;
201  inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
202  }
203 
204  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
205  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
206  if (r0star)
207  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
208  }
209 
210 
217  template<typename NumericT, typename IndexT>
219  vector_base<NumericT> const & p,
221  NumericT const * r0star,
222  vector_base<NumericT> & inner_prod_buffer,
223  vcl_size_t buffer_chunk_size,
224  vcl_size_t buffer_chunk_offset)
225  {
226  typedef NumericT value_type;
227 
228  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
229  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
230  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
231  IndexT const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.handle1());
232  IndexT const * column_indices = detail::extract_raw_pointer<IndexT>(A.handle2());
233  IndexT const * block_start = detail::extract_raw_pointer<IndexT>(A.handle3());
234  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
235 
236  vcl_size_t num_blocks = A.size1() / A.rows_per_block() + 1;
237  std::vector<value_type> result_values(A.rows_per_block());
238 
239  value_type inner_prod_ApAp = 0;
240  value_type inner_prod_pAp = 0;
241  value_type inner_prod_Ap_r0star = 0;
242  for (vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)
243  {
244  vcl_size_t current_columns_per_block = columns_per_block[block_idx];
245 
246  for (vcl_size_t i=0; i<result_values.size(); ++i)
247  result_values[i] = 0;
248 
249  for (IndexT column_entry_index = 0;
250  column_entry_index < current_columns_per_block;
251  ++column_entry_index)
252  {
253  vcl_size_t stride_start = block_start[block_idx] + column_entry_index * A.rows_per_block();
254  // Note: This for-loop may be unrolled by hand for exploiting vectorization
255  // Careful benchmarking recommended first, memory channels may be saturated already!
256  for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
257  {
258  value_type val = elements[stride_start + row_in_block];
259 
260  result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;
261  }
262  }
263 
264  vcl_size_t first_row_in_matrix = block_idx * A.rows_per_block();
265  for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
266  {
267  vcl_size_t row = first_row_in_matrix + row_in_block;
268  if (row < Ap.size())
269  {
270  value_type row_result = result_values[row_in_block];
271 
272  Ap_buf[row] = row_result;
273  inner_prod_ApAp += row_result * row_result;
274  inner_prod_pAp += p_buf[row] * row_result;
275  inner_prod_Ap_r0star += r0star ? row_result * r0star[row] : value_type(0);
276  }
277  }
278  }
279 
280  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
281  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
282  if (r0star)
283  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
284  }
285 
286 
293  template<typename NumericT>
295  vector_base<NumericT> const & p,
297  NumericT const * r0star,
298  vector_base<NumericT> & inner_prod_buffer,
299  vcl_size_t buffer_chunk_size,
300  vcl_size_t buffer_chunk_offset)
301  {
302  typedef NumericT value_type;
303  typedef unsigned int index_type;
304 
305  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
306  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
307  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
308  index_type const * coords = detail::extract_raw_pointer<index_type>(A.handle2());
309  value_type const * csr_elements = detail::extract_raw_pointer<value_type>(A.handle5());
310  index_type const * csr_row_buffer = detail::extract_raw_pointer<index_type>(A.handle3());
311  index_type const * csr_col_buffer = detail::extract_raw_pointer<index_type>(A.handle4());
312  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
313 
314  value_type inner_prod_ApAp = 0;
315  value_type inner_prod_pAp = 0;
316  value_type inner_prod_Ap_r0star = 0;
317  for (vcl_size_t row = 0; row < A.size1(); ++row)
318  {
319  value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
320  value_type sum = 0;
321 
322  //
323  // Part 1: Process ELL part
324  //
325  for (index_type item_id = 0; item_id < A.internal_ellnnz(); ++item_id)
326  {
327  vcl_size_t offset = row + item_id * A.internal_size1();
328  value_type val = elements[offset];
329 
330  if (val)
331  sum += p_buf[coords[offset]] * val;
332  }
333 
334  //
335  // Part 2: Process HYB part
336  //
337  vcl_size_t col_begin = csr_row_buffer[row];
338  vcl_size_t col_end = csr_row_buffer[row + 1];
339 
340  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
341  sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];
342 
343  Ap_buf[row] = sum;
344  inner_prod_ApAp += sum * sum;
345  inner_prod_pAp += val_p_diag * sum;
346  inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
347  }
348 
349  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
350  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
351  if (r0star)
352  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
353  }
354 
355 } // namespace detail
356 
357 
366 template<typename NumericT>
368  NumericT alpha,
371  vector_base<NumericT> const & Ap,
372  NumericT beta,
373  vector_base<NumericT> & inner_prod_buffer)
374 {
375  typedef NumericT value_type;
376 
377  value_type * data_result = detail::extract_raw_pointer<value_type>(result);
378  value_type * data_p = detail::extract_raw_pointer<value_type>(p);
379  value_type * data_r = detail::extract_raw_pointer<value_type>(r);
380  value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
381  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
382 
383  // Note: Due to the special setting in CG, there is no need to check for sizes and strides
385 
386  value_type inner_prod_r = 0;
387  for (long i = 0; i < static_cast<long>(size); ++i)
388  {
389  value_type value_p = data_p[static_cast<vcl_size_t>(i)];
390  value_type value_r = data_r[static_cast<vcl_size_t>(i)];
391 
392 
393  data_result[static_cast<vcl_size_t>(i)] += alpha * value_p;
394  value_r -= alpha * data_Ap[static_cast<vcl_size_t>(i)];
395  value_p = value_r + beta * value_p;
396  inner_prod_r += value_r * value_r;
397 
398  data_p[static_cast<vcl_size_t>(i)] = value_p;
399  data_r[static_cast<vcl_size_t>(i)] = value_r;
400  }
401 
402  data_buffer[0] = inner_prod_r;
403 }
404 
405 
412 template<typename NumericT>
414  vector_base<NumericT> const & p,
416  vector_base<NumericT> & inner_prod_buffer)
417 {
418  typedef NumericT const * PtrType;
419  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
420 }
421 
422 
423 
430 template<typename NumericT>
432  vector_base<NumericT> const & p,
434  vector_base<NumericT> & inner_prod_buffer)
435 {
436  typedef NumericT const * PtrType;
437  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
438 }
439 
440 
447 template<typename NumericT>
449  vector_base<NumericT> const & p,
451  vector_base<NumericT> & inner_prod_buffer)
452 {
453  typedef NumericT const * PtrType;
454  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
455 }
456 
457 
464 template<typename NumericT, typename IndexT>
466  vector_base<NumericT> const & p,
468  vector_base<NumericT> & inner_prod_buffer)
469 {
470  typedef NumericT const * PtrType;
471  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
472 }
473 
474 
475 
476 
483 template<typename NumericT>
485  vector_base<NumericT> const & p,
487  vector_base<NumericT> & inner_prod_buffer)
488 {
489  typedef NumericT const * PtrType;
490  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
491 }
492 
494 
495 
503 template<typename NumericT>
506  vector_base<NumericT> const & Ap,
507  vector_base<NumericT> & inner_prod_buffer,
508  vcl_size_t buffer_chunk_size,
509  vcl_size_t buffer_chunk_offset)
510 {
511  typedef NumericT value_type;
512 
513  value_type * data_s = detail::extract_raw_pointer<value_type>(s);
514  value_type * data_r = detail::extract_raw_pointer<value_type>(r);
515  value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
516  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
517 
518  // Note: Due to the special setting in CG, there is no need to check for sizes and strides
520 
521  // part 1: compute alpha:
522  value_type r_in_r0 = 0;
523  value_type Ap_in_r0 = 0;
524  for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
525  {
526  r_in_r0 += data_buffer[i];
527  Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];
528  }
529  value_type alpha = r_in_r0 / Ap_in_r0;
530 
531  // part 2: s = r - alpha * Ap and first step in reduction for s:
532  value_type inner_prod_s = 0;
533  for (long i = 0; i < static_cast<long>(size); ++i)
534  {
535  value_type value_s = data_s[static_cast<vcl_size_t>(i)];
536 
537  value_s = data_r[static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];
538  inner_prod_s += value_s * value_s;
539 
540  data_s[static_cast<vcl_size_t>(i)] = value_s;
541  }
542 
543  data_buffer[buffer_chunk_offset] = inner_prod_s;
544 }
545 
553  template<typename NumericT>
555  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
556  NumericT beta, vector_base<NumericT> const & Ap,
557  vector_base<NumericT> const & r0star,
558  vector_base<NumericT> & inner_prod_buffer,
559  vcl_size_t buffer_chunk_size)
560  {
561  typedef NumericT value_type;
562 
563  value_type * data_result = detail::extract_raw_pointer<value_type>(result);
564  value_type * data_p = detail::extract_raw_pointer<value_type>(p);
565  value_type const * data_s = detail::extract_raw_pointer<value_type>(s);
566  value_type * data_residual = detail::extract_raw_pointer<value_type>(residual);
567  value_type const * data_As = detail::extract_raw_pointer<value_type>(As);
568  value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
569  value_type const * data_r0star = detail::extract_raw_pointer<value_type>(r0star);
570  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
571 
573 
574  value_type inner_prod_r_r0star = 0;
575  for (long i = 0; i < static_cast<long>(size); ++i)
576  {
577  vcl_size_t index = static_cast<vcl_size_t>(i);
578  value_type value_result = data_result[index];
579  value_type value_p = data_p[index];
580  value_type value_s = data_s[index];
581  value_type value_residual = data_residual[index];
582  value_type value_As = data_As[index];
583  value_type value_Ap = data_Ap[index];
584  value_type value_r0star = data_r0star[index];
585 
586  value_result += alpha * value_p + omega * value_s;
587  value_residual = value_s - omega * value_As;
588  value_p = value_residual + beta * (value_p - omega * value_Ap);
589  inner_prod_r_r0star += value_residual * value_r0star;
590 
591  data_result[index] = value_result;
592  data_residual[index] = value_residual;
593  data_p[index] = value_p;
594  }
595 
596  (void)buffer_chunk_size; // not needed here, just silence compiler warning (unused variable)
597  data_buffer[0] = inner_prod_r_r0star;
598  }
599 
606  template<typename NumericT>
608  vector_base<NumericT> const & p,
610  vector_base<NumericT> const & r0star,
611  vector_base<NumericT> & inner_prod_buffer,
612  vcl_size_t buffer_chunk_size,
613  vcl_size_t buffer_chunk_offset)
614  {
615  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
616 
617  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
618  }
619 
626  template<typename NumericT>
628  vector_base<NumericT> const & p,
630  vector_base<NumericT> const & r0star,
631  vector_base<NumericT> & inner_prod_buffer,
632  vcl_size_t buffer_chunk_size,
633  vcl_size_t buffer_chunk_offset)
634  {
635  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
636 
637  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
638  }
639 
646  template<typename NumericT>
648  vector_base<NumericT> const & p,
650  vector_base<NumericT> const & r0star,
651  vector_base<NumericT> & inner_prod_buffer,
652  vcl_size_t buffer_chunk_size,
653  vcl_size_t buffer_chunk_offset)
654  {
655  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
656 
657  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
658  }
659 
666  template<typename NumericT, typename IndexT>
668  vector_base<NumericT> const & p,
670  vector_base<NumericT> const & r0star,
671  vector_base<NumericT> & inner_prod_buffer,
672  vcl_size_t buffer_chunk_size,
673  vcl_size_t buffer_chunk_offset)
674  {
675  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
676 
677  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
678  }
679 
686  template<typename NumericT>
688  vector_base<NumericT> const & p,
690  vector_base<NumericT> const & r0star,
691  vector_base<NumericT> & inner_prod_buffer,
692  vcl_size_t buffer_chunk_size,
693  vcl_size_t buffer_chunk_offset)
694  {
695  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
696 
697  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
698  }
699 
700 
702 
710 template <typename T>
712  vector_base<T> const & residual,
713  vector_base<T> & R_buffer,
714  vcl_size_t offset_in_R,
715  vector_base<T> const & inner_prod_buffer,
716  vector_base<T> & r_dot_vk_buffer,
717  vcl_size_t buffer_chunk_size,
718  vcl_size_t buffer_chunk_offset)
719 {
720  typedef T value_type;
721 
722  value_type * data_v_k = detail::extract_raw_pointer<value_type>(v_k);
723  value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);
724  value_type * data_R = detail::extract_raw_pointer<value_type>(R_buffer);
725  value_type const * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
726  value_type * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);
727 
728  // Note: Due to the special setting in GMRES, there is no need to check for sizes and strides
730  vcl_size_t vk_start = viennacl::traits::start(v_k);
731 
732  // part 1: compute alpha:
733  value_type norm_vk = 0;
734  for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
735  norm_vk += data_buffer[i + buffer_chunk_size];
736  norm_vk = std::sqrt(norm_vk);
737  data_R[offset_in_R] = norm_vk;
738 
739  // Compute <r, v_k> after normalization of v_k:
740  value_type inner_prod_r_dot_vk = 0;
741  for (long i = 0; i < static_cast<long>(size); ++i)
742  {
743  value_type value_vk = data_v_k[static_cast<vcl_size_t>(i) + vk_start] / norm_vk;
744 
745  inner_prod_r_dot_vk += data_residual[static_cast<vcl_size_t>(i)] * value_vk;
746 
747  data_v_k[static_cast<vcl_size_t>(i) + vk_start] = value_vk;
748  }
749 
750  data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;
751 }
752 
753 
754 
759 template <typename T>
760 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
761  vcl_size_t v_k_size,
762  vcl_size_t v_k_internal_size,
763  vcl_size_t k,
764  vector_base<T> & vi_in_vk_buffer,
765  vcl_size_t buffer_chunk_size)
766 {
767  typedef T value_type;
768 
769  value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
770  value_type * data_inner_prod = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);
771 
772  // reset buffer:
773  for (vcl_size_t j = 0; j < k; ++j)
774  data_inner_prod[j*buffer_chunk_size] = value_type(0);
775 
776  // compute inner products:
777  for (vcl_size_t i = 0; i < v_k_size; ++i)
778  {
779  value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
780 
781  for (vcl_size_t j = 0; j < k; ++j)
782  data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;
783  }
784 }
785 
786 
791 template <typename T>
793  vcl_size_t v_k_size,
794  vcl_size_t v_k_internal_size,
795  vcl_size_t k,
796  vector_base<T> const & vi_in_vk_buffer,
797  vector_base<T> & R_buffer,
798  vcl_size_t krylov_dim,
799  vector_base<T> & inner_prod_buffer,
800  vcl_size_t buffer_chunk_size)
801 {
802  typedef T value_type;
803 
804  value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
805 
806  std::vector<T> values_vi_in_vk(k);
807 
808  // Step 1: Finish reduction of <v_i, v_k> to obtain scalars:
809  for (std::size_t i=0; i<k; ++i)
810  for (vcl_size_t j=0; j<buffer_chunk_size; ++j)
811  values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];
812 
813 
814  // Step 2: Compute v_k -= <v_i, v_k> v_i and reduction on ||v_k||:
815  value_type norm_vk = 0;
816  for (vcl_size_t i = 0; i < v_k_size; ++i)
817  {
818  value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
819 
820  for (vcl_size_t j = 0; j < k; ++j)
821  value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];
822 
823  norm_vk += value_vk * value_vk;
824  data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;
825  }
826 
827  // Step 3: Write values to R_buffer:
828  for (std::size_t i=0; i<k; ++i)
829  R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];
830 
831  inner_prod_buffer[buffer_chunk_size] = norm_vk;
832 }
833 
835 template <typename T>
837  vector_base<T> const & residual,
838  vector_base<T> const & krylov_basis,
839  vcl_size_t v_k_size,
840  vcl_size_t v_k_internal_size,
841  vector_base<T> const & coefficients,
842  vcl_size_t k)
843 {
844  typedef T value_type;
845 
846  value_type * data_result = detail::extract_raw_pointer<value_type>(result);
847  value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);
848  value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);
849  value_type const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);
850 
851  for (vcl_size_t i = 0; i < v_k_size; ++i)
852  {
853  value_type value_result = data_result[i];
854 
855  value_result += data_coefficients[0] * data_residual[i];
856  for (vcl_size_t j = 1; j<k; ++j)
857  value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];
858 
859  data_result[i] = value_result;
860  }
861 
862 }
863 
864 // Reuse implementation from CG:
865 template <typename MatrixType, typename T>
866 void pipelined_gmres_prod(MatrixType const & A,
867  vector_base<T> const & p,
868  vector_base<T> & Ap,
869  vector_base<T> & inner_prod_buffer)
870 {
871  pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
872 }
873 
874 
875 } //namespace host_based
876 } //namespace linalg
877 } //namespace viennacl
878 
879 
880 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
handle_type & handle2()
Definition: ell_matrix.hpp:103
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
Generic size and resize functionality for different vector and matrix types.
const vcl_size_t & size1() const
Returns the number of rows.
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Computes the second reduction stage for multiple inner products , i=0..k-1, then updates v_k -= v_i and computes the first reduction stage for ||v_k||.
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t nnz() const
Returns the number of nonzero entries.
Determines row and column increments for matrices and matrix proxies.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
vcl_size_t rows_per_block() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab a...
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
float NumericT
Definition: bisect.cpp:40
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
Definition: blas3.hpp:36
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:75
Common routines for single-threaded or OpenMP-enabled execution on CPU.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:900
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
handle_type & handle()
Definition: ell_matrix.hpp:100
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorit...
void pipelined_prod_impl(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, NumericT const *r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined C...
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t k)
Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1}.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Defines the action of certain unary and binary operators and its arguments (for host execution)...
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
Computes first reduction stage for multiple inner products , i=0..k-1.
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
void pipelined_gmres_prod(MatrixType const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
Simple enable-if variant that uses the SFINAE pattern.
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...