1 #ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
41 #ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
42 #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000
60 template<
typename NumericT>
73 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
74 unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle1());
75 unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle2());
76 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
78 value_type inner_prod_ApAp = 0;
79 value_type inner_prod_pAp = 0;
80 value_type inner_prod_Ap_r0star = 0;
81 for (
long row = 0; row < static_cast<long>(A.
size1()); ++
row)
88 dot_prod += elements[i] * p_buf[col_buffer[i]];
92 inner_prod_ApAp += dot_prod *
dot_prod;
93 inner_prod_pAp += val_p_diag *
dot_prod;
94 inner_prod_Ap_r0star += r0star ? dot_prod * r0star[
static_cast<vcl_size_t>(
row)] : value_type(0);
97 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
98 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
100 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
111 template<
typename NumericT>
124 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
125 unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle12());
126 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
134 Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];
138 value_type inner_prod_ApAp = 0;
139 value_type inner_prod_pAp = 0;
140 value_type inner_prod_Ap_r0star = 0;
146 inner_prod_ApAp += value_Ap * value_Ap;
147 inner_prod_pAp += value_Ap * value_p;
148 inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);
151 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
152 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
154 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
164 template<
typename NumericT>
177 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
178 unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(A.
handle2());
179 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
181 value_type inner_prod_ApAp = 0;
182 value_type inner_prod_pAp = 0;
183 value_type inner_prod_Ap_r0star = 0;
189 for (
unsigned int item_id = 0; item_id < A.
internal_maxnnz(); ++item_id)
192 value_type val = elements[offset];
195 sum += (p_buf[coords[offset]] * val);
199 inner_prod_ApAp += sum *
sum;
200 inner_prod_pAp += val_p_diag *
sum;
201 inner_prod_Ap_r0star += r0star ? sum * r0star[
row] : value_type(0);
204 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
205 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
207 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
217 template<
typename NumericT,
typename IndexT>
230 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
231 IndexT
const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.
handle1());
232 IndexT
const * column_indices = detail::extract_raw_pointer<IndexT>(A.
handle2());
233 IndexT
const * block_start = detail::extract_raw_pointer<IndexT>(A.
handle3());
234 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
239 value_type inner_prod_ApAp = 0;
240 value_type inner_prod_pAp = 0;
241 value_type inner_prod_Ap_r0star = 0;
242 for (
vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)
244 vcl_size_t current_columns_per_block = columns_per_block[block_idx];
246 for (
vcl_size_t i=0; i<result_values.size(); ++i)
247 result_values[i] = 0;
249 for (IndexT column_entry_index = 0;
250 column_entry_index < current_columns_per_block;
251 ++column_entry_index)
256 for (IndexT row_in_block = 0; row_in_block < A.
rows_per_block(); ++row_in_block)
258 value_type val = elements[stride_start + row_in_block];
260 result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;
265 for (IndexT row_in_block = 0; row_in_block < A.
rows_per_block(); ++row_in_block)
270 value_type row_result = result_values[row_in_block];
272 Ap_buf[
row] = row_result;
273 inner_prod_ApAp += row_result * row_result;
274 inner_prod_pAp += p_buf[
row] * row_result;
275 inner_prod_Ap_r0star += r0star ? row_result * r0star[
row] : value_type(0);
280 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
281 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
283 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
293 template<
typename NumericT>
303 typedef unsigned int index_type;
307 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
308 index_type
const * coords = detail::extract_raw_pointer<index_type>(A.
handle2());
309 value_type
const * csr_elements = detail::extract_raw_pointer<value_type>(A.
handle5());
310 index_type
const * csr_row_buffer = detail::extract_raw_pointer<index_type>(A.
handle3());
311 index_type
const * csr_col_buffer = detail::extract_raw_pointer<index_type>(A.
handle4());
312 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
314 value_type inner_prod_ApAp = 0;
315 value_type inner_prod_pAp = 0;
316 value_type inner_prod_Ap_r0star = 0;
328 value_type val = elements[offset];
331 sum += p_buf[coords[offset]] * val;
340 for (
vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
341 sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];
344 inner_prod_ApAp += sum *
sum;
345 inner_prod_pAp += val_p_diag *
sum;
346 inner_prod_Ap_r0star += r0star ? sum * r0star[
row] : value_type(0);
349 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
350 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
352 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
366 template<
typename NumericT>
377 value_type * data_result = detail::extract_raw_pointer<value_type>(result);
378 value_type * data_p = detail::extract_raw_pointer<value_type>(p);
379 value_type * data_r = detail::extract_raw_pointer<value_type>(r);
380 value_type
const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
381 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
386 value_type inner_prod_r = 0;
387 for (
long i = 0; i < static_cast<long>(
size); ++i)
389 value_type value_p = data_p[
static_cast<vcl_size_t>(i)];
390 value_type value_r = data_r[
static_cast<vcl_size_t>(i)];
393 data_result[
static_cast<vcl_size_t>(i)] += alpha * value_p;
394 value_r -= alpha * data_Ap[
static_cast<vcl_size_t>(i)];
395 value_p = value_r + beta * value_p;
396 inner_prod_r += value_r * value_r;
402 data_buffer[0] = inner_prod_r;
412 template<
typename NumericT>
430 template<
typename NumericT>
447 template<
typename NumericT>
464 template<
typename NumericT,
typename IndexT>
483 template<
typename NumericT>
503 template<
typename NumericT>
513 value_type * data_s = detail::extract_raw_pointer<value_type>(s);
514 value_type * data_r = detail::extract_raw_pointer<value_type>(r);
515 value_type
const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
516 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
522 value_type r_in_r0 = 0;
523 value_type Ap_in_r0 = 0;
524 for (
vcl_size_t i=0; i<buffer_chunk_size; ++i)
526 r_in_r0 += data_buffer[i];
527 Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];
529 value_type alpha = r_in_r0 / Ap_in_r0;
532 value_type inner_prod_s = 0;
533 for (
long i = 0; i < static_cast<long>(
size); ++i)
535 value_type value_s = data_s[
static_cast<vcl_size_t>(i)];
537 value_s = data_r[
static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];
538 inner_prod_s += value_s * value_s;
543 data_buffer[buffer_chunk_offset] = inner_prod_s;
553 template<
typename NumericT>
563 value_type * data_result = detail::extract_raw_pointer<value_type>(result);
564 value_type * data_p = detail::extract_raw_pointer<value_type>(p);
565 value_type
const * data_s = detail::extract_raw_pointer<value_type>(s);
566 value_type * data_residual = detail::extract_raw_pointer<value_type>(residual);
567 value_type
const * data_As = detail::extract_raw_pointer<value_type>(As);
568 value_type
const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
569 value_type
const * data_r0star = detail::extract_raw_pointer<value_type>(r0star);
570 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
574 value_type inner_prod_r_r0star = 0;
575 for (
long i = 0; i < static_cast<long>(
size); ++i)
578 value_type value_result = data_result[index];
579 value_type value_p = data_p[index];
580 value_type value_s = data_s[index];
581 value_type value_residual = data_residual[index];
582 value_type value_As = data_As[index];
583 value_type value_Ap = data_Ap[index];
584 value_type value_r0star = data_r0star[index];
586 value_result += alpha * value_p + omega * value_s;
587 value_residual = value_s - omega * value_As;
588 value_p = value_residual + beta * (value_p - omega * value_Ap);
589 inner_prod_r_r0star += value_residual * value_r0star;
591 data_result[index] = value_result;
592 data_residual[index] = value_residual;
593 data_p[index] = value_p;
596 (void)buffer_chunk_size;
597 data_buffer[0] = inner_prod_r_r0star;
606 template<
typename NumericT>
615 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
626 template<
typename NumericT>
635 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
646 template<
typename NumericT>
655 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
666 template<
typename NumericT,
typename IndexT>
675 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
686 template<
typename NumericT>
695 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
710 template <
typename T>
720 typedef T value_type;
722 value_type * data_v_k = detail::extract_raw_pointer<value_type>(v_k);
723 value_type
const * data_residual = detail::extract_raw_pointer<value_type>(residual);
724 value_type * data_R = detail::extract_raw_pointer<value_type>(R_buffer);
725 value_type
const * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
726 value_type * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);
733 value_type norm_vk = 0;
734 for (
vcl_size_t i=0; i<buffer_chunk_size; ++i)
735 norm_vk += data_buffer[i + buffer_chunk_size];
736 norm_vk = std::sqrt(norm_vk);
737 data_R[offset_in_R] = norm_vk;
740 value_type inner_prod_r_dot_vk = 0;
741 for (
long i = 0; i < static_cast<long>(
size); ++i)
743 value_type value_vk = data_v_k[
static_cast<vcl_size_t>(i) + vk_start] / norm_vk;
745 inner_prod_r_dot_vk += data_residual[
static_cast<vcl_size_t>(i)] * value_vk;
747 data_v_k[
static_cast<vcl_size_t>(i) + vk_start] = value_vk;
750 data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;
759 template <
typename T>
767 typedef T value_type;
769 value_type
const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
770 value_type * data_inner_prod = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);
774 data_inner_prod[j*buffer_chunk_size] = value_type(0);
779 value_type value_vk = data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size];
782 data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;
791 template <
typename T>
802 typedef T value_type;
804 value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
806 std::vector<T> values_vi_in_vk(k);
809 for (std::size_t i=0; i<k; ++i)
810 for (
vcl_size_t j=0; j<buffer_chunk_size; ++j)
811 values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];
815 value_type norm_vk = 0;
818 value_type value_vk = data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size];
821 value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];
823 norm_vk += value_vk * value_vk;
824 data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;
828 for (std::size_t i=0; i<k; ++i)
829 R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];
831 inner_prod_buffer[buffer_chunk_size] = norm_vk;
835 template <
typename T>
844 typedef T value_type;
846 value_type * data_result = detail::extract_raw_pointer<value_type>(result);
847 value_type
const * data_residual = detail::extract_raw_pointer<value_type>(residual);
848 value_type
const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);
849 value_type
const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);
853 value_type value_result = data_result[i];
855 value_result += data_coefficients[0] * data_residual[i];
857 value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];
859 data_result[i] = value_result;
865 template <
typename MatrixType,
typename T>
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
const handle_type & handle4() const
Generic size and resize functionality for different vector and matrix types.
const vcl_size_t & size1() const
Returns the number of rows.
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Computes the second reduction stage for multiple inner products , i=0..k-1, then updates v_k -= v_i and computes the first reduction stage for ||v_k||.
vcl_size_t internal_ellnnz() const
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t nnz() const
Returns the number of nonzero entries.
Determines row and column increments for matrices and matrix proxies.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
vcl_size_t rows_per_block() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab a...
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
const handle_type & handle2() const
vcl_size_t internal_size1() const
Sparse matrix class using the sliced ELLPACK with parameters C, .
result_of::size_type< T >::type start(T const &obj)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Common routines for single-threaded or OpenMP-enabled execution on CPU.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorit...
void pipelined_prod_impl(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, NumericT const *r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined C...
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
size_type size() const
Returns the length of the vector (cf. std::vector)
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t k)
Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1}.
const handle_type & handle() const
vcl_size_t internal_maxnnz() const
Defines the action of certain unary and binary operators and its arguments (for host execution)...
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
Computes first reduction stage for multiple inner products , i=0..k-1.
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
void pipelined_gmres_prod(MatrixType const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
const handle_type & handle3() const
Simple enable-if variant that uses the SFINAE pattern.
const handle_type & handle5() const
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...