ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
31 
33 
34 #include <vector>
35 
36 #ifdef VIENNACL_WITH_OPENMP
37 #include <omp.h>
38 #endif
39 
40 namespace viennacl
41 {
42 namespace linalg
43 {
44 namespace host_based
45 {
46 //
47 // Compressed matrix
48 //
49 
50 namespace detail
51 {
52  template<typename NumericT, unsigned int AlignmentV>
56  {
57  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
58  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
59  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
60  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
61 
62  for (vcl_size_t row = 0; row < mat.size1(); ++row)
63  {
64  NumericT value = 0;
65  unsigned int row_end = row_buffer[row+1];
66 
67  switch (info_selector)
68  {
70  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
71  value = std::max<NumericT>(value, std::fabs(elements[i]));
72  break;
73 
75  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
76  value += std::fabs(elements[i]);
77  break;
78 
80  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
81  value += elements[i] * elements[i];
82  value = std::sqrt(value);
83  break;
84 
86  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
87  {
88  if (col_buffer[i] == row)
89  {
90  value = elements[i];
91  break;
92  }
93  }
94  break;
95  }
96  result_buf[row] = value;
97  }
98  }
99 }
100 
101 
110 template<typename NumericT, unsigned int AlignmentV>
114 {
115  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
116  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
117  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
118  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
119  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
120 
121 #ifdef VIENNACL_WITH_OPENMP
122  #pragma omp parallel for
123 #endif
124  for (long row = 0; row < static_cast<long>(mat.size1()); ++row)
125  {
126  NumericT dot_prod = 0;
127  vcl_size_t row_end = row_buffer[row+1];
128  for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
129  dot_prod += elements[i] * vec_buf[col_buffer[i] * vec.stride() + vec.start()];
130  result_buf[static_cast<vcl_size_t>(row) * result.stride() + result.start()] = dot_prod;
131  }
132 
133 }
134 
143 template<typename NumericT, unsigned int AlignmentV>
145  const viennacl::matrix_base<NumericT> & d_mat,
147 
148  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
149  unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
150  unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
151 
152  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
153  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
154 
155  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
156  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
157  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
158  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
159  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
160  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
161 
162  vcl_size_t result_start1 = viennacl::traits::start1(result);
163  vcl_size_t result_start2 = viennacl::traits::start2(result);
164  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
165  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
166  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
167  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
168 
170  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
172  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
173 
175  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
177  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
178 
179  if ( d_mat.row_major() ) {
180 #ifdef VIENNACL_WITH_OPENMP
181  #pragma omp parallel for
182 #endif
183  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
184  vcl_size_t row_start = sp_mat_row_buffer[row];
185  vcl_size_t row_end = sp_mat_row_buffer[row+1];
186  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
187  NumericT temp = 0;
188  for (vcl_size_t k = row_start; k < row_end; ++k) {
189  temp += sp_mat_elements[k] * d_mat_wrapper_row(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), col);
190  }
191  if (result.row_major())
192  result_wrapper_row(row, col) = temp;
193  else
194  result_wrapper_col(row, col) = temp;
195  }
196  }
197  }
198  else {
199 #ifdef VIENNACL_WITH_OPENMP
200  #pragma omp parallel for
201 #endif
202  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
203  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
204  vcl_size_t row_start = sp_mat_row_buffer[row];
205  vcl_size_t row_end = sp_mat_row_buffer[row+1];
206  NumericT temp = 0;
207  for (vcl_size_t k = row_start; k < row_end; ++k) {
208  temp += sp_mat_elements[k] * d_mat_wrapper_col(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), static_cast<vcl_size_t>(col));
209  }
210  if (result.row_major())
211  result_wrapper_row(row, col) = temp;
212  else
213  result_wrapper_col(row, col) = temp;
214  }
215  }
216  }
217 
218 }
219 
229 template<typename NumericT, unsigned int AlignmentV>
233  viennacl::op_trans > & d_mat,
235 
236  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
237  unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
238  unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
239 
240  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
241  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
242 
243  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
244  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
245  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
246  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
247  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
248  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
249 
250  vcl_size_t result_start1 = viennacl::traits::start1(result);
251  vcl_size_t result_start2 = viennacl::traits::start2(result);
252  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
253  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
254  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
255  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
256 
258  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
260  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
261 
263  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
265  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
266 
267  if ( d_mat.lhs().row_major() ) {
268 #ifdef VIENNACL_WITH_OPENMP
269  #pragma omp parallel for
270 #endif
271  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
272  vcl_size_t row_start = sp_mat_row_buffer[row];
273  vcl_size_t row_end = sp_mat_row_buffer[row+1];
274  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
275  NumericT temp = 0;
276  for (vcl_size_t k = row_start; k < row_end; ++k) {
277  temp += sp_mat_elements[k] * d_mat_wrapper_row(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
278  }
279  if (result.row_major())
280  result_wrapper_row(row, col) = temp;
281  else
282  result_wrapper_col(row, col) = temp;
283  }
284  }
285  }
286  else {
287 #ifdef VIENNACL_WITH_OPENMP
288  #pragma omp parallel for
289 #endif
290  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
291  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
292  vcl_size_t row_start = sp_mat_row_buffer[row];
293  vcl_size_t row_end = sp_mat_row_buffer[row+1];
294  NumericT temp = 0;
295  for (vcl_size_t k = row_start; k < row_end; ++k) {
296  temp += sp_mat_elements[k] * d_mat_wrapper_col(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
297  }
298  if (result.row_major())
299  result_wrapper_row(row, col) = temp;
300  else
301  result_wrapper_col(row, col) = temp;
302  }
303  }
304  }
305 
306 }
307 
308 
318 template<typename NumericT, unsigned int AlignmentV>
322 {
323 
324  NumericT const * A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
325  unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
326  unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
327 
328  NumericT const * B_elements = detail::extract_raw_pointer<NumericT>(B.handle());
329  unsigned int const * B_row_buffer = detail::extract_raw_pointer<unsigned int>(B.handle1());
330  unsigned int const * B_col_buffer = detail::extract_raw_pointer<unsigned int>(B.handle2());
331 
332  C.resize(A.size1(), B.size2(), false);
333  unsigned int * C_row_buffer = detail::extract_raw_pointer<unsigned int>(C.handle1());
334 
335 #if defined(VIENNACL_WITH_OPENMP)
336  unsigned int block_factor = 10;
337  unsigned int max_threads = omp_get_max_threads();
338 #else
339  unsigned int max_threads = 1;
340 #endif
341  std::vector<unsigned int> max_length_row_C(max_threads);
342  std::vector<unsigned int *> row_C_temp_index_buffers(max_threads);
343  std::vector<NumericT *> row_C_temp_value_buffers(max_threads);
344 
345 
346  /*
347  * Stage 1: Determine maximum length of work buffers:
348  */
349 
350 #if defined(VIENNACL_WITH_OPENMP)
351  #pragma omp parallel for schedule(dynamic, A.size1() / (block_factor * max_threads) + 1)
352 #endif
353  for (long i=0; i<long(A.size1()); ++i)
354  {
355  unsigned int row_start_A = A_row_buffer[i];
356  unsigned int row_end_A = A_row_buffer[i+1];
357 
358  unsigned int row_C_upper_bound_row = 0;
359  for (unsigned int j = row_start_A; j<row_end_A; ++j)
360  {
361  unsigned int row_B = A_col_buffer[j];
362 
363  unsigned int entries_in_row = B_row_buffer[row_B+1] - B_row_buffer[row_B];
364  row_C_upper_bound_row += entries_in_row;
365  }
366 
367 #ifdef VIENNACL_WITH_OPENMP
368  unsigned int thread_id = omp_get_thread_num();
369 #else
370  unsigned int thread_id = 0;
371 #endif
372 
373  max_length_row_C[thread_id] = std::max(max_length_row_C[thread_id], std::min(row_C_upper_bound_row, static_cast<unsigned int>(B.size2())));
374  }
375 
376  // determine global maximum row length
377  for (std::size_t i=1; i<max_length_row_C.size(); ++i)
378  max_length_row_C[0] = std::max(max_length_row_C[0], max_length_row_C[i]);
379 
380  // allocate work vectors:
381  for (unsigned int i=0; i<max_threads; ++i)
382  row_C_temp_index_buffers[i] = (unsigned int *)malloc(sizeof(unsigned int)*3*max_length_row_C[0]);
383 
384 
385  /*
386  * Stage 2: Determine sparsity pattern of C
387  */
388 
389 #ifdef VIENNACL_WITH_OPENMP
390  #pragma omp parallel for schedule(dynamic, A.size1() / (block_factor * max_threads) + 1)
391 #endif
392  for (long i=0; i<long(A.size1()); ++i)
393  {
394  unsigned int thread_id = 0;
395  #ifdef VIENNACL_WITH_OPENMP
396  thread_id = omp_get_thread_num();
397  #endif
398  unsigned int buffer_len = max_length_row_C[0];
399 
400  unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
401  unsigned int *row_C_vector_2 = row_C_vector_1 + buffer_len;
402  unsigned int *row_C_vector_3 = row_C_vector_2 + buffer_len;
403 
404  unsigned int row_start_A = A_row_buffer[i];
405  unsigned int row_end_A = A_row_buffer[i+1];
406 
407  C_row_buffer[i] = row_C_scan_symbolic_vector(row_start_A, row_end_A, A_col_buffer,
408  B_row_buffer, B_col_buffer, static_cast<unsigned int>(B.size2()),
409  row_C_vector_1, row_C_vector_2, row_C_vector_3);
410  }
411 
412  // exclusive scan to obtain row start indices:
413  unsigned int current_offset = 0;
414  for (std::size_t i=0; i<C.size1(); ++i)
415  {
416  unsigned int tmp = C_row_buffer[i];
417  C_row_buffer[i] = current_offset;
418  current_offset += tmp;
419  }
420  C_row_buffer[C.size1()] = current_offset;
421  C.reserve(current_offset, false);
422 
423  // allocate work vectors:
424  for (unsigned int i=0; i<max_threads; ++i)
425  row_C_temp_value_buffers[i] = (NumericT *)malloc(sizeof(NumericT)*3*max_length_row_C[0]);
426 
427  /*
428  * Stage 3: Compute product (code similar, maybe pull out into a separate function to avoid code duplication?)
429  */
430  NumericT * C_elements = detail::extract_raw_pointer<NumericT>(C.handle());
431  unsigned int * C_col_buffer = detail::extract_raw_pointer<unsigned int>(C.handle2());
432 
433 #ifdef VIENNACL_WITH_OPENMP
434  #pragma omp parallel for schedule(dynamic, A.size1() / (block_factor * max_threads) + 1)
435 #endif
436  for (long i = 0; i < long(A.size1()); ++i)
437  {
438  unsigned int row_start_A = A_row_buffer[i];
439  unsigned int row_end_A = A_row_buffer[i+1];
440 
441  unsigned int row_C_buffer_start = C_row_buffer[i];
442  unsigned int row_C_buffer_end = C_row_buffer[i+1];
443 
444 #ifdef VIENNACL_WITH_OPENMP
445  unsigned int thread_id = omp_get_thread_num();
446 #else
447  unsigned int thread_id = 0;
448 #endif
449 
450  unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
451  unsigned int *row_C_vector_2 = row_C_vector_1 + max_length_row_C[0];
452  unsigned int *row_C_vector_3 = row_C_vector_2 + max_length_row_C[0];
453 
454  NumericT *row_C_vector_1_values = row_C_temp_value_buffers[thread_id];
455  NumericT *row_C_vector_2_values = row_C_vector_1_values + max_length_row_C[0];
456  NumericT *row_C_vector_3_values = row_C_vector_2_values + max_length_row_C[0];
457 
458  row_C_scan_numeric_vector(row_start_A, row_end_A, A_col_buffer, A_elements,
459  B_row_buffer, B_col_buffer, B_elements, static_cast<unsigned int>(B.size2()),
460  row_C_buffer_start, row_C_buffer_end, C_col_buffer, C_elements,
461  row_C_vector_1, row_C_vector_1_values,
462  row_C_vector_2, row_C_vector_2_values,
463  row_C_vector_3, row_C_vector_3_values);
464  }
465 
466  // clean up at the end:
467  for (unsigned int i=0; i<max_threads; ++i)
468  {
469  free(row_C_temp_index_buffers[i]);
470  free(row_C_temp_value_buffers[i]);
471  }
472 
473 }
474 
475 
476 
477 
478 //
479 // Triangular solve for compressed_matrix, A \ b
480 //
481 namespace detail
482 {
483  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
484  void csr_inplace_solve(IndexArrayT const & row_buffer,
485  IndexArrayT const & col_buffer,
486  ConstScalarArrayT const & element_buffer,
487  ScalarArrayT & vec_buffer,
488  vcl_size_t num_cols,
490  {
491  vcl_size_t row_begin = row_buffer[1];
492  for (vcl_size_t row = 1; row < num_cols; ++row)
493  {
494  NumericT vec_entry = vec_buffer[row];
495  vcl_size_t row_end = row_buffer[row+1];
496  for (vcl_size_t i = row_begin; i < row_end; ++i)
497  {
498  vcl_size_t col_index = col_buffer[i];
499  if (col_index < row)
500  vec_entry -= vec_buffer[col_index] * element_buffer[i];
501  }
502  vec_buffer[row] = vec_entry;
503  row_begin = row_end;
504  }
505  }
506 
507  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
508  void csr_inplace_solve(IndexArrayT const & row_buffer,
509  IndexArrayT const & col_buffer,
510  ConstScalarArrayT const & element_buffer,
511  ScalarArrayT & vec_buffer,
512  vcl_size_t num_cols,
514  {
515  vcl_size_t row_begin = row_buffer[0];
516  for (vcl_size_t row = 0; row < num_cols; ++row)
517  {
518  NumericT vec_entry = vec_buffer[row];
519 
520  // substitute and remember diagonal entry
521  vcl_size_t row_end = row_buffer[row+1];
522  NumericT diagonal_entry = 0;
523  for (vcl_size_t i = row_begin; i < row_end; ++i)
524  {
525  vcl_size_t col_index = col_buffer[i];
526  if (col_index < row)
527  vec_entry -= vec_buffer[col_index] * element_buffer[i];
528  else if (col_index == row)
529  diagonal_entry = element_buffer[i];
530  }
531 
532  vec_buffer[row] = vec_entry / diagonal_entry;
533  row_begin = row_end;
534  }
535  }
536 
537 
538  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
539  void csr_inplace_solve(IndexArrayT const & row_buffer,
540  IndexArrayT const & col_buffer,
541  ConstScalarArrayT const & element_buffer,
542  ScalarArrayT & vec_buffer,
543  vcl_size_t num_cols,
545  {
546  for (vcl_size_t row2 = 1; row2 < num_cols; ++row2)
547  {
548  vcl_size_t row = (num_cols - row2) - 1;
549  NumericT vec_entry = vec_buffer[row];
550  vcl_size_t row_begin = row_buffer[row];
551  vcl_size_t row_end = row_buffer[row+1];
552  for (vcl_size_t i = row_begin; i < row_end; ++i)
553  {
554  vcl_size_t col_index = col_buffer[i];
555  if (col_index > row)
556  vec_entry -= vec_buffer[col_index] * element_buffer[i];
557  }
558  vec_buffer[row] = vec_entry;
559  }
560  }
561 
562  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
563  void csr_inplace_solve(IndexArrayT const & row_buffer,
564  IndexArrayT const & col_buffer,
565  ConstScalarArrayT const & element_buffer,
566  ScalarArrayT & vec_buffer,
567  vcl_size_t num_cols,
569  {
570  for (vcl_size_t row2 = 0; row2 < num_cols; ++row2)
571  {
572  vcl_size_t row = (num_cols - row2) - 1;
573  NumericT vec_entry = vec_buffer[row];
574 
575  // substitute and remember diagonal entry
576  vcl_size_t row_begin = row_buffer[row];
577  vcl_size_t row_end = row_buffer[row+1];
578  NumericT diagonal_entry = 0;
579  for (vcl_size_t i = row_begin; i < row_end; ++i)
580  {
581  vcl_size_t col_index = col_buffer[i];
582  if (col_index > row)
583  vec_entry -= vec_buffer[col_index] * element_buffer[i];
584  else if (col_index == row)
585  diagonal_entry = element_buffer[i];
586  }
587 
588  vec_buffer[row] = vec_entry / diagonal_entry;
589  }
590  }
591 
592 } //namespace detail
593 
594 
595 
602 template<typename NumericT, unsigned int AlignmentV>
604  vector_base<NumericT> & vec,
606 {
607  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
608  NumericT const * elements = detail::extract_raw_pointer<NumericT>(L.handle());
609  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
610  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
611 
612  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
613 }
614 
621 template<typename NumericT, unsigned int AlignmentV>
623  vector_base<NumericT> & vec,
625 {
626  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
627  NumericT const * elements = detail::extract_raw_pointer<NumericT>(L.handle());
628  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
629  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
630 
631  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
632 }
633 
634 
641 template<typename NumericT, unsigned int AlignmentV>
643  vector_base<NumericT> & vec,
645 {
646  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
647  NumericT const * elements = detail::extract_raw_pointer<NumericT>(U.handle());
648  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
649  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
650 
651  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
652 }
653 
660 template<typename NumericT, unsigned int AlignmentV>
662  vector_base<NumericT> & vec,
664 {
665  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
666  NumericT const * elements = detail::extract_raw_pointer<NumericT>(U.handle());
667  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
668  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
669 
670  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
671 }
672 
673 
674 
675 
676 
677 
678 
679 //
680 // Triangular solve for compressed_matrix, A^T \ b
681 //
682 
683 namespace detail
684 {
685  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
686  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
687  IndexArrayT const & col_buffer,
688  ConstScalarArrayT const & element_buffer,
689  ScalarArrayT & vec_buffer,
690  vcl_size_t num_cols,
692  {
693  vcl_size_t col_begin = row_buffer[0];
694  for (vcl_size_t col = 0; col < num_cols; ++col)
695  {
696  NumericT vec_entry = vec_buffer[col];
697  vcl_size_t col_end = row_buffer[col+1];
698  for (vcl_size_t i = col_begin; i < col_end; ++i)
699  {
700  unsigned int row_index = col_buffer[i];
701  if (row_index > col)
702  vec_buffer[row_index] -= vec_entry * element_buffer[i];
703  }
704  col_begin = col_end;
705  }
706  }
707 
708  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
709  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
710  IndexArrayT const & col_buffer,
711  ConstScalarArrayT const & element_buffer,
712  ScalarArrayT & vec_buffer,
713  vcl_size_t num_cols,
715  {
716  vcl_size_t col_begin = row_buffer[0];
717  for (vcl_size_t col = 0; col < num_cols; ++col)
718  {
719  vcl_size_t col_end = row_buffer[col+1];
720 
721  // Stage 1: Find diagonal entry:
722  NumericT diagonal_entry = 0;
723  for (vcl_size_t i = col_begin; i < col_end; ++i)
724  {
725  vcl_size_t row_index = col_buffer[i];
726  if (row_index == col)
727  {
728  diagonal_entry = element_buffer[i];
729  break;
730  }
731  }
732 
733  // Stage 2: Substitute
734  NumericT vec_entry = vec_buffer[col] / diagonal_entry;
735  vec_buffer[col] = vec_entry;
736  for (vcl_size_t i = col_begin; i < col_end; ++i)
737  {
738  vcl_size_t row_index = col_buffer[i];
739  if (row_index > col)
740  vec_buffer[row_index] -= vec_entry * element_buffer[i];
741  }
742  col_begin = col_end;
743  }
744  }
745 
746  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
747  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
748  IndexArrayT const & col_buffer,
749  ConstScalarArrayT const & element_buffer,
750  ScalarArrayT & vec_buffer,
751  vcl_size_t num_cols,
753  {
754  for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
755  {
756  vcl_size_t col = (num_cols - col2) - 1;
757 
758  NumericT vec_entry = vec_buffer[col];
759  vcl_size_t col_begin = row_buffer[col];
760  vcl_size_t col_end = row_buffer[col+1];
761  for (vcl_size_t i = col_begin; i < col_end; ++i)
762  {
763  vcl_size_t row_index = col_buffer[i];
764  if (row_index < col)
765  vec_buffer[row_index] -= vec_entry * element_buffer[i];
766  }
767 
768  }
769  }
770 
771  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
772  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
773  IndexArrayT const & col_buffer,
774  ConstScalarArrayT const & element_buffer,
775  ScalarArrayT & vec_buffer,
776  vcl_size_t num_cols,
778  {
779  for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
780  {
781  vcl_size_t col = (num_cols - col2) - 1;
782  vcl_size_t col_begin = row_buffer[col];
783  vcl_size_t col_end = row_buffer[col+1];
784 
785  // Stage 1: Find diagonal entry:
786  NumericT diagonal_entry = 0;
787  for (vcl_size_t i = col_begin; i < col_end; ++i)
788  {
789  vcl_size_t row_index = col_buffer[i];
790  if (row_index == col)
791  {
792  diagonal_entry = element_buffer[i];
793  break;
794  }
795  }
796 
797  // Stage 2: Substitute
798  NumericT vec_entry = vec_buffer[col] / diagonal_entry;
799  vec_buffer[col] = vec_entry;
800  for (vcl_size_t i = col_begin; i < col_end; ++i)
801  {
802  vcl_size_t row_index = col_buffer[i];
803  if (row_index < col)
804  vec_buffer[row_index] -= vec_entry * element_buffer[i];
805  }
806  }
807  }
808 
809 
810  //
811  // block solves
812  //
813  template<typename NumericT, unsigned int AlignmentV>
816  op_trans> & L,
817  viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
818  vector_base<NumericT> const & /* L_diagonal */, //ignored
819  vector_base<NumericT> & vec,
821  {
822  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
823 
824  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
825  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
826  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
827  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
828 
829  vcl_size_t col_begin = row_buffer[0];
830  for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
831  {
832  NumericT vec_entry = vec_buffer[col];
833  vcl_size_t col_end = row_buffer[col+1];
834  for (vcl_size_t i = col_begin; i < col_end; ++i)
835  {
836  unsigned int row_index = col_buffer[i];
837  if (row_index > col)
838  vec_buffer[row_index] -= vec_entry * elements[i];
839  }
840  col_begin = col_end;
841  }
842  }
843 
844  template<typename NumericT, unsigned int AlignmentV>
847  op_trans> & L,
848  viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
849  vector_base<NumericT> const & L_diagonal,
850  vector_base<NumericT> & vec,
852  {
853  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
854 
855  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
856  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
857  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
858  NumericT const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(L_diagonal.handle());
859  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
860 
861  vcl_size_t col_begin = row_buffer[0];
862  for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
863  {
864  vcl_size_t col_end = row_buffer[col+1];
865 
866  NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
867  vec_buffer[col] = vec_entry;
868  for (vcl_size_t i = col_begin; i < col_end; ++i)
869  {
870  vcl_size_t row_index = col_buffer[i];
871  if (row_index > col)
872  vec_buffer[row_index] -= vec_entry * elements[i];
873  }
874  col_begin = col_end;
875  }
876  }
877 
878 
879 
880  template<typename NumericT, unsigned int AlignmentV>
883  op_trans> & U,
884  viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
885  vector_base<NumericT> const & /* U_diagonal */, //ignored
886  vector_base<NumericT> & vec,
888  {
889  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
890 
891  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
892  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
893  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
894  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
895 
896  for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
897  {
898  vcl_size_t col = (U.lhs().size1() - col2) - 1;
899 
900  NumericT vec_entry = vec_buffer[col];
901  vcl_size_t col_begin = row_buffer[col];
902  vcl_size_t col_end = row_buffer[col+1];
903  for (vcl_size_t i = col_begin; i < col_end; ++i)
904  {
905  vcl_size_t row_index = col_buffer[i];
906  if (row_index < col)
907  vec_buffer[row_index] -= vec_entry * elements[i];
908  }
909 
910  }
911  }
912 
913  template<typename NumericT, unsigned int AlignmentV>
916  op_trans> & U,
917  viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
918  vector_base<NumericT> const & U_diagonal,
919  vector_base<NumericT> & vec,
921  {
922  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
923 
924  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
925  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
926  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
927  NumericT const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(U_diagonal.handle());
928  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
929 
930  for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
931  {
932  vcl_size_t col = (U.lhs().size1() - col2) - 1;
933  vcl_size_t col_begin = row_buffer[col];
934  vcl_size_t col_end = row_buffer[col+1];
935 
936  // Stage 2: Substitute
937  NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
938  vec_buffer[col] = vec_entry;
939  for (vcl_size_t i = col_begin; i < col_end; ++i)
940  {
941  vcl_size_t row_index = col_buffer[i];
942  if (row_index < col)
943  vec_buffer[row_index] -= vec_entry * elements[i];
944  }
945  }
946  }
947 
948 
949 } //namespace detail
950 
957 template<typename NumericT, unsigned int AlignmentV>
960  op_trans> const & proxy,
961  vector_base<NumericT> & vec,
963 {
964  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
965  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
966  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
967  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
968 
969  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
970 }
971 
978 template<typename NumericT, unsigned int AlignmentV>
981  op_trans> const & proxy,
982  vector_base<NumericT> & vec,
984 {
985  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
986  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
987  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
988  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
989 
990  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
991 }
992 
993 
1000 template<typename NumericT, unsigned int AlignmentV>
1003  op_trans> const & proxy,
1004  vector_base<NumericT> & vec,
1006 {
1007  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1008  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
1009  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
1010  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
1011 
1012  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
1013 }
1014 
1015 
1022 template<typename NumericT, unsigned int AlignmentV>
1025  op_trans> const & proxy,
1026  vector_base<NumericT> & vec,
1028 {
1029  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1030  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
1031  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
1032  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
1033 
1034  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
1035 }
1036 
1037 
1038 
1039 //
1040 // Compressed Compressed Matrix
1041 //
1042 
1051 template<typename NumericT>
1053  const viennacl::vector_base<NumericT> & vec,
1055 {
1056  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1057  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1058  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1059  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
1060  unsigned int const * row_indices = detail::extract_raw_pointer<unsigned int>(mat.handle3());
1061  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1062 
1063  vector_assign(result, NumericT(0));
1064 
1065 #ifdef VIENNACL_WITH_OPENMP
1066  #pragma omp parallel for
1067 #endif
1068  for (long i = 0; i < static_cast<long>(mat.nnz1()); ++i)
1069  {
1070  NumericT dot_prod = 0;
1071  vcl_size_t row_end = row_buffer[i+1];
1072  for (vcl_size_t j = row_buffer[i]; j < row_end; ++j)
1073  dot_prod += elements[j] * vec_buf[col_buffer[j] * vec.stride() + vec.start()];
1074  result_buf[row_indices[i] * result.stride() + result.start()] = dot_prod;
1075  }
1076 
1077 }
1078 
1079 
1080 
1081 //
1082 // Coordinate Matrix
1083 //
1084 
1085 namespace detail
1086 {
1087  template<typename NumericT, unsigned int AlignmentV>
1089  vector_base<NumericT> & vec,
1091  {
1092  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1093  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1094  unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
1095 
1096  NumericT value = 0;
1097  unsigned int last_row = 0;
1098 
1099  for (vcl_size_t i = 0; i < mat.nnz(); ++i)
1100  {
1101  unsigned int current_row = coord_buffer[2*i];
1102 
1103  if (current_row != last_row)
1104  {
1105  if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
1106  value = std::sqrt(value);
1107 
1108  result_buf[last_row] = value;
1109  value = 0;
1110  last_row = current_row;
1111  }
1112 
1113  switch (info_selector)
1114  {
1116  value = std::max<NumericT>(value, std::fabs(elements[i]));
1117  break;
1118 
1120  value += std::fabs(elements[i]);
1121  break;
1122 
1124  value += elements[i] * elements[i];
1125  break;
1126 
1127  case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
1128  if (coord_buffer[2*i+1] == current_row)
1129  value = elements[i];
1130  break;
1131 
1132  //default:
1133  // break;
1134  }
1135  }
1136 
1137  if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
1138  value = std::sqrt(value);
1139 
1140  result_buf[last_row] = value;
1141  }
1142 }
1143 
1152 template<typename NumericT, unsigned int AlignmentV>
1154  const viennacl::vector_base<NumericT> & vec,
1156 {
1157  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1158  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1159  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1160  unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
1161 
1162  for (vcl_size_t i = 0; i< result.size(); ++i)
1163  result_buf[i * result.stride() + result.start()] = 0;
1164 
1165  for (vcl_size_t i = 0; i < mat.nnz(); ++i)
1166  result_buf[coord_buffer[2*i] * result.stride() + result.start()]
1167  += elements[i] * vec_buf[coord_buffer[2*i+1] * vec.stride() + vec.start()];
1168 }
1169 
1178 template<typename NumericT, unsigned int AlignmentV>
1180  const viennacl::matrix_base<NumericT> & d_mat,
1182 
1183  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1184  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
1185 
1186  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1187  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1188 
1189  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
1190  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
1191  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
1192  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
1193  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
1194  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
1195 
1196  vcl_size_t result_start1 = viennacl::traits::start1(result);
1197  vcl_size_t result_start2 = viennacl::traits::start2(result);
1198  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1199  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1200  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1201  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1202 
1204  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1206  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1207 
1209  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1211  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1212 
1213  if ( d_mat.row_major() ) {
1214 
1215 #ifdef VIENNACL_WITH_OPENMP
1216  #pragma omp parallel for
1217 #endif
1218  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1219  {
1220  if (result.row_major())
1221  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1222  result_wrapper_row(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1223  else
1224  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1225  result_wrapper_col(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1226  }
1227 
1228 #ifdef VIENNACL_WITH_OPENMP
1229  #pragma omp parallel for
1230 #endif
1231  for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
1232  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1233  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1234  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1235  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1236  NumericT y = d_mat_wrapper_row( c, col);
1237  if (result.row_major())
1238  result_wrapper_row(r, col) += x * y;
1239  else
1240  result_wrapper_col(r, col) += x * y;
1241  }
1242  }
1243  }
1244 
1245  else {
1246 
1247 #ifdef VIENNACL_WITH_OPENMP
1248  #pragma omp parallel for
1249 #endif
1250  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1251  {
1252  if (result.row_major())
1253  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1254  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1255  else
1256  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1257  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1258  }
1259 
1260 #ifdef VIENNACL_WITH_OPENMP
1261  #pragma omp parallel for
1262 #endif
1263  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
1264 
1265  for (vcl_size_t i = 0; i < sp_mat.nnz(); ++i) {
1266 
1267  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1268  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1269  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1270  NumericT y = d_mat_wrapper_col( c, col);
1271 
1272  if (result.row_major())
1273  result_wrapper_row( r, col) += x*y;
1274  else
1275  result_wrapper_col( r, col) += x*y;
1276  }
1277 
1278  }
1279  }
1280 
1281 }
1282 
1283 
1292 template<typename NumericT, unsigned int AlignmentV>
1296  viennacl::op_trans > & d_mat,
1298 
1299  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1300  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
1301 
1302  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
1303  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1304 
1305  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
1306  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
1307  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
1308  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
1309  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
1310  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
1311 
1312  vcl_size_t result_start1 = viennacl::traits::start1(result);
1313  vcl_size_t result_start2 = viennacl::traits::start2(result);
1314  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1315  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1316  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1317  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1318 
1320  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1322  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1323 
1325  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1327  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1328 
1329  if ( d_mat.lhs().row_major() )
1330  {
1331 #ifdef VIENNACL_WITH_OPENMP
1332  #pragma omp parallel for
1333 #endif
1334  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1335  {
1336  if (result.row_major())
1337  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1338  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1339  else
1340  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1341  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1342  }
1343 
1344 #ifdef VIENNACL_WITH_OPENMP
1345  #pragma omp parallel for
1346 #endif
1347  for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
1348  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1349  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1350  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1351  if (result.row_major())
1352  {
1353  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1354  NumericT y = d_mat_wrapper_row( col, c);
1355  result_wrapper_row(r, col) += x * y;
1356  }
1357  }
1358  else
1359  {
1360  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1361  NumericT y = d_mat_wrapper_row( col, c);
1362  result_wrapper_col(r, col) += x * y;
1363  }
1364  }
1365  }
1366 
1367 
1368  }
1369  else
1370  {
1371 #ifdef VIENNACL_WITH_OPENMP
1372  #pragma omp parallel for
1373 #endif
1374  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1375  {
1376  if (result.row_major())
1377  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1378  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1379  else
1380  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1381  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1382  }
1383 
1384 #ifdef VIENNACL_WITH_OPENMP
1385  #pragma omp parallel for
1386 #endif
1387  for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
1388  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1389  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1390  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1391  if (result.row_major())
1392  {
1393  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1394  NumericT y = d_mat_wrapper_col( col, c);
1395  result_wrapper_row(r, col) += x * y;
1396  }
1397  }
1398  else
1399  {
1400  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1401  NumericT y = d_mat_wrapper_col( col, c);
1402  result_wrapper_col(r, col) += x * y;
1403  }
1404  }
1405  }
1406  }
1407 
1408 }
1409 //
1410 // ELL Matrix
1411 //
1420 template<typename NumericT, unsigned int AlignmentV>
1422  const viennacl::vector_base<NumericT> & vec,
1424 {
1425  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1426  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1427  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1428  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1429 
1430  for (vcl_size_t row = 0; row < mat.size1(); ++row)
1431  {
1432  NumericT sum = 0;
1433 
1434  for (unsigned int item_id = 0; item_id < mat.internal_maxnnz(); ++item_id)
1435  {
1436  vcl_size_t offset = row + item_id * mat.internal_size1();
1437  NumericT val = elements[offset];
1438 
1439  if (val > 0 || val < 0)
1440  {
1441  unsigned int col = coords[offset];
1442  sum += (vec_buf[col * vec.stride() + vec.start()] * val);
1443  }
1444  }
1445 
1446  result_buf[row * result.stride() + result.start()] = sum;
1447  }
1448 }
1449 
1458 template<typename NumericT, unsigned int AlignmentV>
1460  const viennacl::matrix_base<NumericT> & d_mat,
1462 {
1463  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1464  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
1465 
1466  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1467  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1468 
1469  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
1470  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
1471  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
1472  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
1473  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
1474  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
1475 
1476  vcl_size_t result_start1 = viennacl::traits::start1(result);
1477  vcl_size_t result_start2 = viennacl::traits::start2(result);
1478  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1479  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1480  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1481  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1482 
1484  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1486  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1487 
1489  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1491  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1492 
1493  if ( d_mat.row_major() ) {
1494 #ifdef VIENNACL_WITH_OPENMP
1495  #pragma omp parallel for
1496 #endif
1497  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1498  {
1499  if (result.row_major())
1500  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1501  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1502  else
1503  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1504  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1505  }
1506 
1507 #ifdef VIENNACL_WITH_OPENMP
1508  #pragma omp parallel for
1509 #endif
1510  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1511  {
1512  for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id)
1513  {
1514  vcl_size_t offset = static_cast<vcl_size_t>(row) + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
1515  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1516  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1517 
1518  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1519  {
1520  if (result.row_major())
1521  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1522  result_wrapper_row(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
1523  else
1524  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1525  result_wrapper_col(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
1526  }
1527  }
1528  }
1529  }
1530  else {
1531 #ifdef VIENNACL_WITH_OPENMP
1532  #pragma omp parallel for
1533 #endif
1534  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1535  {
1536  if (result.row_major())
1537  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1538  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1539  else
1540  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1541  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1542  }
1543 
1544 #ifdef VIENNACL_WITH_OPENMP
1545  #pragma omp parallel for
1546 #endif
1547  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
1548 
1549  for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
1550 
1551  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
1552 
1553  vcl_size_t offset = row + item_id * sp_mat.internal_size1();
1554  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1555  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1556 
1557  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1558  {
1559  if (result.row_major())
1560  result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
1561  else
1562  result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
1563  }
1564  }
1565  }
1566  }
1567  }
1568 
1569 }
1570 
1580 template<typename NumericT, unsigned int AlignmentV>
1584  viennacl::op_trans > & d_mat,
1586 
1587  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1588  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
1589 
1590  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
1591  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1592 
1593  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
1594  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
1595  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
1596  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
1597  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
1598  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
1599 
1600  vcl_size_t result_start1 = viennacl::traits::start1(result);
1601  vcl_size_t result_start2 = viennacl::traits::start2(result);
1602  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1603  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1604  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1605  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1606 
1608  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1610  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1611 
1613  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1615  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1616 
1617  if ( d_mat.lhs().row_major() )
1618  {
1619 #ifdef VIENNACL_WITH_OPENMP
1620  #pragma omp parallel for
1621 #endif
1622  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1623  {
1624  if (result.row_major())
1625  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1626  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1627  else
1628  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1629  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1630  }
1631 
1632  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1633 
1634  for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
1635 
1636  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
1637 
1638  vcl_size_t offset = row + item_id * sp_mat.internal_size1();
1639  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1640  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1641 
1642  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1643  {
1644  if (result.row_major())
1645  result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
1646  else
1647  result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
1648  }
1649  }
1650  }
1651  }
1652  }
1653  else
1654  {
1655 #ifdef VIENNACL_WITH_OPENMP
1656  #pragma omp parallel for
1657 #endif
1658  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1659  {
1660  if (result.row_major())
1661  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1662  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1663  else
1664  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1665  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1666  }
1667 
1668 #ifdef VIENNACL_WITH_OPENMP
1669  #pragma omp parallel for
1670 #endif
1671  for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id) {
1672 
1673  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
1674 
1675  vcl_size_t offset = row + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
1676  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1677  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1678 
1679  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1680  {
1681  if (result.row_major())
1682  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1683  result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
1684  else
1685  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1686  result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
1687  }
1688  }
1689  }
1690  }
1691 
1692 }
1693 
1694 
1695 //
1696 // SELL-C-\sigma Matrix
1697 //
1706 template<typename NumericT, typename IndexT>
1708  const viennacl::vector_base<NumericT> & vec,
1710 {
1711  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1712  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1713  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1714  IndexT const * columns_per_block = detail::extract_raw_pointer<IndexT>(mat.handle1());
1715  IndexT const * column_indices = detail::extract_raw_pointer<IndexT>(mat.handle2());
1716  IndexT const * block_start = detail::extract_raw_pointer<IndexT>(mat.handle3());
1717 
1718  vcl_size_t num_blocks = mat.size1() / mat.rows_per_block() + 1;
1719 
1720 #ifdef VIENNACL_WITH_OPENMP
1721  #pragma omp parallel for
1722 #endif
1723  for (long block_idx2 = 0; block_idx2 < static_cast<long>(num_blocks); ++block_idx2)
1724  {
1725  vcl_size_t block_idx = static_cast<vcl_size_t>(block_idx2);
1726  vcl_size_t current_columns_per_block = columns_per_block[block_idx];
1727 
1728  std::vector<NumericT> result_values(mat.rows_per_block());
1729 
1730  for (IndexT column_entry_index = 0;
1731  column_entry_index < current_columns_per_block;
1732  ++column_entry_index)
1733  {
1734  vcl_size_t stride_start = block_start[block_idx] + column_entry_index * mat.rows_per_block();
1735  // Note: This for-loop may be unrolled by hand for exploiting vectorization
1736  // Careful benchmarking recommended first, memory channels may be saturated already!
1737  for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
1738  {
1739  NumericT val = elements[stride_start + row_in_block];
1740 
1741  result_values[row_in_block] += (val > 0 || val < 0) ? vec_buf[column_indices[stride_start + row_in_block] * vec.stride() + vec.start()] * val : 0;
1742  }
1743  }
1744 
1745  vcl_size_t first_row_in_matrix = block_idx * mat.rows_per_block();
1746  for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
1747  {
1748  if (first_row_in_matrix + row_in_block < result.size())
1749  result_buf[(first_row_in_matrix + row_in_block) * result.stride() + result.start()] = result_values[row_in_block];
1750  }
1751  }
1752 }
1753 
1754 
1755 //
1756 // Hybrid Matrix
1757 //
1766 template<typename NumericT, unsigned int AlignmentV>
1768  const viennacl::vector_base<NumericT> & vec,
1770 {
1771  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1772  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1773  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1774  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1775  NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
1776  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
1777  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
1778 
1779 
1780  for (vcl_size_t row = 0; row < mat.size1(); ++row)
1781  {
1782  NumericT sum = 0;
1783 
1784  //
1785  // Part 1: Process ELL part
1786  //
1787  for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
1788  {
1789  vcl_size_t offset = row + item_id * mat.internal_size1();
1790  NumericT val = elements[offset];
1791 
1792  if (val > 0 || val < 0)
1793  {
1794  unsigned int col = coords[offset];
1795  sum += (vec_buf[col * vec.stride() + vec.start()] * val);
1796  }
1797  }
1798 
1799  //
1800  // Part 2: Process HYB part
1801  //
1802  vcl_size_t col_begin = csr_row_buffer[row];
1803  vcl_size_t col_end = csr_row_buffer[row + 1];
1804 
1805  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1806  {
1807  sum += (vec_buf[csr_col_buffer[item_id] * vec.stride() + vec.start()] * csr_elements[item_id]);
1808  }
1809 
1810  result_buf[row * result.stride() + result.start()] = sum;
1811  }
1812 
1813 }
1814 
1815 //
1816 // Hybrid Matrix
1817 //
1826 template<typename NumericT, unsigned int AlignmentV>
1828  const viennacl::matrix_base<NumericT> & d_mat,
1830 {
1831  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1832  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1833 
1834  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
1835  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
1836  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
1837  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
1838  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
1839  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
1840 
1841  vcl_size_t result_start1 = viennacl::traits::start1(result);
1842  vcl_size_t result_start2 = viennacl::traits::start2(result);
1843  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1844  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1845  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1846  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1847 
1849  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1851  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1852 
1854  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1856  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1857 
1858  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1859  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1860  NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
1861  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
1862  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
1863 
1864 
1865  for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
1866  {
1867  for (vcl_size_t row = 0; row < mat.size1(); ++row)
1868  {
1869  NumericT sum = 0;
1870 
1871  //
1872  // Part 1: Process ELL part
1873  //
1874  for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
1875  {
1876  vcl_size_t offset = row + item_id * mat.internal_size1();
1877  NumericT val = elements[offset];
1878 
1879  if (val < 0 || val > 0) // val != 0 without compiler warnings
1880  {
1881  vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
1882  if (d_mat.row_major())
1883  sum += d_mat_wrapper_row(col, result_col) * val;
1884  else
1885  sum += d_mat_wrapper_col(col, result_col) * val;
1886  }
1887  }
1888 
1889  //
1890  // Part 2: Process HYB/CSR part
1891  //
1892  vcl_size_t col_begin = csr_row_buffer[row];
1893  vcl_size_t col_end = csr_row_buffer[row + 1];
1894 
1895  if (d_mat.row_major())
1896  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1897  sum += d_mat_wrapper_row(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
1898  else
1899  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1900  sum += d_mat_wrapper_col(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
1901 
1902  if (result.row_major())
1903  result_wrapper_row(row, result_col) = sum;
1904  else
1905  result_wrapper_col(row, result_col) = sum;
1906  }
1907  } // for result_col
1908 }
1909 
1910 
1919 template<typename NumericT, unsigned int AlignmentV>
1923  viennacl::op_trans > & d_mat,
1925 {
1926  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1927  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1928 
1929  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
1930  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
1931  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
1932  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
1933  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
1934  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
1935 
1936  vcl_size_t result_start1 = viennacl::traits::start1(result);
1937  vcl_size_t result_start2 = viennacl::traits::start2(result);
1938  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1939  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1940  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1941  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1942 
1944  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1946  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1947 
1949  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1951  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1952 
1953  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1954  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1955  NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
1956  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
1957  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
1958 
1959 
1960  for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
1961  {
1962  for (vcl_size_t row = 0; row < mat.size1(); ++row)
1963  {
1964  NumericT sum = 0;
1965 
1966  //
1967  // Part 1: Process ELL part
1968  //
1969  for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
1970  {
1971  vcl_size_t offset = row + item_id * mat.internal_size1();
1972  NumericT val = elements[offset];
1973 
1974  if (val < 0 || val > 0) // val != 0 without compiler warnings
1975  {
1976  vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
1977  if (d_mat.lhs().row_major())
1978  sum += d_mat_wrapper_row(result_col, col) * val;
1979  else
1980  sum += d_mat_wrapper_col(result_col, col) * val;
1981  }
1982  }
1983 
1984  //
1985  // Part 2: Process HYB/CSR part
1986  //
1987  vcl_size_t col_begin = csr_row_buffer[row];
1988  vcl_size_t col_end = csr_row_buffer[row + 1];
1989 
1990  if (d_mat.lhs().row_major())
1991  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1992  sum += d_mat_wrapper_row(result_col, static_cast<vcl_size_t>(csr_col_buffer[item_id])) * csr_elements[item_id];
1993  else
1994  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1995  sum += d_mat_wrapper_col(result_col, static_cast<vcl_size_t>(csr_col_buffer[item_id])) * csr_elements[item_id];
1996 
1997  if (result.row_major())
1998  result_wrapper_row(row, result_col) = sum;
1999  else
2000  result_wrapper_col(row, result_col) = sum;
2001  }
2002  } // for result_col
2003 }
2004 
2005 
2006 } // namespace host_based
2007 } //namespace linalg
2008 } //namespace viennacl
2009 
2010 
2011 #endif
const vcl_size_t & size2() const
Returns the number of columns.
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
handle_type & handle2()
Definition: ell_matrix.hpp:103
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:382
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
A tag class representing a lower triangular matrix.
Definition: forwards.h:849
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:390
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
vcl_size_t nnz() const
Returns the number of nonzero entries.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
vcl_size_t rows_per_block() const
void vector_assign(vector_base< NumericT > &vec1, const NumericT &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
float NumericT
Definition: bisect.cpp:40
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Helper array for accessing a strided submatrix embedded in a larger matrix.
Definition: common.hpp:73
Definition: blas3.hpp:36
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
A tag class representing an upper triangular matrix.
Definition: forwards.h:854
void prod_impl(const matrix_base< NumericT > &mat, bool trans, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
std::size_t vcl_size_t
Definition: forwards.h:75
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &, vcl_size_t, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:226
Common routines for single-threaded or OpenMP-enabled execution on CPU.
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:900
handle_type & handle()
Definition: ell_matrix.hpp:100
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:248
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
void csr_inplace_solve(IndexArrayT const &row_buffer, IndexArrayT const &col_buffer, ConstScalarArrayT const &element_buffer, ScalarArrayT &vec_buffer, vcl_size_t num_cols, viennacl::linalg::unit_lower_tag)
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
A tag class representing transposed matrices.
Definition: forwards.h:220
void row_info(compressed_matrix< NumericT, AlignmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
A sparse square matrix in compressed sparse rows format.
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
vcl_size_t size1() const
Returns the number of rows.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
void csr_trans_inplace_solve(IndexArrayT const &row_buffer, IndexArrayT const &col_buffer, ConstScalarArrayT const &element_buffer, ScalarArrayT &vec_buffer, vcl_size_t num_cols, viennacl::linalg::unit_lower_tag)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...