ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
self_assign.cpp
Go to the documentation of this file.
1 /* =========================================================================
2  Copyright (c) 2010-2015, Institute for Microelectronics,
3  Institute for Analysis and Scientific Computing,
4  TU Wien.
5  Portions of this software are copyright by UChicago Argonne, LLC.
6 
7  -----------------
8  ViennaCL - The Vienna Computing Library
9  -----------------
10 
11  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12 
13  (A list of authors and contributors can be found in the PDF manual)
14 
15  License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 
19 
24 //
25 // *** System
26 //
27 #include <iostream>
28 
29 
30 //
31 // *** ViennaCL
32 //
33 
34 //#define VIENNACL_DEBUG_ALL
35 
36 #include "viennacl/scalar.hpp"
37 #include "viennacl/vector.hpp"
39 #include "viennacl/matrix.hpp"
44 #include "viennacl/ell_matrix.hpp"
46 #include "viennacl/hyb_matrix.hpp"
47 #include "viennacl/linalg/prod.hpp"
49 #include "viennacl/linalg/ilu.hpp"
53 
54 
55 //
56 // -------------------------------------------------------------
57 //
58 template<typename NumericT>
60 {
61  if (std::fabs(s1 - s2) > 0)
62  return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
63  return 0;
64 }
65 
66 template<typename NumericT>
67 NumericT diff(std::vector<NumericT> const & v1, viennacl::vector<NumericT> const & v2)
68 {
69  std::vector<NumericT> v2_cpu(v2.size());
71  viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
72 
73  for (std::size_t i=0;i<v1.size(); ++i)
74  {
75  if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
76  v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
77  else
78  v2_cpu[i] = 0.0;
79 
80  if (v2_cpu[i] > 0.0001)
81  {
82  //std::cout << "Neighbor: " << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl;
83  std::cout << "Error at entry " << i << ": " << v1[i] << " vs. " << v2[i] << std::endl;
84  //std::cout << "Neighbor: " << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl;
85  exit(EXIT_FAILURE);
86  }
87  }
88 
89  NumericT inf_norm = 0;
90  for (std::size_t i=0;i<v2_cpu.size(); ++i)
91  inf_norm = std::max<NumericT>(inf_norm, std::fabs(v2_cpu[i]));
92 
93  return inf_norm;
94 }
95 
96 template<typename NumericT>
97 NumericT diff(std::vector<std::vector<NumericT> > const & A1, viennacl::matrix<NumericT> const & A2)
98 {
99  std::vector<NumericT> host_values(A2.internal_size());
100  for (std::size_t i=0; i<A2.size1(); ++i)
101  for (std::size_t j=0; j<A2.size2(); ++j)
102  host_values[i*A2.internal_size2() + j] = A1[i][j];
103 
104  std::vector<NumericT> device_values(A2.internal_size());
105  viennacl::fast_copy(A2, &device_values[0]);
106  viennacl::vector<NumericT> vcl_device_values(A2.internal_size()); // workaround to avoid code duplication
107  viennacl::copy(device_values, vcl_device_values);
108 
109  return diff(host_values, vcl_device_values);
110 }
111 
112 
113 template<typename HostContainerT, typename DeviceContainerT, typename NumericT>
114 void check(HostContainerT const & host_container, DeviceContainerT const & device_container,
115  std::string current_stage, NumericT epsilon)
116 {
117  current_stage.resize(25, ' ');
118  std::cout << "Testing operation: " << current_stage;
119  NumericT rel_error = std::fabs(diff(host_container, device_container));
120 
121  if (rel_error > epsilon)
122  {
123  std::cout << std::endl;
124  std::cout << "# Error at operation: " << current_stage << std::endl;
125  std::cout << " diff: " << rel_error << std::endl;
126  exit(EXIT_FAILURE);
127  }
128  std::cout << "PASS" << std::endl;
129 }
130 
131 
132 struct op_assign
133 {
134  template<typename LHS, typename RHS>
135  static void apply(LHS & lhs, RHS const & rhs) { lhs = rhs; }
136 
137  static std::string str() { return "="; }
138 };
139 
141 {
142  template<typename LHS, typename RHS>
143  static void apply(LHS & lhs, RHS const & rhs) { lhs += rhs; }
144 
145  static std::string str() { return "+="; }
146 };
147 
149 {
150  template<typename LHS, typename RHS>
151  static void apply(LHS & lhs, RHS const & rhs) { lhs -= rhs; }
152 
153  static std::string str() { return "-="; }
154 };
155 
156 
157 // compute C = A * B on host and device and compare results.
158 // Note that the reference uses three distinct matrices A, B, C,
159 // whereas C on the device is the same as either A, B, or both.
160 template<typename OpT, typename NumericT, typename HostMatrixT, typename DeviceMatrixT>
161 void test_gemm(NumericT epsilon,
162  HostMatrixT & host_A, HostMatrixT & host_B, HostMatrixT & host_C,
163  DeviceMatrixT & device_A, std::string name_A,
164  DeviceMatrixT & device_B, std::string name_B,
165  DeviceMatrixT & device_C, bool copy_from_A,
166  bool trans_first, bool trans_second)
167 {
169 
170  for (std::size_t i = 0; i<host_A.size(); ++i)
171  for (std::size_t j = 0; j<host_A[i].size(); ++j)
172  {
173  host_A[i][j] = randomNumber();
174  host_B[i][j] = randomNumber();
175  }
176 
177  viennacl::copy(host_A, device_A);
178  viennacl::copy(host_B, device_B);
179  if (copy_from_A)
180  host_C = host_A;
181  else
182  host_C = host_B;
183 
184  for (std::size_t i = 0; i<host_A.size(); ++i)
185  for (std::size_t j = 0; j<host_A[i].size(); ++j)
186  {
187  NumericT tmp = 0;
188  for (std::size_t k = 0; k<host_A[i].size(); ++k)
189  tmp += (trans_first ? host_A[k][i] : host_A[i][k])
190  * (trans_second ? host_B[j][k] : host_B[k][j]);
191  OpT::apply(host_C[i][j], tmp);
192  }
193 
194  if (trans_first && trans_second)
195  {
196  OpT::apply(device_C, viennacl::linalg::prod(trans(device_A), trans(device_B)));
197  check(host_C, device_C, std::string("A ") + OpT::str() + std::string(" ") + name_A + std::string("^T*") + name_B + std::string("^T"), epsilon);
198  }
199  else if (trans_first && !trans_second)
200  {
201  OpT::apply(device_C, viennacl::linalg::prod(trans(device_A), device_B));
202  check(host_C, device_C, std::string("A ") + OpT::str() + std::string(" ") + name_A + std::string("^T*") + name_B + std::string(""), epsilon);
203  }
204  else if (!trans_first && trans_second)
205  {
206  OpT::apply(device_C, viennacl::linalg::prod(device_A, trans(device_B)));
207  check(host_C, device_C, std::string("A ") + OpT::str() + std::string(" ") + name_A + std::string("*") + name_B + std::string("^T"), epsilon);
208  }
209  else
210  {
211  OpT::apply(device_C, viennacl::linalg::prod(device_A, device_B));
212  check(host_C, device_C, std::string("A ") + OpT::str() + std::string(" ") + name_A + std::string("*") + name_B + std::string(""), epsilon);
213  }
214 }
215 
216 // dispatch routine for all combinations of transpositions:
217 // C = A * B, C = A * B^T, C = A^T * B, C = A^T * B^T
218 template<typename OpT, typename NumericT, typename HostMatrixT, typename DeviceMatrixT>
219 void test_gemm(NumericT epsilon,
220  HostMatrixT & host_A, HostMatrixT & host_B, HostMatrixT & host_C,
221  DeviceMatrixT & device_A, std::string name_A,
222  DeviceMatrixT & device_B, std::string name_B,
223  DeviceMatrixT & device_C, bool copy_from_A)
224 {
225  test_gemm<OpT>(epsilon, host_A, host_B, host_C, device_A, name_A, device_B, name_B, device_C, copy_from_A, false, false);
226  test_gemm<OpT>(epsilon, host_A, host_B, host_C, device_A, name_A, device_B, name_B, device_C, copy_from_A, false, true);
227  test_gemm<OpT>(epsilon, host_A, host_B, host_C, device_A, name_A, device_B, name_B, device_C, copy_from_A, true, false);
228  test_gemm<OpT>(epsilon, host_A, host_B, host_C, device_A, name_A, device_B, name_B, device_C, copy_from_A, true, true);
229 }
230 
231 // The actual testing routine.
232 // Sets of vectors and matrices using STL types and uses these for reference calculations.
233 // ViennaCL operations are carried out as usual and then compared against the reference.
234 template<typename NumericT>
235 int test(NumericT epsilon)
236 {
237  std::size_t N = 142; // should be larger than 128 in order to avoid false negatives due to blocking
238 
240 
241  //
242  // Vector setup and test:
243  //
244 
245  std::vector<NumericT> std_x(N);
246  std::vector<NumericT> std_y(N);
247  std::vector<NumericT> std_z(N);
248 
249  for (std::size_t i=0; i<std_x.size(); ++i)
250  std_x[i] = NumericT(i + 1);
251  for (std::size_t i=0; i<std_y.size(); ++i)
252  std_y[i] = NumericT(i*i + 1);
253  for (std::size_t i=0; i<std_z.size(); ++i)
254  std_z[i] = NumericT(2 * i + 1);
255 
259 
260  viennacl::copy(std_x, vcl_x);
261  viennacl::copy(std_y, vcl_y);
262  viennacl::copy(std_z, vcl_z);
263 
264  // This shouldn't do anything bad:
265  vcl_x = vcl_x;
266  check(std_x, vcl_x, "x = x", epsilon);
267 
268  // This should work, even though we are dealing with the same buffer:
269  std_x[0] = std_x[2]; std_x[1] = std_x[3];
271  check(std_x, vcl_x, "x = x (range)", epsilon);
272 
273  //
274  // Matrix-Vector
275  //
276 
277  std::vector<std::vector<NumericT> > std_A(N, std::vector<NumericT>(N, NumericT(1)));
278  std::vector<std::vector<NumericT> > std_B(N, std::vector<NumericT>(N, NumericT(2)));
279  std::vector<std::vector<NumericT> > std_C(N, std::vector<NumericT>(N, NumericT(3)));
280 
284 
285  viennacl::copy(std_A, vcl_A);
286  viennacl::copy(std_B, vcl_B);
287  viennacl::copy(std_C, vcl_C);
288 
289  // This shouldn't do anything bad:
290  vcl_A = vcl_A;
291  check(std_A, vcl_A, "A = A", epsilon);
292 
293  // This should work, even though we are dealing with the same buffer:
294  std_A[0][0] = std_A[0][2]; std_A[0][1] = std_A[0][3];
296  check(std_A, vcl_A, "A = A (range)", epsilon);
297 
298  // check x <- A * x;
299  for (std::size_t i = 0; i<std_y.size(); ++i)
300  {
301  NumericT val = 0;
302  for (std::size_t j = 0; j<std_x.size(); ++j)
303  val += std_A[i][j] * std_x[j];
304  std_y[i] = val;
305  }
306  vcl_x = viennacl::linalg::prod(vcl_A, vcl_x);
307  check(std_y, vcl_x, "x = A*x", epsilon);
308 
309  typedef unsigned int KeyType;
310  std::vector< std::map<KeyType, NumericT> > std_Asparse(N);
311 
312  for (std::size_t i=0; i<std_Asparse.size(); ++i)
313  {
314  if (i > 0)
315  std_Asparse[i][KeyType(i-1)] = randomNumber();
316  std_Asparse[i][KeyType(i)] = NumericT(1) + randomNumber();
317  if (i < std_Asparse.size() - 1)
318  std_Asparse[i][KeyType(i+1)] = randomNumber();
319  }
320 
321  // Sparse
327 
328  viennacl::copy(std_Asparse, vcl_A_csr);
329  viennacl::copy(std_Asparse, vcl_A_coo);
330  viennacl::copy(std_Asparse, vcl_A_ell);
331  viennacl::copy(std_Asparse, vcl_A_sell);
332  viennacl::copy(std_Asparse, vcl_A_hyb);
333 
334  for (std::size_t i=0; i<std_Asparse.size(); ++i)
335  {
336  NumericT val = 0;
337  for (typename std::map<unsigned int, NumericT>::const_iterator it = std_Asparse[i].begin(); it != std_Asparse[i].end(); ++it)
338  val += it->second * std_x[it->first];
339  std_y[i] = val;
340  }
341 
342  viennacl::copy(std_x, vcl_x);
343  vcl_x = viennacl::linalg::prod(vcl_A_csr, vcl_x);
344  check(std_y, vcl_x, "x = A*x (sparse, csr)", epsilon);
345 
346  viennacl::copy(std_x, vcl_x);
347  vcl_x = viennacl::linalg::prod(vcl_A_coo, vcl_x);
348  check(std_y, vcl_x, "x = A*x (sparse, coo)", epsilon);
349 
350  viennacl::copy(std_x, vcl_x);
351  vcl_x = viennacl::linalg::prod(vcl_A_ell, vcl_x);
352  check(std_y, vcl_x, "x = A*x (sparse, ell)", epsilon);
353 
354  viennacl::copy(std_x, vcl_x);
355  vcl_x = viennacl::linalg::prod(vcl_A_sell, vcl_x);
356  check(std_y, vcl_x, "x = A*x (sparse, sell)", epsilon);
357 
358  viennacl::copy(std_x, vcl_x);
359  vcl_x = viennacl::linalg::prod(vcl_A_hyb, vcl_x);
360  check(std_y, vcl_x, "x = A*x (sparse, hyb)", epsilon);
361  std::cout << std::endl;
362 
363 
364  //
365  // Matrix-Matrix (dense times dense):
366  //
367  test_gemm<op_assign>(epsilon, std_A, std_B, std_C, vcl_A, "A", vcl_B, "B", vcl_A, true);
368  test_gemm<op_assign>(epsilon, std_B, std_A, std_C, vcl_B, "B", vcl_A, "A", vcl_A, false);
369  test_gemm<op_assign>(epsilon, std_A, std_A, std_C, vcl_A, "A", vcl_A, "A", vcl_A, true);
370  std::cout << std::endl;
371 
372  test_gemm<op_plus_assign>(epsilon, std_A, std_B, std_C, vcl_A, "A", vcl_B, "B", vcl_A, true);
373  test_gemm<op_plus_assign>(epsilon, std_B, std_A, std_C, vcl_B, "B", vcl_A, "A", vcl_A, false);
374  test_gemm<op_plus_assign>(epsilon, std_A, std_A, std_C, vcl_A, "A", vcl_A, "A", vcl_A, true);
375  std::cout << std::endl;
376 
377  test_gemm<op_minus_assign>(epsilon, std_A, std_B, std_C, vcl_A, "A", vcl_B, "B", vcl_A, true);
378  test_gemm<op_minus_assign>(epsilon, std_B, std_A, std_C, vcl_B, "B", vcl_A, "A", vcl_A, false);
379  test_gemm<op_minus_assign>(epsilon, std_A, std_A, std_C, vcl_A, "A", vcl_A, "A", vcl_A, true);
380  std::cout << std::endl;
381 
382 
383 
384  //
385  // Matrix-Matrix (sparse times dense)
386  //
387  // A = sparse * A
388  viennacl::copy(std_A, vcl_A);
389  for (std::size_t i = 0; i<std_A.size(); ++i)
390  for (std::size_t j = 0; j<std_A[i].size(); ++j)
391  {
392  NumericT tmp = 0;
393  for (std::size_t k = 0; k<std_A[i].size(); ++k)
394  tmp += std_Asparse[i][KeyType(k)] * std_A[k][j];
395  std_C[i][j] = tmp;
396  }
397 
398  viennacl::copy(std_A, vcl_A);
399  vcl_A = viennacl::linalg::prod(vcl_A_csr, vcl_A);
400  check(std_C, vcl_A, "A = csr*A", epsilon);
401 
402  viennacl::copy(std_A, vcl_A);
403  vcl_A = viennacl::linalg::prod(vcl_A_coo, vcl_A);
404  check(std_C, vcl_A, "A = coo*A", epsilon);
405 
406  viennacl::copy(std_A, vcl_A);
407  vcl_A = viennacl::linalg::prod(vcl_A_ell, vcl_A);
408  check(std_C, vcl_A, "A = ell*A", epsilon);
409 
410  viennacl::copy(std_A, vcl_A);
411  //vcl_A = viennacl::linalg::prod(vcl_A_sell, vcl_A);
412  //check(std_C, vcl_A, "A = sell*A", epsilon);
413 
414  viennacl::copy(std_A, vcl_A);
415  vcl_A = viennacl::linalg::prod(vcl_A_hyb, vcl_A);
416  check(std_C, vcl_A, "A = hyb*A", epsilon);
417 
418  // A = sparse * A^T
419  viennacl::copy(std_A, vcl_A);
420  for (std::size_t i = 0; i<std_A.size(); ++i)
421  for (std::size_t j = 0; j<std_A[i].size(); ++j)
422  {
423  NumericT tmp = 0;
424  for (std::size_t k = 0; k<std_A[i].size(); ++k)
425  tmp += std_Asparse[i][KeyType(k)] * std_A[j][k];
426  std_C[i][j] = tmp;
427  }
428 
429  viennacl::copy(std_A, vcl_A);
430  vcl_A = viennacl::linalg::prod(vcl_A_csr, trans(vcl_A));
431  check(std_C, vcl_A, "A = csr*A^T", epsilon);
432 
433  viennacl::copy(std_A, vcl_A);
434  vcl_A = viennacl::linalg::prod(vcl_A_coo, trans(vcl_A));
435  check(std_C, vcl_A, "A = coo*A^T", epsilon);
436 
437  viennacl::copy(std_A, vcl_A);
438  vcl_A = viennacl::linalg::prod(vcl_A_ell, trans(vcl_A));
439  check(std_C, vcl_A, "A = ell*A^T", epsilon);
440 
441  viennacl::copy(std_A, vcl_A);
442  //vcl_A = viennacl::linalg::prod(vcl_A_sell, trans(vcl_A));
443  //check(std_C, vcl_A, "A = sell*A^T", epsilon);
444 
445  viennacl::copy(std_A, vcl_A);
446  vcl_A = viennacl::linalg::prod(vcl_A_hyb, trans(vcl_A));
447  check(std_C, vcl_A, "A = hyb*A^T", epsilon);
448 
449  return EXIT_SUCCESS;
450 }
451 
452 
453 //
454 // -------------------------------------------------------------
455 //
456 int main()
457 {
458  std::cout << std::endl;
459  std::cout << "----------------------------------------------" << std::endl;
460  std::cout << "----------------------------------------------" << std::endl;
461  std::cout << "## Test :: Self-Assignment" << std::endl;
462  std::cout << "----------------------------------------------" << std::endl;
463  std::cout << "----------------------------------------------" << std::endl;
464  std::cout << std::endl;
465 
466  int retval = EXIT_SUCCESS;
467 
468  std::cout << std::endl;
469  std::cout << "----------------------------------------------" << std::endl;
470  std::cout << std::endl;
471  {
472  typedef float NumericT;
473  NumericT epsilon = static_cast<NumericT>(1E-4);
474  std::cout << "# Testing setup:" << std::endl;
475  std::cout << " eps: " << epsilon << std::endl;
476  std::cout << " numeric: float" << std::endl;
477  retval = test<NumericT>(epsilon);
478  if ( retval == EXIT_SUCCESS )
479  std::cout << "# Test passed" << std::endl;
480  else
481  return retval;
482  }
483  std::cout << std::endl;
484  std::cout << "----------------------------------------------" << std::endl;
485  std::cout << std::endl;
486 
487  // Note: No need for double precision check, self-assignments are handled in a numeric-type agnostic manner.
488 
489  std::cout << std::endl;
490  std::cout << "------- Test completed --------" << std::endl;
491  std::cout << std::endl;
492 
493  return retval;
494 }
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
void test_gemm(NumericT epsilon, HostMatrixT &host_A, HostMatrixT &host_B, HostMatrixT &host_C, DeviceMatrixT &device_A, std::string name_A, DeviceMatrixT &device_B, std::string name_B, DeviceMatrixT &device_C, bool copy_from_A, bool trans_first, bool trans_second)
A reader and writer for the matrix market format is implemented here.
NumericT diff(NumericT const &s1, viennacl::scalar< NumericT > const &s2)
Definition: self_assign.cpp:59
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:227
Generic interface for the l^2-norm. See viennacl/linalg/vector_operations.hpp for implementations...
int main()
size_type internal_size() const
Returns the total amount of allocated memory in multiples of sizeof(NumericT)
Definition: matrix_def.hpp:242
void trans(matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > const &proxy, matrix_base< NumericT > &temp_trans)
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...
Implementation of the dense matrix class.
void finish()
Synchronizes the execution. finish() will only return after all compute kernels (CUDA, OpenCL) have completed.
Definition: memory.hpp:54
viennacl::scalar< int > s2
viennacl::scalar< float > s1
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
Implementation of the coordinate_matrix class.
float NumericT
Definition: bisect.cpp:40
static void apply(LHS &lhs, RHS const &rhs)
viennacl::vector< float > v1
Implementation of the hyb_matrix class.
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:102
Random number generator for returning uniformly distributed values in the closed interval [0...
Definition: random.hpp:44
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Definition: vector.hpp:841
Implementations of incomplete factorization preconditioners. Convenience header file.
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
Implementation of the compressed_matrix class.
Implementation of the sliced_ell_matrix class.
static std::string str()
int test(NumericT epsilon)
matrix_range< MatrixType > project(MatrixType const &A, viennacl::range const &r1, viennacl::range const &r2)
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:226
Implementation of the ell_matrix class.
size_type size1() const
Returns the number of rows.
Definition: matrix_def.hpp:224
Proxy classes for vectors.
static std::string str()
Implementation of the compressed_compressed_matrix class (CSR format with a relatively small number o...
Proxy classes for matrices.
viennacl::vector< int > v2
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
static void apply(LHS &lhs, RHS const &rhs)
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
A small collection of sequential random number generators.
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
static std::string str()
A range class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
Definition: forwards.h:424
static void apply(LHS &lhs, RHS const &rhs)
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:240
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Definition: vector.hpp:848
void check(HostContainerT const &host_container, DeviceContainerT const &device_container, std::string current_stage, NumericT epsilon)
Common routines used within ILU-type preconditioners.
Implementation of the ViennaCL scalar class.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)