ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
sparse.cpp
Go to the documentation of this file.
1 /* =========================================================================
2  Copyright (c) 2010-2015, Institute for Microelectronics,
3  Institute for Analysis and Scientific Computing,
4  TU Wien.
5  Portions of this software are copyright by UChicago Argonne, LLC.
6 
7  -----------------
8  ViennaCL - The Vienna Computing Library
9  -----------------
10 
11  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12 
13  (A list of authors and contributors can be found in the PDF manual)
14 
15  License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 
19 /*
20 * Benchmark: Sparse matrix operations, i.e. matrix-vector products (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
21 *
22 */
23 
24 //#define VIENNACL_BUILD_INFO
25 #ifndef NDEBUG
26  #define NDEBUG
27 #endif
28 
29 #define VIENNACL_WITH_UBLAS 1
30 
31 #include <boost/numeric/ublas/triangular.hpp>
32 #include <boost/numeric/ublas/vector.hpp>
33 #include <boost/numeric/ublas/vector_proxy.hpp>
34 #include <boost/numeric/ublas/matrix_sparse.hpp>
35 #include <boost/numeric/ublas/operation_sparse.hpp>
36 #include <boost/numeric/ublas/lu.hpp>
37 
38 
39 #include "viennacl/scalar.hpp"
40 #include "viennacl/vector.hpp"
43 #include "viennacl/ell_matrix.hpp"
44 #include "viennacl/hyb_matrix.hpp"
46 #include "viennacl/linalg/prod.hpp"
49 #include "viennacl/linalg/ilu.hpp"
50 #include "viennacl/tools/timer.hpp"
51 
52 
53 #include <iostream>
54 #include <vector>
55 
56 
57 #define BENCHMARK_RUNS 10
58 
59 
60 inline void printOps(double num_ops, double exec_time)
61 {
62  std::cout << "GFLOPs: " << num_ops / (1000000 * exec_time * 1000) << std::endl;
63 }
64 
65 
66 template<typename ScalarType>
68 {
70  double exec_time;
71 
72  ScalarType std_factor1 = ScalarType(3.1415);
73  ScalarType std_factor2 = ScalarType(42.0);
74  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
75  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
76 
77  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
78  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
79 
80  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
81  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
82  {
83  std::cout << "Error reading Matrix file" << std::endl;
84  return 0;
85  }
86  //unsigned int cg_mat_size = cg_mat.size();
87  std::cout << "done reading matrix" << std::endl;
88 
89  ublas_vec1 = boost::numeric::ublas::scalar_vector<ScalarType>(ublas_matrix.size1(), ScalarType(1.0));
90  ublas_vec2 = ublas_vec1;
91 
92  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
93  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
94  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
95 
96  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
97 
98  viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
99  viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
100  viennacl::sliced_ell_matrix<ScalarType> vcl_sliced_ell_matrix_1;
101 
102  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
103  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size());
104 
105  //cpu to gpu:
106  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
107  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
108  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
109  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
110  #endif
111  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
112  viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
113  viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
114  viennacl::copy(ublas_matrix, vcl_sliced_ell_matrix_1);
115  viennacl::copy(ublas_vec1, vcl_vec1);
116  viennacl::copy(ublas_vec2, vcl_vec2);
117 
118 
120 
121  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
122  timer.start();
123  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
124  {
125  //ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
126  boost::numeric::ublas::axpy_prod(ublas_matrix, ublas_vec2, ublas_vec1, true);
127  }
128  exec_time = timer.get();
129  std::cout << "CPU time: " << exec_time << std::endl;
130  std::cout << "CPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
131  std::cout << ublas_vec1[0] << std::endl;
132 
133 
134  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
135 
136 
137  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
138  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
139  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
140  //std_result = 0.0;
141 
143  timer.start();
144  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
145  {
146  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
147  }
149  exec_time = timer.get();
150  std::cout << "GPU time align1: " << exec_time << std::endl;
151  std::cout << "GPU align1 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
152  std::cout << vcl_vec1[0] << std::endl;
153 
154  std::cout << "Testing triangular solves: compressed_matrix" << std::endl;
155 
156  viennacl::copy(ublas_vec1, vcl_vec1);
157  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
158  viennacl::copy(ublas_vec1, vcl_vec1);
159  std::cout << "ublas..." << std::endl;
160  timer.start();
161  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), ublas_vec1, boost::numeric::ublas::unit_lower_tag());
162  std::cout << "Time elapsed: " << timer.get() << std::endl;
163  std::cout << "ViennaCL..." << std::endl;
165  timer.start();
166  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
168  std::cout << "Time elapsed: " << timer.get() << std::endl;
169 
170  ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
171 
173  timer.start();
174  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
175  {
176  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
177  }
179  exec_time = timer.get();
180  std::cout << "GPU time align4: " << exec_time << std::endl;
181  std::cout << "GPU align4 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
182  std::cout << vcl_vec1[0] << std::endl;
183 
185  timer.start();
186  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
187  {
188  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
189  }
191  exec_time = timer.get();
192  std::cout << "GPU time align8: " << exec_time << std::endl;
193  std::cout << "GPU align8 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
194  std::cout << vcl_vec1[0] << std::endl;
195 
196 
197  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
198  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
200 
201  viennacl::copy(vcl_vec1, ublas_vec2);
202  long err_cnt = 0;
203  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
204  {
205  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
206  {
207  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
208  ++err_cnt;
209  if (err_cnt > 5)
210  break;
211  }
212  }
213 
215  timer.start();
216  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
217  {
218  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
219  }
221  exec_time = timer.get();
222  std::cout << "GPU time: " << exec_time << std::endl;
223  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
224  std::cout << vcl_vec1[0] << std::endl;
225 
226 
227  std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
228  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
230 
231  viennacl::copy(vcl_vec1, ublas_vec2);
232  err_cnt = 0;
233  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
234  {
235  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
236  {
237  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
238  ++err_cnt;
239  if (err_cnt > 5)
240  break;
241  }
242  }
243 
245  timer.start();
246  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
247  {
248  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
249  }
251  exec_time = timer.get();
252  std::cout << "GPU time: " << exec_time << std::endl;
253  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
254  std::cout << vcl_vec1[0] << std::endl;
255 
256 
257  std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
258  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
260 
261  viennacl::copy(vcl_vec1, ublas_vec2);
262  err_cnt = 0;
263  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
264  {
265  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
266  {
267  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
268  ++err_cnt;
269  if (err_cnt > 5)
270  break;
271  }
272  }
273 
275  timer.start();
276  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
277  {
278  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
279  }
281  exec_time = timer.get();
282  std::cout << "GPU time: " << exec_time << std::endl;
283  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
284  std::cout << vcl_vec1[0] << std::endl;
285 
286 
287  std::cout << "------- Matrix-Vector product with sliced_ell_matrix ----------" << std::endl;
288  vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2); //startup calculation
290 
291  viennacl::copy(vcl_vec1, ublas_vec2);
292  err_cnt = 0;
293  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
294  {
295  if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
296  {
297  std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
298  ++err_cnt;
299  if (err_cnt > 5)
300  break;
301  }
302  }
303 
305  timer.start();
306  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
307  {
308  vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2);
309  }
311  exec_time = timer.get();
312  std::cout << "GPU time: " << exec_time << std::endl;
313  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
314  std::cout << vcl_vec1[0] << std::endl;
315 
316  return EXIT_SUCCESS;
317 }
318 
319 
320 int main()
321 {
322  std::cout << std::endl;
323  std::cout << "----------------------------------------------" << std::endl;
324  std::cout << " Device Info" << std::endl;
325  std::cout << "----------------------------------------------" << std::endl;
326 
327 #ifdef VIENNACL_WITH_OPENCL
328  std::cout << viennacl::ocl::current_device().info() << std::endl;
329 #endif
330  std::cout << std::endl;
331  std::cout << "----------------------------------------------" << std::endl;
332  std::cout << "----------------------------------------------" << std::endl;
333  std::cout << "## Benchmark :: Sparse" << std::endl;
334  std::cout << "----------------------------------------------" << std::endl;
335  std::cout << std::endl;
336  std::cout << " -------------------------------" << std::endl;
337  std::cout << " # benchmarking single-precision" << std::endl;
338  std::cout << " -------------------------------" << std::endl;
339  run_benchmark<float>();
340 #ifdef VIENNACL_WITH_OPENCL
342 #endif
343  {
344  std::cout << std::endl;
345  std::cout << " -------------------------------" << std::endl;
346  std::cout << " # benchmarking double-precision" << std::endl;
347  std::cout << " -------------------------------" << std::endl;
348  run_benchmark<double>();
349  }
350  return 0;
351 }
352 
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
Simple timer class based on gettimeofday (POSIX) or QueryPerformanceCounter (Windows).
Definition: timer.hpp:90
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
void inplace_solve(const matrix_base< NumericT > &A, matrix_base< NumericT > &B, SolverTagT)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
A reader and writer for the matrix market format is implemented here.
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:227
Generic interface for the l^2-norm. See viennacl/linalg/vector_operations.hpp for implementations...
int run_benchmark()
Definition: sparse.cpp:67
void trans(matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > const &proxy, matrix_base< NumericT > &temp_trans)
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...
int main()
Definition: sparse.cpp:943
void finish()
Synchronizes the execution. finish() will only return after all compute kernels (CUDA, OpenCL) have completed.
Definition: memory.hpp:54
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
viennacl::ocl::device const & current_device()
Convenience function for returning the active device in the current context.
Definition: backend.hpp:351
Implementation of the coordinate_matrix class.
std::string info(vcl_size_t indent=0, char indent_char= ' ') const
Returns an info string with a few properties of the device. Use full_info() to get all details...
Definition: device.hpp:995
Implementation of the hyb_matrix class.
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:102
#define BENCHMARK_RUNS
Definition: sparse.cpp:57
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
void printOps(double num_ops, double exec_time)
Definition: sparse.cpp:60
Implementations of incomplete factorization preconditioners. Convenience header file.
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
Implementation of the compressed_matrix class.
Implementation of the sliced_ell_matrix class.
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Definition: device.hpp:956
Implementation of the ell_matrix class.
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling.
void prod(std::vector< std::map< IndexT, NumericT > > const &stl_A, std::vector< std::map< IndexT, NumericT > > const &stl_B, std::vector< std::map< IndexT, NumericT > > &stl_C)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
float ScalarType
Definition: fft_1d.cpp:42
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
A sparse square matrix in compressed sparse rows format.
double get() const
Definition: timer.hpp:104
long read_matrix_market_file(MatrixT &mat, const char *file, long index_base=1)
Reads a sparse matrix from a file (MatrixMarket format)
Implementation of the ViennaCL scalar class.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...