ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
compressed_matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
27 
30 namespace viennacl
31 {
32 namespace linalg
33 {
34 namespace opencl
35 {
36 namespace kernels
37 {
38 
40 
41 template<typename StringT>
42 void generate_compressed_matrix_block_trans_lu_backward(StringT & source, std::string const & numeric_string)
43 {
44  source.append("__kernel void block_trans_lu_backward( \n");
45  source.append(" __global const unsigned int * row_jumper_U, \n"); //U part (note that U is transposed in memory)
46  source.append(" __global const unsigned int * column_indices_U, \n");
47  source.append(" __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
48  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
49  source.append(" __global const unsigned int * block_offsets, \n");
50  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
51  source.append(" unsigned int size) \n");
52  source.append("{ \n");
53  source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
54  source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
55  source.append(" unsigned int row_start; \n");
56  source.append(" unsigned int row_stop; \n");
57  source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
58 
59  source.append(" if (col_start >= col_stop) \n");
60  source.append(" return; \n");
61 
62  //backward elimination, using U and diagonal_U
63  source.append(" for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
64  source.append(" { \n");
65  source.append(" unsigned int col = (col_stop - iter) - 1; \n");
66  source.append(" result_entry = result[col] / diagonal_U[col]; \n");
67  source.append(" row_start = row_jumper_U[col]; \n");
68  source.append(" row_stop = row_jumper_U[col + 1]; \n");
69  source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
70  source.append(" result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
71  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
72  source.append(" } \n");
73 
74  //divide result vector by diagonal:
75  source.append(" for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
76  source.append(" result[col] /= diagonal_U[col]; \n");
77  source.append("} \n");
78 }
79 
80 template<typename StringT>
81 void generate_compressed_matrix_block_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
82 {
83  source.append("__kernel void block_trans_unit_lu_forward( \n");
84  source.append(" __global const unsigned int * row_jumper_L, \n"); //L part (note that L is transposed in memory)
85  source.append(" __global const unsigned int * column_indices_L, \n");
86  source.append(" __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
87  source.append(" __global const unsigned int * block_offsets, \n");
88  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
89  source.append(" unsigned int size) \n");
90  source.append("{ \n");
91  source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
92  source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
93  source.append(" unsigned int row_start = row_jumper_L[col_start]; \n");
94  source.append(" unsigned int row_stop; \n");
95  source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
96 
97  source.append(" if (col_start >= col_stop) \n");
98  source.append(" return; \n");
99 
100  //forward elimination, using L:
101  source.append(" for (unsigned int col = col_start; col < col_stop; ++col) \n");
102  source.append(" { \n");
103  source.append(" result_entry = result[col]; \n");
104  source.append(" row_stop = row_jumper_L[col + 1]; \n");
105  source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
106  source.append(" result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
107  source.append(" row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
108  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
109  source.append(" } \n");
110 
111  source.append("}; \n");
112 }
113 
114 namespace detail
115 {
117  template<typename StringT>
118  void generate_compressed_matrix_dense_matrix_mult(StringT & source, std::string const & numeric_string,
119  bool B_transposed, bool B_row_major, bool C_row_major)
120  {
121  source.append("__kernel void ");
122  source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
123  source.append("( \n");
124  source.append(" __global const unsigned int * sp_mat_row_indices, \n");
125  source.append(" __global const unsigned int * sp_mat_col_indices, \n");
126  source.append(" __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
127  source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
128  source.append(" unsigned int d_mat_row_start, \n");
129  source.append(" unsigned int d_mat_col_start, \n");
130  source.append(" unsigned int d_mat_row_inc, \n");
131  source.append(" unsigned int d_mat_col_inc, \n");
132  source.append(" unsigned int d_mat_row_size, \n");
133  source.append(" unsigned int d_mat_col_size, \n");
134  source.append(" unsigned int d_mat_internal_rows, \n");
135  source.append(" unsigned int d_mat_internal_cols, \n");
136  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
137  source.append(" unsigned int result_row_start, \n");
138  source.append(" unsigned int result_col_start, \n");
139  source.append(" unsigned int result_row_inc, \n");
140  source.append(" unsigned int result_col_inc, \n");
141  source.append(" unsigned int result_row_size, \n");
142  source.append(" unsigned int result_col_size, \n");
143  source.append(" unsigned int result_internal_rows, \n");
144  source.append(" unsigned int result_internal_cols) { \n");
145 
146  // split work rows (sparse matrix rows) to thread groups
147  source.append(" for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
148 
149  source.append(" unsigned int row_start = sp_mat_row_indices[row]; \n");
150  source.append(" unsigned int row_end = sp_mat_row_indices[row+1]; \n");
151 
152  // split result cols between threads in a thread group
153  source.append(" for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
154 
155  source.append(" "); source.append(numeric_string); source.append(" r = 0; \n");
156 
157  source.append(" for (unsigned int k = row_start; k < row_end; k ++) { \n");
158 
159  source.append(" unsigned int j = sp_mat_col_indices[k]; \n");
160  source.append(" "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
161 
162  source.append(" "); source.append(numeric_string);
163  if (B_transposed && B_row_major)
164  source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
165  else if (B_transposed && !B_row_major)
166  source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
167  else if (!B_transposed && B_row_major)
168  source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
169  else
170  source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
171  source.append(" r += x * y; \n");
172  source.append(" } \n");
173 
174  if (C_row_major)
175  source.append(" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
176  else
177  source.append(" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
178  source.append(" } \n");
179  source.append(" } \n");
180 
181  source.append("} \n");
182 
183  }
184 }
185 template<typename StringT>
186 void generate_compressed_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
187 {
188  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
189  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, true);
190  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, false);
191  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, true);
192 
193  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
194  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, true);
195  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, false);
196  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, true);
197 }
198 
199 template<typename StringT>
200 void generate_compressed_matrix_jacobi(StringT & source, std::string const & numeric_string)
201 {
202 
203  source.append(" __kernel void jacobi( \n");
204  source.append(" __global const unsigned int * row_indices, \n");
205  source.append(" __global const unsigned int * column_indices, \n");
206  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
207  source.append(" "); source.append(numeric_string); source.append(" weight, \n");
208  source.append(" __global const "); source.append(numeric_string); source.append(" * old_result, \n");
209  source.append(" __global "); source.append(numeric_string); source.append(" * new_result, \n");
210  source.append(" __global const "); source.append(numeric_string); source.append(" * rhs, \n");
211  source.append(" unsigned int size) \n");
212  source.append(" { \n");
213  source.append(" "); source.append(numeric_string); source.append(" sum, diag=1; \n");
214  source.append(" int col; \n");
215  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
216  source.append(" { \n");
217  source.append(" sum = 0; \n");
218  source.append(" for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
219  source.append(" { \n");
220  source.append(" col = column_indices[j]; \n");
221  source.append(" if (i == col) \n");
222  source.append(" diag = elements[j]; \n");
223  source.append(" else \n");
224  source.append(" sum += elements[j] * old_result[col]; \n");
225  source.append(" } \n");
226  source.append(" new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
227  source.append(" } \n");
228  source.append(" } \n");
229 
230 }
231 
232 
233 template<typename StringT>
234 void generate_compressed_matrix_lu_backward(StringT & source, std::string const & numeric_string)
235 {
236  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
237  source.append("__kernel void lu_backward( \n");
238  source.append(" __global const unsigned int * row_indices, \n");
239  source.append(" __global const unsigned int * column_indices, \n");
240  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
241  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
242  source.append(" unsigned int size) \n");
243  source.append("{ \n");
244  source.append(" __local unsigned int col_index_buffer[128]; \n");
245  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
246  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
247 
248  source.append(" unsigned int nnz = row_indices[size]; \n");
249  source.append(" unsigned int current_row = size-1; \n");
250  source.append(" unsigned int row_at_window_start = size-1; \n");
251  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
252  source.append(" "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
253  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
254  source.append(" unsigned int next_row = row_indices[size-1]; \n");
255 
256  source.append(" unsigned int i = loop_end + get_local_id(0); \n");
257  source.append(" while (1) \n");
258  source.append(" { \n");
259  //load into shared memory (coalesced access):
260  source.append(" if (i < nnz) \n");
261  source.append(" { \n");
262  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
263  source.append(" unsigned int tmp = column_indices[i]; \n");
264  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
265  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
266  source.append(" } \n");
267 
268  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
269 
270  //now a single thread does the remaining work in shared memory:
271  source.append(" if (get_local_id(0) == 0) \n");
272  source.append(" { \n");
273  // traverse through all the loaded data from back to front:
274  source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
275  source.append(" { \n");
276  source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
277 
278  source.append(" if (i+k >= nnz) \n");
279  source.append(" continue; \n");
280 
281  source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
282  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
283  source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
284  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
285  source.append(" else if (col_index_buffer[k] == current_row) \n");
286  source.append(" diagonal_entry = element_buffer[k]; \n");
287 
288  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
289  source.append(" { \n");
290  source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
291  source.append(" if (current_row > 0) //load next row's data \n");
292  source.append(" { \n");
293  source.append(" --current_row; \n");
294  source.append(" next_row = row_indices[current_row]; \n");
295  source.append(" current_vector_entry = vector[current_row]; \n");
296  source.append(" } \n");
297  source.append(" } \n");
298 
299 
300  source.append(" } \n"); // for k
301 
302  source.append(" row_at_window_start = current_row; \n");
303  source.append(" } \n"); // if (get_local_id(0) == 0)
304 
305  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
306 
307  source.append(" if (i < get_local_size(0)) \n");
308  source.append(" break; \n");
309 
310  source.append(" i -= get_local_size(0); \n");
311  source.append(" } \n"); //for i
312  source.append("} \n");
313 
314 }
315 
316 template<typename StringT>
317 void generate_compressed_matrix_lu_forward(StringT & source, std::string const & numeric_string)
318 {
319 
320  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
321  source.append("__kernel void lu_forward( \n");
322  source.append(" __global const unsigned int * row_indices, \n");
323  source.append(" __global const unsigned int * column_indices, \n");
324  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
325  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
326  source.append(" unsigned int size) \n");
327  source.append("{ \n");
328  source.append(" __local unsigned int col_index_buffer[128]; \n");
329  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
330  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
331 
332  source.append(" unsigned int nnz = row_indices[size]; \n");
333  source.append(" unsigned int current_row = 0; \n");
334  source.append(" unsigned int row_at_window_start = 0; \n");
335  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
336  source.append(" "); source.append(numeric_string); source.append(" diagonal_entry; \n");
337  source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
338  source.append(" unsigned int next_row = row_indices[1]; \n");
339 
340  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
341  source.append(" { \n");
342  //load into shared memory (coalesced access):
343  source.append(" if (i < nnz) \n");
344  source.append(" { \n");
345  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
346  source.append(" unsigned int tmp = column_indices[i]; \n");
347  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
348  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
349  source.append(" } \n");
350 
351  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
352 
353  //now a single thread does the remaining work in shared memory:
354  source.append(" if (get_local_id(0) == 0) \n");
355  source.append(" { \n");
356  // traverse through all the loaded data:
357  source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
358  source.append(" { \n");
359  source.append(" if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
360  source.append(" { \n");
361  source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
362  source.append(" ++current_row; \n");
363  source.append(" if (current_row < size) \n"); //load next row's data
364  source.append(" { \n");
365  source.append(" next_row = row_indices[current_row+1]; \n");
366  source.append(" current_vector_entry = vector[current_row]; \n");
367  source.append(" } \n");
368  source.append(" } \n");
369 
370  source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
371  source.append(" { \n");
372  source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
373  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
374  source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
375  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
376  source.append(" } \n");
377  source.append(" else if (col_index_buffer[k] == current_row) \n");
378  source.append(" diagonal_entry = element_buffer[k]; \n");
379 
380  source.append(" } \n"); // for k
381 
382  source.append(" row_at_window_start = current_row; \n");
383  source.append(" } \n"); // if (get_local_id(0) == 0)
384 
385  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
386  source.append(" } \n"); //for i
387  source.append("} \n");
388 
389 }
390 
391 template<typename StringT>
392 void generate_compressed_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
393 {
394  source.append("__kernel void row_info_extractor( \n");
395  source.append(" __global const unsigned int * row_indices, \n");
396  source.append(" __global const unsigned int * column_indices, \n");
397  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
398  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
399  source.append(" unsigned int size, \n");
400  source.append(" unsigned int option \n");
401  source.append(" ) \n");
402  source.append("{ \n");
403  source.append(" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
404  source.append(" { \n");
405  source.append(" "); source.append(numeric_string); source.append(" value = 0; \n");
406  source.append(" unsigned int row_end = row_indices[row+1]; \n");
407 
408  source.append(" switch (option) \n");
409  source.append(" { \n");
410  source.append(" case 0: \n"); //inf-norm
411  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
412  source.append(" value = max(value, fabs(elements[i])); \n");
413  source.append(" break; \n");
414 
415  source.append(" case 1: \n"); //1-norm
416  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
417  source.append(" value += fabs(elements[i]); \n");
418  source.append(" break; \n");
419 
420  source.append(" case 2: \n"); //2-norm
421  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
422  source.append(" value += elements[i] * elements[i]; \n");
423  source.append(" value = sqrt(value); \n");
424  source.append(" break; \n");
425 
426  source.append(" case 3: \n"); //diagonal entry
427  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
428  source.append(" { \n");
429  source.append(" if (column_indices[i] == row) \n");
430  source.append(" { \n");
431  source.append(" value = elements[i]; \n");
432  source.append(" break; \n");
433  source.append(" } \n");
434  source.append(" } \n");
435  source.append(" break; \n");
436 
437  source.append(" default: \n");
438  source.append(" break; \n");
439  source.append(" } \n");
440  source.append(" result[row] = value; \n");
441  source.append(" } \n");
442  source.append("} \n");
443 
444 }
445 
446 template<typename StringT>
447 void generate_compressed_matrix_trans_lu_backward(StringT & source, std::string const & numeric_string)
448 {
449 
450  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
451  source.append("__kernel void trans_lu_backward( \n");
452  source.append(" __global const unsigned int * row_indices, \n");
453  source.append(" __global const unsigned int * column_indices, \n");
454  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
455  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
456  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
457  source.append(" unsigned int size) \n");
458  source.append("{ \n");
459  source.append(" __local unsigned int row_index_lookahead[256]; \n");
460  source.append(" __local unsigned int row_index_buffer[256]; \n");
461 
462  source.append(" unsigned int row_index; \n");
463  source.append(" unsigned int col_index; \n");
464  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
465  source.append(" unsigned int nnz = row_indices[size]; \n");
466  source.append(" unsigned int row_at_window_start = size; \n");
467  source.append(" unsigned int row_at_window_end; \n");
468  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
469 
470  source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
471  source.append(" { \n");
472  source.append(" unsigned int i = (nnz - i2) - 1; \n");
473  source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
474  source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
475  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
476 
477  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
478 
479  source.append(" if (i2 < nnz) \n");
480  source.append(" { \n");
481  source.append(" unsigned int row_index_dec = 0; \n");
482  source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
483  source.append(" ++row_index_dec; \n");
484  source.append(" row_index = row_at_window_start - row_index_dec; \n");
485  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
486  source.append(" } \n");
487  source.append(" else \n");
488  source.append(" { \n");
489  source.append(" row_index = size+1; \n");
490  source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
491  source.append(" } \n");
492 
493  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
494 
495  source.append(" row_at_window_start = row_index_buffer[0]; \n");
496  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
497 
498  //backward elimination
499  source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
500  source.append(" { \n");
501  source.append(" unsigned int row = row_at_window_start - row2; \n");
502  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
503 
504  source.append(" if ( (row_index == row) && (col_index < row) ) \n");
505  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
506 
507  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
508  source.append(" } \n");
509 
510  source.append(" row_at_window_start = row_at_window_end; \n");
511  source.append(" } \n");
512 
513  // final step: Divide vector by diagonal entries:
514  source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
515  source.append(" vector[i] /= diagonal_entries[i]; \n");
516  source.append("} \n");
517 
518 }
519 
520 template<typename StringT>
521 void generate_compressed_matrix_trans_lu_forward(StringT & source, std::string const & numeric_string)
522 {
523 
524  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
525  source.append("__kernel void trans_lu_forward( \n");
526  source.append(" __global const unsigned int * row_indices, \n");
527  source.append(" __global const unsigned int * column_indices, \n");
528  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
529  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
530  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
531  source.append(" unsigned int size) \n");
532  source.append("{ \n");
533  source.append(" __local unsigned int row_index_lookahead[256]; \n");
534  source.append(" __local unsigned int row_index_buffer[256]; \n");
535 
536  source.append(" unsigned int row_index; \n");
537  source.append(" unsigned int col_index; \n");
538  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
539  source.append(" unsigned int nnz = row_indices[size]; \n");
540  source.append(" unsigned int row_at_window_start = 0; \n");
541  source.append(" unsigned int row_at_window_end = 0; \n");
542  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
543 
544  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
545  source.append(" { \n");
546  source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
547  source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
548  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
549 
550  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
551 
552  source.append(" if (i < nnz) \n");
553  source.append(" { \n");
554  source.append(" unsigned int row_index_inc = 0; \n");
555  source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
556  source.append(" ++row_index_inc; \n");
557  source.append(" row_index = row_at_window_start + row_index_inc; \n");
558  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
559  source.append(" } \n");
560  source.append(" else \n");
561  source.append(" { \n");
562  source.append(" row_index = size+1; \n");
563  source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
564  source.append(" } \n");
565 
566  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
567 
568  source.append(" row_at_window_start = row_index_buffer[0]; \n");
569  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
570 
571  //forward elimination
572  source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
573  source.append(" { \n");
574  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
575 
576  source.append(" if ( (row_index == row) && (col_index > row) ) \n");
577  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
578 
579  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
580  source.append(" } \n");
581 
582  source.append(" row_at_window_start = row_at_window_end; \n");
583  source.append(" } \n");
584 
585  // final step: Divide vector by diagonal entries:
586  source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
587  source.append(" vector[i] /= diagonal_entries[i]; \n");
588  source.append("} \n");
589 
590 }
591 
592 template<typename StringT>
593 void generate_compressed_matrix_trans_unit_lu_backward(StringT & source, std::string const & numeric_string)
594 {
595 
596  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
597  source.append("__kernel void trans_unit_lu_backward( \n");
598  source.append(" __global const unsigned int * row_indices, \n");
599  source.append(" __global const unsigned int * column_indices, \n");
600  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
601  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
602  source.append(" unsigned int size) \n");
603  source.append("{ \n");
604  source.append(" __local unsigned int row_index_lookahead[256]; \n");
605  source.append(" __local unsigned int row_index_buffer[256]; \n");
606 
607  source.append(" unsigned int row_index; \n");
608  source.append(" unsigned int col_index; \n");
609  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
610  source.append(" unsigned int nnz = row_indices[size]; \n");
611  source.append(" unsigned int row_at_window_start = size; \n");
612  source.append(" unsigned int row_at_window_end; \n");
613  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
614 
615  source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
616  source.append(" { \n");
617  source.append(" unsigned int i = (nnz - i2) - 1; \n");
618  source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
619  source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
620  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
621 
622  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
623 
624  source.append(" if (i2 < nnz) \n");
625  source.append(" { \n");
626  source.append(" unsigned int row_index_dec = 0; \n");
627  source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
628  source.append(" ++row_index_dec; \n");
629  source.append(" row_index = row_at_window_start - row_index_dec; \n");
630  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
631  source.append(" } \n");
632  source.append(" else \n");
633  source.append(" { \n");
634  source.append(" row_index = size+1; \n");
635  source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
636  source.append(" } \n");
637 
638  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
639 
640  source.append(" row_at_window_start = row_index_buffer[0]; \n");
641  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
642 
643  //backward elimination
644  source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
645  source.append(" { \n");
646  source.append(" unsigned int row = row_at_window_start - row2; \n");
647  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
648 
649  source.append(" if ( (row_index == row) && (col_index < row) ) \n");
650  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
651 
652  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
653  source.append(" } \n");
654 
655  source.append(" row_at_window_start = row_at_window_end; \n");
656  source.append(" } \n");
657  source.append("} \n");
658 
659 }
660 
661 
662 template<typename StringT>
663 void generate_compressed_matrix_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
664 {
665 
666  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
667  source.append("__kernel void trans_unit_lu_forward( \n");
668  source.append(" __global const unsigned int * row_indices, \n");
669  source.append(" __global const unsigned int * column_indices, \n");
670  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
671  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
672  source.append(" unsigned int size) \n");
673  source.append("{ \n");
674  source.append(" __local unsigned int row_index_lookahead[256]; \n");
675  source.append(" __local unsigned int row_index_buffer[256]; \n");
676 
677  source.append(" unsigned int row_index; \n");
678  source.append(" unsigned int col_index; \n");
679  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
680  source.append(" unsigned int nnz = row_indices[size]; \n");
681  source.append(" unsigned int row_at_window_start = 0; \n");
682  source.append(" unsigned int row_at_window_end = 0; \n");
683  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
684 
685  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
686  source.append(" { \n");
687  source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
688  source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
689  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
690 
691  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
692 
693  source.append(" if (i < nnz) \n");
694  source.append(" { \n");
695  source.append(" unsigned int row_index_inc = 0; \n");
696  source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
697  source.append(" ++row_index_inc; \n");
698  source.append(" row_index = row_at_window_start + row_index_inc; \n");
699  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
700  source.append(" } \n");
701  source.append(" else \n");
702  source.append(" { \n");
703  source.append(" row_index = size+1; \n");
704  source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
705  source.append(" } \n");
706 
707  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
708 
709  source.append(" row_at_window_start = row_index_buffer[0]; \n");
710  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
711 
712  //forward elimination
713  source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
714  source.append(" { \n");
715  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
716 
717  source.append(" if ( (row_index == row) && (col_index > row) ) \n");
718  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
719 
720  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
721  source.append(" } \n");
722 
723  source.append(" row_at_window_start = row_at_window_end; \n");
724  source.append(" } \n");
725  source.append("} \n");
726 
727 }
728 
729 template<typename StringT>
730 void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT & source, std::string const & numeric_string)
731 {
732 
733  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
734  source.append("__kernel void trans_unit_lu_forward_slow( \n");
735  source.append(" __global const unsigned int * row_indices, \n");
736  source.append(" __global const unsigned int * column_indices, \n");
737  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
738  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
739  source.append(" unsigned int size) \n");
740  source.append("{ \n");
741  source.append(" for (unsigned int row = 0; row < size; ++row) \n");
742  source.append(" { \n");
743  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
744 
745  source.append(" unsigned int row_start = row_indices[row]; \n");
746  source.append(" unsigned int row_stop = row_indices[row + 1]; \n");
747  source.append(" for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
748  source.append(" { \n");
749  source.append(" unsigned int col_index = column_indices[entry_index]; \n");
750  source.append(" if (col_index > row) \n");
751  source.append(" vector[col_index] -= result_entry * elements[entry_index]; \n");
752  source.append(" } \n");
753 
754  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
755  source.append(" } \n");
756  source.append("} \n");
757 
758 }
759 
760 template<typename StringT>
761 void generate_compressed_matrix_unit_lu_backward(StringT & source, std::string const & numeric_string)
762 {
763 
764  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
765  source.append("__kernel void unit_lu_backward( \n");
766  source.append(" __global const unsigned int * row_indices, \n");
767  source.append(" __global const unsigned int * column_indices, \n");
768  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
769  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
770  source.append(" unsigned int size) \n");
771  source.append("{ \n");
772  source.append(" __local unsigned int col_index_buffer[128]; \n");
773  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
774  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
775 
776  source.append(" unsigned int nnz = row_indices[size]; \n");
777  source.append(" unsigned int current_row = size-1; \n");
778  source.append(" unsigned int row_at_window_start = size-1; \n");
779  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
780  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
781  source.append(" unsigned int next_row = row_indices[size-1]; \n");
782 
783  source.append(" unsigned int i = loop_end + get_local_id(0); \n");
784  source.append(" while (1) \n");
785  source.append(" { \n");
786  //load into shared memory (coalesced access):
787  source.append(" if (i < nnz) \n");
788  source.append(" { \n");
789  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
790  source.append(" unsigned int tmp = column_indices[i]; \n");
791  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
792  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
793  source.append(" } \n");
794 
795  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
796 
797  //now a single thread does the remaining work in shared memory:
798  source.append(" if (get_local_id(0) == 0) \n");
799  source.append(" { \n");
800  // traverse through all the loaded data from back to front:
801  source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
802  source.append(" { \n");
803  source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
804 
805  source.append(" if (i+k >= nnz) \n");
806  source.append(" continue; \n");
807 
808  source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
809  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
810  source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
811  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
812 
813  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
814  source.append(" { \n");
815  source.append(" vector[current_row] = current_vector_entry; \n");
816  source.append(" if (current_row > 0) \n"); //load next row's data
817  source.append(" { \n");
818  source.append(" --current_row; \n");
819  source.append(" next_row = row_indices[current_row]; \n");
820  source.append(" current_vector_entry = vector[current_row]; \n");
821  source.append(" } \n");
822  source.append(" } \n");
823 
824 
825  source.append(" } \n"); // for k
826 
827  source.append(" row_at_window_start = current_row; \n");
828  source.append(" } \n"); // if (get_local_id(0) == 0)
829 
830  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
831 
832  source.append(" if (i < get_local_size(0)) \n");
833  source.append(" break; \n");
834 
835  source.append(" i -= get_local_size(0); \n");
836  source.append(" } \n"); //for i
837  source.append("} \n");
838 
839 }
840 
841 template<typename StringT>
842 void generate_compressed_matrix_unit_lu_forward(StringT & source, std::string const & numeric_string)
843 {
844 
845  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
846  source.append("__kernel void unit_lu_forward( \n");
847  source.append(" __global const unsigned int * row_indices, \n");
848  source.append(" __global const unsigned int * column_indices, \n");
849  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
850  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
851  source.append(" unsigned int size) \n");
852  source.append("{ \n");
853  source.append(" __local unsigned int col_index_buffer[128]; \n");
854  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
855  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
856 
857  source.append(" unsigned int nnz = row_indices[size]; \n");
858  source.append(" unsigned int current_row = 0; \n");
859  source.append(" unsigned int row_at_window_start = 0; \n");
860  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
861  source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
862  source.append(" unsigned int next_row = row_indices[1]; \n");
863 
864  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
865  source.append(" { \n");
866  //load into shared memory (coalesced access):
867  source.append(" if (i < nnz) \n");
868  source.append(" { \n");
869  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
870  source.append(" unsigned int tmp = column_indices[i]; \n");
871  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
872  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
873  source.append(" } \n");
874 
875  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
876 
877  //now a single thread does the remaining work in shared memory:
878  source.append(" if (get_local_id(0) == 0) \n");
879  source.append(" { \n");
880  // traverse through all the loaded data:
881  source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
882  source.append(" { \n");
883  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
884  source.append(" { \n");
885  source.append(" vector[current_row] = current_vector_entry; \n");
886  source.append(" ++current_row; \n");
887  source.append(" if (current_row < size) //load next row's data \n");
888  source.append(" { \n");
889  source.append(" next_row = row_indices[current_row+1]; \n");
890  source.append(" current_vector_entry = vector[current_row]; \n");
891  source.append(" } \n");
892  source.append(" } \n");
893 
894  source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
895  source.append(" { \n");
896  source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
897  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
898  source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
899  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
900  source.append(" } \n");
901 
902  source.append(" } \n"); // for k
903 
904  source.append(" row_at_window_start = current_row; \n");
905  source.append(" } \n"); // if (get_local_id(0) == 0)
906 
907  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
908  source.append(" } //for i \n");
909  source.append("} \n");
910 
911 }
912 
913 template<typename StringT>
914 void generate_compressed_matrix_vec_mul_nvidia(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
915 {
916  std::stringstream ss;
917  ss << subwarp_size;
918 
919  source.append("__kernel void vec_mul_nvidia( \n");
920  source.append(" __global const unsigned int * row_indices, \n");
921  source.append(" __global const unsigned int * column_indices, \n");
922  source.append(" __global const unsigned int * row_blocks, \n");
923  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
924  source.append(" unsigned int num_blocks, \n");
925  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
926  source.append(" uint4 layout_x, \n");
927  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
928  source.append(" uint4 layout_result) \n");
929  source.append("{ \n");
930  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
931 
932  source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
933  source.append(" const unsigned int block_increment = get_local_size(0) * ((layout_result.z - 1) / (get_global_size(0)) + 1); \n");
934  source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
935  source.append(" const unsigned int block_stop = min(block_start + block_increment, layout_result.z); \n");
936 
937  source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
938  source.append(" row < block_stop; \n");
939  source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
940  source.append(" { \n");
941  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
942  source.append(" unsigned int row_end = row_indices[row+1]; \n");
943  source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
944  source.append(" dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
945 
946  source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
947  source.append(" #pragma unroll \n");
948  source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
949  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
950 
951  source.append(" if (id_in_row == 0) \n");
952  source.append(" result[row * layout_result.y + layout_result.x] = shared_elements[get_local_id(0)]; \n");
953  source.append(" } \n");
954  source.append("} \n");
955 
956 }
957 
958 template<typename StringT>
959 void generate_compressed_matrix_vec_mul(StringT & source, std::string const & numeric_string)
960 {
961  source.append("__kernel void vec_mul( \n");
962  source.append(" __global const unsigned int * row_indices, \n");
963  source.append(" __global const unsigned int * column_indices, \n");
964  source.append(" __global const unsigned int * row_blocks, \n");
965  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
966  source.append(" unsigned int num_blocks, \n");
967  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
968  source.append(" uint4 layout_x, \n");
969  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
970  source.append(" uint4 layout_result) \n");
971  source.append("{ \n");
972  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
973 
974  source.append(" unsigned int row_start = row_blocks[get_group_id(0)]; \n");
975  source.append(" unsigned int row_stop = row_blocks[get_group_id(0) + 1]; \n");
976  source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
977  source.append(" unsigned int element_start = row_indices[row_start]; \n");
978  source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
979 
980  source.append(" if (rows_to_process > 4) { \n"); // CSR stream
981  // load to shared buffer:
982  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
983  source.append(" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
984 
985  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
986 
987  // use one thread per row to sum:
988  source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
989  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
990  source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
991  source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
992  source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
993  source.append(" dot_prod += shared_elements[i]; \n");
994  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
995  source.append(" } \n");
996  source.append(" } \n");
997 
998  // use multiple threads for the summation
999  source.append(" else if (rows_to_process > 1) \n"); // CSR stream with local reduction
1000  source.append(" {\n");
1001  // load to shared buffer:
1002  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0))\n");
1003  source.append(" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x];\n");
1004 
1005  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1006 
1007  // sum each row separately using a reduction:
1008  source.append(" for (unsigned int row = row_start; row < row_stop; ++row)\n");
1009  source.append(" {\n");
1010  source.append(" unsigned int current_row_start = row_indices[row] - element_start;\n");
1011  source.append(" unsigned int current_row_stop = row_indices[row + 1] - element_start;\n");
1012  source.append(" unsigned int thread_base_id = current_row_start + get_local_id(0);\n");
1013 
1014  // sum whatever exceeds the current buffer:
1015  source.append(" for (unsigned int j = thread_base_id + get_local_size(0); j < current_row_stop; j += get_local_size(0))\n");
1016  source.append(" shared_elements[thread_base_id] += shared_elements[j];\n");
1017 
1018  // reduction
1019  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n");
1020  source.append(" {\n");
1021  source.append(" barrier(CLK_LOCAL_MEM_FENCE);\n");
1022  source.append(" if (get_local_id(0) < stride && thread_base_id < current_row_stop)\n");
1023  source.append(" shared_elements[thread_base_id] += (thread_base_id + stride < current_row_stop) ? shared_elements[thread_base_id+stride] : 0;\n");
1024  source.append(" }\n");
1025  source.append(" if (get_local_id(0) == 0)\n");
1026  source.append(" result[row * layout_result.y + layout_result.x] = shared_elements[current_row_start];\n");
1027  source.append(" }\n");
1028  source.append(" }\n");
1029 
1030 
1031  source.append(" else \n"); // CSR vector for a single row
1032  source.append(" { \n");
1033  // load and sum to shared buffer:
1034  source.append(" shared_elements[get_local_id(0)] = 0; \n");
1035  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
1036  source.append(" shared_elements[get_local_id(0)] += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
1037 
1038  // reduction to obtain final result
1039  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
1040  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1041  source.append(" if (get_local_id(0) < stride) \n");
1042  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
1043  source.append(" } \n");
1044 
1045  source.append(" if (get_local_id(0) == 0) \n");
1046  source.append(" result[row_start * layout_result.y + layout_result.x] = shared_elements[0]; \n");
1047  source.append(" } \n");
1048 
1049  source.append("} \n");
1050 
1051 }
1052 
1053 
1054 template<typename StringT>
1055 void generate_compressed_matrix_vec_mul4(StringT & source, std::string const & numeric_string)
1056 {
1057  source.append("__kernel void vec_mul4( \n");
1058  source.append(" __global const unsigned int * row_indices, \n");
1059  source.append(" __global const uint4 * column_indices, \n");
1060  source.append(" __global const "); source.append(numeric_string); source.append("4 * elements, \n");
1061  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
1062  source.append(" uint4 layout_x, \n");
1063  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1064  source.append(" uint4 layout_result) \n");
1065  source.append("{ \n");
1066  source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
1067  source.append(" unsigned int start, next_stop; \n");
1068  source.append(" uint4 col_idx; \n");
1069  source.append(" "); source.append(numeric_string); source.append("4 tmp_vec; \n");
1070  source.append(" "); source.append(numeric_string); source.append("4 tmp_entries; \n");
1071 
1072  source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
1073  source.append(" { \n");
1074  source.append(" dot_prod = 0; \n");
1075  source.append(" start = row_indices[row] / 4; \n");
1076  source.append(" next_stop = row_indices[row+1] / 4; \n");
1077 
1078  source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
1079  source.append(" { \n");
1080  source.append(" col_idx = column_indices[i]; \n");
1081 
1082  source.append(" tmp_entries = elements[i]; \n");
1083  source.append(" tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
1084  source.append(" tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
1085  source.append(" tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
1086  source.append(" tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
1087 
1088  source.append(" dot_prod += dot(tmp_entries, tmp_vec); \n");
1089  source.append(" } \n");
1090  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1091  source.append(" } \n");
1092  source.append("} \n");
1093 }
1094 
1095 template<typename StringT>
1096 void generate_compressed_matrix_vec_mul8(StringT & source, std::string const & numeric_string)
1097 {
1098  source.append("__kernel void vec_mul8( \n");
1099  source.append(" __global const unsigned int * row_indices, \n");
1100  source.append(" __global const uint8 * column_indices, \n");
1101  source.append(" __global const "); source.append(numeric_string); source.append("8 * elements, \n");
1102  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
1103  source.append(" uint4 layout_x, \n");
1104  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1105  source.append(" uint4 layout_result) \n");
1106  source.append("{ \n");
1107  source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
1108  source.append(" unsigned int start, next_stop; \n");
1109  source.append(" uint8 col_idx; \n");
1110  source.append(" "); source.append(numeric_string); source.append("8 tmp_vec; \n");
1111  source.append(" "); source.append(numeric_string); source.append("8 tmp_entries; \n");
1112 
1113  source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
1114  source.append(" { \n");
1115  source.append(" dot_prod = 0; \n");
1116  source.append(" start = row_indices[row] / 8; \n");
1117  source.append(" next_stop = row_indices[row+1] / 8; \n");
1118 
1119  source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
1120  source.append(" { \n");
1121  source.append(" col_idx = column_indices[i]; \n");
1122 
1123  source.append(" tmp_entries = elements[i]; \n");
1124  source.append(" tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
1125  source.append(" tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
1126  source.append(" tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
1127  source.append(" tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
1128  source.append(" tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
1129  source.append(" tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
1130  source.append(" tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
1131  source.append(" tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
1132 
1133  source.append(" dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
1134  source.append(" dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
1135  source.append(" } \n");
1136  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1137  source.append(" } \n");
1138  source.append("} \n");
1139 }
1140 
1141 template<typename StringT>
1142 void generate_compressed_matrix_vec_mul_cpu(StringT & source, std::string const & numeric_string)
1143 {
1144  source.append("__kernel void vec_mul_cpu( \n");
1145  source.append(" __global const unsigned int * row_indices, \n");
1146  source.append(" __global const unsigned int * column_indices, \n");
1147  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1148  source.append(" __global const "); source.append(numeric_string); source.append(" * vector, \n");
1149  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1150  source.append(" unsigned int size) \n");
1151  source.append("{ \n");
1152  source.append(" unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
1153  source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1154  source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
1155  source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1156  source.append(" { \n");
1157  source.append(" "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
1158  source.append(" unsigned int row_end = row_indices[row+1]; \n");
1159  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
1160  source.append(" dot_prod += elements[i] * vector[column_indices[i]]; \n");
1161  source.append(" result[row] = dot_prod; \n");
1162  source.append(" } \n");
1163  source.append("} \n");
1164 
1165 }
1166 
1167 
1168 
1173 template<typename StringT>
1175 {
1176  source.append("__kernel void spgemm_stage1( \n");
1177  source.append(" __global const unsigned int * A_row_indices, \n");
1178  source.append(" __global const unsigned int * A_column_indices, \n");
1179  source.append(" unsigned int A_size1, \n");
1180  source.append(" __global unsigned int * group_nnz_array) \n");
1181  source.append("{ \n");
1182  source.append(" unsigned int work_per_item = max((uint) ((A_size1 - 1) / get_global_size(0) + 1), (uint) 1); \n");
1183  source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1184  source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) A_size1); \n");
1185  source.append(" unsigned int max_A_nnz = 0; \n");
1186  source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1187  source.append(" max_A_nnz = max(max_A_nnz, A_row_indices[row + 1] - A_row_indices[row]); \n");
1188 
1189  // load and sum to shared buffer:
1190  source.append(" __local unsigned int shared_nnz[256]; \n");
1191  source.append(" shared_nnz[get_local_id(0)] = max_A_nnz; \n");
1192 
1193  // reduction to obtain final result
1194  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
1195  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1196  source.append(" if (get_local_id(0) < stride) \n");
1197  source.append(" shared_nnz[get_local_id(0)] = max(shared_nnz[get_local_id(0)], shared_nnz[get_local_id(0) + stride]); \n");
1198  source.append(" } \n");
1199 
1200  source.append(" if (get_local_id(0) == 0) \n");
1201  source.append(" group_nnz_array[get_group_id(0)] = shared_nnz[0]; \n");
1202  source.append("} \n");
1203 }
1204 
1205 
1210 template<typename StringT>
1212 {
1213  source.append("__kernel void spgemm_decompose_1( \n");
1214  source.append(" __global const unsigned int * A_row_indices, \n");
1215  source.append(" unsigned int A_size1, \n");
1216  source.append(" unsigned int max_per_row, \n");
1217  source.append(" __global unsigned int * chunks_per_row) \n");
1218  source.append("{ \n");
1219  source.append(" for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) {\n");
1220  source.append(" unsigned int num_entries = A_row_indices[row+1] - A_row_indices[row]; \n");
1221  source.append(" chunks_per_row[row] = (num_entries < max_per_row) ? 1 : ((num_entries - 1) / max_per_row + 1); \n");
1222  source.append(" } \n");
1223  source.append("} \n");
1224 }
1225 
1226 
1231 template<typename StringT>
1232 void generate_compressed_matrix_compressed_matrix_prod_A2(StringT & source, std::string const & numeric_string)
1233 {
1234  source.append("__kernel void spgemm_A2( \n");
1235  source.append(" __global unsigned int *A2_row_indices, \n");
1236  source.append(" __global unsigned int *A2_col_indices, \n");
1237  source.append(" __global "); source.append(numeric_string); source.append(" *A2_elements, \n");
1238  source.append(" unsigned int A2_size1, \n");
1239  source.append(" __global const unsigned int *new_row_buffer) \n");
1240  source.append("{ \n");
1241  source.append(" for (unsigned int i = get_global_id(0); i < A2_size1; i += get_global_size(0)) {\n");
1242  source.append(" unsigned int index_start = new_row_buffer[i]; \n");
1243  source.append(" unsigned int index_stop = new_row_buffer[i+1]; \n");
1244 
1245  source.append(" A2_row_indices[i] = index_start; \n");
1246 
1247  source.append(" for (unsigned int j = index_start; j < index_stop; ++j) { \n");
1248  source.append(" A2_col_indices[j] = j; \n");
1249  source.append(" A2_elements[j] = 1; \n");
1250  source.append(" } \n");
1251  source.append(" } \n");
1252 
1253  source.append(" if (get_global_id(0) == 0) \n");
1254  source.append(" A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; \n");
1255  source.append("} \n");
1256 }
1257 
1262 template<typename StringT>
1263 void generate_compressed_matrix_compressed_matrix_prod_G1(StringT & source, std::string const & numeric_string)
1264 {
1265  source.append("__kernel void spgemm_G1( \n");
1266  source.append(" __global unsigned int *G1_row_indices, \n");
1267  source.append(" __global unsigned int *G1_col_indices, \n");
1268  source.append(" __global "); source.append(numeric_string); source.append(" *G1_elements, \n");
1269  source.append(" unsigned int G1_size1, \n");
1270  source.append(" __global const unsigned int *A_row_indices, \n");
1271  source.append(" __global const unsigned int *A_col_indices, \n");
1272  source.append(" __global const "); source.append(numeric_string); source.append(" *A_elements, \n");
1273  source.append(" unsigned int A_size1, \n");
1274  source.append(" unsigned int A_nnz, \n");
1275  source.append(" unsigned int max_per_row, \n");
1276  source.append(" __global const unsigned int *new_row_buffer) \n");
1277  source.append("{ \n");
1278 
1279  // Part 1: Copy column indices and entries:
1280  source.append(" for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) {\n");
1281  source.append(" G1_col_indices[i] = A_col_indices[i]; \n");
1282  source.append(" G1_elements[i] = A_elements[i]; \n");
1283  source.append(" } \n");
1284 
1285  // Part 2: Derive new row indices:
1286  source.append(" for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) {\n");
1287  source.append(" unsigned int old_start = A_row_indices[i]; \n");
1288  source.append(" unsigned int new_start = new_row_buffer[i]; \n");
1289  source.append(" unsigned int row_chunks = new_row_buffer[i+1] - new_start; \n");
1290 
1291  source.append(" for (unsigned int j=0; j<row_chunks; ++j) \n");
1292  source.append(" G1_row_indices[new_start + j] = old_start + j * max_per_row; \n");
1293  source.append(" } \n");
1294 
1295  // write last entry in row_buffer with thread 0:
1296  source.append(" if (get_global_id(0) == 0) \n");
1297  source.append(" G1_row_indices[G1_size1] = A_row_indices[A_size1]; \n");
1298  source.append("} \n");
1299 }
1300 
1301 
1302 
1308 template<typename StringT>
1310 {
1311  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
1312  source.append("__kernel void spgemm_stage2( \n");
1313  source.append(" __global const unsigned int * A_row_indices, \n");
1314  source.append(" __global const unsigned int * A_col_indices, \n");
1315  source.append(" unsigned int A_size1, \n");
1316  source.append(" __global const unsigned int * B_row_indices, \n");
1317  source.append(" __global const unsigned int * B_col_indices, \n");
1318  source.append(" unsigned int B_size2, \n");
1319  source.append(" __global unsigned int * C_row_indices) \n");
1320  source.append("{ \n");
1321  source.append(" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
1322  source.append(" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
1323  source.append(" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
1324  source.append(" __local unsigned int shared_front[32]; \n");
1325 
1326  source.append(" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
1327  source.append(" { \n");
1328  source.append(" unsigned int row_A_start = A_row_indices[row_C]; \n");
1329  source.append(" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
1330 
1331  source.append(" unsigned int my_row_B = row_A_start + get_local_id(0); \n");
1332  source.append(" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
1333  source.append(" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
1334  source.append(" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
1335 
1336  source.append(" unsigned int num_nnz = 0; \n");
1337  source.append(" if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
1338 
1339  source.append(" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1340  source.append(" while (1) { \n");
1341 
1342  // determine minimum index via reduction:
1343  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1344  source.append(" shared_front[get_local_id(0)] = current_front_index; \n");
1345  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1346  source.append(" if (get_local_id(0) < 16) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 16]); \n");
1347  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1348  source.append(" if (get_local_id(0) < 8) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 8]); \n");
1349  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1350  source.append(" if (get_local_id(0) < 4) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 4]); \n");
1351  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1352  source.append(" if (get_local_id(0) < 2) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 2]); \n");
1353  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1354  source.append(" if (get_local_id(0) < 1) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 1]); \n");
1355  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1356 
1357  source.append(" if (shared_front[0] == B_size2) break; \n");
1358 
1359  // update front:
1360  source.append(" if (current_front_index == shared_front[0]) { \n");
1361  source.append(" ++row_B_start; \n");
1362  source.append(" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1363  source.append(" } \n");
1364 
1365  source.append(" ++num_nnz; \n");
1366  source.append(" } \n");
1367  source.append(" } else { num_nnz = row_B_end - row_B_start; }\n");
1368 
1369  // write number of entries found:
1370  source.append(" if (get_local_id(0) == 0) \n");
1371  source.append(" C_row_indices[row_C] = num_nnz; \n");
1372 
1373  source.append(" } \n");
1374 
1375  source.append("} \n");
1376 
1377 }
1378 
1379 
1384 template<typename StringT>
1385 void generate_compressed_matrix_compressed_matrix_prod_3(StringT & source, std::string const & numeric_string)
1386 {
1387  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
1388  source.append("__kernel void spgemm_stage3( \n");
1389  source.append(" __global const unsigned int * A_row_indices, \n");
1390  source.append(" __global const unsigned int * A_col_indices, \n");
1391  source.append(" __global const "); source.append(numeric_string); source.append(" * A_elements, \n");
1392  source.append(" unsigned int A_size1, \n");
1393  source.append(" __global const unsigned int * B_row_indices, \n");
1394  source.append(" __global const unsigned int * B_col_indices, \n");
1395  source.append(" __global const "); source.append(numeric_string); source.append(" * B_elements, \n");
1396  source.append(" unsigned int B_size2, \n");
1397  source.append(" __global unsigned int * C_row_indices, \n");
1398  source.append(" __global unsigned int * C_col_indices, \n");
1399  source.append(" __global "); source.append(numeric_string); source.append(" * C_elements) \n");
1400  source.append("{ \n");
1401  source.append(" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
1402  source.append(" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
1403  source.append(" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
1404  source.append(" __local unsigned int shared_front[32]; \n");
1405  source.append(" __local "); source.append(numeric_string); source.append(" shared_front_values[32]; \n");
1406  source.append(" unsigned int local_id = get_local_id(0); \n");
1407 
1408  source.append(" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
1409  source.append(" { \n");
1410  source.append(" unsigned int row_A_start = A_row_indices[row_C]; \n");
1411  source.append(" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
1412 
1413  source.append(" unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? local_id : 0); \n"); // single row is a special case
1414  source.append(" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
1415  source.append(" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
1416  source.append(" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
1417 
1418  source.append(" "); source.append(numeric_string); source.append(" val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; \n");
1419  source.append(" unsigned int index_in_C = C_row_indices[row_C] + local_id; \n");
1420 
1421  source.append(" if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
1422 
1423  source.append(" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1424  source.append(" "); source.append(numeric_string); source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
1425 
1426  source.append(" unsigned int index_buffer = 0; \n");
1427  source.append(" "); source.append(numeric_string); source.append(" value_buffer = 0; \n");
1428  source.append(" unsigned int buffer_size = 0; \n");
1429 
1430  source.append(" while (1) { \n");
1431 
1432  // determine minimum index via reduction:
1433  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1434  source.append(" shared_front[local_id] = current_front_index; \n");
1435  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1436  source.append(" if (local_id < 16) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 16]); \n");
1437  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1438  source.append(" if (local_id < 8) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 8]); \n");
1439  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1440  source.append(" if (local_id < 4) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 4]); \n");
1441  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1442  source.append(" if (local_id < 2) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 2]); \n");
1443  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1444  source.append(" if (local_id < 1) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 1]); \n");
1445  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1446 
1447  source.append(" if (shared_front[0] == B_size2) break; \n");
1448 
1449  // compute output value via reduction:
1450  source.append(" shared_front_values[local_id] = (current_front_index == shared_front[0]) ? val_A * current_front_value : 0; \n");
1451  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1452  source.append(" if (local_id < 16) shared_front_values[local_id] += shared_front_values[local_id + 16]; \n");
1453  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1454  source.append(" if (local_id < 8) shared_front_values[local_id] += shared_front_values[local_id + 8]; \n");
1455  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1456  source.append(" if (local_id < 4) shared_front_values[local_id] += shared_front_values[local_id + 4]; \n");
1457  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1458  source.append(" if (local_id < 2) shared_front_values[local_id] += shared_front_values[local_id + 2]; \n");
1459  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1460  source.append(" if (local_id < 1) shared_front_values[local_id] += shared_front_values[local_id + 1]; \n");
1461  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1462 
1463  // update front:
1464  source.append(" if (current_front_index == shared_front[0]) { \n");
1465  source.append(" ++row_B_start; \n");
1466  source.append(" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1467  source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
1468  source.append(" } \n");
1469 
1470  // write current front to register buffer:
1471  source.append(" index_buffer = (local_id == buffer_size) ? shared_front[0] : index_buffer; \n");
1472  source.append(" value_buffer = (local_id == buffer_size) ? shared_front_values[0] : value_buffer; \n");
1473  source.append(" ++buffer_size; \n");
1474 
1475  // flush register buffer via a coalesced write once full:
1476  source.append(" if (buffer_size == get_local_size(0)) { \n");
1477  source.append(" C_col_indices[index_in_C] = index_buffer; \n");
1478  source.append(" C_elements[index_in_C] = value_buffer; \n");
1479  source.append(" } \n");
1480 
1481  // the following should be in the previous if-conditional, but a bug in NVIDIA drivers 34x.yz requires this slight rewrite
1482  source.append(" index_in_C += (buffer_size == get_local_size(0)) ? get_local_size(0) : 0; \n");
1483  source.append(" buffer_size = (buffer_size == get_local_size(0)) ? 0 : buffer_size; \n");
1484 
1485  source.append(" } \n");
1486 
1487  // write remaining entries in register buffer:
1488  source.append(" if (local_id < buffer_size) { \n");
1489  source.append(" C_col_indices[index_in_C] = index_buffer; \n");
1490  source.append(" C_elements[index_in_C] = value_buffer; \n");
1491  source.append(" } \n");
1492 
1493  // copy to C in coalesced manner:
1494  source.append(" } else { \n");
1495  source.append(" for (unsigned int i = row_B_start + local_id; i < row_B_end; i += get_local_size(0)) { \n");
1496  source.append(" C_col_indices[index_in_C] = B_col_indices[i]; \n");
1497  source.append(" C_elements[index_in_C] = val_A * B_elements[i]; \n");
1498  source.append(" index_in_C += get_local_size(0); \n");
1499  source.append(" } \n");
1500  source.append(" } \n");
1501 
1502  source.append(" } \n");
1503 
1504  source.append("} \n");
1505 
1506 }
1507 
1508 
1509 template<typename StringT>
1510 void generate_compressed_matrix_compressed_matrix_prod(StringT & source, std::string const & numeric_string)
1511 {
1518 }
1519 
1520 template<typename StringT>
1521 void generate_compressed_matrix_assign_to_dense(StringT & source, std::string const & numeric_string)
1522 {
1523 
1524  source.append(" __kernel void assign_to_dense( \n");
1525  source.append(" __global const unsigned int * row_indices, \n");
1526  source.append(" __global const unsigned int * column_indices, \n");
1527  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1528  source.append(" __global "); source.append(numeric_string); source.append(" * B, \n");
1529  source.append(" unsigned int B_row_start, \n");
1530  source.append(" unsigned int B_col_start, \n");
1531  source.append(" unsigned int B_row_inc, \n");
1532  source.append(" unsigned int B_col_inc, \n");
1533  source.append(" unsigned int B_row_size, \n");
1534  source.append(" unsigned int B_col_size, \n");
1535  source.append(" unsigned int B_internal_rows, \n");
1536  source.append(" unsigned int B_internal_cols) { \n");
1537 
1538  source.append(" for (unsigned int i = get_global_id(0); i < B_row_size; i += get_global_size(0)) \n");
1539  source.append(" { \n");
1540  source.append(" unsigned int row_end = row_indices[i+1]; \n");
1541  source.append(" for (unsigned int j = row_indices[i]; j<row_end; j++) \n");
1542  source.append(" { \n");
1543  source.append(" B[(B_row_start + i * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; \n");
1544  source.append(" } \n");
1545  source.append(" } \n");
1546  source.append(" } \n");
1547 
1548 }
1549 
1550 
1552 
1553 // main kernel class
1555 template<typename NumericT>
1557 {
1558  static std::string program_name()
1559  {
1560  return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
1561  }
1562 
1563  static void init(viennacl::ocl::context & ctx)
1564  {
1565  static std::map<cl_context, bool> init_done;
1566  if (!init_done[ctx.handle().get()])
1567  {
1569  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1570 
1571  std::string source;
1572  source.reserve(1024);
1573 
1574  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1575 
1576  if (numeric_string == "float" || numeric_string == "double")
1577  {
1578  generate_compressed_matrix_jacobi(source, numeric_string);
1581  generate_compressed_matrix_lu_backward(source, numeric_string);
1582  generate_compressed_matrix_lu_forward(source, numeric_string);
1583  generate_compressed_matrix_trans_lu_backward(source, numeric_string);
1584  generate_compressed_matrix_trans_lu_forward(source, numeric_string);
1585  generate_compressed_matrix_trans_unit_lu_backward(source, numeric_string);
1586  generate_compressed_matrix_trans_unit_lu_forward(source, numeric_string);
1588  generate_compressed_matrix_unit_lu_backward(source, numeric_string);
1589  generate_compressed_matrix_unit_lu_forward(source, numeric_string);
1590  }
1592  generate_compressed_matrix_row_info_extractor(source, numeric_string);
1593  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1594  generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16);
1595  generate_compressed_matrix_vec_mul(source, numeric_string);
1596  generate_compressed_matrix_vec_mul4(source, numeric_string);
1597  generate_compressed_matrix_vec_mul8(source, numeric_string);
1598  generate_compressed_matrix_vec_mul_cpu(source, numeric_string);
1599  generate_compressed_matrix_compressed_matrix_prod(source, numeric_string);
1600  generate_compressed_matrix_assign_to_dense(source, numeric_string);
1601 
1602  std::string prog_name = program_name();
1603  #ifdef VIENNACL_BUILD_INFO
1604  std::cout << "Creating program " << prog_name << std::endl;
1605  #endif
1606  ctx.add_program(source, prog_name);
1607  init_done[ctx.handle().get()] = true;
1608  } //if
1609  } //init
1610 };
1611 
1612 } // namespace kernels
1613 } // namespace opencl
1614 } // namespace linalg
1615 } // namespace viennacl
1616 #endif
1617 
Implements a OpenCL platform within ViennaCL.
void generate_compressed_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)
Various little tools used here and there in ViennaCL.
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:49
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
void generate_compressed_matrix_trans_lu_forward(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities.
void generate_compressed_matrix_block_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_dense_matrix_mult(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
void generate_compressed_matrix_vec_mul_cpu(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_jacobi(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
void generate_compressed_matrix_vec_mul(StringT &source, std::string const &numeric_string)
Common implementations shared by OpenCL-based operations.
void generate_compressed_matrix_assign_to_dense(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_block_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
void generate_compressed_matrix_compressed_matrix_prod_G1(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling G_1 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_compressed_matrix_compressed_matrix_prod_3(StringT &source, std::string const &numeric_string)
OpenCL kernel for the third stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
Definition: blas3.hpp:36
void generate_compressed_matrix_vec_mul4(StringT &source, std::string const &numeric_string)
Main kernel class for generating OpenCL kernels for compressed_matrix.
void generate_compressed_matrix_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_vec_mul_nvidia(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
void generate_compressed_matrix_vec_mul8(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_2(StringT &source)
OpenCL kernel for the second stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_trans_unit_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_1(StringT &source)
OpenCL kernel for the first stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_A2(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling A_2 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
Representation of an OpenCL kernel in ViennaCL.
static void init(viennacl::ocl::context &ctx)
void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_compressed_matrix_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_decompose_1(StringT &source)
OpenCL kernel for decomposing A in C = A * B, such that A = A_2 * G_1 with G_1 containing at most 32 ...
void generate_compressed_matrix_unit_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_unit_lu_backward(StringT &source, std::string const &numeric_string)