ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
coordinate_matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
27 
30 namespace viennacl
31 {
32 namespace linalg
33 {
34 namespace opencl
35 {
36 namespace kernels
37 {
38 
40 
41 template<typename StringT>
42 void generate_coordinate_matrix_vec_mul(StringT & source, std::string const & numeric_string)
43 {
44  source.append("__kernel void vec_mul( \n");
45  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
46  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
47  source.append(" __global const uint * group_boundaries, \n");
48  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
49  source.append(" uint4 layout_x, \n");
50  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
51  source.append(" uint4 layout_result, \n");
52  source.append(" __local unsigned int * shared_rows, \n");
53  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
54  source.append("{ \n");
55  source.append(" uint2 tmp; \n");
56  source.append(" "); source.append(numeric_string); source.append(" val; \n");
57  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
58  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
59  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
60 
61  source.append(" uint local_index = 0; \n");
62 
63  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
64  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
65 
66  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
67  source.append(" val = (local_index < group_end) ? elements[local_index] * x[tmp.y * layout_x.y + layout_x.x] : 0; \n");
68 
69  //check for carry from previous loop run:
70  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
71  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
72  source.append(" val += inter_results[get_local_size(0)-1]; \n");
73  source.append(" else \n");
74  source.append(" result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] = inter_results[get_local_size(0)-1]; \n");
75  source.append(" } \n");
76 
77  //segmented parallel reduction begin
78  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
79  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
80  source.append(" inter_results[get_local_id(0)] = val; \n");
81  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
82  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
83 
84  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
85  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
86  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
87  source.append(" inter_results[get_local_id(0)] += left; \n");
88  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
89  source.append(" } \n");
90  //segmented parallel reduction end
91 
92  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
93  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
94  source.append(" result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
95  source.append(" } \n");
96 
97  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
98  source.append(" } \n"); //for k
99 
100  source.append(" if (local_index + 1 == group_end) \n"); //write results of last active entry (this may not necessarily be the case already)
101  source.append(" result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
102  source.append("} \n");
103 
104 }
105 
106 namespace detail
107 {
109  template<typename StringT>
110  void generate_coordinate_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
111  bool B_transposed, bool B_row_major, bool C_row_major)
112  {
113  source.append("__kernel void ");
114  source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
115  source.append("( \n");
116  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
117  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
118  source.append(" __global const uint * group_boundaries, \n");
119  source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
120  source.append(" unsigned int d_mat_row_start, \n");
121  source.append(" unsigned int d_mat_col_start, \n");
122  source.append(" unsigned int d_mat_row_inc, \n");
123  source.append(" unsigned int d_mat_col_inc, \n");
124  source.append(" unsigned int d_mat_row_size, \n");
125  source.append(" unsigned int d_mat_col_size, \n");
126  source.append(" unsigned int d_mat_internal_rows, \n");
127  source.append(" unsigned int d_mat_internal_cols, \n");
128  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
129  source.append(" unsigned int result_row_start, \n");
130  source.append(" unsigned int result_col_start, \n");
131  source.append(" unsigned int result_row_inc, \n");
132  source.append(" unsigned int result_col_inc, \n");
133  source.append(" unsigned int result_row_size, \n");
134  source.append(" unsigned int result_col_size, \n");
135  source.append(" unsigned int result_internal_rows, \n");
136  source.append(" unsigned int result_internal_cols, \n");
137  source.append(" __local unsigned int * shared_rows, \n");
138  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
139  source.append("{ \n");
140  source.append(" uint2 tmp; \n");
141  source.append(" "); source.append(numeric_string); source.append(" val; \n");
142  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
143  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
144  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
145 
146  source.append(" uint local_index = 0; \n");
147 
148  source.append(" for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
149  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
150  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
151 
152  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
153  if (B_transposed && B_row_major)
154  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + tmp.y * d_mat_col_inc ] : 0; \n");
155  if (B_transposed && !B_row_major)
156  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + tmp.y * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
157  else if (!B_transposed && B_row_major)
158  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] : 0; \n");
159  else
160  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
161 
162  //check for carry from previous loop run:
163  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
164  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
165  source.append(" val += inter_results[get_local_size(0)-1]; \n");
166  source.append(" else \n");
167  if (C_row_major)
168  source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_size(0)-1]; \n");
169  else
170  source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_size(0)-1]; \n");
171  source.append(" } \n");
172 
173  //segmented parallel reduction begin
174  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
175  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
176  source.append(" inter_results[get_local_id(0)] = val; \n");
177  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
178  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
179 
180  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
181  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
182  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
183  source.append(" inter_results[get_local_id(0)] += left; \n");
184  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
185  source.append(" } \n");
186  //segmented parallel reduction end
187 
188  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
189  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
190  if (C_row_major)
191  source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
192  else
193  source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
194  source.append(" } \n");
195 
196  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
197  source.append(" } \n"); //for k
198 
199  source.append(" if (local_index + 1 == group_end) \n"); //write results of last active entry (this may not necessarily be the case already)
200  if (C_row_major)
201  source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
202  else
203  source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
204  source.append(" } \n"); //for result_col
205  source.append("} \n");
206 
207  }
208 }
209 
210 template<typename StringT>
211 void generate_coordinate_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
212 {
213  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
214  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, true);
215  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, false);
216  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, true);
217 
218  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
219  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, true);
220  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, false);
221  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, true);
222 }
223 
224 template<typename StringT>
225 void generate_coordinate_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
226 {
227  source.append("__kernel void row_info_extractor( \n");
228  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
229  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
230  source.append(" __global const uint * group_boundaries, \n");
231  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
232  source.append(" unsigned int option, \n");
233  source.append(" __local unsigned int * shared_rows, \n");
234  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
235  source.append("{ \n");
236  source.append(" uint2 tmp; \n");
237  source.append(" "); source.append(numeric_string); source.append(" val; \n");
238  source.append(" uint last_index = get_local_size(0) - 1; \n");
239  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
240  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
241  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : ("); source.append(numeric_string); source.append(")0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
242 
243  source.append(" uint local_index = 0; \n");
244 
245  source.append(" for (uint k = 0; k < k_end; ++k) \n");
246  source.append(" { \n");
247  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
248 
249  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
250  source.append(" val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; \n");
251 
252  //check for carry from previous loop run:
253  source.append(" if (get_local_id(0) == 0 && k > 0) \n");
254  source.append(" { \n");
255  source.append(" if (tmp.x == shared_rows[last_index]) \n");
256  source.append(" { \n");
257  source.append(" switch (option) \n");
258  source.append(" { \n");
259  source.append(" case 0: \n"); //inf-norm
260  source.append(" case 3: \n"); //diagonal entry
261  source.append(" val = max(val, fabs(inter_results[last_index])); \n");
262  source.append(" break; \n");
263 
264  source.append(" case 1: \n"); //1-norm
265  source.append(" val = fabs(val) + inter_results[last_index]; \n");
266  source.append(" break; \n");
267 
268  source.append(" case 2: \n"); //2-norm
269  source.append(" val = sqrt(val * val + inter_results[last_index]); \n");
270  source.append(" break; \n");
271 
272  source.append(" default: \n");
273  source.append(" break; \n");
274  source.append(" } \n");
275  source.append(" } \n");
276  source.append(" else \n");
277  source.append(" { \n");
278  source.append(" switch (option) \n");
279  source.append(" { \n");
280  source.append(" case 0: \n"); //inf-norm
281  source.append(" case 1: \n"); //1-norm
282  source.append(" case 3: \n"); //diagonal entry
283  source.append(" result[shared_rows[last_index]] = inter_results[last_index]; \n");
284  source.append(" break; \n");
285 
286  source.append(" case 2: \n"); //2-norm
287  source.append(" result[shared_rows[last_index]] = sqrt(inter_results[last_index]); \n");
288  source.append(" default: \n");
289  source.append(" break; \n");
290  source.append(" } \n");
291  source.append(" } \n");
292  source.append(" } \n");
293 
294  //segmented parallel reduction begin
295  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
296  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
297  source.append(" switch (option) \n");
298  source.append(" { \n");
299  source.append(" case 0: \n");
300  source.append(" case 3: \n");
301  source.append(" inter_results[get_local_id(0)] = val; \n");
302  source.append(" break; \n");
303  source.append(" case 1: \n");
304  source.append(" inter_results[get_local_id(0)] = fabs(val); \n");
305  source.append(" break; \n");
306  source.append(" case 2: \n");
307  source.append(" inter_results[get_local_id(0)] = val * val; \n");
308  source.append(" default: \n");
309  source.append(" break; \n");
310  source.append(" } \n");
311  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
312 
313  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) \n");
314  source.append(" { \n");
315  source.append(" "); source.append(numeric_string); source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : ("); source.append(numeric_string); source.append(")0; \n");
316  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
317  source.append(" switch (option) \n");
318  source.append(" { \n");
319  source.append(" case 0: \n"); //inf-norm
320  source.append(" case 3: \n"); //diagonal entry
321  source.append(" inter_results[get_local_id(0)] = max(inter_results[get_local_id(0)], left); \n");
322  source.append(" break; \n");
323 
324  source.append(" case 1: \n"); //1-norm
325  source.append(" inter_results[get_local_id(0)] += left; \n");
326  source.append(" break; \n");
327 
328  source.append(" case 2: \n"); //2-norm
329  source.append(" inter_results[get_local_id(0)] += left; \n");
330  source.append(" break; \n");
331 
332  source.append(" default: \n");
333  source.append(" break; \n");
334  source.append(" } \n");
335  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
336  source.append(" } \n");
337  //segmented parallel reduction end
338 
339  source.append(" if (get_local_id(0) != last_index && \n");
340  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] && \n");
341  source.append(" inter_results[get_local_id(0)] != 0) \n");
342  source.append(" { \n");
343  source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
344  source.append(" } \n");
345 
346  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
347  source.append(" } \n"); //for k
348 
349  source.append(" if (local_index + 1 == group_end && inter_results[get_local_id(0)] != 0) \n");
350  source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
351  source.append("} \n");
352 }
353 
355 
356 // main kernel class
358 template<typename NumericT>
360 {
361  static std::string program_name()
362  {
363  return viennacl::ocl::type_to_string<NumericT>::apply() + "_coordinate_matrix";
364  }
365 
366  static void init(viennacl::ocl::context & ctx)
367  {
368  static std::map<cl_context, bool> init_done;
369  if (!init_done[ctx.handle().get()])
370  {
372  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
373 
374  std::string source;
375  source.reserve(1024);
376 
377  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
378 
379  generate_coordinate_matrix_vec_mul(source, numeric_string);
381  generate_coordinate_matrix_row_info_extractor(source, numeric_string);
382 
383  std::string prog_name = program_name();
384  #ifdef VIENNACL_BUILD_INFO
385  std::cout << "Creating program " << prog_name << std::endl;
386  #endif
387  ctx.add_program(source, prog_name);
388  init_done[ctx.handle().get()] = true;
389  } //if
390  } //init
391 };
392 
393 } // namespace kernels
394 } // namespace opencl
395 } // namespace linalg
396 } // namespace viennacl
397 #endif
398 
static void init(viennacl::ocl::context &ctx)
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:49
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Main kernel class for generating OpenCL kernels for coordinate_matrix.
Provides OpenCL-related utilities.
void generate_coordinate_matrix_dense_matrix_mul(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
Common implementations shared by OpenCL-based operations.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
Definition: blas3.hpp:36
void generate_coordinate_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_vec_mul(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_coordinate_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)