ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
22 #include "viennacl/tools/tools.hpp"
23 #include "viennacl/ocl/kernel.hpp"
25 #include "viennacl/ocl/utils.hpp"
26 
32 
35 namespace viennacl
36 {
37 namespace linalg
38 {
39 namespace opencl
40 {
41 namespace kernels
42 {
43 
45 
48 {
49  VIENNACL_AMBM_NONE = 0, // matrix does not exist/contribute
52 };
53 
56 {
58 
61  std::string assign_op;
64 };
65 
66 
67 
68 
69 template<typename StringT>
70 void generate_fft(StringT & source, std::string const & numeric_string, bool is_row_major)
71 {
72  // naive fourier transform (quadratic complexity, use for reference only)
73  source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
74  source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
75  source.append(" unsigned int size, \n");
76  source.append(" unsigned int stride, \n");
77  source.append(" unsigned int batch_num, \n");
78  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
79  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
80  source.append(" \n");
81  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
82  source.append(" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
83  source.append(" "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
84  source.append(" \n");
85  source.append(" for (unsigned int n = 0; n < size; n++) { \n");
86  source.append(" "); source.append(numeric_string); source.append("2 in = ");
87  if (is_row_major)
88  source.append("input[batch_id * stride + n]; \n"); //input index here
89  else
90  source.append("input[n * stride + batch_id]; \n"); //input index here
91  source.append(" \n");
92  source.append(" "); source.append(numeric_string); source.append(" sn, cs; \n");
93  source.append(" "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
94  source.append(" sn = sincos(arg, &cs); \n");
95  source.append(" \n");
96  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
97  source.append(" f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
98  source.append(" } \n");
99  source.append(" \n");
100  if (is_row_major)
101  source.append(" output[batch_id * stride + k] = f; \n"); // output index here
102  else
103  source.append(" output[k * stride + batch_id] = f; \n"); // output index here
104  source.append(" } \n");
105  source.append(" } \n");
106  source.append("} \n");
107 
108  source.append(" \n");
109 
110  source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
111  source.append(" unsigned int s, \n");
112  source.append(" unsigned int bit_size, \n");
113  source.append(" unsigned int size, \n");
114  source.append(" unsigned int stride, \n");
115  source.append(" unsigned int batch_num, \n");
116  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
117  source.append(" \n");
118  source.append(" unsigned int ss = 1 << s; \n");
119  source.append(" unsigned int half_size = size >> 1; \n");
120  source.append(" \n");
121  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
122  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
123  source.append(" \n");
124  source.append(" unsigned int glb_id = get_global_id(0); \n");
125  source.append(" unsigned int glb_sz = get_global_size(0); \n");
126 
127  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
128  source.append(" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
129  source.append(" unsigned int group = (tid & (ss - 1)); \n");
130  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
131 
132  if (is_row_major)
133  {
134  source.append(" unsigned int offset = batch_id * stride + pos; \n");
135  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
136  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
137  }
138  else
139  {
140  source.append(" unsigned int offset = pos * stride + batch_id; \n");
141  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
142  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
143  }
144 
145  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
146 
147  source.append(" sn = sincos(arg, &cs); \n");
148 
149  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
150 
151  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
152 
153  if (is_row_major)
154  source.append(" input[offset + ss] = in1 - tmp; \n");//index
155  else
156  source.append(" input[offset + ss * stride] = in1 - tmp; \n");//index
157  source.append(" input[offset] = in1 + tmp; \n");//index
158  source.append(" } \n");
159  source.append(" } \n");
160  source.append("} \n");
161 
162  source.append(" \n");
163 
164  source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
165  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
166  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
167  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
168  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
169  source.append(" v = (v >> 16) | (v << 16); \n");
170  source.append(" \n");
171  source.append(" v = v >> (32 - bit_size); \n");
172  source.append(" \n");
173  source.append(" return v; \n");
174  source.append(" } \n");
175 
176  source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
177  source.append(" __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
178  source.append(" unsigned int bit_size, \n");
179  source.append(" unsigned int size, \n");
180  source.append(" unsigned int stride, \n");
181  source.append(" unsigned int batch_num, \n");
182  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
183 
184  source.append(" unsigned int grp_id = get_group_id(0); \n");
185  source.append(" unsigned int grp_num = get_num_groups(0); \n");
186 
187  source.append(" unsigned int lcl_sz = get_local_size(0); \n");
188  source.append(" unsigned int lcl_id = get_local_id(0); \n");
189  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
190 
191  source.append(" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
192  //unsigned int base_offset = stride * batch_id; \n");
193  //copy chunk of global memory to local \n");
194  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
195  source.append(" unsigned int v = get_reorder_num(p, bit_size); \n");
196  if (is_row_major)
197  source.append(" lcl_input[v] = input[batch_id * stride + p]; \n"); //index
198  else
199  source.append(" lcl_input[v] = input[p * stride + batch_id]; \n"); //index
200  source.append(" } \n");
201 
202  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
203 
204  //performs Cooley-Tukey FFT on local array
205  source.append(" for (unsigned int s = 0; s < bit_size; s++) { \n");
206  source.append(" unsigned int ss = 1 << s; \n");
207 
208  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
209 
210  source.append(" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
211  source.append(" unsigned int group = (tid & (ss - 1)); \n");
212  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
213 
214  source.append(" "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
215  source.append(" "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
216 
217  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
218 
219  source.append(" sn = sincos(arg, &cs); \n");
220  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
221 
222  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
223 
224  source.append(" lcl_input[pos + ss] = in1 - tmp; \n");
225  source.append(" lcl_input[pos] = in1 + tmp; \n");
226  source.append(" } \n");
227 
228  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
229  source.append(" } \n");
230 
231  //copy local array back to global memory
232  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
233  if (is_row_major)
234  source.append(" input[batch_id * stride + p] = lcl_input[p]; \n");//index
235  else
236  source.append(" input[p * stride + batch_id] = lcl_input[p]; \n");//index
237  source.append(" } \n");
238  source.append(" } \n");
239  source.append(" } \n");
240 
241  source.append(" \n");
242 
243  //
244  // Performs reordering of input data in bit-reversal order
245  // Probably it's better to do in host side,
246  //
247  source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
248  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
249  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
250  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
251  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
252  source.append(" v = (v >> 16) | (v << 16); \n");
253 
254  source.append(" v = v >> (32 - bit_size); \n");
255 
256  source.append(" return v; \n");
257  source.append("} \n");
258 
259  source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
260  source.append(" unsigned int bit_size, \n");
261  source.append(" unsigned int size, \n");
262  source.append(" unsigned int stride, \n");
263  source.append(" int batch_num) { \n");
264 
265  source.append(" unsigned int glb_id = get_global_id(0); \n");
266  source.append(" unsigned int glb_sz = get_global_size(0); \n");
267 
268  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
269  source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
270  source.append(" unsigned int v = get_reorder_num_2(i, bit_size); \n");
271 
272  source.append(" if (i < v) {\n");
273  if (is_row_major)
274  {
275  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
276  source.append(" input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
277  source.append(" input[batch_id * stride + v] = tmp; \n"); //index
278  }
279  else
280  {
281  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
282  source.append(" input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
283  source.append(" input[v * stride + batch_id] = tmp; \n"); //index
284  }
285  source.append(" } \n");
286  source.append(" } \n");
287  source.append(" } \n");
288  source.append("} \n");
289 }
290 
291 template<typename StringT>
292 void generate_lu(StringT & source, std::string const & numeric_string, bool is_row_major)
293 {
294  source.append("__kernel void lu_factorize( \n");
295  source.append(" __global "); source.append(numeric_string); source.append(" * matrix, \n");
296  source.append(" unsigned int matrix_rows, \n");
297  source.append(" unsigned int matrix_cols, \n");
298  source.append(" unsigned int matrix_internal_rows, \n");
299  source.append(" unsigned int matrix_internal_cols) \n");
300  source.append("{ \n");
301  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
302 
303  if (is_row_major)
304  {
305  source.append(" unsigned rowi; \n");
306  source.append(" unsigned rowk; \n");
307  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
308  source.append(" { \n");
309  source.append(" rowi = i * matrix_internal_cols; \n");
310  source.append(" for (unsigned int k=0; k<i; ++k) \n");
311  source.append(" { \n");
312  source.append(" rowk = k * matrix_internal_cols; \n");
313  source.append(" if (get_global_id(0) == 0) \n");
314  source.append(" matrix[rowi + k] /= matrix[rowk + k]; \n");
315 
316  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
317  source.append(" temp = matrix[rowi + k]; \n");
318 
319  //parallel subtraction:
320  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
321  source.append(" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
322  }
323  else
324  {
325  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
326  source.append(" { \n");
327  source.append(" for (unsigned int k=0; k<i; ++k) \n");
328  source.append(" { \n");
329 
330  source.append(" if (get_global_id(0) == 0) \n");
331  source.append(" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
332 
333  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
334  source.append(" temp = matrix[i + k*matrix_internal_rows]; \n");
335 
336  //parallel subtraction:
337  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
338  source.append(" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
339  }
340  source.append(" }");
341  source.append(" }");
342  source.append("}");
343 }
344 
345 
346 template<typename StringT>
347 void generate_scaled_rank1_update(StringT & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
348 {
349  source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
350  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
351  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
352  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
353  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
354  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
355 
356  if (alpha_on_cpu) {
357  source.append(" "); source.append(numeric_string); source.append(" val, \n");
358  } else {
359  source.append(" __global const "); source.append(numeric_string); source.append(" *val, \n");
360  }
361  source.append(" unsigned int options2, \n");
362 
363  source.append(" __global const "); source.append(numeric_string); source.append(" * vec1, \n");
364  source.append(" unsigned int start1, \n");
365  source.append(" unsigned int inc1, \n");
366  source.append(" unsigned int size1, \n");
367 
368  source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
369  source.append(" unsigned int start2, \n");
370  source.append(" unsigned int inc2, \n");
371  source.append(" unsigned int size2) \n");
372  source.append("{ \n");
373 
374  if (alpha_on_cpu) {
375  source.append(" "); source.append(numeric_string); source.append(" alpha = val; \n");
376  } else {
377  source.append(" "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
378  }
379  source.append(" if (options2 & (1 << 0)) \n");
380  source.append(" alpha = -alpha; \n");
381 
382  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
383  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
384 
385  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
386  source.append(" { \n");
387  source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
388  source.append(" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
389  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
390  if (is_row_major)
391  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
392  else
393  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
394  source.append(" } \n");
395  source.append("} \n");
396 }
397 
398 template<typename StringT>
399 void generate_triangular_substitute_inplace(StringT & source, std::string const & numeric_string, bool is_row_major)
400 {
401  source.append("__kernel void triangular_substitute_inplace( \n");
402  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
403  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
404  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
405  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
406  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
407  source.append(" __global "); source.append(numeric_string); source.append(" * v, \n");
408  source.append(" unsigned int v_start, \n");
409  source.append(" unsigned int v_inc, \n");
410  source.append(" unsigned int v_size, \n");
411  source.append(" unsigned int options) \n");
412  source.append("{ \n");
413  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
414  source.append(" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
415  source.append(" unsigned int transposed_access_A = (options & (1 << 1)); \n");
416  source.append(" unsigned int is_lower_solve = (options & (1 << 2)); \n");
417  source.append(" unsigned int row; \n");
418  source.append(" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n"); //Note: A required to be square
419  source.append(" { \n");
420  source.append(" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
421  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
422  source.append(" if (!unit_diagonal_flag) \n");
423  source.append(" { \n");
424  source.append(" if (get_global_id(0) == 0) \n");
425  if (is_row_major)
426  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
427  else
428  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
429  source.append(" } \n");
430 
431  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
432 
433  source.append(" temp = v[row * v_inc + v_start]; \n");
434 
435  source.append(" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
436  source.append(" elim < (is_lower_solve ? A_size1 : row); \n");
437  source.append(" elim += get_global_size(0)) \n");
438  if (is_row_major)
439  {
440  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
441  source.append(" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
442  }
443  else
444  {
445  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
446  source.append(" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
447  }
448  source.append(" } \n");
449  source.append("} \n");
450 }
451 
452 template <typename StringT>
453 void generate_trans_kernel(StringT & source, std::string const & numeric_string, bool is_row_major)
454 {
455  source.append("__kernel void trans_kernel(\n");
456  source.append(" __global const ");source.append(numeric_string);source.append(" * A, \n");
457  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
458  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
459  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
460  source.append(" unsigned int A_stride1, unsigned int A_stride2, \n");
461  source.append(" __global ");source.append(numeric_string);source.append(" * B, \n");
462  source.append(" unsigned int B_start1, unsigned int B_start2, \n");
463  source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
464  source.append(" unsigned int B_stride1, unsigned int B_stride2) \n");
465  source.append("{ \n");
466  source.append(" for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
467  source.append(" { \n");
468  source.append(" for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
469  source.append(" { \n");
470  if(is_row_major)
471  source.append(" B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; \n");
472  else
473  source.append(" B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; \n");
474  source.append(" } \n");
475  source.append(" } \n");
476  source.append("} \n");
477 }
478 
479 namespace detail
480 {
481  inline std::string type_to_string(viennacl::row_major) { return "row"; }
482  inline std::string type_to_string(viennacl::column_major) { return "col"; }
483 }
484 
486 
488 template<typename NumericT>
489 class matrix
490 {
491 private:
492 
493  template<typename ScalarT1, typename ScalarT2>
494  static void generate_ambm_impl2(device_specific::execution_handler & handler, std::string const & prefix, device_specific::matrix_axpy_template::parameters_type const & parameters, scheduler::operation_node_type ASSIGN_OP,
495  viennacl::matrix_base<NumericT> const * x, viennacl::matrix_base<NumericT> const * y, ScalarT1 const * a,
496  viennacl::matrix_base<NumericT> const * z, ScalarT2 const * b)
497  {
498  namespace ds = viennacl::device_specific;
499 
500  handler.add(prefix + "0000", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, false, false));
501  handler.add(prefix + "1000", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, false, false));
502  handler.add(prefix + "0100", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, false, false));
503  handler.add(prefix + "1100", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, false, false));
504  if (b)
505  {
506  handler.add(prefix + "0010", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, true, false));
507  handler.add(prefix + "1010", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, true, false));
508  handler.add(prefix + "0110", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, true, false));
509  handler.add(prefix + "1110", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, true, false));
510 
511  handler.add(prefix + "0001", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, false, true));
512  handler.add(prefix + "1001", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, false, true));
513  handler.add(prefix + "0101", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, false, true));
514  handler.add(prefix + "1101", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, false, true));
515 
516  handler.add(prefix + "0011", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, false, z, b, true, true));
517  handler.add(prefix + "1011", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, false, z, b, true, true));
518  handler.add(prefix + "0111", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, false, true, z, b, true, true));
519  handler.add(prefix + "1111", ds::matrix_axpy_template(parameters), scheduler::preset::avbv(ASSIGN_OP, x, y, a, true, true, z, b, true, true));
520  }
521  }
522 
523  template<typename ScalarT>
524  static void generate_ambm_impl(device_specific::execution_handler & handler, std::string const & prefix, device_specific::matrix_axpy_template::parameters_type const & parameters, scheduler::operation_node_type ASSIGN_OP,
525  viennacl::matrix_base<NumericT> const * x, viennacl::matrix_base<NumericT> const * y, ScalarT const * ha, viennacl::scalar<ScalarT> const * da,
526  viennacl::matrix_base<NumericT> const * z, ScalarT const * hb, viennacl::scalar<ScalarT> const * db)
527  {
528  //x ASSIGN_OP a*y
529  generate_ambm_impl2(handler, prefix + "hm_", parameters, ASSIGN_OP, x, y, ha, (viennacl::matrix_base<NumericT>*)NULL, (NumericT*)NULL);
530  generate_ambm_impl2(handler, prefix + "dm_", parameters, ASSIGN_OP, x, y, da, (viennacl::matrix_base<NumericT>*)NULL, (NumericT*)NULL);
531 
532  //x ASSIGN_OP a*y + b*z
533  generate_ambm_impl2(handler, prefix + "hmhm_", parameters, ASSIGN_OP, x, y, ha, z, hb);
534  generate_ambm_impl2(handler, prefix + "dmhm_", parameters, ASSIGN_OP, x, y, da, z, hb);
535  generate_ambm_impl2(handler, prefix + "hmdm_", parameters, ASSIGN_OP, x, y, ha, z, db);
536  generate_ambm_impl2(handler, prefix + "dmdm_", parameters, ASSIGN_OP, x, y, da, z, db);
537  }
538 
539 
540 public:
542  {
543  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
544  cl_context h = ctx.handle().get();
545  std::pair<bool, cl_context> key(is_row_major, h);
546  if (handlers_map.find(key) == handlers_map.end())
547  {
549 
550  namespace ds = viennacl::device_specific;
551  viennacl::ocl::device const & device = ctx.current_device();
552  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"matrix_row":"matrix_col");
553  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
554  ds::execution_handler & handler = viennacl::device_specific::at(handlers_map, key);
555 
556  ds::matrix_axpy_template::parameters_type matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
557  ds::vector_axpy_template::parameters_type vector_axpy_params = ds::builtin_database::vector_axpy_params<NumericT>(device);
558 
560  if (is_row_major)
561  {
565  }
566  else
567  {
571  }
572 
582  NumericT ha;
583  NumericT hb;
584  int hi = 0;
585  unsigned int hui = 0;
586 
587  // fully parametrized kernels:
588  generate_ambm_impl(handler, "assign_", matrix_axpy_params, scheduler::OPERATION_BINARY_ASSIGN_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
589  generate_ambm_impl(handler, "ip_add_", matrix_axpy_params, scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
590 
591  handler.add("assign_cpu", ds::matrix_axpy_template(matrix_axpy_params), scheduler::preset::assign_cpu(&A, &M));
592  handler.add("matrix_diag_from_vector", ds::matrix_axpy_template(matrix_axpy_params), scheduler::preset::matrix_diag_from_vector(&x, &A, hi));
593  handler.add("matrix_row", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_row(&x, &A, hui));
594  handler.add("matrix_column", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_column(&x, &A, hui));
595  handler.add("matrix_diag_to_vector", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::matrix_diag_to_vector(&x, &A, hi));
596  handler.add("diagonal_assign_cpu", ds::vector_axpy_template(vector_axpy_params), scheduler::preset::diagonal_assign_cpu(&A, &sx));
597  }
598  return viennacl::device_specific::at(handlers_map, key);
599  }
600 };
601 
602 // main kernel class
604 template<typename NumericT>
606 {
607 
608 public:
610  {
611  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
612  cl_context h = ctx.handle().get();
613  std::pair<bool, cl_context> key(is_row_major, h);
614  if (handlers_map.find(key) == handlers_map.end())
615  {
617 
618  namespace ds = viennacl::device_specific;
619  using namespace scheduler;
621 
622  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
623  viennacl::ocl::device const & device = ctx.current_device();
624  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"matrix_element_row":"matrix_element_col");
625  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
626  ds::execution_handler & handler = viennacl::device_specific::at(handlers_map, key);
627  ds::matrix_axpy_template::parameters_type matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
628 
630  if (is_row_major)
631  {
635  }
636  else
637  {
641  }
642 
646 
647 
648  // unary operations
649 #define VIENNACL_ADD_UNARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::unary_element_op(&A, &B, OPTYPE))
650  if (numeric_string == "float" || numeric_string == "double")
651  {
668  }
669  else
670  {
672  }
673 #undef VIENNACL_ADD_UNARY
674 
675  // binary operations
676 #define VIENNACL_ADD_BINARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::binary_element_op(&A, &B, &C, OPTYPE))
679  if (numeric_string == "float" || numeric_string == "double")
680  {
682  }
683 #undef VIENNACL_ADD_BINARY
684 
685  }
686  return viennacl::device_specific::at(handlers_map, key);
687  }
688 };
689 
690 
692 template<typename NumericT>
694 {
695 public:
697  {
698  static std::map<cl_context, device_specific::execution_handler> handlers_map;
699  cl_context key = ctx.handle().get();
700  if (handlers_map.find(key) == handlers_map.end())
701  {
703 
704  namespace ds = viennacl::device_specific;
705  viennacl::ocl::device const & device = ctx.current_device();
706  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_row_wise";
707  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
708  ds::execution_handler & handler = viennacl::device_specific::at(handlers_map, key);
709 
713  handler.add("mat_vec_T", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device, 'T'), 'T'), scheduler::preset::mat_vec_prod(&A, true, &x, &y));
714  handler.add("mat_vec_N", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device, 'N'), 'N'), scheduler::preset::mat_vec_prod(&A, false, &x, &y));
715 
716  }
717  return viennacl::device_specific::at(handlers_map, key);
718  }
719 };
720 
722 template<typename NumericT>
724 {
725 public:
727  {
728  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
729  cl_context h = ctx.handle().get();
730  std::pair<bool, cl_context> key(is_row_major, h);
731  if (handlers_map.find(key) == handlers_map.end())
732  {
734 
735  namespace ds = viennacl::device_specific;
736  viennacl::ocl::device const & device = ctx.current_device();
737  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"_matrix_prod_row":"_matrix_prod_col");
738  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
739  ds::execution_handler & handler = viennacl::device_specific::at(handlers_map, key);
740 
741  ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'N');
742  ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'N');
743  ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'T');
744  ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'T');
745 
747  if (is_row_major)
749  else
751 
752  //Dummy types. The values don't matter for the kernel generation.
756  NumericT alpha = 1;
757  NumericT beta = 0;
758 
759  handler.add("prod_NN", ds::matrix_product_template(matrix_product_params_NN, 'N', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, false, beta, &C));
760  handler.add("prod_TN", ds::matrix_product_template(matrix_product_params_TN, 'T', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, false, beta, &C));
761  handler.add("prod_NT", ds::matrix_product_template(matrix_product_params_NT, 'N', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, true, beta, &C));
762  handler.add("prod_TT", ds::matrix_product_template(matrix_product_params_TT, 'T', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, true, beta, &C));
763 
764  }
765  return viennacl::device_specific::at(handlers_map, key);
766  }
767 };
768 
769 // main kernel class
771 template<typename NumericT, typename LayoutT>
773 {
774  static std::string program_name()
775  {
777  }
778 
779  static void init(viennacl::ocl::context & ctx)
780  {
781  static std::map<cl_context, bool> init_done;
782  if (!init_done[ctx.handle().get()])
783  {
785  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
787 
788  std::string source;
789  source.reserve(8192);
790 
791  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
792 
793  // kernels with mostly predetermined skeleton:
794  generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
795  generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
796 
797  if (numeric_string == "float" || numeric_string == "double")
798  {
799  generate_fft(source, numeric_string, is_row_major);
800  generate_lu(source, numeric_string, is_row_major);
801  generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
802  generate_trans_kernel(source, numeric_string, is_row_major);
803  }
804 
805  std::string prog_name = program_name();
806  #ifdef VIENNACL_BUILD_INFO
807  std::cout << "Creating program " << prog_name << std::endl;
808  #endif
809  ctx.add_program(source, prog_name);
810  init_done[ctx.handle().get()] = true;
811  } //if
812  } //init
813 };
814 
815 
816 
817 
818 template<typename StringT>
819 void generate_matrix_convert_row(StringT & source, std::string const & dest_type, std::string const & src_type)
820 {
821  source.append(" __kernel void convert_row_" + dest_type + "_" + src_type + "( \n");
822  source.append(" __global " + dest_type + " * dest, \n");
823  source.append(" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
824  source.append(" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
825  source.append(" __global const " + src_type + " * src, \n");
826  source.append(" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
827  source.append(" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
828  source.append(" { \n");
829  source.append(" for (unsigned int i = get_group_id(0); i < size1_dest; i += get_num_groups(0)) \n");
830  source.append(" for (unsigned int j = get_local_id(0); j < size2_dest; j += get_local_size(0)) \n");
831  source.append(" dest[(start1_dest + i * inc1_dest) * internal_size2_dest + (start2_dest + j * inc2_dest)] = src[(start1_src + i * inc1_src) * internal_size2_src + (start2_src + j * inc2_src)]; \n");
832  source.append(" } \n");
833 }
834 
835 template<typename StringT>
836 void generate_matrix_convert_col(StringT & source, std::string const & dest_type, std::string const & src_type)
837 {
838  source.append(" __kernel void convert_col_" + dest_type + "_" + src_type + "( \n");
839  source.append(" __global " + dest_type + " * dest, \n");
840  source.append(" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
841  source.append(" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
842  source.append(" __global const " + src_type + " * src, \n");
843  source.append(" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
844  source.append(" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
845  source.append(" { \n");
846  source.append(" for (unsigned int j = get_group_id(0); j < size2_dest; j += get_num_groups(0)) \n");
847  source.append(" for (unsigned int i = get_local_id(0); i < size1_dest; i += get_local_size(0)) \n");
848  source.append(" dest[(start1_dest + i * inc1_dest) + (start2_dest + j * inc2_dest) * internal_size1_dest] = src[(start1_src + i * inc1_src) + (start2_src + j * inc2_src) * internal_size1_src]; \n");
849  source.append(" } \n");
850 }
851 
852 template<typename StringT>
853 void generate_matrix_convert(StringT & source, std::string const & dest_type, std::string const & src_type)
854 {
855  generate_matrix_convert_row(source, dest_type, src_type);
856  generate_matrix_convert_col(source, dest_type, src_type);
857 }
858 
861 {
862 
863 public:
864  static std::string program_name()
865  {
866  return "matrix_convert";
867  }
868 
869  static void init(viennacl::ocl::context & ctx)
870  {
871  static std::map<cl_context, bool> init_done;
872  if (!init_done[ctx.handle().get()])
873  {
874  std::string source;
875  source.reserve(4096);
876 
877  // int
883 
884  // unsigned int
890 
891  // long
897 
898  // unsigned long
904 
905  // float
911 
912  if (ctx.current_device().double_support())
913  {
915 
921 
928  }
929 
930  std::string prog_name = program_name();
931  #ifdef VIENNACL_BUILD_INFO
932  std::cout << "Creating program " << prog_name << std::endl;
933  #endif
934  ctx.add_program(source, prog_name);
935  init_done[ctx.handle().get()] = true;
936  } //if
937  } //init
938 
939 };
940 
941 
942 } // namespace kernels
943 } // namespace opencl
944 } // namespace linalg
945 } // namespace viennacl
946 #endif
947 
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:112
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:723
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:227
Implements a OpenCL platform within ViennaCL.
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:70
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:399
#define VIENNACL_ADD_UNARY(OPTYPE)
statement matrix_diag_from_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Definition: preset.hpp:363
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:484
matrix_axpy_template::parameters_type const & matrix_axpy_params(ocl::device const &device)
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
Definition: utils.hpp:78
Various little tools used here and there in ViennaCL.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
void add(std::string const &key, template_base const &T, statements_container const &statements)
static device_specific::execution_handler & execution_handler(viennacl::ocl::context &ctx)
Definition: matrix.hpp:696
A dense matrix class.
Definition: forwards.h:375
scheduler::statement avbv(scheduler::operation_node_type ASSIGN_OP, NumericT const *x, NumericT const *y, ScalarT1 const *a, bool flip_a, bool reciprocal_a, NumericT const *z, ScalarT2 const *b, bool flip_b, bool reciprocal_b)
Definition: preset.hpp:33
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
float NumericT
Definition: bisect.cpp:40
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
Definition: matrix.hpp:605
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:726
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context.
Definition: context.hpp:368
const OCL_TYPE & get() const
Definition: handle.hpp:189
#define VIENNACL_ADD_BINARY(OPTYPE)
Definition: blas3.hpp:36
statement mat_vec_prod(viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::vector_base< NumericT > const *x, viennacl::vector_base< NumericT > const *y)
Definition: preset.hpp:410
vector_axpy_template::parameters_type const & vector_axpy_params(ocl::device const &device)
static void init(viennacl::ocl::context &ctx)
Definition: matrix.hpp:779
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
Definition: matrix.hpp:772
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:453
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: matrix_def.hpp:93
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Definition: device.hpp:956
A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependen...
Definition: shared_ptr.hpp:83
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:693
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
Definition: matrix.hpp:55
operation_node_type
Enumeration for identifying the possible operations.
Definition: forwards.h:68
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:292
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
Definition: preset.hpp:416
statement matrix_diag_to_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Definition: preset.hpp:357
Main kernel class for vector conversion routines (e.g. convert vector to vector).
Definition: matrix.hpp:860
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:541
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Definition: matrix.hpp:347
Representation of an OpenCL kernel in ViennaCL.
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: vector_def.hpp:87
scheduler::statement diagonal_assign_cpu(matrix_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
Definition: preset.hpp:147
std::string type_to_string(viennacl::row_major)
Definition: matrix.hpp:481
Provides an OpenCL kernel generator.
Definition: common.hpp:34
statement matrix_row(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
Definition: preset.hpp:344
void generate_matrix_convert_col(StringT &source, std::string const &dest_type, std::string const &src_type)
Definition: matrix.hpp:836
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:609
statement matrix_column(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
Definition: preset.hpp:350
A tag for column-major storage of a dense matrix.
Definition: forwards.h:321
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
Definition: matrix.hpp:47
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:489
scheduler::statement assign_cpu(vector_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
Definition: preset.hpp:123
const char * operator_string(scheduler::operation_node_type type)
void generate_matrix_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
Definition: matrix.hpp:853
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
Definition: forwards.h:142
static void init(viennacl::ocl::context &ctx)
Definition: matrix.hpp:869
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
A tag for row-major storage of a dense matrix.
Definition: forwards.h:304
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.
void generate_matrix_convert_row(StringT &source, std::string const &dest_type, std::string const &src_type)
Definition: matrix.hpp:819