1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
69 template<
typename StringT>
73 source.append(
"__kernel void fft_direct(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
74 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *output, \n");
75 source.append(
" unsigned int size, \n");
76 source.append(
" unsigned int stride, \n");
77 source.append(
" unsigned int batch_num, \n");
78 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
79 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
81 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
82 source.append(
" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
83 source.append(
" "); source.append(numeric_string); source.append(
"2 f = 0.0f; \n");
85 source.append(
" for (unsigned int n = 0; n < size; n++) { \n");
86 source.append(
" "); source.append(numeric_string); source.append(
"2 in = ");
88 source.append(
"input[batch_id * stride + n]; \n");
90 source.append(
"input[n * stride + batch_id]; \n");
92 source.append(
" "); source.append(numeric_string); source.append(
" sn, cs; \n");
93 source.append(
" "); source.append(numeric_string); source.append(
" arg = sign * 2 * NUM_PI * k / size * n; \n");
94 source.append(
" sn = sincos(arg, &cs); \n");
96 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
97 source.append(
" f = f + ("); source.append(numeric_string); source.append(
"2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
98 source.append(
" } \n");
101 source.append(
" output[batch_id * stride + k] = f; \n");
103 source.append(
" output[k * stride + batch_id] = f; \n");
104 source.append(
" } \n");
105 source.append(
" } \n");
106 source.append(
"} \n");
108 source.append(
" \n");
110 source.append(
"__kernel void fft_radix2(__global "); source.append(numeric_string); source.append(
"2* input, \n");
111 source.append(
" unsigned int s, \n");
112 source.append(
" unsigned int bit_size, \n");
113 source.append(
" unsigned int size, \n");
114 source.append(
" unsigned int stride, \n");
115 source.append(
" unsigned int batch_num, \n");
116 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
117 source.append(
" \n");
118 source.append(
" unsigned int ss = 1 << s; \n");
119 source.append(
" unsigned int half_size = size >> 1; \n");
120 source.append(
" \n");
121 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
122 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
123 source.append(
" \n");
124 source.append(
" unsigned int glb_id = get_global_id(0); \n");
125 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
127 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
128 source.append(
" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
129 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
130 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
134 source.append(
" unsigned int offset = batch_id * stride + pos; \n");
135 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
136 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss]; \n");
140 source.append(
" unsigned int offset = pos * stride + batch_id; \n");
141 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
142 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss * stride]; \n");
145 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
147 source.append(
" sn = sincos(arg, &cs); \n");
149 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
151 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
154 source.append(
" input[offset + ss] = in1 - tmp; \n");
156 source.append(
" input[offset + ss * stride] = in1 - tmp; \n");
157 source.append(
" input[offset] = in1 + tmp; \n");
158 source.append(
" } \n");
159 source.append(
" } \n");
160 source.append(
"} \n");
162 source.append(
" \n");
164 source.append(
" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
165 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
166 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
167 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
168 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
169 source.append(
" v = (v >> 16) | (v << 16); \n");
170 source.append(
" \n");
171 source.append(
" v = v >> (32 - bit_size); \n");
172 source.append(
" \n");
173 source.append(
" return v; \n");
174 source.append(
" } \n");
176 source.append(
" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append(
"2* input, \n");
177 source.append(
" __local "); source.append(numeric_string); source.append(
"2* lcl_input, \n");
178 source.append(
" unsigned int bit_size, \n");
179 source.append(
" unsigned int size, \n");
180 source.append(
" unsigned int stride, \n");
181 source.append(
" unsigned int batch_num, \n");
182 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
184 source.append(
" unsigned int grp_id = get_group_id(0); \n");
185 source.append(
" unsigned int grp_num = get_num_groups(0); \n");
187 source.append(
" unsigned int lcl_sz = get_local_size(0); \n");
188 source.append(
" unsigned int lcl_id = get_local_id(0); \n");
189 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
191 source.append(
" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
194 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
195 source.append(
" unsigned int v = get_reorder_num(p, bit_size); \n");
197 source.append(
" lcl_input[v] = input[batch_id * stride + p]; \n");
199 source.append(
" lcl_input[v] = input[p * stride + batch_id]; \n");
200 source.append(
" } \n");
202 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
205 source.append(
" for (unsigned int s = 0; s < bit_size; s++) { \n");
206 source.append(
" unsigned int ss = 1 << s; \n");
208 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
210 source.append(
" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
211 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
212 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
214 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = lcl_input[pos]; \n");
215 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = lcl_input[pos + ss]; \n");
217 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
219 source.append(
" sn = sincos(arg, &cs); \n");
220 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
222 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
224 source.append(
" lcl_input[pos + ss] = in1 - tmp; \n");
225 source.append(
" lcl_input[pos] = in1 + tmp; \n");
226 source.append(
" } \n");
228 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
229 source.append(
" } \n");
232 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
234 source.append(
" input[batch_id * stride + p] = lcl_input[p]; \n");
236 source.append(
" input[p * stride + batch_id] = lcl_input[p]; \n");
237 source.append(
" } \n");
238 source.append(
" } \n");
239 source.append(
" } \n");
241 source.append(
" \n");
247 source.append(
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
248 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
249 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
250 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
251 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
252 source.append(
" v = (v >> 16) | (v << 16); \n");
254 source.append(
" v = v >> (32 - bit_size); \n");
256 source.append(
" return v; \n");
257 source.append(
"} \n");
259 source.append(
"__kernel void fft_reorder(__global "); source.append(numeric_string); source.append(
"2* input, \n");
260 source.append(
" unsigned int bit_size, \n");
261 source.append(
" unsigned int size, \n");
262 source.append(
" unsigned int stride, \n");
263 source.append(
" int batch_num) { \n");
265 source.append(
" unsigned int glb_id = get_global_id(0); \n");
266 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
268 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
269 source.append(
" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
270 source.append(
" unsigned int v = get_reorder_num_2(i, bit_size); \n");
272 source.append(
" if (i < v) {\n");
275 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[batch_id * stride + i]; \n");
276 source.append(
" input[batch_id * stride + i] = input[batch_id * stride + v]; \n");
277 source.append(
" input[batch_id * stride + v] = tmp; \n");
281 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[i * stride + batch_id]; \n");
282 source.append(
" input[i * stride + batch_id] = input[v * stride + batch_id]; \n");
283 source.append(
" input[v * stride + batch_id] = tmp; \n");
285 source.append(
" } \n");
286 source.append(
" } \n");
287 source.append(
" } \n");
288 source.append(
"} \n");
291 template<
typename StringT>
294 source.append(
"__kernel void lu_factorize( \n");
295 source.append(
" __global "); source.append(numeric_string); source.append(
" * matrix, \n");
296 source.append(
" unsigned int matrix_rows, \n");
297 source.append(
" unsigned int matrix_cols, \n");
298 source.append(
" unsigned int matrix_internal_rows, \n");
299 source.append(
" unsigned int matrix_internal_cols) \n");
300 source.append(
"{ \n");
301 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
305 source.append(
" unsigned rowi; \n");
306 source.append(
" unsigned rowk; \n");
307 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
308 source.append(
" { \n");
309 source.append(
" rowi = i * matrix_internal_cols; \n");
310 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
311 source.append(
" { \n");
312 source.append(
" rowk = k * matrix_internal_cols; \n");
313 source.append(
" if (get_global_id(0) == 0) \n");
314 source.append(
" matrix[rowi + k] /= matrix[rowk + k]; \n");
316 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
317 source.append(
" temp = matrix[rowi + k]; \n");
320 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
321 source.append(
" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
325 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
326 source.append(
" { \n");
327 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
328 source.append(
" { \n");
330 source.append(
" if (get_global_id(0) == 0) \n");
331 source.append(
" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
333 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
334 source.append(
" temp = matrix[i + k*matrix_internal_rows]; \n");
337 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
338 source.append(
" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
346 template<
typename StringT>
349 source.append(
"__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append(
"cpu") : source.append(
"gpu"); source.append(
"( \n");
350 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
351 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
352 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
353 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
354 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
357 source.append(
" "); source.append(numeric_string); source.append(
" val, \n");
359 source.append(
" __global const "); source.append(numeric_string); source.append(
" *val, \n");
361 source.append(
" unsigned int options2, \n");
363 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec1, \n");
364 source.append(
" unsigned int start1, \n");
365 source.append(
" unsigned int inc1, \n");
366 source.append(
" unsigned int size1, \n");
368 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec2, \n");
369 source.append(
" unsigned int start2, \n");
370 source.append(
" unsigned int inc2, \n");
371 source.append(
" unsigned int size2) \n");
372 source.append(
"{ \n");
375 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val; \n");
377 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val[0]; \n");
379 source.append(
" if (options2 & (1 << 0)) \n");
380 source.append(
" alpha = -alpha; \n");
382 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
383 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
385 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
386 source.append(
" { \n");
387 source.append(
" "); source.append(numeric_string); source.append(
" tmp = vec1[row * inc1 + start1];");
388 source.append(
" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
389 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
391 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
393 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
394 source.append(
" } \n");
395 source.append(
"} \n");
398 template<
typename StringT>
401 source.append(
"__kernel void triangular_substitute_inplace( \n");
402 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
403 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
404 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
405 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
406 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
407 source.append(
" __global "); source.append(numeric_string); source.append(
" * v, \n");
408 source.append(
" unsigned int v_start, \n");
409 source.append(
" unsigned int v_inc, \n");
410 source.append(
" unsigned int v_size, \n");
411 source.append(
" unsigned int options) \n");
412 source.append(
"{ \n");
413 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
414 source.append(
" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
415 source.append(
" unsigned int transposed_access_A = (options & (1 << 1)); \n");
416 source.append(
" unsigned int is_lower_solve = (options & (1 << 2)); \n");
417 source.append(
" unsigned int row; \n");
418 source.append(
" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n");
419 source.append(
" { \n");
420 source.append(
" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
421 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
422 source.append(
" if (!unit_diagonal_flag) \n");
423 source.append(
" { \n");
424 source.append(
" if (get_global_id(0) == 0) \n");
426 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
428 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
429 source.append(
" } \n");
431 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
433 source.append(
" temp = v[row * v_inc + v_start]; \n");
435 source.append(
" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
436 source.append(
" elim < (is_lower_solve ? A_size1 : row); \n");
437 source.append(
" elim += get_global_size(0)) \n");
440 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
441 source.append(
" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
445 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
446 source.append(
" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
448 source.append(
" } \n");
449 source.append(
"} \n");
452 template <
typename StringT>
455 source.append(
"__kernel void trans_kernel(\n");
456 source.append(
" __global const ");source.append(numeric_string);source.append(
" * A, \n");
457 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
458 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
459 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
460 source.append(
" unsigned int A_stride1, unsigned int A_stride2, \n");
461 source.append(
" __global ");source.append(numeric_string);source.append(
" * B, \n");
462 source.append(
" unsigned int B_start1, unsigned int B_start2, \n");
463 source.append(
" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
464 source.append(
" unsigned int B_stride1, unsigned int B_stride2) \n");
465 source.append(
"{ \n");
466 source.append(
" for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
467 source.append(
" { \n");
468 source.append(
" for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
469 source.append(
" { \n");
471 source.append(
" B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; \n");
473 source.append(
" B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; \n");
474 source.append(
" } \n");
475 source.append(
" } \n");
476 source.append(
"} \n");
488 template<
typename NumericT>
493 template<
typename ScalarT1,
typename ScalarT2>
500 handler.
add(prefix +
"0000", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
false,
false));
501 handler.
add(prefix +
"1000", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
false,
false));
502 handler.
add(prefix +
"0100", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
false,
false));
503 handler.
add(prefix +
"1100", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
false,
false));
506 handler.
add(prefix +
"0010", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
true,
false));
507 handler.
add(prefix +
"1010", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
true,
false));
508 handler.
add(prefix +
"0110", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
true,
false));
509 handler.
add(prefix +
"1110", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
true,
false));
511 handler.
add(prefix +
"0001", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
false,
true));
512 handler.
add(prefix +
"1001", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
false,
true));
513 handler.
add(prefix +
"0101", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
false,
true));
514 handler.
add(prefix +
"1101", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
false,
true));
516 handler.
add(prefix +
"0011", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
false, z, b,
true,
true));
517 handler.
add(prefix +
"1011", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
false, z, b,
true,
true));
518 handler.
add(prefix +
"0111", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
false,
true, z, b,
true,
true));
519 handler.
add(prefix +
"1111", ds::matrix_axpy_template(parameters),
scheduler::preset::avbv(ASSIGN_OP, x, y, a,
true,
true, z, b,
true,
true));
523 template<
typename ScalarT>
533 generate_ambm_impl2(handler, prefix +
"hmhm_", parameters, ASSIGN_OP, x, y, ha, z, hb);
534 generate_ambm_impl2(handler, prefix +
"dmhm_", parameters, ASSIGN_OP, x, y, da, z, hb);
535 generate_ambm_impl2(handler, prefix +
"hmdm_", parameters, ASSIGN_OP, x, y, ha, z, db);
536 generate_ambm_impl2(handler, prefix +
"dmdm_", parameters, ASSIGN_OP, x, y, da, z, db);
545 std::pair<bool, cl_context> key(is_row_major, h);
546 if (handlers_map.find(key) == handlers_map.end())
553 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
556 ds::matrix_axpy_template::parameters_type
matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
557 ds::vector_axpy_template::parameters_type
vector_axpy_params = ds::builtin_database::vector_axpy_params<NumericT>(device);
585 unsigned int hui = 0;
588 generate_ambm_impl(handler,
"assign_", matrix_axpy_params,
scheduler::OPERATION_BINARY_ASSIGN_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
589 generate_ambm_impl(handler,
"ip_add_", matrix_axpy_params,
scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE, &A, &B, &ha, &da, &C, &hb, &db);
604 template<
typename NumericT>
613 std::pair<bool, cl_context> key(is_row_major, h);
614 if (handlers_map.find(key) == handlers_map.end())
619 using namespace scheduler;
625 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
627 ds::matrix_axpy_template::parameters_type
matrix_axpy_params = ds::builtin_database::matrix_axpy_params<NumericT>(device);
649 #define VIENNACL_ADD_UNARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::unary_element_op(&A, &B, OPTYPE))
650 if (numeric_string ==
"float" || numeric_string ==
"double")
673 #undef VIENNACL_ADD_UNARY
676 #define VIENNACL_ADD_BINARY(OPTYPE) handler.add(operator_string(OPTYPE), ds::matrix_axpy_template(matrix_axpy_params),scheduler::preset::binary_element_op(&A, &B, &C, OPTYPE))
679 if (numeric_string ==
"float" || numeric_string ==
"double")
683 #undef VIENNACL_ADD_BINARY
692 template<
typename NumericT>
698 static std::map<cl_context, device_specific::execution_handler> handlers_map;
700 if (handlers_map.find(key) == handlers_map.end())
707 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
713 handler.add(
"mat_vec_T", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device,
'T'),
'T'),
scheduler::preset::mat_vec_prod(&A,
true, &x, &y));
714 handler.add(
"mat_vec_N", ds::row_wise_reduction_template(ds::builtin_database::row_wise_reduction_params<NumericT>(device,
'N'),
'N'),
scheduler::preset::mat_vec_prod(&A,
false, &x, &y));
722 template<
typename NumericT>
730 std::pair<bool, cl_context> key(is_row_major, h);
731 if (handlers_map.find(key) == handlers_map.end())
738 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
741 ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'N');
742 ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'N');
743 ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'T');
744 ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'T');
759 handler.add(
"prod_NN", ds::matrix_product_template(matrix_product_params_NN,
'N',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
false, beta, &C));
760 handler.add(
"prod_TN", ds::matrix_product_template(matrix_product_params_TN,
'T',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
false, beta, &C));
761 handler.add(
"prod_NT", ds::matrix_product_template(matrix_product_params_NT,
'N',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
true, beta, &C));
762 handler.add(
"prod_TT", ds::matrix_product_template(matrix_product_params_TT,
'T',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
true, beta, &C));
771 template<
typename NumericT,
typename LayoutT>
781 static std::map<cl_context, bool> init_done;
789 source.reserve(8192);
791 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
797 if (numeric_string ==
"float" || numeric_string ==
"double")
805 std::string prog_name = program_name();
806 #ifdef VIENNACL_BUILD_INFO
807 std::cout <<
"Creating program " << prog_name << std::endl;
809 ctx.add_program(source, prog_name);
810 init_done[ctx.handle().get()] =
true;
818 template<
typename StringT>
821 source.append(
" __kernel void convert_row_" + dest_type +
"_" + src_type +
"( \n");
822 source.append(
" __global " + dest_type +
" * dest, \n");
823 source.append(
" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
824 source.append(
" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
825 source.append(
" __global const " + src_type +
" * src, \n");
826 source.append(
" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
827 source.append(
" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
828 source.append(
" { \n");
829 source.append(
" for (unsigned int i = get_group_id(0); i < size1_dest; i += get_num_groups(0)) \n");
830 source.append(
" for (unsigned int j = get_local_id(0); j < size2_dest; j += get_local_size(0)) \n");
831 source.append(
" dest[(start1_dest + i * inc1_dest) * internal_size2_dest + (start2_dest + j * inc2_dest)] = src[(start1_src + i * inc1_src) * internal_size2_src + (start2_src + j * inc2_src)]; \n");
832 source.append(
" } \n");
835 template<
typename StringT>
838 source.append(
" __kernel void convert_col_" + dest_type +
"_" + src_type +
"( \n");
839 source.append(
" __global " + dest_type +
" * dest, \n");
840 source.append(
" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
841 source.append(
" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
842 source.append(
" __global const " + src_type +
" * src, \n");
843 source.append(
" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
844 source.append(
" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
845 source.append(
" { \n");
846 source.append(
" for (unsigned int j = get_group_id(0); j < size2_dest; j += get_num_groups(0)) \n");
847 source.append(
" for (unsigned int i = get_local_id(0); i < size1_dest; i += get_local_size(0)) \n");
848 source.append(
" dest[(start1_dest + i * inc1_dest) + (start2_dest + j * inc2_dest) * internal_size1_dest] = src[(start1_src + i * inc1_src) + (start2_src + j * inc2_src) * internal_size1_src]; \n");
849 source.append(
" } \n");
852 template<
typename StringT>
866 return "matrix_convert";
871 static std::map<cl_context, bool> init_done;
875 source.reserve(4096);
930 std::string prog_name = program_name();
931 #ifdef VIENNACL_BUILD_INFO
932 std::cout <<
"Creating program " << prog_name << std::endl;
viennacl::ocl::device const & current_device() const
Returns the current device.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
#define VIENNACL_ADD_UNARY(OPTYPE)
statement matrix_diag_from_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Helper class for checking whether a matrix has a row-major layout.
matrix_axpy_template::parameters_type const & matrix_axpy_params(ocl::device const &device)
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
void add(std::string const &key, template_base const &T, statements_container const &statements)
static device_specific::execution_handler & execution_handler(viennacl::ocl::context &ctx)
scheduler::statement avbv(scheduler::operation_node_type ASSIGN_OP, NumericT const *x, NumericT const *y, ScalarT1 const *a, bool flip_a, bool reciprocal_a, NumericT const *z, ScalarT2 const *b, bool flip_b, bool reciprocal_b)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
static void apply(viennacl::ocl::context const &)
bool with_stride_and_range
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context.
const OCL_TYPE & get() const
#define VIENNACL_ADD_BINARY(OPTYPE)
statement mat_vec_prod(viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::vector_base< NumericT > const *x, viennacl::vector_base< NumericT > const *y)
vector_axpy_template::parameters_type const & vector_axpy_params(ocl::device const &device)
static void init(viennacl::ocl::context &ctx)
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
static std::string program_name()
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
operation_node_type
Enumeration for identifying the possible operations.
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
statement matrix_diag_to_vector(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, int id)
Main kernel class for vector conversion routines (e.g. convert vector to vector).
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Representation of an OpenCL kernel in ViennaCL.
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
scheduler::statement diagonal_assign_cpu(matrix_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
std::string type_to_string(viennacl::row_major)
Provides an OpenCL kernel generator.
statement matrix_row(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
void generate_matrix_convert_col(StringT &source, std::string const &dest_type, std::string const &src_type)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
statement matrix_column(viennacl::vector_base< NumericT > const *x, viennacl::matrix_base< NumericT > const *A, unsigned int id)
A tag for column-major storage of a dense matrix.
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
scheduler::statement assign_cpu(vector_base< NumericT > const *x, implicit_vector_base< NumericT > const *y)
const char * operator_string(scheduler::operation_node_type type)
void generate_matrix_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
static void init(viennacl::ocl::context &ctx)
Helper class for converting a type to its string representation.
A tag for row-major storage of a dense matrix.
static std::string program_name()
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.
void generate_matrix_convert_row(StringT &source, std::string const &dest_type, std::string const &src_type)