1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
41 template<
typename StringT>
44 source.append(
"__kernel void block_trans_lu_backward( \n");
45 source.append(
" __global const unsigned int * row_jumper_U, \n");
46 source.append(
" __global const unsigned int * column_indices_U, \n");
47 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements_U, \n");
48 source.append(
" __global const "); source.append(numeric_string); source.append(
" * diagonal_U, \n");
49 source.append(
" __global const unsigned int * block_offsets, \n");
50 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
51 source.append(
" unsigned int size) \n");
52 source.append(
"{ \n");
53 source.append(
" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
54 source.append(
" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
55 source.append(
" unsigned int row_start; \n");
56 source.append(
" unsigned int row_stop; \n");
57 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = 0; \n");
59 source.append(
" if (col_start >= col_stop) \n");
60 source.append(
" return; \n");
63 source.append(
" for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
64 source.append(
" { \n");
65 source.append(
" unsigned int col = (col_stop - iter) - 1; \n");
66 source.append(
" result_entry = result[col] / diagonal_U[col]; \n");
67 source.append(
" row_start = row_jumper_U[col]; \n");
68 source.append(
" row_stop = row_jumper_U[col + 1]; \n");
69 source.append(
" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
70 source.append(
" result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
71 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
72 source.append(
" } \n");
75 source.append(
" for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
76 source.append(
" result[col] /= diagonal_U[col]; \n");
77 source.append(
"} \n");
80 template<
typename StringT>
83 source.append(
"__kernel void block_trans_unit_lu_forward( \n");
84 source.append(
" __global const unsigned int * row_jumper_L, \n");
85 source.append(
" __global const unsigned int * column_indices_L, \n");
86 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements_L, \n");
87 source.append(
" __global const unsigned int * block_offsets, \n");
88 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
89 source.append(
" unsigned int size) \n");
90 source.append(
"{ \n");
91 source.append(
" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
92 source.append(
" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
93 source.append(
" unsigned int row_start = row_jumper_L[col_start]; \n");
94 source.append(
" unsigned int row_stop; \n");
95 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = 0; \n");
97 source.append(
" if (col_start >= col_stop) \n");
98 source.append(
" return; \n");
101 source.append(
" for (unsigned int col = col_start; col < col_stop; ++col) \n");
102 source.append(
" { \n");
103 source.append(
" result_entry = result[col]; \n");
104 source.append(
" row_stop = row_jumper_L[col + 1]; \n");
105 source.append(
" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
106 source.append(
" result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
107 source.append(
" row_start = row_stop; \n");
108 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
109 source.append(
" } \n");
111 source.append(
"}; \n");
117 template<
typename StringT>
119 bool B_transposed,
bool B_row_major,
bool C_row_major)
121 source.append(
"__kernel void ");
123 source.append(
"( \n");
124 source.append(
" __global const unsigned int * sp_mat_row_indices, \n");
125 source.append(
" __global const unsigned int * sp_mat_col_indices, \n");
126 source.append(
" __global const "); source.append(numeric_string); source.append(
" * sp_mat_elements, \n");
127 source.append(
" __global const "); source.append(numeric_string); source.append(
" * d_mat, \n");
128 source.append(
" unsigned int d_mat_row_start, \n");
129 source.append(
" unsigned int d_mat_col_start, \n");
130 source.append(
" unsigned int d_mat_row_inc, \n");
131 source.append(
" unsigned int d_mat_col_inc, \n");
132 source.append(
" unsigned int d_mat_row_size, \n");
133 source.append(
" unsigned int d_mat_col_size, \n");
134 source.append(
" unsigned int d_mat_internal_rows, \n");
135 source.append(
" unsigned int d_mat_internal_cols, \n");
136 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
137 source.append(
" unsigned int result_row_start, \n");
138 source.append(
" unsigned int result_col_start, \n");
139 source.append(
" unsigned int result_row_inc, \n");
140 source.append(
" unsigned int result_col_inc, \n");
141 source.append(
" unsigned int result_row_size, \n");
142 source.append(
" unsigned int result_col_size, \n");
143 source.append(
" unsigned int result_internal_rows, \n");
144 source.append(
" unsigned int result_internal_cols) { \n");
147 source.append(
" for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
149 source.append(
" unsigned int row_start = sp_mat_row_indices[row]; \n");
150 source.append(
" unsigned int row_end = sp_mat_row_indices[row+1]; \n");
153 source.append(
" for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
155 source.append(
" "); source.append(numeric_string); source.append(
" r = 0; \n");
157 source.append(
" for (unsigned int k = row_start; k < row_end; k ++) { \n");
159 source.append(
" unsigned int j = sp_mat_col_indices[k]; \n");
160 source.append(
" "); source.append(numeric_string); source.append(
" x = sp_mat_elements[k]; \n");
162 source.append(
" "); source.append(numeric_string);
163 if (B_transposed && B_row_major)
164 source.append(
" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
165 else if (B_transposed && !B_row_major)
166 source.append(
" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
167 else if (!B_transposed && B_row_major)
168 source.append(
" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
170 source.append(
" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
171 source.append(
" r += x * y; \n");
172 source.append(
" } \n");
175 source.append(
" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
177 source.append(
" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
178 source.append(
" } \n");
179 source.append(
" } \n");
181 source.append(
"} \n");
185 template<
typename StringT>
199 template<
typename StringT>
203 source.append(
" __kernel void jacobi( \n");
204 source.append(
" __global const unsigned int * row_indices, \n");
205 source.append(
" __global const unsigned int * column_indices, \n");
206 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
207 source.append(
" "); source.append(numeric_string); source.append(
" weight, \n");
208 source.append(
" __global const "); source.append(numeric_string); source.append(
" * old_result, \n");
209 source.append(
" __global "); source.append(numeric_string); source.append(
" * new_result, \n");
210 source.append(
" __global const "); source.append(numeric_string); source.append(
" * rhs, \n");
211 source.append(
" unsigned int size) \n");
212 source.append(
" { \n");
213 source.append(
" "); source.append(numeric_string); source.append(
" sum, diag=1; \n");
214 source.append(
" int col; \n");
215 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
216 source.append(
" { \n");
217 source.append(
" sum = 0; \n");
218 source.append(
" for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
219 source.append(
" { \n");
220 source.append(
" col = column_indices[j]; \n");
221 source.append(
" if (i == col) \n");
222 source.append(
" diag = elements[j]; \n");
223 source.append(
" else \n");
224 source.append(
" sum += elements[j] * old_result[col]; \n");
225 source.append(
" } \n");
226 source.append(
" new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
227 source.append(
" } \n");
228 source.append(
" } \n");
233 template<
typename StringT>
237 source.append(
"__kernel void lu_backward( \n");
238 source.append(
" __global const unsigned int * row_indices, \n");
239 source.append(
" __global const unsigned int * column_indices, \n");
240 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
241 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
242 source.append(
" unsigned int size) \n");
243 source.append(
"{ \n");
244 source.append(
" __local unsigned int col_index_buffer[128]; \n");
245 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
246 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
248 source.append(
" unsigned int nnz = row_indices[size]; \n");
249 source.append(
" unsigned int current_row = size-1; \n");
250 source.append(
" unsigned int row_at_window_start = size-1; \n");
251 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[size-1]; \n");
252 source.append(
" "); source.append(numeric_string); source.append(
" diagonal_entry = 0; \n");
253 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
254 source.append(
" unsigned int next_row = row_indices[size-1]; \n");
256 source.append(
" unsigned int i = loop_end + get_local_id(0); \n");
257 source.append(
" while (1) \n");
258 source.append(
" { \n");
260 source.append(
" if (i < nnz) \n");
261 source.append(
" { \n");
262 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
263 source.append(
" unsigned int tmp = column_indices[i]; \n");
264 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
265 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
266 source.append(
" } \n");
268 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
271 source.append(
" if (get_local_id(0) == 0) \n");
272 source.append(
" { \n");
274 source.append(
" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
275 source.append(
" { \n");
276 source.append(
" unsigned int k = (get_local_size(0) - k2) - 1; \n");
278 source.append(
" if (i+k >= nnz) \n");
279 source.append(
" continue; \n");
281 source.append(
" if (col_index_buffer[k] > row_at_window_start) \n");
282 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
283 source.append(
" else if (col_index_buffer[k] > current_row) \n");
284 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
285 source.append(
" else if (col_index_buffer[k] == current_row) \n");
286 source.append(
" diagonal_entry = element_buffer[k]; \n");
288 source.append(
" if (i+k == next_row) \n");
289 source.append(
" { \n");
290 source.append(
" vector[current_row] = current_vector_entry / diagonal_entry; \n");
291 source.append(
" if (current_row > 0) //load next row's data \n");
292 source.append(
" { \n");
293 source.append(
" --current_row; \n");
294 source.append(
" next_row = row_indices[current_row]; \n");
295 source.append(
" current_vector_entry = vector[current_row]; \n");
296 source.append(
" } \n");
297 source.append(
" } \n");
300 source.append(
" } \n");
302 source.append(
" row_at_window_start = current_row; \n");
303 source.append(
" } \n");
305 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
307 source.append(
" if (i < get_local_size(0)) \n");
308 source.append(
" break; \n");
310 source.append(
" i -= get_local_size(0); \n");
311 source.append(
" } \n");
312 source.append(
"} \n");
316 template<
typename StringT>
321 source.append(
"__kernel void lu_forward( \n");
322 source.append(
" __global const unsigned int * row_indices, \n");
323 source.append(
" __global const unsigned int * column_indices, \n");
324 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
325 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
326 source.append(
" unsigned int size) \n");
327 source.append(
"{ \n");
328 source.append(
" __local unsigned int col_index_buffer[128]; \n");
329 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
330 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
332 source.append(
" unsigned int nnz = row_indices[size]; \n");
333 source.append(
" unsigned int current_row = 0; \n");
334 source.append(
" unsigned int row_at_window_start = 0; \n");
335 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[0]; \n");
336 source.append(
" "); source.append(numeric_string); source.append(
" diagonal_entry; \n");
337 source.append(
" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
338 source.append(
" unsigned int next_row = row_indices[1]; \n");
340 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
341 source.append(
" { \n");
343 source.append(
" if (i < nnz) \n");
344 source.append(
" { \n");
345 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
346 source.append(
" unsigned int tmp = column_indices[i]; \n");
347 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
348 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
349 source.append(
" } \n");
351 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
354 source.append(
" if (get_local_id(0) == 0) \n");
355 source.append(
" { \n");
357 source.append(
" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
358 source.append(
" { \n");
359 source.append(
" if (current_row < size && i+k == next_row) \n");
360 source.append(
" { \n");
361 source.append(
" vector[current_row] = current_vector_entry / diagonal_entry; \n");
362 source.append(
" ++current_row; \n");
363 source.append(
" if (current_row < size) \n");
364 source.append(
" { \n");
365 source.append(
" next_row = row_indices[current_row+1]; \n");
366 source.append(
" current_vector_entry = vector[current_row]; \n");
367 source.append(
" } \n");
368 source.append(
" } \n");
370 source.append(
" if (current_row < size && col_index_buffer[k] < current_row) \n");
371 source.append(
" { \n");
372 source.append(
" if (col_index_buffer[k] < row_at_window_start) \n");
373 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
374 source.append(
" else if (col_index_buffer[k] < current_row) \n");
375 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
376 source.append(
" } \n");
377 source.append(
" else if (col_index_buffer[k] == current_row) \n");
378 source.append(
" diagonal_entry = element_buffer[k]; \n");
380 source.append(
" } \n");
382 source.append(
" row_at_window_start = current_row; \n");
383 source.append(
" } \n");
385 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
386 source.append(
" } \n");
387 source.append(
"} \n");
391 template<
typename StringT>
394 source.append(
"__kernel void row_info_extractor( \n");
395 source.append(
" __global const unsigned int * row_indices, \n");
396 source.append(
" __global const unsigned int * column_indices, \n");
397 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
398 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
399 source.append(
" unsigned int size, \n");
400 source.append(
" unsigned int option \n");
401 source.append(
" ) \n");
402 source.append(
"{ \n");
403 source.append(
" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
404 source.append(
" { \n");
405 source.append(
" "); source.append(numeric_string); source.append(
" value = 0; \n");
406 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
408 source.append(
" switch (option) \n");
409 source.append(
" { \n");
410 source.append(
" case 0: \n");
411 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
412 source.append(
" value = max(value, fabs(elements[i])); \n");
413 source.append(
" break; \n");
415 source.append(
" case 1: \n");
416 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
417 source.append(
" value += fabs(elements[i]); \n");
418 source.append(
" break; \n");
420 source.append(
" case 2: \n");
421 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
422 source.append(
" value += elements[i] * elements[i]; \n");
423 source.append(
" value = sqrt(value); \n");
424 source.append(
" break; \n");
426 source.append(
" case 3: \n");
427 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
428 source.append(
" { \n");
429 source.append(
" if (column_indices[i] == row) \n");
430 source.append(
" { \n");
431 source.append(
" value = elements[i]; \n");
432 source.append(
" break; \n");
433 source.append(
" } \n");
434 source.append(
" } \n");
435 source.append(
" break; \n");
437 source.append(
" default: \n");
438 source.append(
" break; \n");
439 source.append(
" } \n");
440 source.append(
" result[row] = value; \n");
441 source.append(
" } \n");
442 source.append(
"} \n");
446 template<
typename StringT>
451 source.append(
"__kernel void trans_lu_backward( \n");
452 source.append(
" __global const unsigned int * row_indices, \n");
453 source.append(
" __global const unsigned int * column_indices, \n");
454 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
455 source.append(
" __global const "); source.append(numeric_string); source.append(
" * diagonal_entries, \n");
456 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
457 source.append(
" unsigned int size) \n");
458 source.append(
"{ \n");
459 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
460 source.append(
" __local unsigned int row_index_buffer[256]; \n");
462 source.append(
" unsigned int row_index; \n");
463 source.append(
" unsigned int col_index; \n");
464 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
465 source.append(
" unsigned int nnz = row_indices[size]; \n");
466 source.append(
" unsigned int row_at_window_start = size; \n");
467 source.append(
" unsigned int row_at_window_end; \n");
468 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
470 source.append(
" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
471 source.append(
" { \n");
472 source.append(
" unsigned int i = (nnz - i2) - 1; \n");
473 source.append(
" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
474 source.append(
" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
475 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
477 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
479 source.append(
" if (i2 < nnz) \n");
480 source.append(
" { \n");
481 source.append(
" unsigned int row_index_dec = 0; \n");
482 source.append(
" while (row_index_lookahead[row_index_dec] > i) \n");
483 source.append(
" ++row_index_dec; \n");
484 source.append(
" row_index = row_at_window_start - row_index_dec; \n");
485 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
486 source.append(
" } \n");
487 source.append(
" else \n");
488 source.append(
" { \n");
489 source.append(
" row_index = size+1; \n");
490 source.append(
" row_index_buffer[get_local_id(0)] = 0; \n");
491 source.append(
" } \n");
493 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
495 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
496 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
499 source.append(
" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
500 source.append(
" { \n");
501 source.append(
" unsigned int row = row_at_window_start - row2; \n");
502 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row] / diagonal_entries[row]; \n");
504 source.append(
" if ( (row_index == row) && (col_index < row) ) \n");
505 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
507 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
508 source.append(
" } \n");
510 source.append(
" row_at_window_start = row_at_window_end; \n");
511 source.append(
" } \n");
514 source.append(
" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
515 source.append(
" vector[i] /= diagonal_entries[i]; \n");
516 source.append(
"} \n");
520 template<
typename StringT>
525 source.append(
"__kernel void trans_lu_forward( \n");
526 source.append(
" __global const unsigned int * row_indices, \n");
527 source.append(
" __global const unsigned int * column_indices, \n");
528 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
529 source.append(
" __global const "); source.append(numeric_string); source.append(
" * diagonal_entries, \n");
530 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
531 source.append(
" unsigned int size) \n");
532 source.append(
"{ \n");
533 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
534 source.append(
" __local unsigned int row_index_buffer[256]; \n");
536 source.append(
" unsigned int row_index; \n");
537 source.append(
" unsigned int col_index; \n");
538 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
539 source.append(
" unsigned int nnz = row_indices[size]; \n");
540 source.append(
" unsigned int row_at_window_start = 0; \n");
541 source.append(
" unsigned int row_at_window_end = 0; \n");
542 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
544 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
545 source.append(
" { \n");
546 source.append(
" col_index = (i < nnz) ? column_indices[i] : 0; \n");
547 source.append(
" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
548 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
550 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
552 source.append(
" if (i < nnz) \n");
553 source.append(
" { \n");
554 source.append(
" unsigned int row_index_inc = 0; \n");
555 source.append(
" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
556 source.append(
" ++row_index_inc; \n");
557 source.append(
" row_index = row_at_window_start + row_index_inc; \n");
558 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
559 source.append(
" } \n");
560 source.append(
" else \n");
561 source.append(
" { \n");
562 source.append(
" row_index = size+1; \n");
563 source.append(
" row_index_buffer[get_local_id(0)] = size - 1; \n");
564 source.append(
" } \n");
566 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
568 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
569 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
572 source.append(
" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
573 source.append(
" { \n");
574 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row] / diagonal_entries[row]; \n");
576 source.append(
" if ( (row_index == row) && (col_index > row) ) \n");
577 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
579 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
580 source.append(
" } \n");
582 source.append(
" row_at_window_start = row_at_window_end; \n");
583 source.append(
" } \n");
586 source.append(
" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
587 source.append(
" vector[i] /= diagonal_entries[i]; \n");
588 source.append(
"} \n");
592 template<
typename StringT>
597 source.append(
"__kernel void trans_unit_lu_backward( \n");
598 source.append(
" __global const unsigned int * row_indices, \n");
599 source.append(
" __global const unsigned int * column_indices, \n");
600 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
601 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
602 source.append(
" unsigned int size) \n");
603 source.append(
"{ \n");
604 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
605 source.append(
" __local unsigned int row_index_buffer[256]; \n");
607 source.append(
" unsigned int row_index; \n");
608 source.append(
" unsigned int col_index; \n");
609 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
610 source.append(
" unsigned int nnz = row_indices[size]; \n");
611 source.append(
" unsigned int row_at_window_start = size; \n");
612 source.append(
" unsigned int row_at_window_end; \n");
613 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
615 source.append(
" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
616 source.append(
" { \n");
617 source.append(
" unsigned int i = (nnz - i2) - 1; \n");
618 source.append(
" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
619 source.append(
" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
620 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
622 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
624 source.append(
" if (i2 < nnz) \n");
625 source.append(
" { \n");
626 source.append(
" unsigned int row_index_dec = 0; \n");
627 source.append(
" while (row_index_lookahead[row_index_dec] > i) \n");
628 source.append(
" ++row_index_dec; \n");
629 source.append(
" row_index = row_at_window_start - row_index_dec; \n");
630 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
631 source.append(
" } \n");
632 source.append(
" else \n");
633 source.append(
" { \n");
634 source.append(
" row_index = size+1; \n");
635 source.append(
" row_index_buffer[get_local_id(0)] = 0; \n");
636 source.append(
" } \n");
638 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
640 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
641 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
644 source.append(
" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
645 source.append(
" { \n");
646 source.append(
" unsigned int row = row_at_window_start - row2; \n");
647 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
649 source.append(
" if ( (row_index == row) && (col_index < row) ) \n");
650 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
652 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
653 source.append(
" } \n");
655 source.append(
" row_at_window_start = row_at_window_end; \n");
656 source.append(
" } \n");
657 source.append(
"} \n");
662 template<
typename StringT>
667 source.append(
"__kernel void trans_unit_lu_forward( \n");
668 source.append(
" __global const unsigned int * row_indices, \n");
669 source.append(
" __global const unsigned int * column_indices, \n");
670 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
671 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
672 source.append(
" unsigned int size) \n");
673 source.append(
"{ \n");
674 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
675 source.append(
" __local unsigned int row_index_buffer[256]; \n");
677 source.append(
" unsigned int row_index; \n");
678 source.append(
" unsigned int col_index; \n");
679 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
680 source.append(
" unsigned int nnz = row_indices[size]; \n");
681 source.append(
" unsigned int row_at_window_start = 0; \n");
682 source.append(
" unsigned int row_at_window_end = 0; \n");
683 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
685 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
686 source.append(
" { \n");
687 source.append(
" col_index = (i < nnz) ? column_indices[i] : 0; \n");
688 source.append(
" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
689 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
691 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
693 source.append(
" if (i < nnz) \n");
694 source.append(
" { \n");
695 source.append(
" unsigned int row_index_inc = 0; \n");
696 source.append(
" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
697 source.append(
" ++row_index_inc; \n");
698 source.append(
" row_index = row_at_window_start + row_index_inc; \n");
699 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
700 source.append(
" } \n");
701 source.append(
" else \n");
702 source.append(
" { \n");
703 source.append(
" row_index = size+1; \n");
704 source.append(
" row_index_buffer[get_local_id(0)] = size - 1; \n");
705 source.append(
" } \n");
707 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
709 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
710 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
713 source.append(
" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
714 source.append(
" { \n");
715 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
717 source.append(
" if ( (row_index == row) && (col_index > row) ) \n");
718 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
720 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
721 source.append(
" } \n");
723 source.append(
" row_at_window_start = row_at_window_end; \n");
724 source.append(
" } \n");
725 source.append(
"} \n");
729 template<
typename StringT>
734 source.append(
"__kernel void trans_unit_lu_forward_slow( \n");
735 source.append(
" __global const unsigned int * row_indices, \n");
736 source.append(
" __global const unsigned int * column_indices, \n");
737 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
738 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
739 source.append(
" unsigned int size) \n");
740 source.append(
"{ \n");
741 source.append(
" for (unsigned int row = 0; row < size; ++row) \n");
742 source.append(
" { \n");
743 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
745 source.append(
" unsigned int row_start = row_indices[row]; \n");
746 source.append(
" unsigned int row_stop = row_indices[row + 1]; \n");
747 source.append(
" for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
748 source.append(
" { \n");
749 source.append(
" unsigned int col_index = column_indices[entry_index]; \n");
750 source.append(
" if (col_index > row) \n");
751 source.append(
" vector[col_index] -= result_entry * elements[entry_index]; \n");
752 source.append(
" } \n");
754 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
755 source.append(
" } \n");
756 source.append(
"} \n");
760 template<
typename StringT>
765 source.append(
"__kernel void unit_lu_backward( \n");
766 source.append(
" __global const unsigned int * row_indices, \n");
767 source.append(
" __global const unsigned int * column_indices, \n");
768 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
769 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
770 source.append(
" unsigned int size) \n");
771 source.append(
"{ \n");
772 source.append(
" __local unsigned int col_index_buffer[128]; \n");
773 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
774 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
776 source.append(
" unsigned int nnz = row_indices[size]; \n");
777 source.append(
" unsigned int current_row = size-1; \n");
778 source.append(
" unsigned int row_at_window_start = size-1; \n");
779 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[size-1]; \n");
780 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
781 source.append(
" unsigned int next_row = row_indices[size-1]; \n");
783 source.append(
" unsigned int i = loop_end + get_local_id(0); \n");
784 source.append(
" while (1) \n");
785 source.append(
" { \n");
787 source.append(
" if (i < nnz) \n");
788 source.append(
" { \n");
789 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
790 source.append(
" unsigned int tmp = column_indices[i]; \n");
791 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
792 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
793 source.append(
" } \n");
795 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
798 source.append(
" if (get_local_id(0) == 0) \n");
799 source.append(
" { \n");
801 source.append(
" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
802 source.append(
" { \n");
803 source.append(
" unsigned int k = (get_local_size(0) - k2) - 1; \n");
805 source.append(
" if (i+k >= nnz) \n");
806 source.append(
" continue; \n");
808 source.append(
" if (col_index_buffer[k] > row_at_window_start) \n");
809 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
810 source.append(
" else if (col_index_buffer[k] > current_row) \n");
811 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
813 source.append(
" if (i+k == next_row) \n");
814 source.append(
" { \n");
815 source.append(
" vector[current_row] = current_vector_entry; \n");
816 source.append(
" if (current_row > 0) \n");
817 source.append(
" { \n");
818 source.append(
" --current_row; \n");
819 source.append(
" next_row = row_indices[current_row]; \n");
820 source.append(
" current_vector_entry = vector[current_row]; \n");
821 source.append(
" } \n");
822 source.append(
" } \n");
825 source.append(
" } \n");
827 source.append(
" row_at_window_start = current_row; \n");
828 source.append(
" } \n");
830 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
832 source.append(
" if (i < get_local_size(0)) \n");
833 source.append(
" break; \n");
835 source.append(
" i -= get_local_size(0); \n");
836 source.append(
" } \n");
837 source.append(
"} \n");
841 template<
typename StringT>
846 source.append(
"__kernel void unit_lu_forward( \n");
847 source.append(
" __global const unsigned int * row_indices, \n");
848 source.append(
" __global const unsigned int * column_indices, \n");
849 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
850 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
851 source.append(
" unsigned int size) \n");
852 source.append(
"{ \n");
853 source.append(
" __local unsigned int col_index_buffer[128]; \n");
854 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
855 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
857 source.append(
" unsigned int nnz = row_indices[size]; \n");
858 source.append(
" unsigned int current_row = 0; \n");
859 source.append(
" unsigned int row_at_window_start = 0; \n");
860 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[0]; \n");
861 source.append(
" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
862 source.append(
" unsigned int next_row = row_indices[1]; \n");
864 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
865 source.append(
" { \n");
867 source.append(
" if (i < nnz) \n");
868 source.append(
" { \n");
869 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
870 source.append(
" unsigned int tmp = column_indices[i]; \n");
871 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
872 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
873 source.append(
" } \n");
875 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
878 source.append(
" if (get_local_id(0) == 0) \n");
879 source.append(
" { \n");
881 source.append(
" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
882 source.append(
" { \n");
883 source.append(
" if (i+k == next_row) \n");
884 source.append(
" { \n");
885 source.append(
" vector[current_row] = current_vector_entry; \n");
886 source.append(
" ++current_row; \n");
887 source.append(
" if (current_row < size) //load next row's data \n");
888 source.append(
" { \n");
889 source.append(
" next_row = row_indices[current_row+1]; \n");
890 source.append(
" current_vector_entry = vector[current_row]; \n");
891 source.append(
" } \n");
892 source.append(
" } \n");
894 source.append(
" if (current_row < size && col_index_buffer[k] < current_row) \n");
895 source.append(
" { \n");
896 source.append(
" if (col_index_buffer[k] < row_at_window_start) \n");
897 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
898 source.append(
" else if (col_index_buffer[k] < current_row) \n");
899 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
900 source.append(
" } \n");
902 source.append(
" } \n");
904 source.append(
" row_at_window_start = current_row; \n");
905 source.append(
" } \n");
907 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
908 source.append(
" } //for i \n");
909 source.append(
"} \n");
913 template<
typename StringT>
916 std::stringstream ss;
919 source.append(
"__kernel void vec_mul_nvidia( \n");
920 source.append(
" __global const unsigned int * row_indices, \n");
921 source.append(
" __global const unsigned int * column_indices, \n");
922 source.append(
" __global const unsigned int * row_blocks, \n");
923 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
924 source.append(
" unsigned int num_blocks, \n");
925 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
926 source.append(
" uint4 layout_x, \n");
927 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
928 source.append(
" uint4 layout_result) \n");
929 source.append(
"{ \n");
930 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_elements[256]; \n");
932 source.append(
" const unsigned int id_in_row = get_local_id(0) % " + ss.str() +
"; \n");
933 source.append(
" const unsigned int block_increment = get_local_size(0) * ((layout_result.z - 1) / (get_global_size(0)) + 1); \n");
934 source.append(
" const unsigned int block_start = get_group_id(0) * block_increment; \n");
935 source.append(
" const unsigned int block_stop = min(block_start + block_increment, layout_result.z); \n");
937 source.append(
" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() +
"; \n");
938 source.append(
" row < block_stop; \n");
939 source.append(
" row += get_local_size(0) / " + ss.str() +
") \n");
940 source.append(
" { \n");
941 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
942 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
943 source.append(
" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() +
") \n");
944 source.append(
" dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
946 source.append(
" shared_elements[get_local_id(0)] = dot_prod; \n");
947 source.append(
" #pragma unroll \n");
948 source.append(
" for (unsigned int k = 1; k < " + ss.str() +
"; k *= 2) \n");
949 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
951 source.append(
" if (id_in_row == 0) \n");
952 source.append(
" result[row * layout_result.y + layout_result.x] = shared_elements[get_local_id(0)]; \n");
953 source.append(
" } \n");
954 source.append(
"} \n");
958 template<
typename StringT>
961 source.append(
"__kernel void vec_mul( \n");
962 source.append(
" __global const unsigned int * row_indices, \n");
963 source.append(
" __global const unsigned int * column_indices, \n");
964 source.append(
" __global const unsigned int * row_blocks, \n");
965 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
966 source.append(
" unsigned int num_blocks, \n");
967 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
968 source.append(
" uint4 layout_x, \n");
969 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
970 source.append(
" uint4 layout_result) \n");
971 source.append(
"{ \n");
972 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_elements[1024]; \n");
974 source.append(
" unsigned int row_start = row_blocks[get_group_id(0)]; \n");
975 source.append(
" unsigned int row_stop = row_blocks[get_group_id(0) + 1]; \n");
976 source.append(
" unsigned int rows_to_process = row_stop - row_start; \n");
977 source.append(
" unsigned int element_start = row_indices[row_start]; \n");
978 source.append(
" unsigned int element_stop = row_indices[row_stop]; \n");
980 source.append(
" if (rows_to_process > 4) { \n");
982 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
983 source.append(
" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
985 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
988 source.append(
" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
989 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
990 source.append(
" unsigned int thread_row_start = row_indices[row] - element_start; \n");
991 source.append(
" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
992 source.append(
" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
993 source.append(
" dot_prod += shared_elements[i]; \n");
994 source.append(
" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
995 source.append(
" } \n");
996 source.append(
" } \n");
999 source.append(
" else if (rows_to_process > 1) \n");
1000 source.append(
" {\n");
1002 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0))\n");
1003 source.append(
" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x];\n");
1005 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1008 source.append(
" for (unsigned int row = row_start; row < row_stop; ++row)\n");
1009 source.append(
" {\n");
1010 source.append(
" unsigned int current_row_start = row_indices[row] - element_start;\n");
1011 source.append(
" unsigned int current_row_stop = row_indices[row + 1] - element_start;\n");
1012 source.append(
" unsigned int thread_base_id = current_row_start + get_local_id(0);\n");
1015 source.append(
" for (unsigned int j = thread_base_id + get_local_size(0); j < current_row_stop; j += get_local_size(0))\n");
1016 source.append(
" shared_elements[thread_base_id] += shared_elements[j];\n");
1019 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n");
1020 source.append(
" {\n");
1021 source.append(
" barrier(CLK_LOCAL_MEM_FENCE);\n");
1022 source.append(
" if (get_local_id(0) < stride && thread_base_id < current_row_stop)\n");
1023 source.append(
" shared_elements[thread_base_id] += (thread_base_id + stride < current_row_stop) ? shared_elements[thread_base_id+stride] : 0;\n");
1024 source.append(
" }\n");
1025 source.append(
" if (get_local_id(0) == 0)\n");
1026 source.append(
" result[row * layout_result.y + layout_result.x] = shared_elements[current_row_start];\n");
1027 source.append(
" }\n");
1028 source.append(
" }\n");
1031 source.append(
" else \n");
1032 source.append(
" { \n");
1034 source.append(
" shared_elements[get_local_id(0)] = 0; \n");
1035 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
1036 source.append(
" shared_elements[get_local_id(0)] += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
1039 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
1040 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1041 source.append(
" if (get_local_id(0) < stride) \n");
1042 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
1043 source.append(
" } \n");
1045 source.append(
" if (get_local_id(0) == 0) \n");
1046 source.append(
" result[row_start * layout_result.y + layout_result.x] = shared_elements[0]; \n");
1047 source.append(
" } \n");
1049 source.append(
"} \n");
1054 template<
typename StringT>
1057 source.append(
"__kernel void vec_mul4( \n");
1058 source.append(
" __global const unsigned int * row_indices, \n");
1059 source.append(
" __global const uint4 * column_indices, \n");
1060 source.append(
" __global const "); source.append(numeric_string); source.append(
"4 * elements, \n");
1061 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
1062 source.append(
" uint4 layout_x, \n");
1063 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
1064 source.append(
" uint4 layout_result) \n");
1065 source.append(
"{ \n");
1066 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod; \n");
1067 source.append(
" unsigned int start, next_stop; \n");
1068 source.append(
" uint4 col_idx; \n");
1069 source.append(
" "); source.append(numeric_string); source.append(
"4 tmp_vec; \n");
1070 source.append(
" "); source.append(numeric_string); source.append(
"4 tmp_entries; \n");
1072 source.append(
" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
1073 source.append(
" { \n");
1074 source.append(
" dot_prod = 0; \n");
1075 source.append(
" start = row_indices[row] / 4; \n");
1076 source.append(
" next_stop = row_indices[row+1] / 4; \n");
1078 source.append(
" for (unsigned int i = start; i < next_stop; ++i) \n");
1079 source.append(
" { \n");
1080 source.append(
" col_idx = column_indices[i]; \n");
1082 source.append(
" tmp_entries = elements[i]; \n");
1083 source.append(
" tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
1084 source.append(
" tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
1085 source.append(
" tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
1086 source.append(
" tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
1088 source.append(
" dot_prod += dot(tmp_entries, tmp_vec); \n");
1089 source.append(
" } \n");
1090 source.append(
" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1091 source.append(
" } \n");
1092 source.append(
"} \n");
1095 template<
typename StringT>
1098 source.append(
"__kernel void vec_mul8( \n");
1099 source.append(
" __global const unsigned int * row_indices, \n");
1100 source.append(
" __global const uint8 * column_indices, \n");
1101 source.append(
" __global const "); source.append(numeric_string); source.append(
"8 * elements, \n");
1102 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
1103 source.append(
" uint4 layout_x, \n");
1104 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
1105 source.append(
" uint4 layout_result) \n");
1106 source.append(
"{ \n");
1107 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod; \n");
1108 source.append(
" unsigned int start, next_stop; \n");
1109 source.append(
" uint8 col_idx; \n");
1110 source.append(
" "); source.append(numeric_string); source.append(
"8 tmp_vec; \n");
1111 source.append(
" "); source.append(numeric_string); source.append(
"8 tmp_entries; \n");
1113 source.append(
" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
1114 source.append(
" { \n");
1115 source.append(
" dot_prod = 0; \n");
1116 source.append(
" start = row_indices[row] / 8; \n");
1117 source.append(
" next_stop = row_indices[row+1] / 8; \n");
1119 source.append(
" for (unsigned int i = start; i < next_stop; ++i) \n");
1120 source.append(
" { \n");
1121 source.append(
" col_idx = column_indices[i]; \n");
1123 source.append(
" tmp_entries = elements[i]; \n");
1124 source.append(
" tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
1125 source.append(
" tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
1126 source.append(
" tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
1127 source.append(
" tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
1128 source.append(
" tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
1129 source.append(
" tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
1130 source.append(
" tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
1131 source.append(
" tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
1133 source.append(
" dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
1134 source.append(
" dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
1135 source.append(
" } \n");
1136 source.append(
" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1137 source.append(
" } \n");
1138 source.append(
"} \n");
1141 template<
typename StringT>
1144 source.append(
"__kernel void vec_mul_cpu( \n");
1145 source.append(
" __global const unsigned int * row_indices, \n");
1146 source.append(
" __global const unsigned int * column_indices, \n");
1147 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1148 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vector, \n");
1149 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
1150 source.append(
" unsigned int size) \n");
1151 source.append(
"{ \n");
1152 source.append(
" unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
1153 source.append(
" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1154 source.append(
" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
1155 source.append(
" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1156 source.append(
" { \n");
1157 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = ("); source.append(numeric_string); source.append(
")0; \n");
1158 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
1159 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
1160 source.append(
" dot_prod += elements[i] * vector[column_indices[i]]; \n");
1161 source.append(
" result[row] = dot_prod; \n");
1162 source.append(
" } \n");
1163 source.append(
"} \n");
1173 template<
typename StringT>
1176 source.append(
"__kernel void spgemm_stage1( \n");
1177 source.append(
" __global const unsigned int * A_row_indices, \n");
1178 source.append(
" __global const unsigned int * A_column_indices, \n");
1179 source.append(
" unsigned int A_size1, \n");
1180 source.append(
" __global unsigned int * group_nnz_array) \n");
1181 source.append(
"{ \n");
1182 source.append(
" unsigned int work_per_item = max((uint) ((A_size1 - 1) / get_global_size(0) + 1), (uint) 1); \n");
1183 source.append(
" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1184 source.append(
" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) A_size1); \n");
1185 source.append(
" unsigned int max_A_nnz = 0; \n");
1186 source.append(
" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1187 source.append(
" max_A_nnz = max(max_A_nnz, A_row_indices[row + 1] - A_row_indices[row]); \n");
1190 source.append(
" __local unsigned int shared_nnz[256]; \n");
1191 source.append(
" shared_nnz[get_local_id(0)] = max_A_nnz; \n");
1194 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
1195 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1196 source.append(
" if (get_local_id(0) < stride) \n");
1197 source.append(
" shared_nnz[get_local_id(0)] = max(shared_nnz[get_local_id(0)], shared_nnz[get_local_id(0) + stride]); \n");
1198 source.append(
" } \n");
1200 source.append(
" if (get_local_id(0) == 0) \n");
1201 source.append(
" group_nnz_array[get_group_id(0)] = shared_nnz[0]; \n");
1202 source.append(
"} \n");
1210 template<
typename StringT>
1213 source.append(
"__kernel void spgemm_decompose_1( \n");
1214 source.append(
" __global const unsigned int * A_row_indices, \n");
1215 source.append(
" unsigned int A_size1, \n");
1216 source.append(
" unsigned int max_per_row, \n");
1217 source.append(
" __global unsigned int * chunks_per_row) \n");
1218 source.append(
"{ \n");
1219 source.append(
" for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) {\n");
1220 source.append(
" unsigned int num_entries = A_row_indices[row+1] - A_row_indices[row]; \n");
1221 source.append(
" chunks_per_row[row] = (num_entries < max_per_row) ? 1 : ((num_entries - 1) / max_per_row + 1); \n");
1222 source.append(
" } \n");
1223 source.append(
"} \n");
1231 template<
typename StringT>
1234 source.append(
"__kernel void spgemm_A2( \n");
1235 source.append(
" __global unsigned int *A2_row_indices, \n");
1236 source.append(
" __global unsigned int *A2_col_indices, \n");
1237 source.append(
" __global "); source.append(numeric_string); source.append(
" *A2_elements, \n");
1238 source.append(
" unsigned int A2_size1, \n");
1239 source.append(
" __global const unsigned int *new_row_buffer) \n");
1240 source.append(
"{ \n");
1241 source.append(
" for (unsigned int i = get_global_id(0); i < A2_size1; i += get_global_size(0)) {\n");
1242 source.append(
" unsigned int index_start = new_row_buffer[i]; \n");
1243 source.append(
" unsigned int index_stop = new_row_buffer[i+1]; \n");
1245 source.append(
" A2_row_indices[i] = index_start; \n");
1247 source.append(
" for (unsigned int j = index_start; j < index_stop; ++j) { \n");
1248 source.append(
" A2_col_indices[j] = j; \n");
1249 source.append(
" A2_elements[j] = 1; \n");
1250 source.append(
" } \n");
1251 source.append(
" } \n");
1253 source.append(
" if (get_global_id(0) == 0) \n");
1254 source.append(
" A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; \n");
1255 source.append(
"} \n");
1262 template<
typename StringT>
1265 source.append(
"__kernel void spgemm_G1( \n");
1266 source.append(
" __global unsigned int *G1_row_indices, \n");
1267 source.append(
" __global unsigned int *G1_col_indices, \n");
1268 source.append(
" __global "); source.append(numeric_string); source.append(
" *G1_elements, \n");
1269 source.append(
" unsigned int G1_size1, \n");
1270 source.append(
" __global const unsigned int *A_row_indices, \n");
1271 source.append(
" __global const unsigned int *A_col_indices, \n");
1272 source.append(
" __global const "); source.append(numeric_string); source.append(
" *A_elements, \n");
1273 source.append(
" unsigned int A_size1, \n");
1274 source.append(
" unsigned int A_nnz, \n");
1275 source.append(
" unsigned int max_per_row, \n");
1276 source.append(
" __global const unsigned int *new_row_buffer) \n");
1277 source.append(
"{ \n");
1280 source.append(
" for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) {\n");
1281 source.append(
" G1_col_indices[i] = A_col_indices[i]; \n");
1282 source.append(
" G1_elements[i] = A_elements[i]; \n");
1283 source.append(
" } \n");
1286 source.append(
" for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) {\n");
1287 source.append(
" unsigned int old_start = A_row_indices[i]; \n");
1288 source.append(
" unsigned int new_start = new_row_buffer[i]; \n");
1289 source.append(
" unsigned int row_chunks = new_row_buffer[i+1] - new_start; \n");
1291 source.append(
" for (unsigned int j=0; j<row_chunks; ++j) \n");
1292 source.append(
" G1_row_indices[new_start + j] = old_start + j * max_per_row; \n");
1293 source.append(
" } \n");
1296 source.append(
" if (get_global_id(0) == 0) \n");
1297 source.append(
" G1_row_indices[G1_size1] = A_row_indices[A_size1]; \n");
1298 source.append(
"} \n");
1308 template<
typename StringT>
1311 source.append(
"__attribute__((reqd_work_group_size(32, 1, 1))) \n");
1312 source.append(
"__kernel void spgemm_stage2( \n");
1313 source.append(
" __global const unsigned int * A_row_indices, \n");
1314 source.append(
" __global const unsigned int * A_col_indices, \n");
1315 source.append(
" unsigned int A_size1, \n");
1316 source.append(
" __global const unsigned int * B_row_indices, \n");
1317 source.append(
" __global const unsigned int * B_col_indices, \n");
1318 source.append(
" unsigned int B_size2, \n");
1319 source.append(
" __global unsigned int * C_row_indices) \n");
1320 source.append(
"{ \n");
1321 source.append(
" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
1322 source.append(
" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
1323 source.append(
" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
1324 source.append(
" __local unsigned int shared_front[32]; \n");
1326 source.append(
" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
1327 source.append(
" { \n");
1328 source.append(
" unsigned int row_A_start = A_row_indices[row_C]; \n");
1329 source.append(
" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
1331 source.append(
" unsigned int my_row_B = row_A_start + get_local_id(0); \n");
1332 source.append(
" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
1333 source.append(
" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
1334 source.append(
" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
1336 source.append(
" unsigned int num_nnz = 0; \n");
1337 source.append(
" if (row_A_end - row_A_start > 1) { \n");
1339 source.append(
" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1340 source.append(
" while (1) { \n");
1343 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1344 source.append(
" shared_front[get_local_id(0)] = current_front_index; \n");
1345 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1346 source.append(
" if (get_local_id(0) < 16) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 16]); \n");
1347 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1348 source.append(
" if (get_local_id(0) < 8) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 8]); \n");
1349 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1350 source.append(
" if (get_local_id(0) < 4) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 4]); \n");
1351 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1352 source.append(
" if (get_local_id(0) < 2) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 2]); \n");
1353 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1354 source.append(
" if (get_local_id(0) < 1) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 1]); \n");
1355 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1357 source.append(
" if (shared_front[0] == B_size2) break; \n");
1360 source.append(
" if (current_front_index == shared_front[0]) { \n");
1361 source.append(
" ++row_B_start; \n");
1362 source.append(
" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1363 source.append(
" } \n");
1365 source.append(
" ++num_nnz; \n");
1366 source.append(
" } \n");
1367 source.append(
" } else { num_nnz = row_B_end - row_B_start; }\n");
1370 source.append(
" if (get_local_id(0) == 0) \n");
1371 source.append(
" C_row_indices[row_C] = num_nnz; \n");
1373 source.append(
" } \n");
1375 source.append(
"} \n");
1384 template<
typename StringT>
1387 source.append(
"__attribute__((reqd_work_group_size(32, 1, 1))) \n");
1388 source.append(
"__kernel void spgemm_stage3( \n");
1389 source.append(
" __global const unsigned int * A_row_indices, \n");
1390 source.append(
" __global const unsigned int * A_col_indices, \n");
1391 source.append(
" __global const "); source.append(numeric_string); source.append(
" * A_elements, \n");
1392 source.append(
" unsigned int A_size1, \n");
1393 source.append(
" __global const unsigned int * B_row_indices, \n");
1394 source.append(
" __global const unsigned int * B_col_indices, \n");
1395 source.append(
" __global const "); source.append(numeric_string); source.append(
" * B_elements, \n");
1396 source.append(
" unsigned int B_size2, \n");
1397 source.append(
" __global unsigned int * C_row_indices, \n");
1398 source.append(
" __global unsigned int * C_col_indices, \n");
1399 source.append(
" __global "); source.append(numeric_string); source.append(
" * C_elements) \n");
1400 source.append(
"{ \n");
1401 source.append(
" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
1402 source.append(
" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
1403 source.append(
" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
1404 source.append(
" __local unsigned int shared_front[32]; \n");
1405 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_front_values[32]; \n");
1406 source.append(
" unsigned int local_id = get_local_id(0); \n");
1408 source.append(
" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
1409 source.append(
" { \n");
1410 source.append(
" unsigned int row_A_start = A_row_indices[row_C]; \n");
1411 source.append(
" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
1413 source.append(
" unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? local_id : 0); \n");
1414 source.append(
" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
1415 source.append(
" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
1416 source.append(
" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
1418 source.append(
" "); source.append(numeric_string); source.append(
" val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; \n");
1419 source.append(
" unsigned int index_in_C = C_row_indices[row_C] + local_id; \n");
1421 source.append(
" if (row_A_end - row_A_start > 1) { \n");
1423 source.append(
" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1424 source.append(
" "); source.append(numeric_string); source.append(
" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
1426 source.append(
" unsigned int index_buffer = 0; \n");
1427 source.append(
" "); source.append(numeric_string); source.append(
" value_buffer = 0; \n");
1428 source.append(
" unsigned int buffer_size = 0; \n");
1430 source.append(
" while (1) { \n");
1433 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1434 source.append(
" shared_front[local_id] = current_front_index; \n");
1435 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1436 source.append(
" if (local_id < 16) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 16]); \n");
1437 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1438 source.append(
" if (local_id < 8) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 8]); \n");
1439 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1440 source.append(
" if (local_id < 4) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 4]); \n");
1441 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1442 source.append(
" if (local_id < 2) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 2]); \n");
1443 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1444 source.append(
" if (local_id < 1) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 1]); \n");
1445 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1447 source.append(
" if (shared_front[0] == B_size2) break; \n");
1450 source.append(
" shared_front_values[local_id] = (current_front_index == shared_front[0]) ? val_A * current_front_value : 0; \n");
1451 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1452 source.append(
" if (local_id < 16) shared_front_values[local_id] += shared_front_values[local_id + 16]; \n");
1453 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1454 source.append(
" if (local_id < 8) shared_front_values[local_id] += shared_front_values[local_id + 8]; \n");
1455 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1456 source.append(
" if (local_id < 4) shared_front_values[local_id] += shared_front_values[local_id + 4]; \n");
1457 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1458 source.append(
" if (local_id < 2) shared_front_values[local_id] += shared_front_values[local_id + 2]; \n");
1459 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1460 source.append(
" if (local_id < 1) shared_front_values[local_id] += shared_front_values[local_id + 1]; \n");
1461 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1464 source.append(
" if (current_front_index == shared_front[0]) { \n");
1465 source.append(
" ++row_B_start; \n");
1466 source.append(
" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1467 source.append(
" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
1468 source.append(
" } \n");
1471 source.append(
" index_buffer = (local_id == buffer_size) ? shared_front[0] : index_buffer; \n");
1472 source.append(
" value_buffer = (local_id == buffer_size) ? shared_front_values[0] : value_buffer; \n");
1473 source.append(
" ++buffer_size; \n");
1476 source.append(
" if (buffer_size == get_local_size(0)) { \n");
1477 source.append(
" C_col_indices[index_in_C] = index_buffer; \n");
1478 source.append(
" C_elements[index_in_C] = value_buffer; \n");
1479 source.append(
" } \n");
1482 source.append(
" index_in_C += (buffer_size == get_local_size(0)) ? get_local_size(0) : 0; \n");
1483 source.append(
" buffer_size = (buffer_size == get_local_size(0)) ? 0 : buffer_size; \n");
1485 source.append(
" } \n");
1488 source.append(
" if (local_id < buffer_size) { \n");
1489 source.append(
" C_col_indices[index_in_C] = index_buffer; \n");
1490 source.append(
" C_elements[index_in_C] = value_buffer; \n");
1491 source.append(
" } \n");
1494 source.append(
" } else { \n");
1495 source.append(
" for (unsigned int i = row_B_start + local_id; i < row_B_end; i += get_local_size(0)) { \n");
1496 source.append(
" C_col_indices[index_in_C] = B_col_indices[i]; \n");
1497 source.append(
" C_elements[index_in_C] = val_A * B_elements[i]; \n");
1498 source.append(
" index_in_C += get_local_size(0); \n");
1499 source.append(
" } \n");
1500 source.append(
" } \n");
1502 source.append(
" } \n");
1504 source.append(
"} \n");
1509 template<
typename StringT>
1520 template<
typename StringT>
1524 source.append(
" __kernel void assign_to_dense( \n");
1525 source.append(
" __global const unsigned int * row_indices, \n");
1526 source.append(
" __global const unsigned int * column_indices, \n");
1527 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1528 source.append(
" __global "); source.append(numeric_string); source.append(
" * B, \n");
1529 source.append(
" unsigned int B_row_start, \n");
1530 source.append(
" unsigned int B_col_start, \n");
1531 source.append(
" unsigned int B_row_inc, \n");
1532 source.append(
" unsigned int B_col_inc, \n");
1533 source.append(
" unsigned int B_row_size, \n");
1534 source.append(
" unsigned int B_col_size, \n");
1535 source.append(
" unsigned int B_internal_rows, \n");
1536 source.append(
" unsigned int B_internal_cols) { \n");
1538 source.append(
" for (unsigned int i = get_global_id(0); i < B_row_size; i += get_global_size(0)) \n");
1539 source.append(
" { \n");
1540 source.append(
" unsigned int row_end = row_indices[i+1]; \n");
1541 source.append(
" for (unsigned int j = row_indices[i]; j<row_end; j++) \n");
1542 source.append(
" { \n");
1543 source.append(
" B[(B_row_start + i * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; \n");
1544 source.append(
" } \n");
1545 source.append(
" } \n");
1546 source.append(
" } \n");
1555 template<
typename NumericT>
1565 static std::map<cl_context, bool> init_done;
1572 source.reserve(1024);
1574 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1576 if (numeric_string ==
"float" || numeric_string ==
"double")
1603 #ifdef VIENNACL_BUILD_INFO
1604 std::cout <<
"Creating program " << prog_name << std::endl;
1606 ctx.add_program(source, prog_name);
1607 init_done[ctx.handle().get()] =
true;
void generate_compressed_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_compressed_matrix_trans_lu_forward(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities.
void generate_compressed_matrix_block_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_dense_matrix_mult(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
void generate_compressed_matrix_vec_mul_cpu(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_jacobi(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
void generate_compressed_matrix_vec_mul(StringT &source, std::string const &numeric_string)
Common implementations shared by OpenCL-based operations.
void generate_compressed_matrix_assign_to_dense(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_block_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
void generate_compressed_matrix_compressed_matrix_prod_G1(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling G_1 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const
void generate_compressed_matrix_compressed_matrix_prod_3(StringT &source, std::string const &numeric_string)
OpenCL kernel for the third stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_vec_mul4(StringT &source, std::string const &numeric_string)
Main kernel class for generating OpenCL kernels for compressed_matrix.
void generate_compressed_matrix_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_vec_mul_nvidia(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
void generate_compressed_matrix_vec_mul8(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_2(StringT &source)
OpenCL kernel for the second stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_trans_unit_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_1(StringT &source)
OpenCL kernel for the first stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_A2(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling A_2 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
static std::string program_name()
Representation of an OpenCL kernel in ViennaCL.
static void init(viennacl::ocl::context &ctx)
void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
void generate_compressed_matrix_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_decompose_1(StringT &source)
OpenCL kernel for decomposing A in C = A * B, such that A = A_2 * G_1 with G_1 containing at most 32 ...
void generate_compressed_matrix_unit_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_unit_lu_backward(StringT &source, std::string const &numeric_string)