1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
48 template<
typename StringT>
51 source.append(
"__kernel void cg_vector_update( \n");
52 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
53 source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n");
54 source.append(
" __global "); source.append(numeric_string); source.append(
" * p, \n");
55 source.append(
" __global "); source.append(numeric_string); source.append(
" * r, \n");
56 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
57 source.append(
" "); source.append(numeric_string); source.append(
" beta, \n");
58 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
59 source.append(
" unsigned int size, \n");
60 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
61 source.append(
"{ \n");
62 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
63 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
64 source.append(
" "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
65 source.append(
" "); source.append(numeric_string); source.append(
" value_r = r[i]; \n");
67 source.append(
" result[i] += alpha * value_p; \n");
68 source.append(
" value_r -= alpha * Ap[i]; \n");
69 source.append(
" value_p = value_r + beta * value_p; \n");
71 source.append(
" p[i] = value_p; \n");
72 source.append(
" r[i] = value_r; \n");
73 source.append(
" inner_prod_contrib += value_r * value_r; \n");
74 source.append(
" } \n");
77 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
78 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
79 source.append(
" { \n");
80 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
81 source.append(
" if (get_local_id(0) < stride) \n");
82 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
86 source.append(
" if (get_local_id(0) == 0) \n ");
87 source.append(
" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
89 source.append(
"} \n");
92 template<
typename StringT>
98 source.append(
"__kernel void cg_csr_blocked_prod( \n");
99 source.append(
" __global const unsigned int * row_indices, \n");
100 source.append(
" __global const unsigned int * column_indices, \n");
101 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
102 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
103 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
104 source.append(
" unsigned int size, \n");
105 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
106 source.append(
" unsigned int buffer_size, \n");
107 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
108 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
109 source.append(
"{ \n");
110 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_elements[256]; \n");
111 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
112 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
114 source.append(
" const unsigned int id_in_row = get_local_id(0) % " + ss.str() +
"; \n");
115 source.append(
" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
116 source.append(
" const unsigned int block_start = get_group_id(0) * block_increment; \n");
117 source.append(
" const unsigned int block_stop = min(block_start + block_increment, size); \n");
119 source.append(
" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() +
"; \n");
120 source.append(
" row < block_stop; \n");
121 source.append(
" row += get_local_size(0) / " + ss.str() +
") \n");
122 source.append(
" { \n");
123 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
124 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
125 source.append(
" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() +
") \n");
126 source.append(
" dot_prod += elements[i] * p[column_indices[i]]; \n");
128 source.append(
" shared_elements[get_local_id(0)] = dot_prod; \n");
129 source.append(
" #pragma unroll \n");
130 source.append(
" for (unsigned int k = 1; k < " + ss.str() +
"; k *= 2) \n");
131 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
133 source.append(
" if (id_in_row == 0) { \n");
134 source.append(
" Ap[row] = shared_elements[get_local_id(0)]; \n");
135 source.append(
" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
136 source.append(
" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
137 source.append(
" } \n");
138 source.append(
" } \n");
141 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
142 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
143 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
144 source.append(
" { \n");
145 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
146 source.append(
" if (get_local_id(0) < stride) { \n");
147 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
148 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
149 source.append(
" } ");
150 source.append(
" } ");
153 source.append(
" if (get_local_id(0) == 0) { \n ");
154 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
155 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
156 source.append(
" } \n");
158 source.append(
"} \n");
161 template<
typename StringT>
164 source.append(
"__kernel void cg_csr_prod( \n");
165 source.append(
" __global const unsigned int * row_indices, \n");
166 source.append(
" __global const unsigned int * column_indices, \n");
167 source.append(
" __global const unsigned int * row_blocks, \n");
168 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
169 source.append(
" unsigned int num_blocks, \n");
170 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
171 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
172 source.append(
" unsigned int size, \n");
173 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
174 source.append(
" unsigned int buffer_size, \n");
175 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
176 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
177 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_elements) \n");
178 source.append(
"{ \n");
180 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
181 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
183 source.append(
" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
184 source.append(
" unsigned int row_start = row_blocks[block_id]; \n");
185 source.append(
" unsigned int row_stop = row_blocks[block_id + 1]; \n");
186 source.append(
" unsigned int rows_to_process = row_stop - row_start; \n");
187 source.append(
" unsigned int element_start = row_indices[row_start]; \n");
188 source.append(
" unsigned int element_stop = row_indices[row_stop]; \n");
190 source.append(
" if (rows_to_process > 1) { \n");
192 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
193 source.append(
" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
195 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
198 source.append(
" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
199 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
200 source.append(
" unsigned int thread_row_start = row_indices[row] - element_start; \n");
201 source.append(
" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
202 source.append(
" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
203 source.append(
" dot_prod += shared_elements[i]; \n");
204 source.append(
" Ap[row] = dot_prod; \n");
205 source.append(
" inner_prod_ApAp += dot_prod * dot_prod; \n");
206 source.append(
" inner_prod_pAp += p[row] * dot_prod; \n");
207 source.append(
" } \n");
208 source.append(
" } \n");
210 source.append(
" else \n");
211 source.append(
" { \n");
213 source.append(
" shared_elements[get_local_id(0)] = 0; \n");
214 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
215 source.append(
" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
218 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
219 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
220 source.append(
" if (get_local_id(0) < stride) \n");
221 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
222 source.append(
" } \n");
224 source.append(
" if (get_local_id(0) == 0) { \n");
225 source.append(
" Ap[row_start] = shared_elements[0]; \n");
226 source.append(
" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
227 source.append(
" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
228 source.append(
" } \n");
229 source.append(
" } \n");
230 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
231 source.append(
" } \n");
234 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
235 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
236 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
237 source.append(
" { \n");
238 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
239 source.append(
" if (get_local_id(0) < stride) { \n");
240 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
241 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
242 source.append(
" } ");
243 source.append(
" } ");
246 source.append(
" if (get_local_id(0) == 0) { \n ");
247 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
248 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
249 source.append(
" } \n");
251 source.append(
"} \n");
256 template<
typename StringT>
259 source.append(
"__kernel void cg_coo_prod( \n");
260 source.append(
" __global const uint2 * coords, \n");
261 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
262 source.append(
" __global const uint * group_boundaries, \n");
263 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
264 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
265 source.append(
" unsigned int size, \n");
266 source.append(
" __local unsigned int * shared_rows, \n");
267 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
268 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
269 source.append(
" unsigned int buffer_size, \n");
270 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
271 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
272 source.append(
"{ \n");
273 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
274 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
277 source.append(
" uint2 tmp; \n");
278 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
279 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
280 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
281 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
283 source.append(
" uint local_index = 0; \n");
285 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
286 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
288 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
289 source.append(
" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
292 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
293 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
294 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
295 source.append(
" else {\n");
296 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
297 source.append(
" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
298 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
299 source.append(
" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
300 source.append(
" } \n");
301 source.append(
" } \n");
304 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
305 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
306 source.append(
" inter_results[get_local_id(0)] = val; \n");
307 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
308 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
310 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
311 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
312 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
313 source.append(
" inter_results[get_local_id(0)] += left; \n");
314 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
315 source.append(
" } \n");
318 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
319 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
320 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
321 source.append(
" Ap[tmp.x] = Ap_entry; \n");
322 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
323 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
324 source.append(
" } \n");
326 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
327 source.append(
" } \n");
329 source.append(
" if (local_index + 1 == group_end) {\n");
330 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
331 source.append(
" Ap[tmp.x] = Ap_entry; \n");
332 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
333 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
334 source.append(
" } \n");
337 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
338 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
339 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
340 source.append(
" { \n");
341 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
342 source.append(
" if (get_local_id(0) < stride) { \n");
343 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
344 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
345 source.append(
" } ");
346 source.append(
" } ");
349 source.append(
" if (get_local_id(0) == 0) { \n ");
350 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
351 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
352 source.append(
" } \n");
354 source.append(
"} \n \n");
359 template<
typename StringT>
362 source.append(
"__kernel void cg_ell_prod( \n");
363 source.append(
" __global const unsigned int * coords, \n");
364 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
365 source.append(
" unsigned int internal_row_num, \n");
366 source.append(
" unsigned int items_per_row, \n");
367 source.append(
" unsigned int aligned_items_per_row, \n");
368 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
369 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
370 source.append(
" unsigned int size, \n");
371 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
372 source.append(
" unsigned int buffer_size, \n");
373 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
374 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
375 source.append(
"{ \n");
376 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
377 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
378 source.append(
" uint glb_id = get_global_id(0); \n");
379 source.append(
" uint glb_sz = get_global_size(0); \n");
381 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
382 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
384 source.append(
" uint offset = row; \n");
385 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
386 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
387 source.append(
" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
388 source.append(
" } \n");
390 source.append(
" Ap[row] = sum; \n");
391 source.append(
" inner_prod_ApAp += sum * sum; \n");
392 source.append(
" inner_prod_pAp += p[row] * sum; \n");
393 source.append(
" } \n");
396 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
397 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
398 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
399 source.append(
" { \n");
400 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
401 source.append(
" if (get_local_id(0) < stride) { \n");
402 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
403 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
404 source.append(
" } ");
405 source.append(
" } ");
408 source.append(
" if (get_local_id(0) == 0) { \n ");
409 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
410 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
411 source.append(
" } \n");
412 source.append(
"} \n \n");
415 template<
typename StringT>
418 source.append(
"__kernel void cg_sliced_ell_prod( \n");
419 source.append(
" __global const unsigned int * columns_per_block, \n");
420 source.append(
" __global const unsigned int * column_indices, \n");
421 source.append(
" __global const unsigned int * block_start, \n");
422 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
423 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
424 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
425 source.append(
" unsigned int size, \n");
426 source.append(
" unsigned int block_size, \n");
427 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
428 source.append(
" unsigned int buffer_size, \n");
429 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
430 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
431 source.append(
"{ \n");
432 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
433 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
434 source.append(
" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
435 source.append(
" uint id_in_block = get_local_id(0) % block_size; \n");
436 source.append(
" uint num_blocks = (size - 1) / block_size + 1; \n");
437 source.append(
" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
438 source.append(
" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
440 source.append(
" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
441 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
443 source.append(
" uint row = block_idx * block_size + id_in_block; \n");
444 source.append(
" uint offset = block_start[block_idx]; \n");
445 source.append(
" uint num_columns = columns_per_block[block_idx]; \n");
446 source.append(
" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
447 source.append(
" uint index = offset + item_id * block_size + id_in_block; \n");
448 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
449 source.append(
" sum += val ? (p[column_indices[index]] * val) : 0; \n");
450 source.append(
" } \n");
452 source.append(
" if (row < size) {\n");
453 source.append(
" Ap[row] = sum; \n");
454 source.append(
" inner_prod_ApAp += sum * sum; \n");
455 source.append(
" inner_prod_pAp += p[row] * sum; \n");
456 source.append(
" } \n");
457 source.append(
" } \n");
460 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
461 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
462 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
463 source.append(
" { \n");
464 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
465 source.append(
" if (get_local_id(0) < stride) { \n");
466 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
467 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
468 source.append(
" } ");
469 source.append(
" } ");
472 source.append(
" if (get_local_id(0) == 0) { \n ");
473 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
474 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
475 source.append(
" } \n");
476 source.append(
"} \n \n");
479 template<
typename StringT>
482 source.append(
"__kernel void cg_hyb_prod( \n");
483 source.append(
" const __global int* ell_coords, \n");
484 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
485 source.append(
" const __global uint* csr_rows, \n");
486 source.append(
" const __global uint* csr_cols, \n");
487 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
488 source.append(
" unsigned int internal_row_num, \n");
489 source.append(
" unsigned int items_per_row, \n");
490 source.append(
" unsigned int aligned_items_per_row, \n");
491 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
492 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
493 source.append(
" unsigned int size, \n");
494 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
495 source.append(
" unsigned int buffer_size, \n");
496 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
497 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
498 source.append(
"{ \n");
499 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
500 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
501 source.append(
" uint glb_id = get_global_id(0); \n");
502 source.append(
" uint glb_sz = get_global_size(0); \n");
504 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
505 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
507 source.append(
" uint offset = row; \n");
508 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
509 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
510 source.append(
" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
511 source.append(
" } \n");
513 source.append(
" uint col_begin = csr_rows[row]; \n");
514 source.append(
" uint col_end = csr_rows[row + 1]; \n");
516 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
517 source.append(
" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
518 source.append(
" } \n");
520 source.append(
" Ap[row] = sum; \n");
521 source.append(
" inner_prod_ApAp += sum * sum; \n");
522 source.append(
" inner_prod_pAp += p[row] * sum; \n");
523 source.append(
" } \n");
526 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
527 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
528 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
529 source.append(
" { \n");
530 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
531 source.append(
" if (get_local_id(0) < stride) { \n");
532 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
533 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
534 source.append(
" } ");
535 source.append(
" } ");
538 source.append(
" if (get_local_id(0) == 0) { \n ");
539 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
540 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
541 source.append(
" } \n");
542 source.append(
"} \n \n");
549 template<
typename StringT>
552 source.append(
"__kernel void bicgstab_update_s( \n");
553 source.append(
" __global "); source.append(numeric_string); source.append(
" * s, \n");
554 source.append(
" __global "); source.append(numeric_string); source.append(
" const * r, \n");
555 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
556 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
557 source.append(
" unsigned int chunk_size, \n");
558 source.append(
" unsigned int chunk_offset, \n");
559 source.append(
" unsigned int size, \n");
560 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array, \n");
561 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_Ap_in_r0) \n");
562 source.append(
"{ \n");
564 source.append(
" "); source.append(numeric_string); source.append(
" alpha = 0; \n");
567 source.append(
" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0)]; \n");
568 source.append(
" shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
569 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
570 source.append(
" { \n");
571 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
572 source.append(
" if (get_local_id(0) < stride) { \n");
573 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
574 source.append(
" shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride]; \n");
575 source.append(
" } ");
576 source.append(
" } ");
579 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
580 source.append(
" alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
582 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
583 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
584 source.append(
" "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
585 source.append(
" \n");
586 source.append(
" value_s = r[i] - alpha * Ap[i]; \n");
587 source.append(
" inner_prod_contrib += value_s * value_s; \n");
588 source.append(
" \n");
589 source.append(
" s[i] = value_s; \n");
590 source.append(
" } \n");
591 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
594 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
595 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
596 source.append(
" { \n");
597 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
598 source.append(
" if (get_local_id(0) < stride) \n");
599 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
600 source.append(
" } ");
603 source.append(
" if (get_local_id(0) == 0) \n ");
604 source.append(
" inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
606 source.append(
"} \n");
612 template<
typename StringT>
615 source.append(
"__kernel void bicgstab_vector_update( \n");
616 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
617 source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n");
618 source.append(
" __global "); source.append(numeric_string); source.append(
" * p, \n");
619 source.append(
" "); source.append(numeric_string); source.append(
" omega, \n");
620 source.append(
" __global "); source.append(numeric_string); source.append(
" const * s, \n");
621 source.append(
" __global "); source.append(numeric_string); source.append(
" * residual, \n");
622 source.append(
" __global "); source.append(numeric_string); source.append(
" const * As, \n");
623 source.append(
" "); source.append(numeric_string); source.append(
" beta, \n");
624 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
625 source.append(
" __global "); source.append(numeric_string); source.append(
" const * r0star, \n");
626 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
627 source.append(
" unsigned int size, \n");
628 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
629 source.append(
"{ \n");
630 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r_r0star = 0; \n");
631 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
632 source.append(
" "); source.append(numeric_string); source.append(
" value_result = result[i]; \n");
633 source.append(
" "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
634 source.append(
" "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
635 source.append(
" "); source.append(numeric_string); source.append(
" value_residual = residual[i]; \n");
636 source.append(
" "); source.append(numeric_string); source.append(
" value_As = As[i]; \n");
637 source.append(
" "); source.append(numeric_string); source.append(
" value_Ap = Ap[i]; \n");
638 source.append(
" "); source.append(numeric_string); source.append(
" value_r0star = r0star[i]; \n");
639 source.append(
" \n");
640 source.append(
" value_result += alpha * value_p + omega * value_s; \n");
641 source.append(
" value_residual = value_s - omega * value_As; \n");
642 source.append(
" value_p = value_residual + beta * (value_p - omega * value_Ap); \n");
643 source.append(
" \n");
644 source.append(
" result[i] = value_result; \n");
645 source.append(
" residual[i] = value_residual; \n");
646 source.append(
" p[i] = value_p; \n");
647 source.append(
" inner_prod_r_r0star += value_residual * value_r0star; \n");
648 source.append(
" } \n");
651 source.append(
" shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
652 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
653 source.append(
" { \n");
654 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
655 source.append(
" if (get_local_id(0) < stride) \n");
656 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
657 source.append(
" } ");
660 source.append(
" if (get_local_id(0) == 0) \n ");
661 source.append(
" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
663 source.append(
"} \n");
666 template<
typename StringT>
669 std::stringstream ss;
672 source.append(
"__kernel void bicgstab_csr_blocked_prod( \n");
673 source.append(
" __global const unsigned int * row_indices, \n");
674 source.append(
" __global const unsigned int * column_indices, \n");
675 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
676 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
677 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
678 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
679 source.append(
" unsigned int size, \n");
680 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
681 source.append(
" unsigned int buffer_size, \n");
682 source.append(
" unsigned int buffer_offset, \n");
683 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
684 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
685 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
686 source.append(
"{ \n");
687 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_elements[256]; \n");
688 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
689 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
690 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
692 source.append(
" const unsigned int id_in_row = get_local_id(0) % " + ss.str() +
"; \n");
693 source.append(
" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
694 source.append(
" const unsigned int block_start = get_group_id(0) * block_increment; \n");
695 source.append(
" const unsigned int block_stop = min(block_start + block_increment, size); \n");
697 source.append(
" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() +
"; \n");
698 source.append(
" row < block_stop; \n");
699 source.append(
" row += get_local_size(0) / " + ss.str() +
") \n");
700 source.append(
" { \n");
701 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
702 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
703 source.append(
" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() +
") \n");
704 source.append(
" dot_prod += elements[i] * p[column_indices[i]]; \n");
706 source.append(
" shared_elements[get_local_id(0)] = dot_prod; \n");
707 source.append(
" #pragma unroll \n");
708 source.append(
" for (unsigned int k = 1; k < " + ss.str() +
"; k *= 2) \n");
709 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
711 source.append(
" if (id_in_row == 0) { \n");
712 source.append(
" Ap[row] = shared_elements[get_local_id(0)]; \n");
713 source.append(
" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
714 source.append(
" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
715 source.append(
" inner_prod_r0Ap += r0star[row] * shared_elements[get_local_id(0)]; \n");
716 source.append(
" } \n");
717 source.append(
" } \n");
720 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
721 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
722 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
723 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
724 source.append(
" { \n");
725 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
726 source.append(
" if (get_local_id(0) < stride) { \n");
727 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
728 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
729 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
730 source.append(
" } ");
731 source.append(
" } ");
734 source.append(
" if (get_local_id(0) == 0) { \n ");
735 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
736 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
737 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
738 source.append(
" } \n");
740 source.append(
"} \n");
743 template<
typename StringT>
746 source.append(
"__kernel void bicgstab_csr_prod( \n");
747 source.append(
" __global const unsigned int * row_indices, \n");
748 source.append(
" __global const unsigned int * column_indices, \n");
749 source.append(
" __global const unsigned int * row_blocks, \n");
750 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
751 source.append(
" unsigned int num_blocks, \n");
752 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
753 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
754 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
755 source.append(
" unsigned int size, \n");
756 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
757 source.append(
" unsigned int buffer_size, \n");
758 source.append(
" unsigned int buffer_offset, \n");
759 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
760 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
761 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
762 source.append(
"{ \n");
763 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_elements[1024]; \n");
764 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
765 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
766 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
768 source.append(
" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
769 source.append(
" unsigned int row_start = row_blocks[block_id]; \n");
770 source.append(
" unsigned int row_stop = row_blocks[block_id + 1]; \n");
771 source.append(
" unsigned int rows_to_process = row_stop - row_start; \n");
772 source.append(
" unsigned int element_start = row_indices[row_start]; \n");
773 source.append(
" unsigned int element_stop = row_indices[row_stop]; \n");
775 source.append(
" if (rows_to_process > 1) { \n");
777 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
778 source.append(
" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
780 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
783 source.append(
" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
784 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
785 source.append(
" unsigned int thread_row_start = row_indices[row] - element_start; \n");
786 source.append(
" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
787 source.append(
" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
788 source.append(
" dot_prod += shared_elements[i]; \n");
789 source.append(
" Ap[row] = dot_prod; \n");
790 source.append(
" inner_prod_ApAp += dot_prod * dot_prod; \n");
791 source.append(
" inner_prod_pAp += p[row] * dot_prod; \n");
792 source.append(
" inner_prod_r0Ap += r0star[row] * dot_prod; \n");
793 source.append(
" } \n");
794 source.append(
" } \n");
796 source.append(
" else \n");
797 source.append(
" { \n");
799 source.append(
" shared_elements[get_local_id(0)] = 0; \n");
800 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
801 source.append(
" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
804 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
805 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
806 source.append(
" if (get_local_id(0) < stride) \n");
807 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
808 source.append(
" } \n");
810 source.append(
" if (get_local_id(0) == 0) { \n");
811 source.append(
" Ap[row_start] = shared_elements[0]; \n");
812 source.append(
" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
813 source.append(
" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
814 source.append(
" inner_prod_r0Ap += r0star[row_start] * shared_elements[0]; \n");
815 source.append(
" } \n");
816 source.append(
" } \n");
817 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
818 source.append(
" } \n");
821 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
822 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
823 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
824 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
825 source.append(
" { \n");
826 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
827 source.append(
" if (get_local_id(0) < stride) { \n");
828 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
829 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
830 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
831 source.append(
" } ");
832 source.append(
" } ");
835 source.append(
" if (get_local_id(0) == 0) { \n ");
836 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
837 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
838 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
839 source.append(
" } \n");
841 source.append(
"} \n \n");
845 template<
typename StringT>
848 source.append(
"__kernel void bicgstab_coo_prod( \n");
849 source.append(
" __global const uint2 * coords, \n");
850 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
851 source.append(
" __global const uint * group_boundaries, \n");
852 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
853 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
854 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
855 source.append(
" unsigned int size, \n");
856 source.append(
" __local unsigned int * shared_rows, \n");
857 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
858 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
859 source.append(
" unsigned int buffer_size, \n");
860 source.append(
" unsigned int buffer_offset, \n");
861 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
862 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
863 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
864 source.append(
"{ \n");
865 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
866 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
867 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
870 source.append(
" uint2 tmp; \n");
871 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
872 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
873 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
874 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
876 source.append(
" uint local_index = 0; \n");
878 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
879 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
881 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
882 source.append(
" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
885 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
886 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
887 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
888 source.append(
" else {\n");
889 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
890 source.append(
" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
891 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
892 source.append(
" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
893 source.append(
" inner_prod_r0Ap += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
894 source.append(
" } \n");
895 source.append(
" } \n");
898 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
899 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
900 source.append(
" inter_results[get_local_id(0)] = val; \n");
901 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
902 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
904 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
905 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
906 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
907 source.append(
" inter_results[get_local_id(0)] += left; \n");
908 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
909 source.append(
" } \n");
912 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
913 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
914 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
915 source.append(
" Ap[tmp.x] = Ap_entry; \n");
916 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
917 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
918 source.append(
" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
919 source.append(
" } \n");
921 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
922 source.append(
" } \n");
924 source.append(
" if (local_index + 1 == group_end) {\n");
925 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
926 source.append(
" Ap[tmp.x] = Ap_entry; \n");
927 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
928 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
929 source.append(
" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
930 source.append(
" } \n");
933 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
934 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
935 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
936 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
937 source.append(
" { \n");
938 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
939 source.append(
" if (get_local_id(0) < stride) { \n");
940 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
941 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
942 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
943 source.append(
" } ");
944 source.append(
" } ");
947 source.append(
" if (get_local_id(0) == 0) { \n ");
948 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
949 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
950 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
951 source.append(
" } \n");
953 source.append(
"} \n \n");
958 template<
typename StringT>
961 source.append(
"__kernel void bicgstab_ell_prod( \n");
962 source.append(
" __global const unsigned int * coords, \n");
963 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
964 source.append(
" unsigned int internal_row_num, \n");
965 source.append(
" unsigned int items_per_row, \n");
966 source.append(
" unsigned int aligned_items_per_row, \n");
967 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
968 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
969 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
970 source.append(
" unsigned int size, \n");
971 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
972 source.append(
" unsigned int buffer_size, \n");
973 source.append(
" unsigned int buffer_offset, \n");
974 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
975 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
976 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
977 source.append(
"{ \n");
978 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
979 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
980 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
981 source.append(
" uint glb_id = get_global_id(0); \n");
982 source.append(
" uint glb_sz = get_global_size(0); \n");
984 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
985 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
987 source.append(
" uint offset = row; \n");
988 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
989 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
990 source.append(
" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
991 source.append(
" } \n");
993 source.append(
" Ap[row] = sum; \n");
994 source.append(
" inner_prod_ApAp += sum * sum; \n");
995 source.append(
" inner_prod_pAp += p[row] * sum; \n");
996 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
997 source.append(
" } \n");
1000 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1001 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1002 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1003 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1004 source.append(
" { \n");
1005 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1006 source.append(
" if (get_local_id(0) < stride) { \n");
1007 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1008 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1009 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1010 source.append(
" } ");
1011 source.append(
" } ");
1014 source.append(
" if (get_local_id(0) == 0) { \n ");
1015 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1016 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1017 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1018 source.append(
" } \n");
1019 source.append(
"} \n \n");
1022 template<
typename StringT>
1025 source.append(
"__kernel void bicgstab_sliced_ell_prod( \n");
1026 source.append(
" __global const unsigned int * columns_per_block, \n");
1027 source.append(
" __global const unsigned int * column_indices, \n");
1028 source.append(
" __global const unsigned int * block_start, \n");
1029 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1030 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1031 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1032 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
1033 source.append(
" unsigned int size, \n");
1034 source.append(
" unsigned int block_size, \n");
1035 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1036 source.append(
" unsigned int buffer_size, \n");
1037 source.append(
" unsigned int buffer_offset, \n");
1038 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1039 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
1040 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
1041 source.append(
"{ \n");
1042 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
1043 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
1044 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
1045 source.append(
" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
1046 source.append(
" uint id_in_block = get_local_id(0) % block_size; \n");
1047 source.append(
" uint num_blocks = (size - 1) / block_size + 1; \n");
1048 source.append(
" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
1049 source.append(
" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
1051 source.append(
" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
1052 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
1054 source.append(
" uint row = block_idx * block_size + id_in_block; \n");
1055 source.append(
" uint offset = block_start[block_idx]; \n");
1056 source.append(
" uint num_columns = columns_per_block[block_idx]; \n");
1057 source.append(
" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
1058 source.append(
" uint index = offset + item_id * block_size + id_in_block; \n");
1059 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
1060 source.append(
" sum += val ? (p[column_indices[index]] * val) : 0; \n");
1061 source.append(
" } \n");
1063 source.append(
" if (row < size) {\n");
1064 source.append(
" Ap[row] = sum; \n");
1065 source.append(
" inner_prod_ApAp += sum * sum; \n");
1066 source.append(
" inner_prod_pAp += p[row] * sum; \n");
1067 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
1068 source.append(
" } \n");
1069 source.append(
" } \n");
1072 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1073 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1074 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1075 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1076 source.append(
" { \n");
1077 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1078 source.append(
" if (get_local_id(0) < stride) { \n");
1079 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1080 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1081 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1082 source.append(
" } ");
1083 source.append(
" } ");
1086 source.append(
" if (get_local_id(0) == 0) { \n ");
1087 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1088 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1089 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1090 source.append(
" } \n");
1091 source.append(
"} \n \n");
1094 template<
typename StringT>
1097 source.append(
"__kernel void bicgstab_hyb_prod( \n");
1098 source.append(
" const __global int* ell_coords, \n");
1099 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
1100 source.append(
" const __global uint* csr_rows, \n");
1101 source.append(
" const __global uint* csr_cols, \n");
1102 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
1103 source.append(
" unsigned int internal_row_num, \n");
1104 source.append(
" unsigned int items_per_row, \n");
1105 source.append(
" unsigned int aligned_items_per_row, \n");
1106 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1107 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1108 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
1109 source.append(
" unsigned int size, \n");
1110 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1111 source.append(
" unsigned int buffer_size, \n");
1112 source.append(
" unsigned int buffer_offset, \n");
1113 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1114 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
1115 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
1116 source.append(
"{ \n");
1117 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
1118 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
1119 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
1120 source.append(
" uint glb_id = get_global_id(0); \n");
1121 source.append(
" uint glb_sz = get_global_size(0); \n");
1123 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
1124 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
1126 source.append(
" uint offset = row; \n");
1127 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
1128 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
1129 source.append(
" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
1130 source.append(
" } \n");
1132 source.append(
" uint col_begin = csr_rows[row]; \n");
1133 source.append(
" uint col_end = csr_rows[row + 1]; \n");
1135 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
1136 source.append(
" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
1137 source.append(
" } \n");
1139 source.append(
" Ap[row] = sum; \n");
1140 source.append(
" inner_prod_ApAp += sum * sum; \n");
1141 source.append(
" inner_prod_pAp += p[row] * sum; \n");
1142 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
1143 source.append(
" } \n");
1146 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1147 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1148 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1149 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1150 source.append(
" { \n");
1151 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1152 source.append(
" if (get_local_id(0) < stride) { \n");
1153 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1154 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1155 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1156 source.append(
" } ");
1157 source.append(
" } ");
1160 source.append(
" if (get_local_id(0) == 0) { \n ");
1161 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1162 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1163 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1164 source.append(
" } \n");
1165 source.append(
"} \n \n");
1171 template <
typename StringType>
1174 source.append(
"__kernel void gmres_gram_schmidt_1( \n");
1175 source.append(
" __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
1176 source.append(
" unsigned int size, \n");
1177 source.append(
" unsigned int internal_size, \n");
1178 source.append(
" unsigned int k, \n");
1179 source.append(
" __global "); source.append(numeric_string); source.append(
" * vi_in_vk_buffer, \n");
1180 source.append(
" unsigned int chunk_size) \n");
1181 source.append(
"{ \n");
1183 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_array[7*128]; \n");
1186 source.append(
" "); source.append(numeric_string); source.append(
" vi_in_vk[7]; \n");
1188 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
1190 source.append(
" unsigned int k_base = 0; \n");
1191 source.append(
" while (k_base < k) { \n");
1192 source.append(
" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1196 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1197 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = 0; \n");
1201 source.append(
" vi_in_vk[0] = 0;\n");
1202 source.append(
" vi_in_vk[1] = 0;\n");
1203 source.append(
" vi_in_vk[2] = 0;\n");
1204 source.append(
" vi_in_vk[3] = 0;\n");
1205 source.append(
" vi_in_vk[4] = 0;\n");
1206 source.append(
" vi_in_vk[5] = 0;\n");
1207 source.append(
" vi_in_vk[6] = 0;\n");
1209 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1210 source.append(
" value_vk = krylov_basis[i + k * internal_size]; \n");
1211 source.append(
" \n");
1212 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1214 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1216 source.append(
" vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1217 source.append(
" } \n");
1222 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1223 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
1225 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1226 source.append(
" { \n");
1227 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1228 source.append(
" if (get_local_id(0) < stride) { \n");
1229 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1230 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1231 source.append(
" } ");
1232 source.append(
" } ");
1235 source.append(
" if (get_local_id(0) == 0) \n ");
1236 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1237 source.append(
" vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
1239 source.append(
" k_base += vecs_in_iteration; \n");
1240 source.append(
" } \n");
1242 source.append(
"} \n");
1246 template <
typename StringType>
1249 source.append(
"__kernel void gmres_gram_schmidt_2( \n");
1250 source.append(
" __global "); source.append(numeric_string); source.append(
" * krylov_basis, \n");
1251 source.append(
" unsigned int size, \n");
1252 source.append(
" unsigned int internal_size, \n");
1253 source.append(
" unsigned int k, \n");
1254 source.append(
" __global "); source.append(numeric_string); source.append(
" const * vi_in_vk_buffer, \n");
1255 source.append(
" unsigned int chunk_size, \n");
1256 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
1257 source.append(
" unsigned int krylov_dim, \n");
1258 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1259 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
1260 source.append(
"{ \n");
1262 source.append(
" "); source.append(numeric_string); source.append(
" vk_dot_vk = 0; \n");
1263 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
1265 source.append(
" unsigned int k_base = 0; \n");
1266 source.append(
" while (k_base < k) { \n");
1267 source.append(
" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1270 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1271 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
1272 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1273 source.append(
" { \n");
1274 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1275 source.append(
" if (get_local_id(0) < stride) { \n");
1276 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1277 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1278 source.append(
" } ");
1279 source.append(
" } ");
1280 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1283 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1284 source.append(
" value_vk = krylov_basis[i + k * internal_size]; \n");
1285 source.append(
" \n");
1286 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1287 source.append(
" value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
1288 source.append(
" vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; \n");
1289 source.append(
" krylov_basis[i + k * internal_size] = value_vk; \n");
1290 source.append(
" } \n");
1293 source.append(
" if (get_group_id(0) == 0) \n");
1294 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1295 source.append(
" R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
1296 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1298 source.append(
" k_base += vecs_in_iteration; \n");
1299 source.append(
" } \n");
1302 source.append(
" shared_array[get_local_id(0)] = vk_dot_vk; \n");
1303 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1304 source.append(
" { \n");
1305 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1306 source.append(
" if (get_local_id(0) < stride) \n");
1307 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1308 source.append(
" } ");
1311 source.append(
" if (get_local_id(0) == 0) \n ");
1312 source.append(
" inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
1314 source.append(
"} \n");
1317 template <
typename StringType>
1320 source.append(
"__kernel void gmres_normalize_vk( \n");
1321 source.append(
" __global "); source.append(numeric_string); source.append(
" * vk, \n");
1322 source.append(
" unsigned int vk_offset, \n");
1323 source.append(
" __global "); source.append(numeric_string); source.append(
" const * residual, \n");
1324 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
1325 source.append(
" unsigned int R_offset, \n");
1326 source.append(
" __global "); source.append(numeric_string); source.append(
" const * inner_prod_buffer, \n");
1327 source.append(
" unsigned int chunk_size, \n");
1328 source.append(
" __global "); source.append(numeric_string); source.append(
" * r_dot_vk_buffer, \n");
1329 source.append(
" unsigned int chunk_offset, \n");
1330 source.append(
" unsigned int size, \n");
1331 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
1332 source.append(
"{ \n");
1334 source.append(
" "); source.append(numeric_string); source.append(
" norm_vk = 0; \n");
1337 source.append(
" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
1338 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1339 source.append(
" { \n");
1340 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1341 source.append(
" if (get_local_id(0) < stride) \n");
1342 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1343 source.append(
" } ");
1346 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1347 source.append(
" norm_vk = sqrt(shared_array[0]); \n");
1349 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
1350 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1351 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = vk[i + vk_offset] / norm_vk; \n");
1352 source.append(
" \n");
1353 source.append(
" inner_prod_contrib += residual[i] * value_vk; \n");
1354 source.append(
" \n");
1355 source.append(
" vk[i + vk_offset] = value_vk; \n");
1356 source.append(
" } \n");
1357 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1360 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
1361 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1362 source.append(
" { \n");
1363 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1364 source.append(
" if (get_local_id(0) < stride) \n");
1365 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1366 source.append(
" } ");
1369 source.append(
" if (get_local_id(0) == 0) \n ");
1370 source.append(
" r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
1371 source.append(
" if (get_global_id(0) == 0) \n ");
1372 source.append(
" R_buffer[R_offset] = norm_vk; \n");
1374 source.append(
"} \n");
1378 template <
typename StringType>
1381 source.append(
"__kernel void gmres_update_result( \n");
1382 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
1383 source.append(
" __global "); source.append(numeric_string); source.append(
" const * residual, \n");
1384 source.append(
" __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
1385 source.append(
" unsigned int size, \n");
1386 source.append(
" unsigned int internal_size, \n");
1387 source.append(
" __global "); source.append(numeric_string); source.append(
" const * coefficients, \n");
1388 source.append(
" unsigned int k) \n");
1389 source.append(
"{ \n");
1391 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1392 source.append(
" "); source.append(numeric_string); source.append(
" value_result = result[i] + coefficients[0] * residual[i]; \n");
1393 source.append(
" \n");
1394 source.append(
" for (unsigned int j = 1; j < k; ++j) \n");
1395 source.append(
" value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
1396 source.append(
" \n");
1397 source.append(
" result[i] = value_result; \n");
1398 source.append(
" } \n");
1400 source.append(
"} \n");
1404 template <
typename StringType>
1407 source.append(
"__kernel void gmres_csr_blocked_prod( \n");
1408 source.append(
" __global const unsigned int * row_indices, \n");
1409 source.append(
" __global const unsigned int * column_indices, \n");
1410 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1411 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1412 source.append(
" unsigned int offset_p, \n");
1413 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1414 source.append(
" unsigned int offset_Ap, \n");
1415 source.append(
" unsigned int size, \n");
1416 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1417 source.append(
" unsigned int buffer_size, \n");
1418 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1419 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1420 source.append(
"{ \n");
1421 source.append(
" cg_csr_blocked_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1422 source.append(
"} \n \n");
1426 template <
typename StringType>
1429 source.append(
"__kernel void gmres_csr_prod( \n");
1430 source.append(
" __global const unsigned int * row_indices, \n");
1431 source.append(
" __global const unsigned int * column_indices, \n");
1432 source.append(
" __global const unsigned int * row_blocks, \n");
1433 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1434 source.append(
" unsigned int num_blocks, \n");
1435 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1436 source.append(
" unsigned int offset_p, \n");
1437 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1438 source.append(
" unsigned int offset_Ap, \n");
1439 source.append(
" unsigned int size, \n");
1440 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1441 source.append(
" unsigned int buffer_size, \n");
1442 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1443 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
1444 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_elements) \n");
1445 source.append(
"{ \n");
1446 source.append(
" cg_csr_prod(row_indices, column_indices, row_blocks, elements, num_blocks, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp, shared_elements); \n");
1447 source.append(
"} \n \n");
1451 template <
typename StringType>
1454 source.append(
"__kernel void gmres_coo_prod( \n");
1455 source.append(
" __global const uint2 * coords, \n");
1456 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1457 source.append(
" __global const uint * group_boundaries, \n");
1458 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1459 source.append(
" unsigned int offset_p, \n");
1460 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1461 source.append(
" unsigned int offset_Ap, \n");
1462 source.append(
" unsigned int size, \n");
1463 source.append(
" __local unsigned int * shared_rows, \n");
1464 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
1465 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1466 source.append(
" unsigned int buffer_size, \n");
1467 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1468 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1469 source.append(
"{ \n");
1470 source.append(
" cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1471 source.append(
"} \n \n");
1476 template <
typename StringType>
1479 source.append(
"__kernel void gmres_ell_prod( \n");
1480 source.append(
" __global const unsigned int * coords, \n");
1481 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1482 source.append(
" unsigned int internal_row_num, \n");
1483 source.append(
" unsigned int items_per_row, \n");
1484 source.append(
" unsigned int aligned_items_per_row, \n");
1485 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1486 source.append(
" unsigned int offset_p, \n");
1487 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1488 source.append(
" unsigned int offset_Ap, \n");
1489 source.append(
" unsigned int size, \n");
1490 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1491 source.append(
" unsigned int buffer_size, \n");
1492 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1493 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1494 source.append(
"{ \n");
1495 source.append(
" cg_ell_prod(coords, elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1496 source.append(
"} \n \n");
1499 template <
typename StringType>
1502 source.append(
"__kernel void gmres_sliced_ell_prod( \n");
1503 source.append(
" __global const unsigned int * columns_per_block, \n");
1504 source.append(
" __global const unsigned int * column_indices, \n");
1505 source.append(
" __global const unsigned int * block_start, \n");
1506 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1507 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1508 source.append(
" unsigned int offset_p, \n");
1509 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1510 source.append(
" unsigned int offset_Ap, \n");
1511 source.append(
" unsigned int size, \n");
1512 source.append(
" unsigned int block_size, \n");
1513 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1514 source.append(
" unsigned int buffer_size, \n");
1515 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1516 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1517 source.append(
"{ \n");
1518 source.append(
" cg_sliced_ell_prod(columns_per_block, column_indices, block_start, elements, p + offset_p, Ap + offset_Ap, size, block_size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1519 source.append(
"} \n \n");
1522 template <
typename StringType>
1525 source.append(
"__kernel void gmres_hyb_prod( \n");
1526 source.append(
" const __global int* ell_coords, \n");
1527 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
1528 source.append(
" const __global uint* csr_rows, \n");
1529 source.append(
" const __global uint* csr_cols, \n");
1530 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
1531 source.append(
" unsigned int internal_row_num, \n");
1532 source.append(
" unsigned int items_per_row, \n");
1533 source.append(
" unsigned int aligned_items_per_row, \n");
1534 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1535 source.append(
" unsigned int offset_p, \n");
1536 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1537 source.append(
" unsigned int offset_Ap, \n");
1538 source.append(
" unsigned int size, \n");
1539 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1540 source.append(
" unsigned int buffer_size, \n");
1541 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1542 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1543 source.append(
"{ \n");
1544 source.append(
" cg_hyb_prod(ell_coords, ell_elements, csr_rows, csr_cols, csr_elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1545 source.append(
"} \n \n");
1555 template<
typename NumericT>
1565 static std::map<cl_context, bool> init_done;
1572 source.reserve(1024);
1574 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1608 #ifdef VIENNACL_BUILD_INFO
1609 std::cout <<
"Creating program " << prog_name << std::endl;
1611 ctx.add_program(source, prog_name);
1612 init_done[ctx.handle().get()] =
true;
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
void generate_sliced_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
static std::string program_name()
void generate_pipelined_gmres_gram_schmidt_stage1(StringType &source, std::string const &numeric_string, bool is_nvidia)
void generate_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
void generate_hyb_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_normalize_vk(StringType &source, std::string const &numeric_string)
void generate_pipelined_gmres_gram_schmidt_stage2(StringType &source, std::string const &numeric_string)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
void generate_compressed_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
static void apply(viennacl::ocl::context const &)
void generate_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
const OCL_TYPE & get() const
void generate_compressed_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_vector_update(StringT &source, std::string const &numeric_string)
void generate_coordinate_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
void generate_coordinate_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
Proxy classes for vectors.
void generate_hyb_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
void generate_hyb_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_update_s(StringT &source, std::string const &numeric_string)
void generate_pipelined_cg_vector_update(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_update_result(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
void generate_compressed_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)