ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
iterative.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 
24 
28 
29 #include "viennacl/ocl/kernel.hpp"
31 #include "viennacl/ocl/utils.hpp"
32 
35 
38 namespace viennacl
39 {
40 namespace linalg
41 {
42 namespace opencl
43 {
44 namespace kernels
45 {
47 
48 template<typename StringT>
49 void generate_pipelined_cg_vector_update(StringT & source, std::string const & numeric_string)
50 {
51  source.append("__kernel void cg_vector_update( \n");
52  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
53  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
54  source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
55  source.append(" __global "); source.append(numeric_string); source.append(" * r, \n");
56  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
57  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
58  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
59  source.append(" unsigned int size, \n");
60  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
61  source.append("{ \n");
62  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
63  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
64  source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
65  source.append(" "); source.append(numeric_string); source.append(" value_r = r[i]; \n");
66  source.append(" \n");
67  source.append(" result[i] += alpha * value_p; \n");
68  source.append(" value_r -= alpha * Ap[i]; \n");
69  source.append(" value_p = value_r + beta * value_p; \n");
70  source.append(" \n");
71  source.append(" p[i] = value_p; \n");
72  source.append(" r[i] = value_r; \n");
73  source.append(" inner_prod_contrib += value_r * value_r; \n");
74  source.append(" } \n");
75 
76  // parallel reduction in work group
77  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
78  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
79  source.append(" { \n");
80  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
81  source.append(" if (get_local_id(0) < stride) \n");
82  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
83  source.append(" } ");
84 
85  // write results to result array
86  source.append(" if (get_local_id(0) == 0) \n ");
87  source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
88 
89  source.append("} \n");
90 }
91 
92 template<typename StringT>
93 void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
94 {
95  std::stringstream ss;
96  ss << subwarp_size;
97 
98  source.append("__kernel void cg_csr_blocked_prod( \n");
99  source.append(" __global const unsigned int * row_indices, \n");
100  source.append(" __global const unsigned int * column_indices, \n");
101  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
102  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
103  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
104  source.append(" unsigned int size, \n");
105  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
106  source.append(" unsigned int buffer_size, \n");
107  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
108  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
109  source.append("{ \n");
110  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
111  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
112  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
113 
114  source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
115  source.append(" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
116  source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
117  source.append(" const unsigned int block_stop = min(block_start + block_increment, size); \n");
118 
119  source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
120  source.append(" row < block_stop; \n");
121  source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
122  source.append(" { \n");
123  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
124  source.append(" unsigned int row_end = row_indices[row+1]; \n");
125  source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
126  source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
127 
128  source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
129  source.append(" #pragma unroll \n");
130  source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
131  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
132 
133  source.append(" if (id_in_row == 0) { \n");
134  source.append(" Ap[row] = shared_elements[get_local_id(0)]; \n");
135  source.append(" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
136  source.append(" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
137  source.append(" } \n");
138  source.append(" } \n");
139 
141  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
142  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
143  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
144  source.append(" { \n");
145  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
146  source.append(" if (get_local_id(0) < stride) { \n");
147  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
148  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
149  source.append(" } ");
150  source.append(" } ");
151 
152  // write results to result array
153  source.append(" if (get_local_id(0) == 0) { \n ");
154  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
155  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
156  source.append(" } \n");
157 
158  source.append("} \n");
159 }
160 
161 template<typename StringT>
162 void generate_compressed_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
163 {
164  source.append("__kernel void cg_csr_prod( \n");
165  source.append(" __global const unsigned int * row_indices, \n");
166  source.append(" __global const unsigned int * column_indices, \n");
167  source.append(" __global const unsigned int * row_blocks, \n");
168  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
169  source.append(" unsigned int num_blocks, \n");
170  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
171  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
172  source.append(" unsigned int size, \n");
173  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
174  source.append(" unsigned int buffer_size, \n");
175  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
176  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
177  source.append(" __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
178  source.append("{ \n");
179 
180  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
181  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
182 
183  source.append(" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
184  source.append(" unsigned int row_start = row_blocks[block_id]; \n");
185  source.append(" unsigned int row_stop = row_blocks[block_id + 1]; \n");
186  source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
187  source.append(" unsigned int element_start = row_indices[row_start]; \n");
188  source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
189 
190  source.append(" if (rows_to_process > 1) { \n"); // CSR stream
191  // load to shared buffer:
192  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
193  source.append(" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
194 
195  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
196 
197  // use one thread per row to sum:
198  source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
199  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
200  source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
201  source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
202  source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
203  source.append(" dot_prod += shared_elements[i]; \n");
204  source.append(" Ap[row] = dot_prod; \n");
205  source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
206  source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
207  source.append(" } \n");
208  source.append(" } \n");
209 
210  source.append(" else \n"); // CSR vector for a single row
211  source.append(" { \n");
212  // load and sum to shared buffer:
213  source.append(" shared_elements[get_local_id(0)] = 0; \n");
214  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
215  source.append(" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
216 
217  // reduction to obtain final result
218  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
219  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
220  source.append(" if (get_local_id(0) < stride) \n");
221  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
222  source.append(" } \n");
223 
224  source.append(" if (get_local_id(0) == 0) { \n");
225  source.append(" Ap[row_start] = shared_elements[0]; \n");
226  source.append(" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
227  source.append(" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
228  source.append(" } \n");
229  source.append(" } \n");
230  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
231  source.append(" } \n");
232 
233  // parallel reduction in work group
234  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
235  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
236  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
237  source.append(" { \n");
238  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
239  source.append(" if (get_local_id(0) < stride) { \n");
240  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
241  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
242  source.append(" } ");
243  source.append(" } ");
244 
245  // write results to result array
246  source.append(" if (get_local_id(0) == 0) { \n ");
247  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
248  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
249  source.append(" } \n");
250 
251  source.append("} \n");
252 
253 }
254 
255 
256 template<typename StringT>
257 void generate_coordinate_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
258 {
259  source.append("__kernel void cg_coo_prod( \n");
260  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
261  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
262  source.append(" __global const uint * group_boundaries, \n");
263  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
264  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
265  source.append(" unsigned int size, \n");
266  source.append(" __local unsigned int * shared_rows, \n");
267  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
268  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
269  source.append(" unsigned int buffer_size, \n");
270  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
271  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
272  source.append("{ \n");
273  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
274  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
275 
277  source.append(" uint2 tmp; \n");
278  source.append(" "); source.append(numeric_string); source.append(" val; \n");
279  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
280  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
281  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
282 
283  source.append(" uint local_index = 0; \n");
284 
285  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
286  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
287 
288  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
289  source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
290 
291  //check for carry from previous loop run:
292  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
293  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
294  source.append(" val += inter_results[get_local_size(0)-1]; \n");
295  source.append(" else {\n");
296  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
297  source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
298  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
299  source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
300  source.append(" } \n");
301  source.append(" } \n");
302 
303  //segmented parallel reduction begin
304  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
305  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
306  source.append(" inter_results[get_local_id(0)] = val; \n");
307  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
308  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
309 
310  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
311  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
312  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
313  source.append(" inter_results[get_local_id(0)] += left; \n");
314  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
315  source.append(" } \n");
316  //segmented parallel reduction end
317 
318  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
319  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
320  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
321  source.append(" Ap[tmp.x] = Ap_entry; \n");
322  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
323  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
324  source.append(" } \n");
325 
326  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
327  source.append(" } \n"); //for k
328 
329  source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
330  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
331  source.append(" Ap[tmp.x] = Ap_entry; \n");
332  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
333  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
334  source.append(" } \n");
335 
337  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
338  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
339  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
340  source.append(" { \n");
341  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
342  source.append(" if (get_local_id(0) < stride) { \n");
343  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
344  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
345  source.append(" } ");
346  source.append(" } ");
347 
348  // write results to result array
349  source.append(" if (get_local_id(0) == 0) { \n ");
350  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
351  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
352  source.append(" } \n");
353 
354  source.append("} \n \n");
355 
356 }
357 
358 
359 template<typename StringT>
360 void generate_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
361 {
362  source.append("__kernel void cg_ell_prod( \n");
363  source.append(" __global const unsigned int * coords, \n");
364  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
365  source.append(" unsigned int internal_row_num, \n");
366  source.append(" unsigned int items_per_row, \n");
367  source.append(" unsigned int aligned_items_per_row, \n");
368  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
369  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
370  source.append(" unsigned int size, \n");
371  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
372  source.append(" unsigned int buffer_size, \n");
373  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
374  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
375  source.append("{ \n");
376  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
377  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
378  source.append(" uint glb_id = get_global_id(0); \n");
379  source.append(" uint glb_sz = get_global_size(0); \n");
380 
381  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
382  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
383 
384  source.append(" uint offset = row; \n");
385  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
386  source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
387  source.append(" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
388  source.append(" } \n");
389 
390  source.append(" Ap[row] = sum; \n");
391  source.append(" inner_prod_ApAp += sum * sum; \n");
392  source.append(" inner_prod_pAp += p[row] * sum; \n");
393  source.append(" } \n");
394 
396  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
397  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
398  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
399  source.append(" { \n");
400  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
401  source.append(" if (get_local_id(0) < stride) { \n");
402  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
403  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
404  source.append(" } ");
405  source.append(" } ");
406 
407  // write results to result array
408  source.append(" if (get_local_id(0) == 0) { \n ");
409  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
410  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
411  source.append(" } \n");
412  source.append("} \n \n");
413 }
414 
415 template<typename StringT>
416 void generate_sliced_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
417 {
418  source.append("__kernel void cg_sliced_ell_prod( \n");
419  source.append(" __global const unsigned int * columns_per_block, \n");
420  source.append(" __global const unsigned int * column_indices, \n");
421  source.append(" __global const unsigned int * block_start, \n");
422  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
423  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
424  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
425  source.append(" unsigned int size, \n");
426  source.append(" unsigned int block_size, \n");
427  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
428  source.append(" unsigned int buffer_size, \n");
429  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
430  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
431  source.append("{ \n");
432  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
433  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
434  source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
435  source.append(" uint id_in_block = get_local_id(0) % block_size; \n");
436  source.append(" uint num_blocks = (size - 1) / block_size + 1; \n");
437  source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
438  source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
439 
440  source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
441  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
442 
443  source.append(" uint row = block_idx * block_size + id_in_block; \n");
444  source.append(" uint offset = block_start[block_idx]; \n");
445  source.append(" uint num_columns = columns_per_block[block_idx]; \n");
446  source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
447  source.append(" uint index = offset + item_id * block_size + id_in_block; \n");
448  source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
449  source.append(" sum += val ? (p[column_indices[index]] * val) : 0; \n");
450  source.append(" } \n");
451 
452  source.append(" if (row < size) {\n");
453  source.append(" Ap[row] = sum; \n");
454  source.append(" inner_prod_ApAp += sum * sum; \n");
455  source.append(" inner_prod_pAp += p[row] * sum; \n");
456  source.append(" } \n");
457  source.append(" } \n");
458 
460  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
461  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
462  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
463  source.append(" { \n");
464  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
465  source.append(" if (get_local_id(0) < stride) { \n");
466  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
467  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
468  source.append(" } ");
469  source.append(" } ");
470 
471  // write results to result array
472  source.append(" if (get_local_id(0) == 0) { \n ");
473  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
474  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
475  source.append(" } \n");
476  source.append("} \n \n");
477 }
478 
479 template<typename StringT>
480 void generate_hyb_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
481 {
482  source.append("__kernel void cg_hyb_prod( \n");
483  source.append(" const __global int* ell_coords, \n");
484  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
485  source.append(" const __global uint* csr_rows, \n");
486  source.append(" const __global uint* csr_cols, \n");
487  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
488  source.append(" unsigned int internal_row_num, \n");
489  source.append(" unsigned int items_per_row, \n");
490  source.append(" unsigned int aligned_items_per_row, \n");
491  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
492  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
493  source.append(" unsigned int size, \n");
494  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
495  source.append(" unsigned int buffer_size, \n");
496  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
497  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
498  source.append("{ \n");
499  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
500  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
501  source.append(" uint glb_id = get_global_id(0); \n");
502  source.append(" uint glb_sz = get_global_size(0); \n");
503 
504  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
505  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
506 
507  source.append(" uint offset = row; \n");
508  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
509  source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
510  source.append(" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
511  source.append(" } \n");
512 
513  source.append(" uint col_begin = csr_rows[row]; \n");
514  source.append(" uint col_end = csr_rows[row + 1]; \n");
515 
516  source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
517  source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
518  source.append(" } \n");
519 
520  source.append(" Ap[row] = sum; \n");
521  source.append(" inner_prod_ApAp += sum * sum; \n");
522  source.append(" inner_prod_pAp += p[row] * sum; \n");
523  source.append(" } \n");
524 
526  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
527  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
528  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
529  source.append(" { \n");
530  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
531  source.append(" if (get_local_id(0) < stride) { \n");
532  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
533  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
534  source.append(" } ");
535  source.append(" } ");
536 
537  // write results to result array
538  source.append(" if (get_local_id(0) == 0) { \n ");
539  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
540  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
541  source.append(" } \n");
542  source.append("} \n \n");
543 }
544 
545 
547 
548 
549 template<typename StringT>
550 void generate_pipelined_bicgstab_update_s(StringT & source, std::string const & numeric_string)
551 {
552  source.append("__kernel void bicgstab_update_s( \n");
553  source.append(" __global "); source.append(numeric_string); source.append(" * s, \n");
554  source.append(" __global "); source.append(numeric_string); source.append(" const * r, \n");
555  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
556  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
557  source.append(" unsigned int chunk_size, \n");
558  source.append(" unsigned int chunk_offset, \n");
559  source.append(" unsigned int size, \n");
560  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array, \n");
561  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_Ap_in_r0) \n");
562  source.append("{ \n");
563 
564  source.append(" "); source.append(numeric_string); source.append(" alpha = 0; \n");
565 
566  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
567  source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0)]; \n");
568  source.append(" shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
569  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
570  source.append(" { \n");
571  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
572  source.append(" if (get_local_id(0) < stride) { \n");
573  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
574  source.append(" shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride]; \n");
575  source.append(" } ");
576  source.append(" } ");
577 
578  // compute alpha from reduced values:
579  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
580  source.append(" alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
581 
582  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
583  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
584  source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
585  source.append(" \n");
586  source.append(" value_s = r[i] - alpha * Ap[i]; \n");
587  source.append(" inner_prod_contrib += value_s * value_s; \n");
588  source.append(" \n");
589  source.append(" s[i] = value_s; \n");
590  source.append(" } \n");
591  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
592 
593  // parallel reduction in work group
594  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
595  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
596  source.append(" { \n");
597  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
598  source.append(" if (get_local_id(0) < stride) \n");
599  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
600  source.append(" } ");
601 
602  // write results to result array
603  source.append(" if (get_local_id(0) == 0) \n ");
604  source.append(" inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
605 
606  source.append("} \n");
607 
608 }
609 
610 
611 
612 template<typename StringT>
613 void generate_pipelined_bicgstab_vector_update(StringT & source, std::string const & numeric_string)
614 {
615  source.append("__kernel void bicgstab_vector_update( \n");
616  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
617  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
618  source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
619  source.append(" "); source.append(numeric_string); source.append(" omega, \n");
620  source.append(" __global "); source.append(numeric_string); source.append(" const * s, \n");
621  source.append(" __global "); source.append(numeric_string); source.append(" * residual, \n");
622  source.append(" __global "); source.append(numeric_string); source.append(" const * As, \n");
623  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
624  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
625  source.append(" __global "); source.append(numeric_string); source.append(" const * r0star, \n");
626  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
627  source.append(" unsigned int size, \n");
628  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
629  source.append("{ \n");
630  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r_r0star = 0; \n");
631  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
632  source.append(" "); source.append(numeric_string); source.append(" value_result = result[i]; \n");
633  source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
634  source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
635  source.append(" "); source.append(numeric_string); source.append(" value_residual = residual[i]; \n");
636  source.append(" "); source.append(numeric_string); source.append(" value_As = As[i]; \n");
637  source.append(" "); source.append(numeric_string); source.append(" value_Ap = Ap[i]; \n");
638  source.append(" "); source.append(numeric_string); source.append(" value_r0star = r0star[i]; \n");
639  source.append(" \n");
640  source.append(" value_result += alpha * value_p + omega * value_s; \n");
641  source.append(" value_residual = value_s - omega * value_As; \n");
642  source.append(" value_p = value_residual + beta * (value_p - omega * value_Ap); \n");
643  source.append(" \n");
644  source.append(" result[i] = value_result; \n");
645  source.append(" residual[i] = value_residual; \n");
646  source.append(" p[i] = value_p; \n");
647  source.append(" inner_prod_r_r0star += value_residual * value_r0star; \n");
648  source.append(" } \n");
649 
650  // parallel reduction in work group
651  source.append(" shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
652  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
653  source.append(" { \n");
654  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
655  source.append(" if (get_local_id(0) < stride) \n");
656  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
657  source.append(" } ");
658 
659  // write results to result array
660  source.append(" if (get_local_id(0) == 0) \n ");
661  source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
662 
663  source.append("} \n");
664 }
665 
666 template<typename StringT>
667 void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
668 {
669  std::stringstream ss;
670  ss << subwarp_size;
671 
672  source.append("__kernel void bicgstab_csr_blocked_prod( \n");
673  source.append(" __global const unsigned int * row_indices, \n");
674  source.append(" __global const unsigned int * column_indices, \n");
675  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
676  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
677  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
678  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
679  source.append(" unsigned int size, \n");
680  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
681  source.append(" unsigned int buffer_size, \n");
682  source.append(" unsigned int buffer_offset, \n");
683  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
684  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
685  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
686  source.append("{ \n");
687  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
688  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
689  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
690  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
691 
692  source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
693  source.append(" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
694  source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
695  source.append(" const unsigned int block_stop = min(block_start + block_increment, size); \n");
696 
697  source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
698  source.append(" row < block_stop; \n");
699  source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
700  source.append(" { \n");
701  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
702  source.append(" unsigned int row_end = row_indices[row+1]; \n");
703  source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
704  source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
705 
706  source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
707  source.append(" #pragma unroll \n");
708  source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
709  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
710 
711  source.append(" if (id_in_row == 0) { \n");
712  source.append(" Ap[row] = shared_elements[get_local_id(0)]; \n");
713  source.append(" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
714  source.append(" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
715  source.append(" inner_prod_r0Ap += r0star[row] * shared_elements[get_local_id(0)]; \n");
716  source.append(" } \n");
717  source.append(" } \n");
718 
719  // parallel reduction in work group
720  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
721  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
722  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
723  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
724  source.append(" { \n");
725  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
726  source.append(" if (get_local_id(0) < stride) { \n");
727  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
728  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
729  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
730  source.append(" } ");
731  source.append(" } ");
732 
733  // write results to result array
734  source.append(" if (get_local_id(0) == 0) { \n ");
735  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
736  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
737  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
738  source.append(" } \n");
739 
740  source.append("} \n");
741 }
742 
743 template<typename StringT>
744 void generate_compressed_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
745 {
746  source.append("__kernel void bicgstab_csr_prod( \n");
747  source.append(" __global const unsigned int * row_indices, \n");
748  source.append(" __global const unsigned int * column_indices, \n");
749  source.append(" __global const unsigned int * row_blocks, \n");
750  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
751  source.append(" unsigned int num_blocks, \n");
752  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
753  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
754  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
755  source.append(" unsigned int size, \n");
756  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
757  source.append(" unsigned int buffer_size, \n");
758  source.append(" unsigned int buffer_offset, \n");
759  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
760  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
761  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
762  source.append("{ \n");
763  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
764  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
765  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
766  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
767 
768  source.append(" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
769  source.append(" unsigned int row_start = row_blocks[block_id]; \n");
770  source.append(" unsigned int row_stop = row_blocks[block_id + 1]; \n");
771  source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
772  source.append(" unsigned int element_start = row_indices[row_start]; \n");
773  source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
774 
775  source.append(" if (rows_to_process > 1) { \n"); // CSR stream
776  // load to shared buffer:
777  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
778  source.append(" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
779 
780  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
781 
782  // use one thread per row to sum:
783  source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
784  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
785  source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
786  source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
787  source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
788  source.append(" dot_prod += shared_elements[i]; \n");
789  source.append(" Ap[row] = dot_prod; \n");
790  source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
791  source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
792  source.append(" inner_prod_r0Ap += r0star[row] * dot_prod; \n");
793  source.append(" } \n");
794  source.append(" } \n");
795 
796  source.append(" else \n"); // CSR vector for a single row
797  source.append(" { \n");
798  // load and sum to shared buffer:
799  source.append(" shared_elements[get_local_id(0)] = 0; \n");
800  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
801  source.append(" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
802 
803  // reduction to obtain final result
804  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
805  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
806  source.append(" if (get_local_id(0) < stride) \n");
807  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
808  source.append(" } \n");
809 
810  source.append(" if (get_local_id(0) == 0) { \n");
811  source.append(" Ap[row_start] = shared_elements[0]; \n");
812  source.append(" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
813  source.append(" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
814  source.append(" inner_prod_r0Ap += r0star[row_start] * shared_elements[0]; \n");
815  source.append(" } \n");
816  source.append(" } \n");
817  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
818  source.append(" } \n");
819 
820  // parallel reduction in work group
821  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
822  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
823  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
824  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
825  source.append(" { \n");
826  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
827  source.append(" if (get_local_id(0) < stride) { \n");
828  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
829  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
830  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
831  source.append(" } ");
832  source.append(" } ");
833 
834  // write results to result array
835  source.append(" if (get_local_id(0) == 0) { \n ");
836  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
837  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
838  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
839  source.append(" } \n");
840 
841  source.append("} \n \n");
842 
843 }
844 
845 template<typename StringT>
846 void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
847 {
848  source.append("__kernel void bicgstab_coo_prod( \n");
849  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
850  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
851  source.append(" __global const uint * group_boundaries, \n");
852  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
853  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
854  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
855  source.append(" unsigned int size, \n");
856  source.append(" __local unsigned int * shared_rows, \n");
857  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
858  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
859  source.append(" unsigned int buffer_size, \n");
860  source.append(" unsigned int buffer_offset, \n");
861  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
862  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
863  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
864  source.append("{ \n");
865  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
866  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
867  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
868 
870  source.append(" uint2 tmp; \n");
871  source.append(" "); source.append(numeric_string); source.append(" val; \n");
872  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
873  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
874  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
875 
876  source.append(" uint local_index = 0; \n");
877 
878  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
879  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
880 
881  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
882  source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
883 
884  //check for carry from previous loop run:
885  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
886  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
887  source.append(" val += inter_results[get_local_size(0)-1]; \n");
888  source.append(" else {\n");
889  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
890  source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
891  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
892  source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
893  source.append(" inner_prod_r0Ap += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
894  source.append(" } \n");
895  source.append(" } \n");
896 
897  //segmented parallel reduction begin
898  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
899  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
900  source.append(" inter_results[get_local_id(0)] = val; \n");
901  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
902  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
903 
904  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
905  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
906  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
907  source.append(" inter_results[get_local_id(0)] += left; \n");
908  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
909  source.append(" } \n");
910  //segmented parallel reduction end
911 
912  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
913  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
914  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
915  source.append(" Ap[tmp.x] = Ap_entry; \n");
916  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
917  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
918  source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
919  source.append(" } \n");
920 
921  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
922  source.append(" } \n"); //for k
923 
924  source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
925  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
926  source.append(" Ap[tmp.x] = Ap_entry; \n");
927  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
928  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
929  source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
930  source.append(" } \n");
931 
932  // parallel reduction in work group
933  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
934  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
935  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
936  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
937  source.append(" { \n");
938  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
939  source.append(" if (get_local_id(0) < stride) { \n");
940  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
941  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
942  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
943  source.append(" } ");
944  source.append(" } ");
945 
946  // write results to result array
947  source.append(" if (get_local_id(0) == 0) { \n ");
948  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
949  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
950  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
951  source.append(" } \n");
952 
953  source.append("} \n \n");
954 
955 }
956 
957 
958 template<typename StringT>
959 void generate_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
960 {
961  source.append("__kernel void bicgstab_ell_prod( \n");
962  source.append(" __global const unsigned int * coords, \n");
963  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
964  source.append(" unsigned int internal_row_num, \n");
965  source.append(" unsigned int items_per_row, \n");
966  source.append(" unsigned int aligned_items_per_row, \n");
967  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
968  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
969  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
970  source.append(" unsigned int size, \n");
971  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
972  source.append(" unsigned int buffer_size, \n");
973  source.append(" unsigned int buffer_offset, \n");
974  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
975  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
976  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
977  source.append("{ \n");
978  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
979  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
980  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
981  source.append(" uint glb_id = get_global_id(0); \n");
982  source.append(" uint glb_sz = get_global_size(0); \n");
983 
984  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
985  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
986 
987  source.append(" uint offset = row; \n");
988  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
989  source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
990  source.append(" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
991  source.append(" } \n");
992 
993  source.append(" Ap[row] = sum; \n");
994  source.append(" inner_prod_ApAp += sum * sum; \n");
995  source.append(" inner_prod_pAp += p[row] * sum; \n");
996  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
997  source.append(" } \n");
998 
999  // parallel reduction in work group
1000  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1001  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1002  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1003  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1004  source.append(" { \n");
1005  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1006  source.append(" if (get_local_id(0) < stride) { \n");
1007  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1008  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1009  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1010  source.append(" } ");
1011  source.append(" } ");
1012 
1013  // write results to result array
1014  source.append(" if (get_local_id(0) == 0) { \n ");
1015  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1016  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1017  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1018  source.append(" } \n");
1019  source.append("} \n \n");
1020 }
1021 
1022 template<typename StringT>
1023 void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
1024 {
1025  source.append("__kernel void bicgstab_sliced_ell_prod( \n");
1026  source.append(" __global const unsigned int * columns_per_block, \n");
1027  source.append(" __global const unsigned int * column_indices, \n");
1028  source.append(" __global const unsigned int * block_start, \n");
1029  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1030  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1031  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1032  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
1033  source.append(" unsigned int size, \n");
1034  source.append(" unsigned int block_size, \n");
1035  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1036  source.append(" unsigned int buffer_size, \n");
1037  source.append(" unsigned int buffer_offset, \n");
1038  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1039  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
1040  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
1041  source.append("{ \n");
1042  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
1043  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
1044  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
1045  source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
1046  source.append(" uint id_in_block = get_local_id(0) % block_size; \n");
1047  source.append(" uint num_blocks = (size - 1) / block_size + 1; \n");
1048  source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
1049  source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
1050 
1051  source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
1052  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
1053 
1054  source.append(" uint row = block_idx * block_size + id_in_block; \n");
1055  source.append(" uint offset = block_start[block_idx]; \n");
1056  source.append(" uint num_columns = columns_per_block[block_idx]; \n");
1057  source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
1058  source.append(" uint index = offset + item_id * block_size + id_in_block; \n");
1059  source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
1060  source.append(" sum += val ? (p[column_indices[index]] * val) : 0; \n");
1061  source.append(" } \n");
1062 
1063  source.append(" if (row < size) {\n");
1064  source.append(" Ap[row] = sum; \n");
1065  source.append(" inner_prod_ApAp += sum * sum; \n");
1066  source.append(" inner_prod_pAp += p[row] * sum; \n");
1067  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
1068  source.append(" } \n");
1069  source.append(" } \n");
1070 
1071  // parallel reduction in work group
1072  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1073  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1074  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1075  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1076  source.append(" { \n");
1077  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1078  source.append(" if (get_local_id(0) < stride) { \n");
1079  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1080  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1081  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1082  source.append(" } ");
1083  source.append(" } ");
1084 
1085  // write results to result array
1086  source.append(" if (get_local_id(0) == 0) { \n ");
1087  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1088  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1089  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1090  source.append(" } \n");
1091  source.append("} \n \n");
1092 }
1093 
1094 template<typename StringT>
1095 void generate_hyb_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
1096 {
1097  source.append("__kernel void bicgstab_hyb_prod( \n");
1098  source.append(" const __global int* ell_coords, \n");
1099  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
1100  source.append(" const __global uint* csr_rows, \n");
1101  source.append(" const __global uint* csr_cols, \n");
1102  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
1103  source.append(" unsigned int internal_row_num, \n");
1104  source.append(" unsigned int items_per_row, \n");
1105  source.append(" unsigned int aligned_items_per_row, \n");
1106  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1107  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1108  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
1109  source.append(" unsigned int size, \n");
1110  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1111  source.append(" unsigned int buffer_size, \n");
1112  source.append(" unsigned int buffer_offset, \n");
1113  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1114  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
1115  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
1116  source.append("{ \n");
1117  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
1118  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
1119  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
1120  source.append(" uint glb_id = get_global_id(0); \n");
1121  source.append(" uint glb_sz = get_global_size(0); \n");
1122 
1123  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
1124  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
1125 
1126  source.append(" uint offset = row; \n");
1127  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
1128  source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
1129  source.append(" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
1130  source.append(" } \n");
1131 
1132  source.append(" uint col_begin = csr_rows[row]; \n");
1133  source.append(" uint col_end = csr_rows[row + 1]; \n");
1134 
1135  source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
1136  source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
1137  source.append(" } \n");
1138 
1139  source.append(" Ap[row] = sum; \n");
1140  source.append(" inner_prod_ApAp += sum * sum; \n");
1141  source.append(" inner_prod_pAp += p[row] * sum; \n");
1142  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
1143  source.append(" } \n");
1144 
1145  // parallel reduction in work group
1146  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1147  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1148  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1149  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1150  source.append(" { \n");
1151  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1152  source.append(" if (get_local_id(0) < stride) { \n");
1153  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1154  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1155  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1156  source.append(" } ");
1157  source.append(" } ");
1158 
1159  // write results to result array
1160  source.append(" if (get_local_id(0) == 0) { \n ");
1161  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1162  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1163  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1164  source.append(" } \n");
1165  source.append("} \n \n");
1166 }
1167 
1169 
1170 
1171 template <typename StringType>
1172 void generate_pipelined_gmres_gram_schmidt_stage1(StringType & source, std::string const & numeric_string, bool is_nvidia)
1173 {
1174  source.append("__kernel void gmres_gram_schmidt_1( \n");
1175  source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
1176  source.append(" unsigned int size, \n");
1177  source.append(" unsigned int internal_size, \n");
1178  source.append(" unsigned int k, \n");
1179  source.append(" __global "); source.append(numeric_string); source.append(" * vi_in_vk_buffer, \n");
1180  source.append(" unsigned int chunk_size) \n");
1181  source.append("{ \n");
1182 
1183  source.append(" __local "); source.append(numeric_string); source.append(" shared_array[7*128]; \n");
1184  if (!is_nvidia) // use of thread-local variables entails a 2x performance drop on NVIDIA GPUs, but is faster an AMD
1185  {
1186  source.append(" "); source.append(numeric_string); source.append(" vi_in_vk[7]; \n");
1187  }
1188  source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
1189 
1190  source.append(" unsigned int k_base = 0; \n");
1191  source.append(" while (k_base < k) { \n");
1192  source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1193 
1194  if (is_nvidia)
1195  {
1196  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1197  source.append(" shared_array[get_local_id(0) + j*chunk_size] = 0; \n");
1198  }
1199  else
1200  {
1201  source.append(" vi_in_vk[0] = 0;\n");
1202  source.append(" vi_in_vk[1] = 0;\n");
1203  source.append(" vi_in_vk[2] = 0;\n");
1204  source.append(" vi_in_vk[3] = 0;\n");
1205  source.append(" vi_in_vk[4] = 0;\n");
1206  source.append(" vi_in_vk[5] = 0;\n");
1207  source.append(" vi_in_vk[6] = 0;\n");
1208  }
1209  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1210  source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
1211  source.append(" \n");
1212  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1213  if (is_nvidia)
1214  source.append(" shared_array[get_local_id(0) + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1215  else
1216  source.append(" vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1217  source.append(" } \n");
1218 
1219  // parallel reduction in work group
1220  if (!is_nvidia)
1221  {
1222  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1223  source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
1224  }
1225  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1226  source.append(" { \n");
1227  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1228  source.append(" if (get_local_id(0) < stride) { \n");
1229  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1230  source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1231  source.append(" } ");
1232  source.append(" } ");
1233 
1234  // write results to result array
1235  source.append(" if (get_local_id(0) == 0) \n ");
1236  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1237  source.append(" vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
1238 
1239  source.append(" k_base += vecs_in_iteration; \n");
1240  source.append(" } \n");
1241 
1242  source.append("} \n");
1243 
1244 }
1245 
1246 template <typename StringType>
1247 void generate_pipelined_gmres_gram_schmidt_stage2(StringType & source, std::string const & numeric_string)
1248 {
1249  source.append("__kernel void gmres_gram_schmidt_2( \n");
1250  source.append(" __global "); source.append(numeric_string); source.append(" * krylov_basis, \n");
1251  source.append(" unsigned int size, \n");
1252  source.append(" unsigned int internal_size, \n");
1253  source.append(" unsigned int k, \n");
1254  source.append(" __global "); source.append(numeric_string); source.append(" const * vi_in_vk_buffer, \n");
1255  source.append(" unsigned int chunk_size, \n");
1256  source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
1257  source.append(" unsigned int krylov_dim, \n");
1258  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1259  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
1260  source.append("{ \n");
1261 
1262  source.append(" "); source.append(numeric_string); source.append(" vk_dot_vk = 0; \n");
1263  source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
1264 
1265  source.append(" unsigned int k_base = 0; \n");
1266  source.append(" while (k_base < k) { \n");
1267  source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1268 
1269  // parallel reduction in work group for <v_i, v_k>
1270  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1271  source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
1272  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1273  source.append(" { \n");
1274  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1275  source.append(" if (get_local_id(0) < stride) { \n");
1276  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1277  source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1278  source.append(" } ");
1279  source.append(" } ");
1280  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1281 
1282  // v_k -= <v_i, v_k> v_i:
1283  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1284  source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
1285  source.append(" \n");
1286  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1287  source.append(" value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
1288  source.append(" vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; \n");
1289  source.append(" krylov_basis[i + k * internal_size] = value_vk; \n");
1290  source.append(" } \n");
1291 
1292  // write to R: (to avoid thread divergence, all threads write the same value)
1293  source.append(" if (get_group_id(0) == 0) \n");
1294  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1295  source.append(" R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
1296  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1297 
1298  source.append(" k_base += vecs_in_iteration; \n");
1299  source.append(" } \n");
1300 
1301  // parallel reduction in work group for <v_k, v_k>
1302  source.append(" shared_array[get_local_id(0)] = vk_dot_vk; \n");
1303  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1304  source.append(" { \n");
1305  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1306  source.append(" if (get_local_id(0) < stride) \n");
1307  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1308  source.append(" } ");
1309 
1310  // write results to result array
1311  source.append(" if (get_local_id(0) == 0) \n ");
1312  source.append(" inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
1313 
1314  source.append("} \n");
1315 }
1316 
1317 template <typename StringType>
1318 void generate_pipelined_gmres_normalize_vk(StringType & source, std::string const & numeric_string)
1319 {
1320  source.append("__kernel void gmres_normalize_vk( \n");
1321  source.append(" __global "); source.append(numeric_string); source.append(" * vk, \n");
1322  source.append(" unsigned int vk_offset, \n");
1323  source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
1324  source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
1325  source.append(" unsigned int R_offset, \n");
1326  source.append(" __global "); source.append(numeric_string); source.append(" const * inner_prod_buffer, \n");
1327  source.append(" unsigned int chunk_size, \n");
1328  source.append(" __global "); source.append(numeric_string); source.append(" * r_dot_vk_buffer, \n");
1329  source.append(" unsigned int chunk_offset, \n");
1330  source.append(" unsigned int size, \n");
1331  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
1332  source.append("{ \n");
1333 
1334  source.append(" "); source.append(numeric_string); source.append(" norm_vk = 0; \n");
1335 
1336  // parallel reduction in work group to compute <vk, vk>
1337  source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
1338  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1339  source.append(" { \n");
1340  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1341  source.append(" if (get_local_id(0) < stride) \n");
1342  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1343  source.append(" } ");
1344 
1345  // compute alpha from reduced values:
1346  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1347  source.append(" norm_vk = sqrt(shared_array[0]); \n");
1348 
1349  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
1350  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1351  source.append(" "); source.append(numeric_string); source.append(" value_vk = vk[i + vk_offset] / norm_vk; \n");
1352  source.append(" \n");
1353  source.append(" inner_prod_contrib += residual[i] * value_vk; \n");
1354  source.append(" \n");
1355  source.append(" vk[i + vk_offset] = value_vk; \n");
1356  source.append(" } \n");
1357  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1358 
1359  // parallel reduction in work group
1360  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
1361  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1362  source.append(" { \n");
1363  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1364  source.append(" if (get_local_id(0) < stride) \n");
1365  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1366  source.append(" } ");
1367 
1368  // write results to result array
1369  source.append(" if (get_local_id(0) == 0) \n ");
1370  source.append(" r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
1371  source.append(" if (get_global_id(0) == 0) \n ");
1372  source.append(" R_buffer[R_offset] = norm_vk; \n");
1373 
1374  source.append("} \n");
1375 
1376 }
1377 
1378 template <typename StringType>
1379 void generate_pipelined_gmres_update_result(StringType & source, std::string const & numeric_string)
1380 {
1381  source.append("__kernel void gmres_update_result( \n");
1382  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1383  source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
1384  source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
1385  source.append(" unsigned int size, \n");
1386  source.append(" unsigned int internal_size, \n");
1387  source.append(" __global "); source.append(numeric_string); source.append(" const * coefficients, \n");
1388  source.append(" unsigned int k) \n");
1389  source.append("{ \n");
1390 
1391  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1392  source.append(" "); source.append(numeric_string); source.append(" value_result = result[i] + coefficients[0] * residual[i]; \n");
1393  source.append(" \n");
1394  source.append(" for (unsigned int j = 1; j < k; ++j) \n");
1395  source.append(" value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
1396  source.append(" \n");
1397  source.append(" result[i] = value_result; \n");
1398  source.append(" } \n");
1399 
1400  source.append("} \n");
1401 }
1402 
1403 
1404 template <typename StringType>
1405 void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType & source, std::string const & numeric_string)
1406 {
1407  source.append("__kernel void gmres_csr_blocked_prod( \n");
1408  source.append(" __global const unsigned int * row_indices, \n");
1409  source.append(" __global const unsigned int * column_indices, \n");
1410  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1411  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1412  source.append(" unsigned int offset_p, \n");
1413  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1414  source.append(" unsigned int offset_Ap, \n");
1415  source.append(" unsigned int size, \n");
1416  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1417  source.append(" unsigned int buffer_size, \n");
1418  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1419  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1420  source.append("{ \n");
1421  source.append(" cg_csr_blocked_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1422  source.append("} \n \n");
1423 
1424 }
1425 
1426 template <typename StringType>
1427 void generate_compressed_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1428 {
1429  source.append("__kernel void gmres_csr_prod( \n");
1430  source.append(" __global const unsigned int * row_indices, \n");
1431  source.append(" __global const unsigned int * column_indices, \n");
1432  source.append(" __global const unsigned int * row_blocks, \n");
1433  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1434  source.append(" unsigned int num_blocks, \n");
1435  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1436  source.append(" unsigned int offset_p, \n");
1437  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1438  source.append(" unsigned int offset_Ap, \n");
1439  source.append(" unsigned int size, \n");
1440  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1441  source.append(" unsigned int buffer_size, \n");
1442  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1443  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
1444  source.append(" __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
1445  source.append("{ \n");
1446  source.append(" cg_csr_prod(row_indices, column_indices, row_blocks, elements, num_blocks, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp, shared_elements); \n");
1447  source.append("} \n \n");
1448 
1449 }
1450 
1451 template <typename StringType>
1452 void generate_coordinate_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1453 {
1454  source.append("__kernel void gmres_coo_prod( \n");
1455  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
1456  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1457  source.append(" __global const uint * group_boundaries, \n");
1458  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1459  source.append(" unsigned int offset_p, \n");
1460  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1461  source.append(" unsigned int offset_Ap, \n");
1462  source.append(" unsigned int size, \n");
1463  source.append(" __local unsigned int * shared_rows, \n");
1464  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
1465  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1466  source.append(" unsigned int buffer_size, \n");
1467  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1468  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1469  source.append("{ \n");
1470  source.append(" cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1471  source.append("} \n \n");
1472 
1473 }
1474 
1475 
1476 template <typename StringType>
1477 void generate_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1478 {
1479  source.append("__kernel void gmres_ell_prod( \n");
1480  source.append(" __global const unsigned int * coords, \n");
1481  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1482  source.append(" unsigned int internal_row_num, \n");
1483  source.append(" unsigned int items_per_row, \n");
1484  source.append(" unsigned int aligned_items_per_row, \n");
1485  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1486  source.append(" unsigned int offset_p, \n");
1487  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1488  source.append(" unsigned int offset_Ap, \n");
1489  source.append(" unsigned int size, \n");
1490  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1491  source.append(" unsigned int buffer_size, \n");
1492  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1493  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1494  source.append("{ \n");
1495  source.append(" cg_ell_prod(coords, elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1496  source.append("} \n \n");
1497 }
1498 
1499 template <typename StringType>
1500 void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1501 {
1502  source.append("__kernel void gmres_sliced_ell_prod( \n");
1503  source.append(" __global const unsigned int * columns_per_block, \n");
1504  source.append(" __global const unsigned int * column_indices, \n");
1505  source.append(" __global const unsigned int * block_start, \n");
1506  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1507  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1508  source.append(" unsigned int offset_p, \n");
1509  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1510  source.append(" unsigned int offset_Ap, \n");
1511  source.append(" unsigned int size, \n");
1512  source.append(" unsigned int block_size, \n");
1513  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1514  source.append(" unsigned int buffer_size, \n");
1515  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1516  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1517  source.append("{ \n");
1518  source.append(" cg_sliced_ell_prod(columns_per_block, column_indices, block_start, elements, p + offset_p, Ap + offset_Ap, size, block_size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1519  source.append("} \n \n");
1520 }
1521 
1522 template <typename StringType>
1523 void generate_hyb_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1524 {
1525  source.append("__kernel void gmres_hyb_prod( \n");
1526  source.append(" const __global int* ell_coords, \n");
1527  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
1528  source.append(" const __global uint* csr_rows, \n");
1529  source.append(" const __global uint* csr_cols, \n");
1530  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
1531  source.append(" unsigned int internal_row_num, \n");
1532  source.append(" unsigned int items_per_row, \n");
1533  source.append(" unsigned int aligned_items_per_row, \n");
1534  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1535  source.append(" unsigned int offset_p, \n");
1536  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1537  source.append(" unsigned int offset_Ap, \n");
1538  source.append(" unsigned int size, \n");
1539  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1540  source.append(" unsigned int buffer_size, \n");
1541  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1542  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1543  source.append("{ \n");
1544  source.append(" cg_hyb_prod(ell_coords, ell_elements, csr_rows, csr_cols, csr_elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1545  source.append("} \n \n");
1546 }
1547 
1548 
1549 
1550 
1552 
1553 // main kernel class
1555 template<typename NumericT>
1557 {
1558  static std::string program_name()
1559  {
1560  return viennacl::ocl::type_to_string<NumericT>::apply() + "_iterative";
1561  }
1562 
1563  static void init(viennacl::ocl::context & ctx)
1564  {
1565  static std::map<cl_context, bool> init_done;
1566  if (!init_done[ctx.handle().get()])
1567  {
1569  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1570 
1571  std::string source;
1572  source.reserve(1024);
1573 
1574  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1575 
1576  generate_pipelined_cg_vector_update(source, numeric_string);
1577  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1578  generate_compressed_matrix_pipelined_cg_blocked_prod(source, numeric_string, 16);
1579  generate_compressed_matrix_pipelined_cg_prod(source, numeric_string);
1580  generate_coordinate_matrix_pipelined_cg_prod(source, numeric_string);
1581  generate_ell_matrix_pipelined_cg_prod(source, numeric_string);
1582  generate_sliced_ell_matrix_pipelined_cg_prod(source, numeric_string);
1583  generate_hyb_matrix_pipelined_cg_prod(source, numeric_string);
1584 
1585  generate_pipelined_bicgstab_update_s(source, numeric_string);
1586  generate_pipelined_bicgstab_vector_update(source, numeric_string);
1587  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1591  generate_ell_matrix_pipelined_bicgstab_prod(source, numeric_string);
1593  generate_hyb_matrix_pipelined_bicgstab_prod(source, numeric_string);
1594 
1595  generate_pipelined_gmres_gram_schmidt_stage1(source, numeric_string, ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id); // NVIDIA GPUs require special treatment here
1596  generate_pipelined_gmres_gram_schmidt_stage2(source, numeric_string);
1597  generate_pipelined_gmres_normalize_vk(source, numeric_string);
1598  generate_pipelined_gmres_update_result(source, numeric_string);
1599  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1601  generate_compressed_matrix_pipelined_gmres_prod(source, numeric_string);
1602  generate_coordinate_matrix_pipelined_gmres_prod(source, numeric_string);
1603  generate_ell_matrix_pipelined_gmres_prod(source, numeric_string);
1604  generate_sliced_ell_matrix_pipelined_gmres_prod(source, numeric_string);
1605  generate_hyb_matrix_pipelined_gmres_prod(source, numeric_string);
1606 
1607  std::string prog_name = program_name();
1608  #ifdef VIENNACL_BUILD_INFO
1609  std::cout << "Creating program " << prog_name << std::endl;
1610  #endif
1611  ctx.add_program(source, prog_name);
1612  init_done[ctx.handle().get()] = true;
1613  } //if
1614  } //init
1615 };
1616 
1617 } // namespace kernels
1618 } // namespace opencl
1619 } // namespace linalg
1620 } // namespace viennacl
1621 #endif
1622 
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
Definition: iterative.hpp:1556
void generate_sliced_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:416
void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1405
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Definition: iterative.hpp:1563
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Provides OpenCL-related utilities.
void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:1023
void generate_pipelined_gmres_gram_schmidt_stage1(StringType &source, std::string const &numeric_string, bool is_nvidia)
Definition: iterative.hpp:1172
void generate_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:360
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
void generate_hyb_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:1095
void generate_pipelined_gmres_normalize_vk(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1318
void generate_pipelined_gmres_gram_schmidt_stage2(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1247
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
Definition: iterative.hpp:93
void generate_compressed_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1427
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:959
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_compressed_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:162
void generate_pipelined_bicgstab_vector_update(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:613
void generate_coordinate_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:257
void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
Definition: iterative.hpp:667
void generate_coordinate_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1452
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
Proxy classes for vectors.
void generate_hyb_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1523
void generate_hyb_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:480
void generate_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1477
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:846
void generate_pipelined_bicgstab_update_s(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:550
void generate_pipelined_cg_vector_update(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:49
void generate_pipelined_gmres_update_result(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1379
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_compressed_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:744
void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1500