1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
39 template<
typename StringT>
42 source.append(
"float get_element(__global const unsigned int * row_indices, \n");
43 source.append(
" __global const unsigned int * column_indices, \n");
44 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
45 source.append(
" unsigned int row, \n");
46 source.append(
" unsigned int col) \n");
47 source.append(
"{ \n");
48 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
49 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
50 source.append(
" if (column_indices[i] == col) \n");
51 source.append(
" return elements[i]; \n");
52 source.append(
" if (column_indices[i] > col) \n");
53 source.append(
" return 0; \n");
54 source.append(
" } \n");
55 source.append(
" return 0; \n");
56 source.append(
"} \n");
58 source.append(
"void block_assembly(__global const unsigned int * row_indices, \n");
59 source.append(
" __global const unsigned int * column_indices, \n");
60 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
61 source.append(
" __global const unsigned int * matrix_dimensions, \n");
62 source.append(
" __global const unsigned int * set_I, \n");
63 source.append(
" __global const unsigned int * set_J, \n");
64 source.append(
" unsigned int matrix_ind, \n");
65 source.append(
" __global "); source.append(numeric_string); source.append(
" * com_A_I_J) \n");
66 source.append(
"{ \n");
67 source.append(
" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
68 source.append(
" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
70 source.append(
" for (unsigned int i = 0; i < col_n; ++i){ \n");
72 source.append(
" for (unsigned int j = 0; j < row_n; j++){ \n");
73 source.append(
" com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
74 source.append(
" } \n");
75 source.append(
" } \n");
76 source.append(
"} \n");
78 source.append(
"__kernel void assemble_blocks( \n");
79 source.append(
" __global const unsigned int * row_indices, \n");
80 source.append(
" __global const unsigned int * column_indices, \n");
81 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
82 source.append(
" __global const unsigned int * set_I, \n");
83 source.append(
" __global const unsigned int * set_J, \n");
84 source.append(
" __global const unsigned int * i_ind, \n");
85 source.append(
" __global const unsigned int * j_ind, \n");
86 source.append(
" __global const unsigned int * block_ind, \n");
87 source.append(
" __global const unsigned int * matrix_dimensions, \n");
88 source.append(
" __global "); source.append(numeric_string); source.append(
" * com_A_I_J, \n");
89 source.append(
" __global unsigned int * g_is_update, \n");
90 source.append(
" unsigned int block_elems_num) \n");
91 source.append(
"{ \n");
92 source.append(
" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
93 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
94 source.append(
" block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
95 source.append(
" } \n");
96 source.append(
" } \n");
97 source.append(
" } \n");
100 template<
typename StringT>
103 source.append(
" void assemble_bv(__global "); source.append(numeric_string); source.append(
" * g_bv_r, __global "); source.append(numeric_string); source.append(
" * g_bv, unsigned int col_n){ \n");
104 source.append(
" for (unsigned int i = 0; i < col_n; ++i){ \n");
105 source.append(
" g_bv_r[i] = g_bv[ i]; \n");
106 source.append(
" } \n");
107 source.append(
" } \n");
109 source.append(
" void assemble_bv_block(__global "); source.append(numeric_string); source.append(
" * g_bv_r, __global "); source.append(numeric_string); source.append(
" * g_bv, unsigned int col_n, \n");
110 source.append(
" __global "); source.append(numeric_string); source.append(
" * g_bv_u, unsigned int col_n_u) \n");
111 source.append(
" { \n");
112 source.append(
" assemble_bv(g_bv_r, g_bv, col_n); \n");
113 source.append(
" assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
114 source.append(
" } \n");
116 source.append(
" __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(
" * g_bv, \n");
117 source.append(
" __global unsigned int * start_bv_ind, \n");
118 source.append(
" __global unsigned int * matrix_dimensions, \n");
119 source.append(
" __global "); source.append(numeric_string); source.append(
" * g_bv_u, \n");
120 source.append(
" __global unsigned int * start_bv_u_ind, \n");
121 source.append(
" __global unsigned int * matrix_dimensions_u, \n");
122 source.append(
" __global "); source.append(numeric_string); source.append(
" * g_bv_r, \n");
123 source.append(
" __global unsigned int * start_bv_r_ind, \n");
124 source.append(
" __global unsigned int * matrix_dimensions_r, \n");
125 source.append(
" __global unsigned int * g_is_update, \n");
126 source.append(
" //__local "); source.append(numeric_string); source.append(
" * local_gb, \n");
127 source.append(
" unsigned int block_elems_num) \n");
128 source.append(
" { \n");
129 source.append(
" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
130 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
131 source.append(
" assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
132 source.append(
" } \n");
133 source.append(
" } \n");
134 source.append(
" } \n");
137 template<
typename StringT>
140 source.append(
"void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(
" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(
" * v, unsigned int ind, "); source.append(numeric_string); source.append(
" *res){ \n");
141 source.append(
" *res = 0.0; \n");
142 source.append(
" for (unsigned int j = ind; j < row_n; ++j){ \n");
143 source.append(
" if (j == ind){ \n");
144 source.append(
" *res += v[ j]; \n");
145 source.append(
" }else{ \n");
146 source.append(
" *res += A[ j + ind*row_n]*v[ j]; \n");
147 source.append(
" } \n");
148 source.append(
" } \n");
149 source.append(
"} \n");
151 source.append(
"void backwardSolve(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * y, __global "); source.append(numeric_string); source.append(
" * x){ \n");
152 source.append(
" for (int i = col_n-1; i >= 0; i--) { \n");
153 source.append(
" x[ i] = y[ i]; \n");
154 source.append(
" for (int j = i+1; j < col_n; ++j) { \n");
155 source.append(
" x[ i] -= R[ i + j*row_n]*x[ j]; \n");
156 source.append(
" } \n");
157 source.append(
" x[i] /= R[ i + i*row_n]; \n");
158 source.append(
" } \n");
159 source.append(
"} \n");
162 source.append(
"void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(
" * b_v, __global "); source.append(numeric_string); source.append(
" * y){ \n");
163 source.append(
" "); source.append(numeric_string); source.append(
" inn_prod = 0; \n");
164 source.append(
" for (unsigned int i = 0; i < col_n; ++i){ \n");
165 source.append(
" custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
166 source.append(
" for (unsigned int j = i; j < row_n; ++j){ \n");
167 source.append(
" if (i == j){ \n");
168 source.append(
" y[ j] -= b_v[ i]*inn_prod; \n");
169 source.append(
" } \n");
170 source.append(
" else{ \n");
171 source.append(
" y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
172 source.append(
" } \n");
173 source.append(
" } \n");
174 source.append(
" } \n");
175 source.append(
" } \n");
177 source.append(
"void ls(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * b_v, __global "); source.append(numeric_string); source.append(
" * m_v, __global "); source.append(numeric_string); source.append(
" * y_v){ \n");
178 source.append(
" apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
179 source.append(
" //m_new - is m_v now \n");
180 source.append(
" backwardSolve(R, row_n, col_n, y_v, m_v); \n");
181 source.append(
"} \n");
183 source.append(
"__kernel void block_least_squares( \n");
184 source.append(
" __global "); source.append(numeric_string); source.append(
" * global_R, \n");
185 source.append(
" __global unsigned int * block_ind, \n");
186 source.append(
" __global "); source.append(numeric_string); source.append(
" * b_v, \n");
187 source.append(
" __global unsigned int * start_bv_inds, \n");
188 source.append(
" __global "); source.append(numeric_string); source.append(
" * m_v, \n");
189 source.append(
" __global "); source.append(numeric_string); source.append(
" * y_v, \n");
190 source.append(
" __global unsigned int * start_y_inds, \n");
191 source.append(
" __global unsigned int * matrix_dimensions, \n");
192 source.append(
" __global unsigned int * g_is_update, \n");
193 source.append(
" unsigned int block_elems_num) \n");
194 source.append(
"{ \n");
195 source.append(
" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
196 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
197 source.append(
" ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
198 source.append(
" } \n");
199 source.append(
" } \n");
200 source.append(
"} \n");
203 template<
typename StringT>
206 source.append(
"void custom_dot_prod(__global "); source.append(numeric_string); source.append(
" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(
" * v, unsigned int ind, "); source.append(numeric_string); source.append(
" *res){ \n");
207 source.append(
" *res = 0.0; \n");
208 source.append(
" for (unsigned int j = ind; j < row_n; ++j){ \n");
209 source.append(
" if (j == ind){ \n");
210 source.append(
" *res += v[j]; \n");
211 source.append(
" }else{ \n");
212 source.append(
" *res += A[j + ind*row_n]*v[j]; \n");
213 source.append(
" } \n");
214 source.append(
" } \n");
215 source.append(
"} \n");
217 source.append(
"void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * b_v, __local "); source.append(numeric_string); source.append(
" * y){ \n");
218 source.append(
" "); source.append(numeric_string); source.append(
" inn_prod = 0; \n");
219 source.append(
" for (unsigned int i = 0; i < col_n; ++i){ \n");
220 source.append(
" custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
221 source.append(
" for (unsigned int j = i; j < row_n; ++j){ \n");
222 source.append(
" if (i == j){ \n");
223 source.append(
" y[j] -= b_v[ i]*inn_prod; \n");
224 source.append(
" } \n");
225 source.append(
" else{ \n");
226 source.append(
" y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
227 source.append(
" } \n");
228 source.append(
" } \n");
229 source.append(
" } \n");
230 source.append(
"} \n");
232 source.append(
"void q_mult(__global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * b_v, __local "); source.append(numeric_string); source.append(
" * R_u, unsigned int col_n_u){ \n");
233 source.append(
" for (unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
234 source.append(
" apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
235 source.append(
" } \n");
236 source.append(
"} \n");
238 source.append(
"void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
239 source.append(
" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
240 source.append(
" for (unsigned int j = 0; j < row_n; ++j){ \n");
241 source.append(
" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
242 source.append(
" } \n");
243 source.append(
" } \n");
244 source.append(
"} \n");
246 source.append(
"void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
247 source.append(
" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
248 source.append(
" for (unsigned int j = 0; j < row_n; ++j){ \n");
249 source.append(
" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
250 source.append(
" } \n");
251 source.append(
" } \n");
252 source.append(
"} \n");
254 source.append(
"__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(
" * global_R, \n");
255 source.append(
" __global unsigned int * block_ind, \n");
256 source.append(
" __global "); source.append(numeric_string); source.append(
" * global_R_u, \n");
257 source.append(
" __global unsigned int *block_ind_u, \n");
258 source.append(
" __global "); source.append(numeric_string); source.append(
" * b_v, \n");
259 source.append(
" __global unsigned int * start_bv_inds, \n");
260 source.append(
" __global unsigned int * matrix_dimensions, \n");
261 source.append(
" __global unsigned int * matrix_dimensions_u, \n");
262 source.append(
" __global unsigned int * g_is_update, \n");
263 source.append(
" __local "); source.append(numeric_string); source.append(
" * local_R_u, \n");
264 source.append(
" unsigned int block_elems_num){ \n");
265 source.append(
" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
266 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
268 source.append(
" matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
269 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
270 source.append(
" q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
271 source.append(
" matrix_dimensions_u[2*i + 1]); \n");
272 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
273 source.append(
" matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
274 source.append(
" } \n");
275 source.append(
" } \n");
276 source.append(
"} \n");
279 template<
typename StringT>
282 source.append(
"void dot_prod(__local const "); source.append(numeric_string); source.append(
"* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append(
"* res){ \n");
283 source.append(
" *res = 0; \n");
284 source.append(
" for (unsigned int i = beg_ind; i < n; ++i){ \n");
285 source.append(
" *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
286 source.append(
" } \n");
287 source.append(
"} \n");
289 source.append(
"void vector_div(__global "); source.append(numeric_string); source.append(
"* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(
" b, unsigned int n){ \n");
290 source.append(
" for (unsigned int i = beg_ind; i < n; ++i){ \n");
291 source.append(
" v[i] /= b; \n");
292 source.append(
" } \n");
293 source.append(
"} \n");
295 source.append(
"void copy_vector(__local const "); source.append(numeric_string); source.append(
"* A, __global "); source.append(numeric_string); source.append(
"* v, const unsigned int beg_ind, const unsigned int n){ \n");
296 source.append(
" for (unsigned int i = beg_ind; i < n; ++i){ \n");
297 source.append(
" v[i] = A[(beg_ind-1)*n + i]; \n");
298 source.append(
" } \n");
299 source.append(
"} \n");
302 source.append(
"void householder_vector(__local const "); source.append(numeric_string); source.append(
"* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append(
"* v, __global "); source.append(numeric_string); source.append(
"* b){ \n");
303 source.append(
" "); source.append(numeric_string); source.append(
" sg; \n");
304 source.append(
" dot_prod(A, n, j+1, &sg); \n");
305 source.append(
" copy_vector(A, v, j+1, n); \n");
306 source.append(
" "); source.append(numeric_string); source.append(
" mu; \n");
307 source.append(
" v[j] = 1.0; \n");
309 source.append(
" if (sg == 0){ \n");
310 source.append(
" *b = 0; \n");
311 source.append(
" } \n");
312 source.append(
" else{ \n");
313 source.append(
" mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
314 source.append(
" if (A[ j*n + j] <= 0){ \n");
315 source.append(
" v[j] = A[ j*n + j] - mu; \n");
316 source.append(
" }else{ \n");
317 source.append(
" v[j] = -sg/(A[ j*n + j] + mu); \n");
318 source.append(
" } \n");
319 source.append(
" *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
321 source.append(
" vector_div(v, j, v[j], n); \n");
323 source.append(
" } \n");
324 source.append(
"} \n");
326 source.append(
"void custom_inner_prod(__local const "); source.append(numeric_string); source.append(
"* A, __global "); source.append(numeric_string); source.append(
"* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append(
"* res){ \n");
327 source.append(
" for (unsigned int i = start_ind; i < row_num; ++i){ \n");
328 source.append(
" *res += A[col_ind*row_num + i]*v[i]; \n");
329 source.append(
" } \n");
330 source.append(
"} \n");
332 source.append(
"void apply_householder_reflection(__local "); source.append(numeric_string); source.append(
"* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append(
"* v, "); source.append(numeric_string); source.append(
" b){ \n");
333 source.append(
" "); source.append(numeric_string); source.append(
" in_prod_res; \n");
334 source.append(
" for (unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
335 source.append(
" in_prod_res = 0.0; \n");
336 source.append(
" custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
337 source.append(
" for (unsigned int j = iter_cnt; j < row_n; ++j){ \n");
338 source.append(
" A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
339 source.append(
" } \n");
340 source.append(
" } \n");
341 source.append(
"} \n");
343 source.append(
"void store_householder_vector(__local "); source.append(numeric_string); source.append(
"* A, unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append(
"* v){ \n");
344 source.append(
" for (unsigned int i = ind; i < n; ++i){ \n");
345 source.append(
" A[ (ind-1)*n + i] = v[i]; \n");
346 source.append(
" } \n");
347 source.append(
"} \n");
349 source.append(
"void single_qr( __local "); source.append(numeric_string); source.append(
"* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append(
"* b_v, __global "); source.append(numeric_string); source.append(
"* v, unsigned int matrix_ind){ \n");
352 source.append(
" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
353 source.append(
" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
355 source.append(
" if ((col_n == row_n)&&(row_n == 1)){ \n");
356 source.append(
" b_v[0] = 0.0; \n");
357 source.append(
" return; \n");
358 source.append(
" } \n");
359 source.append(
" for (unsigned int i = 0; i < col_n; ++i){ \n");
360 source.append(
" if (get_local_id(0) == 0){ \n");
361 source.append(
" householder_vector(R, i, row_n, v, b_v + i); \n");
362 source.append(
" } \n");
363 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
364 source.append(
" apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
365 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
366 source.append(
" if (get_local_id(0) == 0){ \n");
367 source.append(
" if (i < matrix_dimensions[2*matrix_ind]){ \n");
368 source.append(
" store_householder_vector(R, i+1, row_n, v); \n");
369 source.append(
" } \n");
370 source.append(
" } \n");
371 source.append(
" } \n");
372 source.append(
"} \n");
374 source.append(
"void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
375 source.append(
" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
376 source.append(
" for (unsigned int j = 0; j < row_n; ++j){ \n");
377 source.append(
" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
378 source.append(
" } \n");
379 source.append(
" } \n");
380 source.append(
"} \n");
381 source.append(
"void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append(
"* g_M, __local "); source.append(numeric_string); source.append(
"* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
382 source.append(
" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
383 source.append(
" for (unsigned int j = 0; j < row_n; ++j){ \n");
384 source.append(
" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
385 source.append(
" } \n");
386 source.append(
" } \n");
387 source.append(
"} \n");
390 source.append(
"__kernel void block_qr( \n");
391 source.append(
" __global "); source.append(numeric_string); source.append(
"* R, \n");
392 source.append(
" __global unsigned int* matrix_dimensions, \n");
393 source.append(
" __global "); source.append(numeric_string); source.append(
"* b_v, \n");
394 source.append(
" __global "); source.append(numeric_string); source.append(
"* v, \n");
395 source.append(
" __global unsigned int* start_matrix_inds, \n");
396 source.append(
" __global unsigned int* start_bv_inds, \n");
397 source.append(
" __global unsigned int* start_v_inds, \n");
398 source.append(
" __global unsigned int * g_is_update, \n");
399 source.append(
" __local "); source.append(numeric_string); source.append(
"* local_buff_R, \n");
400 source.append(
" unsigned int block_elems_num){ \n");
401 source.append(
" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
402 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
403 source.append(
" matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
404 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
405 source.append(
" single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
406 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
407 source.append(
" matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
408 source.append(
" } \n");
409 source.append(
" } \n");
410 source.append(
"} \n");
413 template<
typename StringT>
416 source.append(
"void assemble_upper_part(__global "); source.append(numeric_string); source.append(
" * R_q, \n");
417 source.append(
" unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, \n");
418 source.append(
" unsigned int row_n_u, unsigned int col_n_u, \n");
419 source.append(
" unsigned int col_n, unsigned int diff){ \n");
420 source.append(
" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
421 source.append(
" for (unsigned int j = 0; j < diff; ++j){ \n");
422 source.append(
" R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
423 source.append(
" } \n");
424 source.append(
" } \n");
425 source.append(
" } \n");
427 source.append(
"void assemble_lower_part(__global "); source.append(numeric_string); source.append(
" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
428 source.append(
" unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
429 source.append(
" unsigned int diff){ \n");
430 source.append(
" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
431 source.append(
" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
432 source.append(
" R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
433 source.append(
" } \n");
434 source.append(
" } \n");
435 source.append(
"} \n");
437 source.append(
"void assemble_qr_block(__global "); source.append(numeric_string); source.append(
" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, \n");
438 source.append(
" unsigned int col_n_u, __global "); source.append(numeric_string); source.append(
" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
439 source.append(
" unsigned int diff = row_n_u - col_n; \n");
440 source.append(
" assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
441 source.append(
" if (diff > 0){ \n");
442 source.append(
" assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
443 source.append(
" } \n");
444 source.append(
"} \n");
446 source.append(
"__kernel void block_qr_assembly( \n");
447 source.append(
" __global unsigned int * matrix_dimensions, \n");
448 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_u, \n");
449 source.append(
" __global unsigned int * block_ind_u, \n");
450 source.append(
" __global unsigned int * matrix_dimensions_u, \n");
451 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
452 source.append(
" __global unsigned int * block_ind_u_u, \n");
453 source.append(
" __global unsigned int * matrix_dimensions_u_u, \n");
454 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_q, \n");
455 source.append(
" __global unsigned int * block_ind_q, \n");
456 source.append(
" __global unsigned int * matrix_dimensions_q, \n");
457 source.append(
" __global unsigned int * g_is_update, \n");
458 source.append(
" //__local "); source.append(numeric_string); source.append(
" * local_R_q, \n");
459 source.append(
" unsigned int block_elems_num) \n");
460 source.append(
"{ \n");
461 source.append(
" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
462 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
463 source.append(
" assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
464 source.append(
" matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
465 source.append(
" } \n");
466 source.append(
" } \n");
467 source.append(
"} \n");
470 template<
typename StringT>
473 source.append(
"void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(
" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, \n");
474 source.append(
" unsigned int row_n_u, unsigned int col_n_u, \n");
475 source.append(
" unsigned int col_n, unsigned int diff){ \n");
476 source.append(
" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
477 source.append(
" for (unsigned int j = 0; j < diff; ++j){ \n");
478 source.append(
" R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
479 source.append(
" } \n");
480 source.append(
" } \n");
481 source.append(
" } \n");
484 source.append(
"void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(
" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, \n");
485 source.append(
" unsigned int col_n_u, unsigned int col_n){ \n");
486 source.append(
" unsigned int diff = row_n_u - col_n; \n");
487 source.append(
" assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
488 source.append(
"} \n");
490 source.append(
"__kernel void block_qr_assembly_1( \n");
491 source.append(
" __global unsigned int * matrix_dimensions, \n");
492 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_u, \n");
493 source.append(
" __global unsigned int * block_ind_u, \n");
494 source.append(
" __global unsigned int * matrix_dimensions_u, \n");
495 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_q, \n");
496 source.append(
" __global unsigned int * block_ind_q, \n");
497 source.append(
" __global unsigned int * matrix_dimensions_q, \n");
498 source.append(
" __global unsigned int * g_is_update, \n");
499 source.append(
" unsigned int block_elems_num) \n");
500 source.append(
"{ \n");
501 source.append(
" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
502 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
503 source.append(
" assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
504 source.append(
" matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
505 source.append(
" } \n");
506 source.append(
" } \n");
507 source.append(
"} \n");
510 template<
typename StringT>
513 source.append(
"void assemble_r(__global "); source.append(numeric_string); source.append(
" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R, \n");
514 source.append(
" unsigned int row_n, unsigned int col_n) \n");
515 source.append(
"{ \n");
516 source.append(
" for (unsigned int i = 0; i < col_n; ++i){ \n");
517 source.append(
" for (unsigned int j = 0; j < row_n; ++j){ \n");
518 source.append(
" gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
519 source.append(
" } \n");
520 source.append(
" } \n");
521 source.append(
"} \n");
523 source.append(
"void assemble_r_u(__global "); source.append(numeric_string); source.append(
" * gR, \n");
524 source.append(
" unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
525 source.append(
" unsigned int col_n) \n");
526 source.append(
"{ \n");
527 source.append(
" for (unsigned int i = 0; i < col_n_u; ++i){ \n");
528 source.append(
" for (unsigned int j = 0; j < col_n; ++j){ \n");
529 source.append(
" gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
530 source.append(
" } \n");
531 source.append(
" } \n");
532 source.append(
"} \n");
535 source.append(
"void assemble_r_u_u(__global "); source.append(numeric_string); source.append(
" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R_u_u, unsigned int row_n_u_u, \n");
536 source.append(
" unsigned int col_n_u_u, unsigned int col_n) \n");
537 source.append(
"{ \n");
538 source.append(
" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
539 source.append(
" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
540 source.append(
" gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
541 source.append(
" } \n");
542 source.append(
" } \n");
543 source.append(
"} \n");
545 source.append(
"void assemble_r_block(__global "); source.append(numeric_string); source.append(
" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(
" * R, unsigned int row_n, \n");
546 source.append(
" unsigned int col_n, __global "); source.append(numeric_string); source.append(
" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
547 source.append(
" unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
548 source.append(
" assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
549 source.append(
" assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
550 source.append(
" assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
551 source.append(
"} \n");
554 source.append(
"__kernel void block_r_assembly( \n");
555 source.append(
" __global "); source.append(numeric_string); source.append(
" * R, \n");
556 source.append(
" __global unsigned int * block_ind, \n");
557 source.append(
" __global unsigned int * matrix_dimensions, \n");
558 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_u, \n");
559 source.append(
" __global unsigned int * block_ind_u, \n");
560 source.append(
" __global unsigned int * matrix_dimensions_u, \n");
561 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_u_u, \n");
562 source.append(
" __global unsigned int * block_ind_u_u, \n");
563 source.append(
" __global unsigned int * matrix_dimensions_u_u, \n");
564 source.append(
" __global "); source.append(numeric_string); source.append(
" * g_R, \n");
565 source.append(
" __global unsigned int * block_ind_r, \n");
566 source.append(
" __global unsigned int * matrix_dimensions_r, \n");
567 source.append(
" __global unsigned int * g_is_update, \n");
568 source.append(
" unsigned int block_elems_num) \n");
569 source.append(
"{ \n");
570 source.append(
" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
571 source.append(
" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
573 source.append(
" assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
574 source.append(
" matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
575 source.append(
" R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
577 source.append(
" } \n");
578 source.append(
" } \n");
579 source.append(
"} \n");
586 template<
typename NumericT>
596 static std::map<cl_context, bool> init_done;
603 source.reserve(1024);
605 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
617 #ifdef VIENNACL_BUILD_INFO
618 std::cout <<
"Creating program " << prog_name << std::endl;
620 ctx.add_program(source, prog_name);
621 init_done[ctx.handle().get()] =
true;
Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners...
void generate_spai_block_r_assembly(StringT &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_spai_block_qr_assembly(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
void generate_spai_block_q_mult(StringT &source, std::string const &numeric_string)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const
void generate_spai_assemble_blocks(StringT &source, std::string const &numeric_string)
static std::string program_name()
void generate_spai_block_qr(StringT &source, std::string const &numeric_string)
void generate_spai_block_least_squares(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_spai_block_qr_assembly_1(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_spai_block_bv_assembly(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.