ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
spai.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
28 namespace viennacl
29 {
30 namespace linalg
31 {
32 namespace opencl
33 {
34 namespace kernels
35 {
36 
38 
39 template<typename StringT>
40 void generate_spai_assemble_blocks(StringT & source, std::string const & numeric_string)
41 {
42  source.append("float get_element(__global const unsigned int * row_indices, \n");
43  source.append(" __global const unsigned int * column_indices, \n");
44  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
45  source.append(" unsigned int row, \n");
46  source.append(" unsigned int col) \n");
47  source.append("{ \n");
48  source.append(" unsigned int row_end = row_indices[row+1]; \n");
49  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
50  source.append(" if (column_indices[i] == col) \n");
51  source.append(" return elements[i]; \n");
52  source.append(" if (column_indices[i] > col) \n");
53  source.append(" return 0; \n");
54  source.append(" } \n");
55  source.append(" return 0; \n");
56  source.append("} \n");
57 
58  source.append("void block_assembly(__global const unsigned int * row_indices, \n");
59  source.append(" __global const unsigned int * column_indices, \n");
60  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
61  source.append(" __global const unsigned int * matrix_dimensions, \n");
62  source.append(" __global const unsigned int * set_I, \n");
63  source.append(" __global const unsigned int * set_J, \n");
64  source.append(" unsigned int matrix_ind, \n");
65  source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n");
66  source.append("{ \n");
67  source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
68  source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
69 
70  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
71  //start row index
72  source.append(" for (unsigned int j = 0; j < row_n; j++){ \n");
73  source.append(" com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
74  source.append(" } \n");
75  source.append(" } \n");
76  source.append("} \n");
77 
78  source.append("__kernel void assemble_blocks( \n");
79  source.append(" __global const unsigned int * row_indices, \n");
80  source.append(" __global const unsigned int * column_indices, \n");
81  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
82  source.append(" __global const unsigned int * set_I, \n");
83  source.append(" __global const unsigned int * set_J, \n");
84  source.append(" __global const unsigned int * i_ind, \n");
85  source.append(" __global const unsigned int * j_ind, \n");
86  source.append(" __global const unsigned int * block_ind, \n");
87  source.append(" __global const unsigned int * matrix_dimensions, \n");
88  source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n");
89  source.append(" __global unsigned int * g_is_update, \n");
90  source.append(" unsigned int block_elems_num) \n");
91  source.append("{ \n");
92  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
93  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
94  source.append(" block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
95  source.append(" } \n");
96  source.append(" } \n");
97  source.append(" } \n");
98 }
99 
100 template<typename StringT>
101 void generate_spai_block_bv_assembly(StringT & source, std::string const & numeric_string)
102 {
103  source.append(" void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n");
104  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
105  source.append(" g_bv_r[i] = g_bv[ i]; \n");
106  source.append(" } \n");
107  source.append(" } \n");
108 
109  source.append(" void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n");
110  source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n");
111  source.append(" { \n");
112  source.append(" assemble_bv(g_bv_r, g_bv, col_n); \n");
113  source.append(" assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
114  source.append(" } \n");
115 
116  source.append(" __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n");
117  source.append(" __global unsigned int * start_bv_ind, \n");
118  source.append(" __global unsigned int * matrix_dimensions, \n");
119  source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, \n");
120  source.append(" __global unsigned int * start_bv_u_ind, \n");
121  source.append(" __global unsigned int * matrix_dimensions_u, \n");
122  source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_r, \n");
123  source.append(" __global unsigned int * start_bv_r_ind, \n");
124  source.append(" __global unsigned int * matrix_dimensions_r, \n");
125  source.append(" __global unsigned int * g_is_update, \n");
126  source.append(" //__local "); source.append(numeric_string); source.append(" * local_gb, \n");
127  source.append(" unsigned int block_elems_num) \n");
128  source.append(" { \n");
129  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
130  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
131  source.append(" assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
132  source.append(" } \n");
133  source.append(" } \n");
134  source.append(" } \n");
135 }
136 
137 template<typename StringT>
138 void generate_spai_block_least_squares(StringT & source, std::string const & numeric_string)
139 {
140  source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
141  source.append(" *res = 0.0; \n");
142  source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n");
143  source.append(" if (j == ind){ \n");
144  source.append(" *res += v[ j]; \n");
145  source.append(" }else{ \n");
146  source.append(" *res += A[ j + ind*row_n]*v[ j]; \n");
147  source.append(" } \n");
148  source.append(" } \n");
149  source.append("} \n");
150 
151  source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n");
152  source.append(" for (int i = col_n-1; i >= 0; i--) { \n");
153  source.append(" x[ i] = y[ i]; \n");
154  source.append(" for (int j = i+1; j < col_n; ++j) { \n");
155  source.append(" x[ i] -= R[ i + j*row_n]*x[ j]; \n");
156  source.append(" } \n");
157  source.append(" x[i] /= R[ i + i*row_n]; \n");
158  source.append(" } \n");
159  source.append("} \n");
160 
161 
162  source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * y){ \n");
163  source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
164  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
165  source.append(" custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
166  source.append(" for (unsigned int j = i; j < row_n; ++j){ \n");
167  source.append(" if (i == j){ \n");
168  source.append(" y[ j] -= b_v[ i]*inn_prod; \n");
169  source.append(" } \n");
170  source.append(" else{ \n");
171  source.append(" y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
172  source.append(" } \n");
173  source.append(" } \n");
174  source.append(" } \n");
175  source.append(" } \n");
176 
177  source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n");
178  source.append(" apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
179  source.append(" //m_new - is m_v now \n");
180  source.append(" backwardSolve(R, row_n, col_n, y_v, m_v); \n");
181  source.append("} \n");
182 
183  source.append("__kernel void block_least_squares( \n");
184  source.append(" __global "); source.append(numeric_string); source.append(" * global_R, \n");
185  source.append(" __global unsigned int * block_ind, \n");
186  source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n");
187  source.append(" __global unsigned int * start_bv_inds, \n");
188  source.append(" __global "); source.append(numeric_string); source.append(" * m_v, \n");
189  source.append(" __global "); source.append(numeric_string); source.append(" * y_v, \n");
190  source.append(" __global unsigned int * start_y_inds, \n");
191  source.append(" __global unsigned int * matrix_dimensions, \n");
192  source.append(" __global unsigned int * g_is_update, \n");
193  source.append(" unsigned int block_elems_num) \n");
194  source.append("{ \n");
195  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
196  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
197  source.append(" ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
198  source.append(" } \n");
199  source.append(" } \n");
200  source.append("} \n");
201 }
202 
203 template<typename StringT>
204 void generate_spai_block_q_mult(StringT & source, std::string const & numeric_string)
205 {
206  source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
207  source.append(" *res = 0.0; \n");
208  source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n");
209  source.append(" if (j == ind){ \n");
210  source.append(" *res += v[j]; \n");
211  source.append(" }else{ \n");
212  source.append(" *res += A[j + ind*row_n]*v[j]; \n");
213  source.append(" } \n");
214  source.append(" } \n");
215  source.append("} \n");
216 
217  source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n");
218  source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
219  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
220  source.append(" custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
221  source.append(" for (unsigned int j = i; j < row_n; ++j){ \n");
222  source.append(" if (i == j){ \n");
223  source.append(" y[j] -= b_v[ i]*inn_prod; \n");
224  source.append(" } \n");
225  source.append(" else{ \n");
226  source.append(" y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
227  source.append(" } \n");
228  source.append(" } \n");
229  source.append(" } \n");
230  source.append("} \n");
231 
232  source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n");
233  source.append(" for (unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
234  source.append(" apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
235  source.append(" } \n");
236  source.append("} \n");
237 
238  source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
239  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
240  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
241  source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
242  source.append(" } \n");
243  source.append(" } \n");
244  source.append("} \n");
245 
246  source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
247  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
248  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
249  source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
250  source.append(" } \n");
251  source.append(" } \n");
252  source.append("} \n");
253 
254  source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n");
255  source.append(" __global unsigned int * block_ind, \n");
256  source.append(" __global "); source.append(numeric_string); source.append(" * global_R_u, \n");
257  source.append(" __global unsigned int *block_ind_u, \n");
258  source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n");
259  source.append(" __global unsigned int * start_bv_inds, \n");
260  source.append(" __global unsigned int * matrix_dimensions, \n");
261  source.append(" __global unsigned int * matrix_dimensions_u, \n");
262  source.append(" __global unsigned int * g_is_update, \n");
263  source.append(" __local "); source.append(numeric_string); source.append(" * local_R_u, \n");
264  source.append(" unsigned int block_elems_num){ \n");
265  source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
266  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
267  //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
268  source.append(" matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
269  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
270  source.append(" q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
271  source.append(" matrix_dimensions_u[2*i + 1]); \n");
272  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
273  source.append(" matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
274  source.append(" } \n");
275  source.append(" } \n");
276  source.append("} \n");
277 }
278 
279 template<typename StringT>
280 void generate_spai_block_qr(StringT & source, std::string const & numeric_string)
281 {
282  source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n");
283  source.append(" *res = 0; \n");
284  source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
285  source.append(" *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
286  source.append(" } \n");
287  source.append("} \n");
288 
289  source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n");
290  source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
291  source.append(" v[i] /= b; \n");
292  source.append(" } \n");
293  source.append("} \n");
294 
295  source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n");
296  source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
297  source.append(" v[i] = A[(beg_ind-1)*n + i]; \n");
298  source.append(" } \n");
299  source.append("} \n");
300 
301 
302  source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n");
303  source.append(" "); source.append(numeric_string); source.append(" sg; \n");
304  source.append(" dot_prod(A, n, j+1, &sg); \n");
305  source.append(" copy_vector(A, v, j+1, n); \n");
306  source.append(" "); source.append(numeric_string); source.append(" mu; \n");
307  source.append(" v[j] = 1.0; \n");
308  //print_contigious_vector(v, v_start_ind, n);
309  source.append(" if (sg == 0){ \n");
310  source.append(" *b = 0; \n");
311  source.append(" } \n");
312  source.append(" else{ \n");
313  source.append(" mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
314  source.append(" if (A[ j*n + j] <= 0){ \n");
315  source.append(" v[j] = A[ j*n + j] - mu; \n");
316  source.append(" }else{ \n");
317  source.append(" v[j] = -sg/(A[ j*n + j] + mu); \n");
318  source.append(" } \n");
319  source.append(" *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
320  //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));
321  source.append(" vector_div(v, j, v[j], n); \n");
322  //print_contigious_vector(v, v_start_ind, n);
323  source.append(" } \n");
324  source.append("} \n");
325 
326  source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n");
327  source.append(" for (unsigned int i = start_ind; i < row_num; ++i){ \n");
328  source.append(" *res += A[col_ind*row_num + i]*v[i]; \n");
329  source.append(" } \n");
330  source.append("} \n");
331  //
332  source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n");
333  source.append(" "); source.append(numeric_string); source.append(" in_prod_res; \n");
334  source.append(" for (unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
335  source.append(" in_prod_res = 0.0; \n");
336  source.append(" custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
337  source.append(" for (unsigned int j = iter_cnt; j < row_n; ++j){ \n");
338  source.append(" A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
339  source.append(" } \n");
340  source.append(" } \n");
341  source.append("} \n");
342 
343  source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A, unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n");
344  source.append(" for (unsigned int i = ind; i < n; ++i){ \n");
345  source.append(" A[ (ind-1)*n + i] = v[i]; \n");
346  source.append(" } \n");
347  source.append("} \n");
348 
349  source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n");
350  //matrix_dimensions[0] - number of rows
351  //matrix_dimensions[1] - number of columns
352  source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
353  source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
354 
355  source.append(" if ((col_n == row_n)&&(row_n == 1)){ \n");
356  source.append(" b_v[0] = 0.0; \n");
357  source.append(" return; \n");
358  source.append(" } \n");
359  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
360  source.append(" if (get_local_id(0) == 0){ \n");
361  source.append(" householder_vector(R, i, row_n, v, b_v + i); \n");
362  source.append(" } \n");
363  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
364  source.append(" apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
365  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
366  source.append(" if (get_local_id(0) == 0){ \n");
367  source.append(" if (i < matrix_dimensions[2*matrix_ind]){ \n");
368  source.append(" store_householder_vector(R, i+1, row_n, v); \n");
369  source.append(" } \n");
370  source.append(" } \n");
371  source.append(" } \n");
372  source.append("} \n");
373 
374  source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
375  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
376  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
377  source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
378  source.append(" } \n");
379  source.append(" } \n");
380  source.append("} \n");
381  source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
382  source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
383  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
384  source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
385  source.append(" } \n");
386  source.append(" } \n");
387  source.append("} \n");
388 
389 
390  source.append("__kernel void block_qr( \n");
391  source.append(" __global "); source.append(numeric_string); source.append("* R, \n");
392  source.append(" __global unsigned int* matrix_dimensions, \n");
393  source.append(" __global "); source.append(numeric_string); source.append("* b_v, \n");
394  source.append(" __global "); source.append(numeric_string); source.append("* v, \n");
395  source.append(" __global unsigned int* start_matrix_inds, \n");
396  source.append(" __global unsigned int* start_bv_inds, \n");
397  source.append(" __global unsigned int* start_v_inds, \n");
398  source.append(" __global unsigned int * g_is_update, \n");
399  source.append(" __local "); source.append(numeric_string); source.append("* local_buff_R, \n");
400  source.append(" unsigned int block_elems_num){ \n");
401  source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
402  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
403  source.append(" matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
404  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
405  source.append(" single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
406  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
407  source.append(" matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
408  source.append(" } \n");
409  source.append(" } \n");
410  source.append("} \n");
411 }
412 
413 template<typename StringT>
414 void generate_spai_block_qr_assembly(StringT & source, std::string const & numeric_string)
415 {
416  source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n");
417  source.append(" unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
418  source.append(" unsigned int row_n_u, unsigned int col_n_u, \n");
419  source.append(" unsigned int col_n, unsigned int diff){ \n");
420  source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
421  source.append(" for (unsigned int j = 0; j < diff; ++j){ \n");
422  source.append(" R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
423  source.append(" } \n");
424  source.append(" } \n");
425  source.append(" } \n");
426 
427  source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
428  source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
429  source.append(" unsigned int diff){ \n");
430  source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
431  source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
432  source.append(" R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
433  source.append(" } \n");
434  source.append(" } \n");
435  source.append("} \n");
436 
437  source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
438  source.append(" unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
439  source.append(" unsigned int diff = row_n_u - col_n; \n");
440  source.append(" assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
441  source.append(" if (diff > 0){ \n");
442  source.append(" assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
443  source.append(" } \n");
444  source.append("} \n");
445 
446  source.append("__kernel void block_qr_assembly( \n");
447  source.append(" __global unsigned int * matrix_dimensions, \n");
448  source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
449  source.append(" __global unsigned int * block_ind_u, \n");
450  source.append(" __global unsigned int * matrix_dimensions_u, \n");
451  source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
452  source.append(" __global unsigned int * block_ind_u_u, \n");
453  source.append(" __global unsigned int * matrix_dimensions_u_u, \n");
454  source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n");
455  source.append(" __global unsigned int * block_ind_q, \n");
456  source.append(" __global unsigned int * matrix_dimensions_q, \n");
457  source.append(" __global unsigned int * g_is_update, \n");
458  source.append(" //__local "); source.append(numeric_string); source.append(" * local_R_q, \n");
459  source.append(" unsigned int block_elems_num) \n");
460  source.append("{ \n");
461  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
462  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
463  source.append(" assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
464  source.append(" matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
465  source.append(" } \n");
466  source.append(" } \n");
467  source.append("} \n");
468 }
469 
470 template<typename StringT>
471 void generate_spai_block_qr_assembly_1(StringT & source, std::string const & numeric_string)
472 {
473  source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
474  source.append(" unsigned int row_n_u, unsigned int col_n_u, \n");
475  source.append(" unsigned int col_n, unsigned int diff){ \n");
476  source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
477  source.append(" for (unsigned int j = 0; j < diff; ++j){ \n");
478  source.append(" R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
479  source.append(" } \n");
480  source.append(" } \n");
481  source.append(" } \n");
482 
483 
484  source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
485  source.append(" unsigned int col_n_u, unsigned int col_n){ \n");
486  source.append(" unsigned int diff = row_n_u - col_n; \n");
487  source.append(" assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
488  source.append("} \n");
489 
490  source.append("__kernel void block_qr_assembly_1( \n");
491  source.append(" __global unsigned int * matrix_dimensions, \n");
492  source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
493  source.append(" __global unsigned int * block_ind_u, \n");
494  source.append(" __global unsigned int * matrix_dimensions_u, \n");
495  source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n");
496  source.append(" __global unsigned int * block_ind_q, \n");
497  source.append(" __global unsigned int * matrix_dimensions_q, \n");
498  source.append(" __global unsigned int * g_is_update, \n");
499  source.append(" unsigned int block_elems_num) \n");
500  source.append("{ \n");
501  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
502  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
503  source.append(" assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
504  source.append(" matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
505  source.append(" } \n");
506  source.append(" } \n");
507  source.append("} \n");
508 }
509 
510 template<typename StringT>
511 void generate_spai_block_r_assembly(StringT & source, std::string const & numeric_string)
512 {
513  source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n");
514  source.append(" unsigned int row_n, unsigned int col_n) \n");
515  source.append("{ \n");
516  source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
517  source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
518  source.append(" gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
519  source.append(" } \n");
520  source.append(" } \n");
521  source.append("} \n");
522 
523  source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n");
524  source.append(" unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
525  source.append(" unsigned int col_n) \n");
526  source.append("{ \n");
527  source.append(" for (unsigned int i = 0; i < col_n_u; ++i){ \n");
528  source.append(" for (unsigned int j = 0; j < col_n; ++j){ \n");
529  source.append(" gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
530  source.append(" } \n");
531  source.append(" } \n");
532  source.append("} \n");
533 
534 
535  source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n");
536  source.append(" unsigned int col_n_u_u, unsigned int col_n) \n");
537  source.append("{ \n");
538  source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
539  source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
540  source.append(" gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
541  source.append(" } \n");
542  source.append(" } \n");
543  source.append("} \n");
544 
545  source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n");
546  source.append(" unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
547  source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
548  source.append(" assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
549  source.append(" assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
550  source.append(" assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
551  source.append("} \n");
552 
553 
554  source.append("__kernel void block_r_assembly( \n");
555  source.append(" __global "); source.append(numeric_string); source.append(" * R, \n");
556  source.append(" __global unsigned int * block_ind, \n");
557  source.append(" __global unsigned int * matrix_dimensions, \n");
558  source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
559  source.append(" __global unsigned int * block_ind_u, \n");
560  source.append(" __global unsigned int * matrix_dimensions_u, \n");
561  source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
562  source.append(" __global unsigned int * block_ind_u_u, \n");
563  source.append(" __global unsigned int * matrix_dimensions_u_u, \n");
564  source.append(" __global "); source.append(numeric_string); source.append(" * g_R, \n");
565  source.append(" __global unsigned int * block_ind_r, \n");
566  source.append(" __global unsigned int * matrix_dimensions_r, \n");
567  source.append(" __global unsigned int * g_is_update, \n");
568  source.append(" unsigned int block_elems_num) \n");
569  source.append("{ \n");
570  source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
571  source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
572 
573  source.append(" assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
574  source.append(" matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
575  source.append(" R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
576 
577  source.append(" } \n");
578  source.append(" } \n");
579  source.append("} \n");
580 }
581 
583 
584 // main kernel class
586 template<typename NumericT>
587 struct spai
588 {
589  static std::string program_name()
590  {
592  }
593 
594  static void init(viennacl::ocl::context & ctx)
595  {
596  static std::map<cl_context, bool> init_done;
597  if (!init_done[ctx.handle().get()])
598  {
600  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
601 
602  std::string source;
603  source.reserve(1024);
604 
605  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
606 
607  generate_spai_assemble_blocks(source, numeric_string);
608  generate_spai_block_bv_assembly(source, numeric_string);
609  generate_spai_block_least_squares(source, numeric_string);
610  generate_spai_block_q_mult(source, numeric_string);
611  generate_spai_block_qr(source, numeric_string);
612  generate_spai_block_qr_assembly(source, numeric_string);
613  generate_spai_block_qr_assembly_1(source, numeric_string);
614  generate_spai_block_r_assembly(source, numeric_string);
615 
616  std::string prog_name = program_name();
617  #ifdef VIENNACL_BUILD_INFO
618  std::cout << "Creating program " << prog_name << std::endl;
619  #endif
620  ctx.add_program(source, prog_name);
621  init_done[ctx.handle().get()] = true;
622  } //if
623  } //init
624 };
625 
626 } // namespace kernels
627 } // namespace opencl
628 } // namespace linalg
629 } // namespace viennacl
630 #endif
631 
Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners...
Definition: spai.hpp:587
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
void generate_spai_block_r_assembly(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:511
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
void generate_spai_block_qr_assembly(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:414
Provides OpenCL-related utilities.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
void generate_spai_block_q_mult(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:204
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_spai_assemble_blocks(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:40
static std::string program_name()
Definition: spai.hpp:589
void generate_spai_block_qr(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:280
void generate_spai_block_least_squares(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:138
Representation of an OpenCL kernel in ViennaCL.
void generate_spai_block_qr_assembly_1(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:471
static void init(viennacl::ocl::context &ctx)
Definition: spai.hpp:594
void generate_spai_block_bv_assembly(StringT &source, std::string const &numeric_string)
Definition: spai.hpp:101
Helper class for converting a type to its string representation.
Definition: utils.hpp:57