00001 #ifndef VIENNACL_LINALG_KERNELS_SPAI_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_SPAI_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const spai_align1_block_qr_assembly_1 =
00011 "void assemble_upper_part_1(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, \n"
00012 " unsigned int row_n_u, unsigned int col_n_u,\n"
00013 " unsigned int col_n, unsigned int diff){\n"
00014 " for(unsigned int i = 0; i < col_n_q; ++i){\n"
00015 " for(unsigned int j = 0; j < diff; ++j){\n"
00016 " R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ];\n"
00017 " }\n"
00018 " }\n"
00019 " }\n"
00020 "void assemble_qr_block_1(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,\n"
00021 " unsigned int col_n_u, unsigned int col_n){\n"
00022 " unsigned int diff = row_n_u - col_n;\n"
00023 " assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);\n"
00024 "}\n"
00025 "__kernel void block_qr_assembly_1(\n"
00026 " __global unsigned int * matrix_dimensions,\n"
00027 " __global float * R_u,\n"
00028 " __global unsigned int * block_ind_u,\n"
00029 " __global unsigned int * matrix_dimensions_u,\n"
00030 " __global float * R_q,\n"
00031 " __global unsigned int * block_ind_q,\n"
00032 " __global unsigned int * matrix_dimensions_q,\n"
00033 " __global unsigned int * g_is_update,\n"
00034 " //__local float * local_R_q,\n"
00035 " unsigned int block_elems_num) \n"
00036 "{ \n"
00037 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
00038 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
00039 " assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"
00040 " matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]);\n"
00041 " }\n"
00042 " }\n"
00043 "}\n"
00044 ;
00045
00046 const char * const spai_align1_block_qr_assembly =
00047 "void assemble_upper_part(__global float * R_q,\n"
00048 " unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, \n"
00049 " unsigned int row_n_u, unsigned int col_n_u,\n"
00050 " unsigned int col_n, unsigned int diff){\n"
00051 " for(unsigned int i = 0; i < col_n_q; ++i){\n"
00052 " for(unsigned int j = 0; j < diff; ++j){\n"
00053 " R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ];\n"
00054 " }\n"
00055 " }\n"
00056 " }\n"
00057 "void assemble_lower_part(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u_u, \n"
00058 " unsigned int row_n_u_u, unsigned int col_n_u_u, \n"
00059 " unsigned int diff){\n"
00060 " for(unsigned int i = 0; i < col_n_u_u; ++i){\n"
00061 " for(unsigned int j = 0; j < row_n_u_u; ++j){\n"
00062 " R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j];\n"
00063 " }\n"
00064 " } \n"
00065 "}\n"
00066 "void assemble_qr_block(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,\n"
00067 " unsigned int col_n_u, __global float * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){\n"
00068 " unsigned int diff = row_n_u - col_n;\n"
00069 " assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);\n"
00070 " if(diff > 0){\n"
00071 " assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff);\n"
00072 " }\n"
00073 "}\n"
00074 "__kernel void block_qr_assembly(\n"
00075 " __global unsigned int * matrix_dimensions,\n"
00076 " __global float * R_u,\n"
00077 " __global unsigned int * block_ind_u,\n"
00078 " __global unsigned int * matrix_dimensions_u,\n"
00079 " __global float * R_u_u,\n"
00080 " __global unsigned int * block_ind_u_u,\n"
00081 " __global unsigned int * matrix_dimensions_u_u,\n"
00082 " __global float * R_q,\n"
00083 " __global unsigned int * block_ind_q,\n"
00084 " __global unsigned int * matrix_dimensions_q,\n"
00085 " __global unsigned int * g_is_update,\n"
00086 " //__local float * local_R_q,\n"
00087 " unsigned int block_elems_num) \n"
00088 "{ \n"
00089 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
00090 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
00091 " //\n"
00092 " assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"
00093 " matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]);\n"
00094 " }\n"
00095 " }\n"
00096 "}\n"
00097 ;
00098
00099 const char * const spai_align1_block_bv_assembly =
00100 "void assemble_bv(__global float * g_bv_r, __global float * g_bv, unsigned int col_n){\n"
00101 " for(unsigned int i = 0; i < col_n; ++i){\n"
00102 " g_bv_r[i] = g_bv[ i];\n"
00103 " }\n"
00104 "}\n"
00105 "void assemble_bv_block(__global float * g_bv_r, __global float * g_bv, unsigned int col_n,\n"
00106 " __global float * g_bv_u, unsigned int col_n_u)\n"
00107 "{\n"
00108 " assemble_bv(g_bv_r, g_bv, col_n);\n"
00109 " assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u);\n"
00110 " \n"
00111 "}\n"
00112 "__kernel void block_bv_assembly(__global float * g_bv,\n"
00113 " __global unsigned int * start_bv_ind,\n"
00114 " __global unsigned int * matrix_dimensions,\n"
00115 " __global float * g_bv_u,\n"
00116 " __global unsigned int * start_bv_u_ind,\n"
00117 " __global unsigned int * matrix_dimensions_u,\n"
00118 " __global float * g_bv_r,\n"
00119 " __global unsigned int * start_bv_r_ind,\n"
00120 " __global unsigned int * matrix_dimensions_r,\n"
00121 " __global unsigned int * g_is_update,\n"
00122 " //__local float * local_gb,\n"
00123 " unsigned int block_elems_num)\n"
00124 "{ \n"
00125 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
00126 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
00127 " assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]);\n"
00128 " }\n"
00129 " }\n"
00130 "}\n"
00131 ;
00132
00133 const char * const spai_align1_block_least_squares =
00134 "void custom_dot_prod_ls(__global float * A, unsigned int row_n, __global float * v, unsigned int ind, float *res){\n"
00135 " *res = 0.0;\n"
00136 " for(unsigned int j = ind; j < row_n; ++j){\n"
00137 " if(j == ind){\n"
00138 " *res += v[ j];\n"
00139 " }else{\n"
00140 " *res += A[ j + ind*row_n]*v[ j];\n"
00141 " }\n"
00142 " }\n"
00143 " }\n"
00144 "void backwardSolve(__global float * R, unsigned int row_n, unsigned int col_n, __global float * y, __global float * x){\n"
00145 " for (int i = col_n-1; i >= 0 ; i--) {\n"
00146 " x[ i] = y[ i];\n"
00147 " for (int j = i+1; j < col_n; ++j) {\n"
00148 " x[ i] -= R[ i + j*row_n]*x[ j];\n"
00149 " }\n"
00150 " x[i] /= R[ i + i*row_n];\n"
00151 " }\n"
00152 " \n"
00153 "}\n"
00154 " \n"
00155 "void apply_q_trans_vec_ls(__global float * R, unsigned int row_n, unsigned int col_n, __global const float * b_v, __global float * y){\n"
00156 " float inn_prod = 0;\n"
00157 " for(unsigned int i = 0; i < col_n; ++i){\n"
00158 " custom_dot_prod_ls(R, row_n, y, i, &inn_prod);\n"
00159 " for(unsigned int j = i; j < row_n; ++j){\n"
00160 " if(i == j){\n"
00161 " y[ j] -= b_v[ i]*inn_prod;\n"
00162 " }\n"
00163 " else{\n"
00164 " y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n];\n"
00165 " }\n"
00166 " }\n"
00167 " //std::cout<<y<<std::endl;\n"
00168 " }\n"
00169 " }\n"
00170 "void ls(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __global float * m_v, __global float * y_v){\n"
00171 " \n"
00172 " apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v);\n"
00173 " //m_new - is m_v now\n"
00174 " backwardSolve(R, row_n, col_n, y_v, m_v);\n"
00175 "}\n"
00176 "__kernel void block_least_squares(\n"
00177 " __global float * global_R,\n"
00178 " __global unsigned int * block_ind,\n"
00179 " __global float * b_v,\n"
00180 " __global unsigned int * start_bv_inds,\n"
00181 " __global float * m_v,\n"
00182 " __global float * y_v,\n"
00183 " __global unsigned int * start_y_inds,\n"
00184 " __global unsigned int * matrix_dimensions,\n"
00185 " __global unsigned int * g_is_update,\n"
00186 " //__local float * local_R,\n"
00187 " unsigned int block_elems_num) \n"
00188 "{ \n"
00189 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
00190 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
00191 " \n"
00192 " ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] );\n"
00193 " \n"
00194 " }\n"
00195 " }\n"
00196 "}\n"
00197 ;
00198
00199 const char * const spai_align1_block_qr =
00200 "void dot_prod(__local const float* A, unsigned int n, unsigned int beg_ind, float* res){\n"
00201 " *res = 0;\n"
00202 " for(unsigned int i = beg_ind; i < n; ++i){\n"
00203 " *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i];\n"
00204 " }\n"
00205 "}\n"
00206 " \n"
00207 "void vector_div(__global float* v, unsigned int beg_ind, float b, unsigned int n){\n"
00208 " for(unsigned int i = beg_ind; i < n; ++i){\n"
00209 " v[i] /= b;\n"
00210 " }\n"
00211 "}\n"
00212 "void copy_vector(__local const float* A, __global float* v, const unsigned int beg_ind, const unsigned int n){\n"
00213 " for(unsigned int i = beg_ind; i < n; ++i){\n"
00214 " v[i] = A[(beg_ind-1)*n + i];\n"
00215 " }\n"
00216 "}\n"
00217 " \n"
00218 " \n"
00219 "void householder_vector(__local const float* A, unsigned int j, unsigned int n, __global float* v, __global float* b){\n"
00220 " float sg;\n"
00221 " dot_prod(A, n, j+1, &sg); \n"
00222 " copy_vector(A, v, j+1, n);\n"
00223 " float mu;\n"
00224 " v[j] = 1.0;\n"
00225 " //print_contigious_vector(v, v_start_ind, n);\n"
00226 " if(sg == 0){\n"
00227 " *b = 0;\n"
00228 " }\n"
00229 " else{\n"
00230 " mu = sqrt(A[j*n + j]*A[ j*n + j] + sg);\n"
00231 " if(A[ j*n + j] <= 0){\n"
00232 " v[j] = A[ j*n + j] - mu;\n"
00233 " }else{\n"
00234 " v[j] = -sg/(A[ j*n + j] + mu);\n"
00235 " }\n"
00236 " *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]);\n"
00237 " //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));\n"
00238 " vector_div(v, j, v[j], n);\n"
00239 " //print_contigious_vector(v, v_start_ind, n);\n"
00240 " }\n"
00241 "}\n"
00242 "void custom_inner_prod(__local const float* A, __global float* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, float* res){\n"
00243 " for(unsigned int i = start_ind; i < row_num; ++i){\n"
00244 " *res += A[col_ind*row_num + i]*v[i]; \n"
00245 " }\n"
00246 "}\n"
00247 "// \n"
00248 "void apply_householder_reflection(__local float* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global float* v, float b){\n"
00249 " float in_prod_res;\n"
00250 " for(unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){\n"
00251 " in_prod_res = 0.0;\n"
00252 " custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res);\n"
00253 " for(unsigned int j = iter_cnt; j < row_n; ++j){\n"
00254 " A[ i*row_n + j] -= b*in_prod_res* v[j];\n"
00255 " }\n"
00256 " }\n"
00257 " \n"
00258 "}\n"
00259 "void store_householder_vector(__local float* A, unsigned int ind, unsigned int n, __global float* v){\n"
00260 " for(unsigned int i = ind; i < n; ++i){\n"
00261 " A[ (ind-1)*n + i] = v[i];\n"
00262 " }\n"
00263 "}\n"
00264 "void single_qr( __local float* R, __global unsigned int* matrix_dimensions, __global float* b_v, __global float* v, unsigned int matrix_ind){\n"
00265 " //matrix_dimensions[0] - number of rows\n"
00266 " //matrix_dimensions[1] - number of columns\n"
00267 " unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];\n"
00268 " unsigned int row_n = matrix_dimensions[2*matrix_ind];\n"
00269 " \n"
00270 " if((col_n == row_n)&&(row_n == 1)){\n"
00271 " b_v[0] = 0.0;\n"
00272 " return;\n"
00273 " }\n"
00274 " for(unsigned int i = 0; i < col_n; ++i){\n"
00275 " if(get_local_id(0) == 0){\n"
00276 " householder_vector(R, i, row_n, v, b_v + i);\n"
00277 " }\n"
00278 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00279 " apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]);\n"
00280 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00281 " if(get_local_id(0) == 0){\n"
00282 " if(i < matrix_dimensions[2*matrix_ind]){\n"
00283 " store_householder_vector(R, i+1, row_n, v);\n"
00284 " }\n"
00285 " }\n"
00286 " }\n"
00287 "}\n"
00288 "void matrix_from_global_to_local_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
00289 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
00290 " for(unsigned int j = 0; j < row_n; ++j){\n"
00291 " l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];\n"
00292 " }\n"
00293 " }\n"
00294 "}\n"
00295 "void matrix_from_local_to_global_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
00296 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
00297 " for(unsigned int j = 0; j < row_n; ++j){\n"
00298 " g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];\n"
00299 " }\n"
00300 " }\n"
00301 "}\n"
00302 "__kernel void block_qr(\n"
00303 " __global float* R, \n"
00304 " __global unsigned int* matrix_dimensions, \n"
00305 " __global float* b_v, \n"
00306 " __global float* v, \n"
00307 " __global unsigned int* start_matrix_inds, \n"
00308 " __global unsigned int* start_bv_inds, \n"
00309 " __global unsigned int* start_v_inds,\n"
00310 " __global unsigned int * g_is_update, \n"
00311 " __local float* local_buff_R,\n"
00312 " unsigned int block_elems_num){\n"
00313 " for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){\n"
00314 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
00315 " matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
00316 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00317 " single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i);\n"
00318 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00319 " matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
00320 " }\n"
00321 " }\n"
00322 "}\n"
00323 ;
00324
00325 const char * const spai_align1_block_r_assembly =
00326 "void assemble_r(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, \n"
00327 " unsigned int row_n, unsigned int col_n)\n"
00328 "{\n"
00329 " for(unsigned int i = 0; i < col_n; ++i){\n"
00330 " for(unsigned int j = 0; j < row_n; ++j){\n"
00331 " gR[i*row_n_r + j] = R[i*row_n + j ];\n"
00332 " }\n"
00333 " }\n"
00334 "}\n"
00335 "void assemble_r_u(__global float * gR,\n"
00336 " unsigned int row_n_r, unsigned int col_n_r, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, \n"
00337 " unsigned int col_n)\n"
00338 "{\n"
00339 " for(unsigned int i = 0; i < col_n_u; ++i){\n"
00340 " for(unsigned int j = 0; j < col_n; ++j){\n"
00341 " gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j];\n"
00342 " }\n"
00343 " } \n"
00344 "}\n"
00345 "void assemble_r_u_u(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R_u_u, unsigned int row_n_u_u, \n"
00346 " unsigned int col_n_u_u, unsigned int col_n)\n"
00347 "{\n"
00348 " for(unsigned int i = 0; i < col_n_u_u; ++i){\n"
00349 " for(unsigned int j = 0; j < row_n_u_u; ++j){\n"
00350 " gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j];\n"
00351 " }\n"
00352 " } \n"
00353 "}\n"
00354 "void assemble_r_block(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, unsigned int row_n, \n"
00355 " unsigned int col_n, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, __global float * R_u_u, \n"
00356 " unsigned int row_n_u_u, unsigned int col_n_u_u){\n"
00357 " assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n"
00358 " assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n);\n"
00359 " assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n);\n"
00360 "}\n"
00361 "__kernel void block_r_assembly(\n"
00362 " __global float * R,\n"
00363 " __global unsigned int * block_ind,\n"
00364 " __global unsigned int * matrix_dimensions,\n"
00365 " __global float * R_u,\n"
00366 " __global unsigned int * block_ind_u,\n"
00367 " __global unsigned int * matrix_dimensions_u,\n"
00368 " __global float * R_u_u,\n"
00369 " __global unsigned int * block_ind_u_u,\n"
00370 " __global unsigned int * matrix_dimensions_u_u,\n"
00371 " __global float * g_R,\n"
00372 " __global unsigned int * block_ind_r,\n"
00373 " __global unsigned int * matrix_dimensions_r,\n"
00374 " __global unsigned int * g_is_update,\n"
00375 " //__local float * local_gR,\n"
00376 " unsigned int block_elems_num) \n"
00377 "{ \n"
00378 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
00379 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
00380 " \n"
00381 " assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n"
00382 " matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1],\n"
00383 " R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]);\n"
00384 " \n"
00385 " }\n"
00386 " }\n"
00387 "}\n"
00388 ;
00389
00390 const char * const spai_align1_block_q_mult =
00391 "void custom_dot_prod(__global float * A, unsigned int row_n, __local float * v, unsigned int ind, float *res){\n"
00392 " *res = 0.0;\n"
00393 " for(unsigned int j = ind; j < row_n; ++j){\n"
00394 " if(j == ind){\n"
00395 " *res += v[j];\n"
00396 " }else{\n"
00397 " *res += A[j + ind*row_n]*v[j];\n"
00398 " }\n"
00399 " }\n"
00400 " }\n"
00401 "void apply_q_trans_vec(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * y){\n"
00402 " float inn_prod = 0;\n"
00403 " for(unsigned int i = 0; i < col_n; ++i){\n"
00404 " custom_dot_prod(R, row_n, y, i, &inn_prod);\n"
00405 " for(unsigned int j = i; j < row_n; ++j){\n"
00406 " if(i == j){\n"
00407 " y[j] -= b_v[ i]*inn_prod;\n"
00408 " }\n"
00409 " else{\n"
00410 " y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n];\n"
00411 " }\n"
00412 " }\n"
00413 " }\n"
00414 " }\n"
00415 "void q_mult(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * R_u, unsigned int col_n_u){\n"
00416 " for(unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){\n"
00417 " apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i);\n"
00418 " } \n"
00419 "}\n"
00420 "void matrix_from_global_to_local(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
00421 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
00422 " for(unsigned int j = 0; j < row_n; ++j){\n"
00423 " l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];\n"
00424 " }\n"
00425 " }\n"
00426 "}\n"
00427 "void matrix_from_local_to_global(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
00428 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
00429 " for(unsigned int j = 0; j < row_n; ++j){\n"
00430 " g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];\n"
00431 " }\n"
00432 " }\n"
00433 "}\n"
00434 "__kernel void block_q_mult(__global float * global_R,\n"
00435 " __global unsigned int * block_ind,\n"
00436 " __global float * global_R_u,\n"
00437 " __global unsigned int *block_ind_u,\n"
00438 " __global float * b_v,\n"
00439 " __global unsigned int * start_bv_inds,\n"
00440 " __global unsigned int * matrix_dimensions,\n"
00441 " __global unsigned int * matrix_dimensions_u,\n"
00442 " __global unsigned int * g_is_update,\n"
00443 " __local float * local_R_u,\n"
00444 " unsigned int block_elems_num){\n"
00445 " for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){\n"
00446 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){\n"
00447 " //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
00448 " matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]);\n"
00449 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00450 " q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n"
00451 " matrix_dimensions_u[2*i + 1]);\n"
00452 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00453 " matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]);\n"
00454 " }\n"
00455 " }\n"
00456 "}\n"
00457 ;
00458
00459 const char * const spai_align1_assemble_blocks =
00460 "float get_element(__global const unsigned int * row_indices,\n"
00461 " __global const unsigned int * column_indices,\n"
00462 " __global const float * elements,\n"
00463 " unsigned int row,\n"
00464 " unsigned int col\n"
00465 " )\n"
00466 "{\n"
00467 " unsigned int row_end = row_indices[row+1];\n"
00468 " for(unsigned int i = row_indices[row]; i < row_end; ++i){\n"
00469 " if(column_indices[i] == col)\n"
00470 " return elements[i];\n"
00471 " if(column_indices[i] > col)\n"
00472 " return 0.0;\n"
00473 " }\n"
00474 " return 0.0; \n"
00475 "}\n"
00476 "void block_assembly(__global const unsigned int * row_indices,\n"
00477 " __global const unsigned int * column_indices, \n"
00478 " __global const float * elements,\n"
00479 " __global const unsigned int * matrix_dimensions,\n"
00480 " __global const unsigned int * set_I,\n"
00481 " __global const unsigned int * set_J, \n"
00482 " unsigned int matrix_ind,\n"
00483 " __global float * com_A_I_J)\n"
00484 "{\n"
00485 " unsigned int row_n = matrix_dimensions[2*matrix_ind];\n"
00486 " unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];\n"
00487 " \n"
00488 " for(unsigned int i = 0; i < col_n; ++i){\n"
00489 " //start row index\n"
00490 " for(unsigned int j = 0; j < row_n; j++){\n"
00491 " com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]);\n"
00492 " }\n"
00493 " }\n"
00494 " \n"
00495 "}\n"
00496 "__kernel void assemble_blocks(\n"
00497 " __global const unsigned int * row_indices,\n"
00498 " __global const unsigned int * column_indices, \n"
00499 " __global const float * elements,\n"
00500 " __global const unsigned int * set_I,\n"
00501 " __global const unsigned int * set_J,\n"
00502 " __global const unsigned int * i_ind,\n"
00503 " __global const unsigned int * j_ind,\n"
00504 " __global const unsigned int * block_ind,\n"
00505 " __global const unsigned int * matrix_dimensions,\n"
00506 " __global float * com_A_I_J,\n"
00507 " __global unsigned int * g_is_update,\n"
00508 " unsigned int block_elems_num) \n"
00509 "{ \n"
00510 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
00511 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
00512 " \n"
00513 " block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]);\n"
00514 " }\n"
00515 " }\n"
00516 "}\n"
00517 ;
00518
00519 }
00520 }
00521 }
00522 #endif