00001 #ifndef VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const matrix_col_align1_unit_lower_triangular_substitute_inplace =
00011 "__kernel void unit_lower_triangular_substitute_inplace(\n"
00012 " __global const float * matrix,\n"
00013 " unsigned int matrix_rows,\n"
00014 " unsigned int matrix_cols,\n"
00015 " unsigned int matrix_internal_rows,\n"
00016 " unsigned int matrix_internal_cols,\n"
00017 " __global float * vector)\n"
00018 "{\n"
00019 " float temp;\n"
00020 " for (int row = 0; row < matrix_rows; ++row)\n"
00021 " {\n"
00022 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00023 " temp = vector[row];\n"
00024 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00025 " vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
00026 " }\n"
00027 "}\n"
00028 ;
00029
00030 const char * const matrix_col_align1_inplace_sub =
00031 "__kernel void inplace_sub(\n"
00032 " __global float * vec1,\n"
00033 " __global const float * vec2,\n"
00034 " unsigned int size) \n"
00035 "{ \n"
00036 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00037 " vec1[i] -= vec2[i];\n"
00038 "}\n"
00039 ;
00040
00041 const char * const matrix_col_align1_lower_triangular_substitute_inplace =
00042 "__kernel void lower_triangular_substitute_inplace(\n"
00043 " __global const float * matrix,\n"
00044 " unsigned int matrix_rows,\n"
00045 " unsigned int matrix_cols,\n"
00046 " unsigned int matrix_internal_rows,\n"
00047 " unsigned int matrix_internal_cols,\n"
00048 " __global float * vector)\n"
00049 "{\n"
00050 " float temp;\n"
00051 " for (int row = 0; row < matrix_rows; ++row)\n"
00052 " {\n"
00053 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00054 " if (get_global_id(0) == 0)\n"
00055 " vector[row] /= matrix[row+row*matrix_internal_rows];\n"
00056 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00057 " temp = vector[row];\n"
00058 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00059 " vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
00060 " }\n"
00061 "}\n"
00062 ;
00063
00064 const char * const matrix_col_align1_trans_vec_mul =
00065 "__kernel void trans_vec_mul(\n"
00066 " __global const float * matrix,\n"
00067 " unsigned int matrix_rows,\n"
00068 " unsigned int matrix_cols,\n"
00069 " unsigned int matrix_internal_rows,\n"
00070 " unsigned int matrix_internal_cols,\n"
00071 " __global const float * vector, \n"
00072 " __global float * result) \n"
00073 "{ \n"
00074 " //row and col indicate indices within transposed matrix\n"
00075 " for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
00076 " {\n"
00077 " float dot_prod2 = 0.0f;\n"
00078 " for (unsigned int col = 0; col < matrix_rows; ++col)\n"
00079 " dot_prod2 += matrix[row * matrix_internal_rows + col] * vector[col];\n"
00080 " result[row] = dot_prod2;\n"
00081 " }\n"
00082 "}\n"
00083 ;
00084
00085 const char * const matrix_col_align1_rank1_update =
00086 "//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
00087 "__kernel void rank1_update(\n"
00088 " __global float * matrix,\n"
00089 " unsigned int matrix_rows,\n"
00090 " unsigned int matrix_cols,\n"
00091 " unsigned int matrix_internal_rows,\n"
00092 " unsigned int matrix_internal_cols,\n"
00093 " __global const float * vector1, \n"
00094 " __global const float * vector2) \n"
00095 "{ \n"
00096 " float tmp;\n"
00097 " for (unsigned int row= get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
00098 " {\n"
00099 " tmp = vector1[row];\n"
00100 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
00101 " matrix[row + col * matrix_internal_rows] += tmp * vector2[col];\n"
00102 " }\n"
00103 "}\n"
00104 ;
00105
00106 const char * const matrix_col_align1_sub =
00107 "__kernel void sub(\n"
00108 " __global const float * vec1,\n"
00109 " __global const float * vec2, \n"
00110 " __global float * result,\n"
00111 " unsigned int size)\n"
00112 "{ \n"
00113 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00114 " result[i] = vec1[i] - vec2[i];\n"
00115 "}\n"
00116 ;
00117
00118 const char * const matrix_col_align1_trans_unit_upper_triangular_substitute_inplace =
00119 "//transposed lower triangular matrix\n"
00120 "__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
00121 " __global const float * matrix, \n"
00122 " unsigned int matrix_rows,\n"
00123 " unsigned int matrix_cols,\n"
00124 " unsigned int matrix_internal_rows,\n"
00125 " unsigned int matrix_internal_cols,\n"
00126 " __global float * vector) \n"
00127 "{ \n"
00128 " float temp; \n"
00129 " for (int row = matrix_rows-1; row > -1; --row) \n"
00130 " { \n"
00131 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00132 " \n"
00133 " temp = vector[row]; \n"
00134 " //eliminate column with index 'row' in parallel: \n"
00135 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00136 " vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
00137 " } \n"
00138 " \n"
00139 "}\n"
00140 ;
00141
00142 const char * const matrix_col_align1_lu_factorize =
00143 "__kernel void lu_factorize(\n"
00144 " __global float * matrix,\n"
00145 " unsigned int matrix_rows,\n"
00146 " unsigned int matrix_cols,\n"
00147 " unsigned int matrix_internal_rows,\n"
00148 " unsigned int matrix_internal_cols) \n"
00149 "{ \n"
00150 " float temp;\n"
00151 " for (unsigned int i=1; i<matrix_rows; ++i)\n"
00152 " {\n"
00153 " for (unsigned int k=0; k<i; ++k)\n"
00154 " {\n"
00155 " if (get_global_id(0) == 0)\n"
00156 " matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows];\n"
00157 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00158 " temp = matrix[i + k*matrix_internal_rows];\n"
00159 " \n"
00160 " //parallel subtraction:\n"
00161 " for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0))\n"
00162 " matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows];\n"
00163 " }\n"
00164 " }\n"
00165 "} \n"
00166 ;
00167
00168 const char * const matrix_col_align1_add =
00169 "__kernel void add(\n"
00170 " __global const float * vec1,\n"
00171 " __global const float * vec2, \n"
00172 " __global float * result,\n"
00173 " unsigned int size) \n"
00174 "{ \n"
00175 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00176 " result[i] = vec1[i] + vec2[i];\n"
00177 "}\n"
00178 ;
00179
00180 const char * const matrix_col_align1_fft_direct =
00181 "// Direct FFT computation (quadratic complexity - use for reference only)\n"
00182 "__kernel void fft_direct(__global float2* input,\n"
00183 " __global float2* output,\n"
00184 " unsigned int size,\n"
00185 " unsigned int stride,\n"
00186 " unsigned int batch_num,\n"
00187 " float sign) {\n"
00188 " \n"
00189 " const float NUM_PI = 3.14159265358979323846;\n"
00190 " \n"
00191 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
00192 " for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
00193 " float2 f = 0.0f;\n"
00194 " for(unsigned int n = 0; n < size; n++) {\n"
00195 " float2 in = input[n * stride + batch_id]; //input index here\n"
00196 " float sn, cs;\n"
00197 " float arg = sign * 2 * NUM_PI * k / size * n;\n"
00198 " sn = sincos(arg, &cs);\n"
00199 " float2 ex = (float2)(cs, sn);\n"
00200 " f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
00201 " }\n"
00202 " output[k * stride + batch_id] = f;// output index here\n"
00203 " }\n"
00204 " }\n"
00205 "}\n"
00206 ;
00207
00208 const char * const matrix_col_align1_vec_mul =
00209 "__kernel void vec_mul(\n"
00210 " __global const float * matrix,\n"
00211 " unsigned int matrix_rows,\n"
00212 " unsigned int matrix_cols,\n"
00213 " unsigned int matrix_internal_rows,\n"
00214 " unsigned int matrix_internal_cols,\n"
00215 " __global const float * vector, \n"
00216 " __global float * result) \n"
00217 "{ \n"
00218 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
00219 " {\n"
00220 " float dot_prod = 0.0f;\n"
00221 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
00222 " dot_prod += matrix[row + col*matrix_internal_rows] * vector[col];\n"
00223 " result[row] = dot_prod;\n"
00224 " }\n"
00225 "}\n"
00226 ;
00227
00228 const char * const matrix_col_align1_fft_radix2_local =
00229 "unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
00230 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
00231 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
00232 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
00233 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
00234 " v = (v >> 16) | (v << 16);\n"
00235 " v = v >> (32 - bit_size);\n"
00236 " return v;\n"
00237 "}\n"
00238 "__kernel void fft_radix2_local(__global float2* input,\n"
00239 " __local float2* lcl_input,\n"
00240 " unsigned int bit_size,\n"
00241 " unsigned int size,\n"
00242 " unsigned int stride,\n"
00243 " unsigned int batch_num,\n"
00244 " float sign) {\n"
00245 " unsigned int grp_id = get_group_id(0);\n"
00246 " unsigned int grp_num = get_num_groups(0);\n"
00247 " unsigned int lcl_sz = get_local_size(0);\n"
00248 " unsigned int lcl_id = get_local_id(0);\n"
00249 " const float NUM_PI = 3.14159265358979323846;\n"
00250 " for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
00251 " //unsigned int base_offset = stride * batch_id;\n"
00252 " //copy chunk of global memory to local\n"
00253 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
00254 " unsigned int v = get_reorder_num(p, bit_size);\n"
00255 " lcl_input[v] = input[p * stride + batch_id];//index\n"
00256 " }\n"
00257 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00258 " //performs Cooley-Tukey FFT on local array\n"
00259 " for(unsigned int s = 0; s < bit_size; s++) {\n"
00260 " unsigned int ss = 1 << s;\n"
00261 " float cs, sn;\n"
00262 " for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
00263 " unsigned int group = (tid & (ss - 1));\n"
00264 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
00265 " float2 in1 = lcl_input[pos];\n"
00266 " float2 in2 = lcl_input[pos + ss];\n"
00267 " float arg = group * sign * NUM_PI / ss;\n"
00268 " sn = sincos(arg, &cs);\n"
00269 " float2 ex = (float2)(cs, sn);\n"
00270 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
00271 " lcl_input[pos + ss] = in1 - tmp;\n"
00272 " lcl_input[pos] = in1 + tmp;\n"
00273 " }\n"
00274 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00275 " }\n"
00276 " //copy local array back to global memory\n"
00277 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
00278 " input[p * stride + batch_id] = lcl_input[p];//index\n"
00279 " }\n"
00280 " }\n"
00281 "}\n"
00282 ;
00283
00284 const char * const matrix_col_align1_trans_lower_triangular_substitute_inplace =
00285 "__kernel void trans_lower_triangular_substitute_inplace(\n"
00286 " __global const float * matrix,\n"
00287 " unsigned int matrix_rows,\n"
00288 " unsigned int matrix_cols,\n"
00289 " unsigned int matrix_internal_rows,\n"
00290 " unsigned int matrix_internal_cols,\n"
00291 " __global float * vector)\n"
00292 "{\n"
00293 " float temp;\n"
00294 " for (int row = 0; row < matrix_rows; ++row)\n"
00295 " {\n"
00296 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00297 " if (get_global_id(0) == 0)\n"
00298 " vector[row] /= matrix[row+row*matrix_internal_rows];\n"
00299 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00300 " temp = vector[row];\n"
00301 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00302 " vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
00303 " }\n"
00304 "}\n"
00305 ;
00306
00307 const char * const matrix_col_align1_inplace_divide =
00308 "__kernel void inplace_divide(\n"
00309 " __global float * vec,\n"
00310 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
00311 " unsigned int size) \n"
00312 "{ \n"
00313 " float factor = *fac;\n"
00314 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00315 " vec[i] /= factor;\n"
00316 "}\n"
00317 ;
00318
00319 const char * const matrix_col_align1_trans_upper_triangular_substitute_inplace =
00320 "//transposed lower triangular matrix\n"
00321 "__kernel void trans_upper_triangular_substitute_inplace(\n"
00322 " __global const float * matrix, \n"
00323 " unsigned int matrix_rows,\n"
00324 " unsigned int matrix_cols,\n"
00325 " unsigned int matrix_internal_rows,\n"
00326 " unsigned int matrix_internal_cols,\n"
00327 " __global float * vector) \n"
00328 "{ \n"
00329 " float temp; \n"
00330 " for (int row = matrix_rows-1; row > -1; --row) \n"
00331 " { \n"
00332 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00333 " if (get_global_id(0) == 0) \n"
00334 " vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
00335 " \n"
00336 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00337 " temp = vector[row]; \n"
00338 " //eliminate column with index 'row' in parallel: \n"
00339 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00340 " vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
00341 " } \n"
00342 " \n"
00343 "}\n"
00344 ;
00345
00346 const char * const matrix_col_align1_unit_upper_triangular_substitute_inplace =
00347 "__kernel void unit_upper_triangular_substitute_inplace( \n"
00348 " __global const float * matrix, \n"
00349 " unsigned int matrix_rows,\n"
00350 " unsigned int matrix_cols,\n"
00351 " unsigned int matrix_internal_rows,\n"
00352 " unsigned int matrix_internal_cols,\n"
00353 " __global float * vector) \n"
00354 "{ \n"
00355 " float temp; \n"
00356 " for (int row = matrix_rows-1; row > -1; --row) \n"
00357 " { \n"
00358 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00359 " \n"
00360 " temp = vector[row]; \n"
00361 " //eliminate column with index 'row' in parallel: \n"
00362 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00363 " vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
00364 " } \n"
00365 " \n"
00366 "}\n"
00367 ;
00368
00369 const char * const matrix_col_align1_inplace_add =
00370 "__kernel void inplace_add(\n"
00371 " __global float * A,\n"
00372 " unsigned int A_row_start,\n"
00373 " unsigned int A_col_start,\n"
00374 " unsigned int A_row_size,\n"
00375 " unsigned int A_col_size,\n"
00376 " unsigned int A_internal_rows,\n"
00377 " unsigned int A_internal_cols,\n"
00378 " __global const float * B, \n"
00379 " unsigned int B_row_start,\n"
00380 " unsigned int B_col_start,\n"
00381 " unsigned int B_row_size,\n"
00382 " unsigned int B_col_size,\n"
00383 " unsigned int B_internal_rows,\n"
00384 " unsigned int B_internal_cols)\n"
00385 "{ \n"
00386 " if ( get_global_id(0) < A_row_size\n"
00387 " && get_global_id(1) < A_col_size\n"
00388 " )\n"
00389 " A[ (get_global_id(0) + A_row_start)\n"
00390 " + (get_global_id(1) + A_col_start) * A_internal_rows] \n"
00391 " += B[ (get_global_id(0) + B_row_start)\n"
00392 " + (get_global_id(1) + B_col_start) * B_internal_rows];\n"
00393 "}\n"
00394 ;
00395
00396 const char * const matrix_col_align1_trans_unit_lower_triangular_substitute_inplace =
00397 "\n"
00398 "__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
00399 " __global const float * matrix,\n"
00400 " unsigned int matrix_rows,\n"
00401 " unsigned int matrix_cols,\n"
00402 " unsigned int matrix_internal_rows,\n"
00403 " unsigned int matrix_internal_cols,\n"
00404 " __global float * vector)\n"
00405 "{\n"
00406 " float temp;\n"
00407 " for (int row = 0; row < matrix_rows; ++row)\n"
00408 " {\n"
00409 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00410 "\n"
00411 " temp = vector[row];\n"
00412 "\n"
00413 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00414 " vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
00415 " }\n"
00416 "}\n"
00417 "\n"
00418 "\n"
00419 ;
00420
00421 const char * const matrix_col_align1_scaled_rank1_update =
00422 "__kernel void scaled_rank1_update(\n"
00423 " __global float * matrix,\n"
00424 " unsigned int matrix_rows,\n"
00425 " unsigned int matrix_cols,\n"
00426 " unsigned int matrix_internal_rows,\n"
00427 " unsigned int matrix_internal_cols,\n"
00428 " float val,\n"
00429 " __global const float * vector1, \n"
00430 " __global const float * vector2) \n"
00431 "{ \n"
00432 " float tmp;\n"
00433 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
00434 " {\n"
00435 " tmp = val * vector1[row];\n"
00436 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
00437 " matrix[row + col*matrix_internal_rows] += tmp * vector2[col];\n"
00438 " }\n"
00439 "}\n"
00440 ;
00441
00442 const char * const matrix_col_align1_clear =
00443 "__kernel void clear(\n"
00444 " __global float * vec,\n"
00445 " unsigned int size) \n"
00446 "{ \n"
00447 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00448 " vec[i] = 0;\n"
00449 "}\n"
00450 ;
00451
00452 const char * const matrix_col_align1_fft_radix2 =
00453 "__kernel void fft_radix2(__global float2* input,\n"
00454 " unsigned int s,\n"
00455 " unsigned int bit_size,\n"
00456 " unsigned int size,\n"
00457 " unsigned int stride,\n"
00458 " unsigned int batch_num,\n"
00459 " float sign) {\n"
00460 " unsigned int ss = 1 << s;\n"
00461 " unsigned int half_size = size >> 1;\n"
00462 " float cs, sn;\n"
00463 " const float NUM_PI = 3.14159265358979323846;\n"
00464 " unsigned int glb_id = get_global_id(0);\n"
00465 " unsigned int glb_sz = get_global_size(0);\n"
00466 " \n"
00467 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
00468 " for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
00469 " unsigned int group = (tid & (ss - 1));\n"
00470 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
00471 " unsigned int offset = pos * stride + batch_id;\n"
00472 " float2 in1 = input[offset];//index\n"
00473 " float2 in2 = input[offset + ss * stride];//index\n"
00474 " float arg = group * sign * NUM_PI / ss;\n"
00475 " sn = sincos(arg, &cs);\n"
00476 " float2 ex = (float2)(cs, sn);\n"
00477 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
00478 " input[offset + ss * stride] = in1 - tmp;//index\n"
00479 " input[offset] = in1 + tmp;//index\n"
00480 " }\n"
00481 " }\n"
00482 "}\n"
00483 ;
00484
00485 const char * const matrix_col_align1_cpu_inplace_mult =
00486 "__kernel void cpu_inplace_mult(\n"
00487 " __global float * vec,\n"
00488 " float factor, \n"
00489 " unsigned int size) \n"
00490 "{ \n"
00491 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00492 " vec[i] *= factor;\n"
00493 "}\n"
00494 ;
00495
00496 const char * const matrix_col_align1_fft_reorder =
00497 "/*\n"
00498 "* Performs reordering of input data in bit-reversal order\n"
00499 "* Probably it's better to do in host side,\n"
00500 "*/\n"
00501 "unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
00502 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
00503 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
00504 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
00505 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
00506 " v = (v >> 16) | (v << 16);\n"
00507 " v = v >> (32 - bit_size);\n"
00508 " return v;\n"
00509 "}\n"
00510 "__kernel void fft_reorder(__global float2* input,\n"
00511 " unsigned int bit_size,\n"
00512 " unsigned int size,\n"
00513 " unsigned int stride,\n"
00514 " int batch_num) {\n"
00515 " unsigned int glb_id = get_global_id(0);\n"
00516 " unsigned int glb_sz = get_global_size(0);\n"
00517 " \n"
00518 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
00519 " for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
00520 " unsigned int v = get_reorder_num_2(i, bit_size);\n"
00521 " if(i < v) {\n"
00522 " float2 tmp = input[i * stride + batch_id]; // index\n"
00523 " input[i * stride + batch_id] = input[v * stride + batch_id]; //index\n"
00524 " input[v * stride + batch_id] = tmp; //index\n"
00525 " }\n"
00526 " }\n"
00527 " }\n"
00528 "}\n"
00529 ;
00530
00531 const char * const matrix_col_align1_inplace_mult =
00532 "__kernel void inplace_mult(\n"
00533 " __global float * vec,\n"
00534 " __global const float * fac, \n"
00535 " unsigned int size) \n"
00536 "{ \n"
00537 " float factor = *fac;\n"
00538 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00539 " vec[i] *= factor;\n"
00540 "}\n"
00541 ;
00542
00543 const char * const matrix_col_align1_upper_triangular_substitute_inplace =
00544 "__kernel void upper_triangular_substitute_inplace( \n"
00545 " __global const float * matrix, \n"
00546 " unsigned int matrix_rows,\n"
00547 " unsigned int matrix_cols,\n"
00548 " unsigned int matrix_internal_rows,\n"
00549 " unsigned int matrix_internal_cols,\n"
00550 " __global float * vector) \n"
00551 "{ \n"
00552 " float temp; \n"
00553 " for (int row = matrix_rows-1; row > -1; --row) \n"
00554 " { \n"
00555 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00556 " if (get_global_id(0) == 0) \n"
00557 " vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
00558 " \n"
00559 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00560 " temp = vector[row]; \n"
00561 " //eliminate column with index 'row' in parallel: \n"
00562 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00563 " vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
00564 " } \n"
00565 " \n"
00566 "}\n"
00567 ;
00568
00569 }
00570 }
00571 }
00572 #endif