00001 #ifndef VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const vector_align16_inplace_sub =
00011 "__kernel void inplace_sub(\n"
00012 " __global float16 * vec1,\n"
00013 " unsigned int start1,\n"
00014 " unsigned int size1,\n"
00015 " __global const float16 * vec2,\n"
00016 " unsigned int start2,\n"
00017 " unsigned int size2) \n"
00018 "{ \n"
00019 " unsigned int i_end = size1/16;\n"
00020 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00021 " vec1[i+start1] -= vec2[i+start2];\n"
00022 "}\n"
00023 ;
00024
00025 const char * const vector_align16_mult =
00026 "__kernel void mult(\n"
00027 " __global const float16 * vec,\n"
00028 " unsigned int start1,\n"
00029 " unsigned int size1,\n"
00030 " __global const float * fac, \n"
00031 " __global float16 * result,\n"
00032 " unsigned int start2,\n"
00033 " unsigned int size2) \n"
00034 "{ \n"
00035 " float factor = *fac;\n"
00036 " unsigned int i_end = size1/16;\n"
00037 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00038 " result[i+start2] = vec[i+start1] * factor;\n"
00039 "}\n"
00040 ;
00041
00042 const char * const vector_align16_sub =
00043 "__kernel void sub(\n"
00044 " __global const float16 * vec1,\n"
00045 " unsigned int start1,\n"
00046 " unsigned int size1,\n"
00047 " __global const float16 * vec2, \n"
00048 " unsigned int start2,\n"
00049 " unsigned int size2,\n"
00050 " __global float16 * result,\n"
00051 " unsigned int start3,\n"
00052 " unsigned int size3)\n"
00053 "{ \n"
00054 " unsigned int i_end = size1 / 16;\n"
00055 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00056 " result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
00057 "}\n"
00058 ;
00059
00060 const char * const vector_align16_cpu_inplace_mul =
00061 "\n"
00062 "__kernel void cpu_inplace_mult(\n"
00063 " __global float16 * vec,\n"
00064 " unsigned int start1,\n"
00065 " unsigned int size1,\n"
00066 " float factor) \n"
00067 "{ \n"
00068 " unsigned int i_end = size1/16;\n"
00069 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00070 " vec[i+start1] *= factor;\n"
00071 "}\n"
00072 "\n"
00073 ;
00074
00075 const char * const vector_align16_add =
00076 "__kernel void add(\n"
00077 " __global const float16 * vec1,\n"
00078 " unsigned int start1,\n"
00079 " unsigned int size1,\n"
00080 " __global const float16 * vec2, \n"
00081 " unsigned int start2,\n"
00082 " unsigned int size2,\n"
00083 " __global float16 * result,\n"
00084 " unsigned int start3,\n"
00085 " unsigned int size3)\n"
00086 "{ \n"
00087 " unsigned int i_end = size/16;\n"
00088 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00089 " result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
00090 "}\n"
00091 ;
00092
00093 const char * const vector_align16_cpu_mult =
00094 "__kernel void cpu_mult(\n"
00095 " __global const float16 * vec,\n"
00096 " unsigned int start1,\n"
00097 " unsigned int size1,\n"
00098 " float factor, \n"
00099 " __global float16 * result,\n"
00100 " unsigned int start2,\n"
00101 " unsigned int size2) \n"
00102 "{ \n"
00103 " unsigned int i_end = size1/16;\n"
00104 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00105 " result[i+start2] = vec[i+start1] * factor;\n"
00106 "}\n"
00107 ;
00108
00109 const char * const vector_align16_inplace_divide =
00110 "__kernel void inplace_divide(\n"
00111 " __global float16 * vec,\n"
00112 " unsigned int start1,\n"
00113 " unsigned int size1,\n"
00114 " __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
00115 "{ \n"
00116 " float factor = *fac;\n"
00117 " unsigned int i_end = size1/16;\n"
00118 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00119 " vec[i+start1] /= factor;\n"
00120 "}\n"
00121 ;
00122
00123 const char * const vector_align16_inplace_add =
00124 "__kernel void inplace_add(\n"
00125 " __global float16 * vec1,\n"
00126 " unsigned int start1,\n"
00127 " unsigned int size1,\n"
00128 " __global const float16 * vec2,\n"
00129 " unsigned int start2,\n"
00130 " unsigned int size2) \n"
00131 "{ \n"
00132 " unsigned int i_end = size1/16;\n"
00133 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00134 " vec1[i+start1] += vec2[i+start2];\n"
00135 "}\n"
00136 ;
00137
00138 const char * const vector_align16_divide =
00139 "//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
00140 "__kernel void divide(\n"
00141 " __global const float16 * vec,\n"
00142 " unsigned int start1,\n"
00143 " unsigned int size1,\n"
00144 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
00145 " __global float16 * result,\n"
00146 " unsigned int start2,\n"
00147 " unsigned int size2) \n"
00148 "{ \n"
00149 " float factor = *fac;\n"
00150 " unsigned int i_end = size/16;\n"
00151 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00152 " result[i+start2] = vec[i+start1] / factor;\n"
00153 "}\n"
00154 ;
00155
00156 const char * const vector_align16_inplace_mult =
00157 "__kernel void inplace_mult(\n"
00158 " __global float16 * vec,\n"
00159 " unsigned int start1,\n"
00160 " unsigned int size1,\n"
00161 " __global const float * fac) \n"
00162 "{ \n"
00163 " float factor = *fac;\n"
00164 " unsigned int i_end = size1/16;\n"
00165 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00166 " vec[i+start1] *= factor;\n"
00167 "}\n"
00168 ;
00169
00170 const char * const vector_align4_inplace_div_add =
00171 "__kernel void inplace_div_add(\n"
00172 " __global float4 * vec1,\n"
00173 " unsigned int start1,\n"
00174 " unsigned int size1,\n"
00175 " __global const float4 * vec2,\n"
00176 " unsigned int start2,\n"
00177 " unsigned int size2,\n"
00178 " __global const float * fac) //CPU variant is mapped to mult_add\n"
00179 "{ \n"
00180 " float factor = *fac;\n"
00181 " unsigned int i_end = size1 / 4;\n"
00182 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00183 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
00184 "}\n"
00185 ;
00186
00187 const char * const vector_align4_cpu_mul_add =
00188 "__kernel void cpu_mul_add(\n"
00189 " __global const float4 * vec1,\n"
00190 " unsigned int start1,\n"
00191 " unsigned int size1,\n"
00192 " float factor,\n"
00193 " __global const float4 * vec2,\n"
00194 " unsigned int start2,\n"
00195 " unsigned int size2,\n"
00196 " __global float4 * result,\n"
00197 " unsigned int start3,\n"
00198 " unsigned int size3) \n"
00199 "{ \n"
00200 " unsigned int i_end = size1/4;\n"
00201 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00202 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00203 "}\n"
00204 ;
00205
00206 const char * const vector_align4_inplace_mul_sub =
00207 "__kernel void inplace_mul_sub(\n"
00208 " __global float4 * vec1,\n"
00209 " unsigned int start1,\n"
00210 " unsigned int size1,\n"
00211 " __global const float4 * vec2,\n"
00212 " unsigned int start2,\n"
00213 " unsigned int size2,\n"
00214 " __global const float * fac) //CPU variant is mapped to mult_add\n"
00215 "{ \n"
00216 " float factor = *fac;\n"
00217 " unsigned int i_end = size/4;\n"
00218 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00219 " vec1[i+start1] -= vec2[i+start2] * factor;\n"
00220 "}\n"
00221 ;
00222
00223 const char * const vector_align4_inplace_mul_add =
00224 "__kernel void inplace_mul_add(\n"
00225 " __global float4 * vec1,\n"
00226 " unsigned int start1,\n"
00227 " unsigned int size1,\n"
00228 " __global const float4 * vec2,\n"
00229 " unsigned int start2,\n"
00230 " unsigned int size2,\n"
00231 " __global const float * fac) \n"
00232 "{ \n"
00233 " float factor = *fac;\n"
00234 " unsigned int size_div_4 = size1/4;\n"
00235 " for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
00236 " vec1[i+start1] += vec2[i+start2] * factor;\n"
00237 "}\n"
00238 ;
00239
00240 const char * const vector_align4_mul_add =
00241 "__kernel void mul_add(\n"
00242 " __global const float4 * vec1,\n"
00243 " unsigned int start1,\n"
00244 " unsigned int size1,\n"
00245 " __global const float * fac,\n"
00246 " __global const float4 * vec2,\n"
00247 " unsigned int start2,\n"
00248 " unsigned int size2,\n"
00249 " __global float4 * result,\n"
00250 " unsigned int start3,\n"
00251 " unsigned int size3) \n"
00252 "{ \n"
00253 " float factor = *fac;\n"
00254 " unsigned int i_end = size1/4;\n"
00255 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00256 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00257 "}\n"
00258 ;
00259
00260 const char * const vector_align4_cpu_inplace_mul_add =
00261 "__kernel void cpu_inplace_mul_add(\n"
00262 " __global float4 * vec1,\n"
00263 " unsigned int start1,\n"
00264 " unsigned int size1,\n"
00265 " __global const float4 * vec2,\n"
00266 " unsigned int start2,\n"
00267 " unsigned int size2,\n"
00268 " float factor) \n"
00269 "{ \n"
00270 " unsigned int i_end = size1/4;\n"
00271 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00272 " vec1[i+start1] += vec2[i+start2] * factor;\n"
00273 "}\n"
00274 ;
00275
00276 const char * const vector_align4_inplace_div_sub =
00277 "__kernel void inplace_div_sub(\n"
00278 " __global float4 * vec1,\n"
00279 " unsigned int start1,\n"
00280 " unsigned int size1,\n"
00281 " __global const float4 * vec2,\n"
00282 " unsigned int start2,\n"
00283 " unsigned int size2,\n"
00284 " __global const float * fac) //CPU variant is mapped to mult_add\n"
00285 "{ \n"
00286 " float factor = *fac;\n"
00287 " unsigned int i_end = size1/4;\n"
00288 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00289 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
00290 "}\n"
00291 ;
00292
00293 const char * const vector_align1_inplace_sub =
00294 "__kernel void inplace_sub(\n"
00295 " __global float * vec1,\n"
00296 " unsigned int start1,\n"
00297 " unsigned int size1,\n"
00298 " __global const float * vec2,\n"
00299 " unsigned int start2,\n"
00300 " unsigned int size2) \n"
00301 "{ \n"
00302 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00303 " vec1[i+start1] -= vec2[i+start2];\n"
00304 "}\n"
00305 ;
00306
00307 const char * const vector_align1_norm_inf =
00308 "\n"
00309 "////// norm_inf\n"
00310 "float impl_norm_inf(\n"
00311 " __global const float * vec,\n"
00312 " unsigned int start_index,\n"
00313 " unsigned int end_index,\n"
00314 " __local float * tmp_buffer)\n"
00315 "{\n"
00316 " float tmp = 0;\n"
00317 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00318 " tmp = fmax(fabs(vec[i]), tmp);\n"
00319 " tmp_buffer[get_local_id(0)] = tmp;\n"
00320 " \n"
00321 " //step 2: parallel reduction:\n"
00322 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00323 " {\n"
00324 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00325 " if (get_global_id(0) < stride)\n"
00326 " tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
00327 " }\n"
00328 " \n"
00329 " return tmp_buffer[0];\n"
00330 "}\n"
00331 "\n"
00332 "__kernel void norm_inf(\n"
00333 " __global const float * vec,\n"
00334 " unsigned int start1,\n"
00335 " unsigned int size1,\n"
00336 " __local float * tmp_buffer,\n"
00337 " global float * group_buffer)\n"
00338 "{\n"
00339 " float tmp = impl_norm_inf(vec,\n"
00340 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00341 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
00342 " tmp_buffer);\n"
00343 " \n"
00344 " if (get_local_id(0) == 0)\n"
00345 " group_buffer[get_group_id(0)] = tmp; \n"
00346 "}\n"
00347 ;
00348
00349 const char * const vector_align1_index_norm_inf =
00350 "//index_norm_inf:\n"
00351 "unsigned int float_vector1_index_norm_inf_impl(\n"
00352 " __global const float * vec,\n"
00353 " unsigned int start1,\n"
00354 " unsigned int size1,\n"
00355 " __local float * float_buffer,\n"
00356 " __local unsigned int * index_buffer)\n"
00357 "{\n"
00358 " //step 1: fill buffer:\n"
00359 " float cur_max = 0.0f;\n"
00360 " float tmp;\n"
00361 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00362 " {\n"
00363 " tmp = fabs(vec[i+start1]);\n"
00364 " if (cur_max < tmp)\n"
00365 " {\n"
00366 " float_buffer[get_global_id(0)] = tmp;\n"
00367 " index_buffer[get_global_id(0)] = i;\n"
00368 " cur_max = tmp;\n"
00369 " }\n"
00370 " }\n"
00371 " \n"
00372 " //step 2: parallel reduction:\n"
00373 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00374 " {\n"
00375 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00376 " if (get_global_id(0) < stride)\n"
00377 " {\n"
00378 " //find the first occurring index\n"
00379 " if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
00380 " {\n"
00381 " index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
00382 " float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
00383 " }\n"
00384 " \n"
00385 " //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
00386 " //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
00387 " }\n"
00388 " }\n"
00389 " \n"
00390 " return index_buffer[0];\n"
00391 "}\n"
00392 "\n"
00393 "__kernel void index_norm_inf(\n"
00394 " __global float * vec,\n"
00395 " unsigned int start1,\n"
00396 " unsigned int size1,\n"
00397 " __local float * float_buffer,\n"
00398 " __local unsigned int * index_buffer,\n"
00399 " global unsigned int * result) \n"
00400 "{ \n"
00401 " unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, size1, float_buffer, index_buffer);\n"
00402 " if (get_global_id(0) == 0) *result = tmp;\n"
00403 "}\n"
00404 "\n"
00405 "\n"
00406 ;
00407
00408 const char * const vector_align1_mult =
00409 "__kernel void mult(\n"
00410 " __global const float * vec,\n"
00411 " unsigned int start1,\n"
00412 " unsigned int size1,\n"
00413 " __global const float * fac, \n"
00414 " __global float * result,\n"
00415 " unsigned int start3,\n"
00416 " unsigned int size3) \n"
00417 "{ \n"
00418 " float factor = *fac;\n"
00419 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00420 " result[i+start3] = vec[i+start1] * factor;\n"
00421 "}\n"
00422 ;
00423
00424 const char * const vector_align1_swap =
00425 "////// swap:\n"
00426 "__kernel void swap(\n"
00427 " __global float * vec1,\n"
00428 " unsigned int start1,\n"
00429 " unsigned int size1,\n"
00430 " __global float * vec2,\n"
00431 " unsigned int start2,\n"
00432 " unsigned int size2\n"
00433 " ) \n"
00434 "{ \n"
00435 " float tmp;\n"
00436 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00437 " {\n"
00438 " tmp = vec2[i+start2];\n"
00439 " vec2[i+start2] = vec1[i+start1];\n"
00440 " vec1[i+start1] = tmp;\n"
00441 " }\n"
00442 "}\n"
00443 " \n"
00444 ;
00445
00446 const char * const vector_align1_inplace_div_add =
00447 "///// divide add:\n"
00448 "__kernel void inplace_div_add(\n"
00449 " __global float * vec1,\n"
00450 " unsigned int start1,\n"
00451 " unsigned int size1,\n"
00452 " __global const float * vec2,\n"
00453 " unsigned int start2,\n"
00454 " unsigned int size2,\n"
00455 " __global const float * fac) //CPU variant is mapped to mult_add\n"
00456 "{ \n"
00457 " float factor = *fac;\n"
00458 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00459 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
00460 "}\n"
00461 ;
00462
00463 const char * const vector_align1_norm_2 =
00464 "//helper:\n"
00465 "void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
00466 "{\n"
00467 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00468 " {\n"
00469 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00470 " if (get_global_id(0) < stride)\n"
00471 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00472 " }\n"
00473 "}\n"
00474 "\n"
00475 "////// norm_2\n"
00476 "float impl_norm_2(\n"
00477 " __global const float * vec,\n"
00478 " unsigned int start_index,\n"
00479 " unsigned int end_index,\n"
00480 " __local float * tmp_buffer)\n"
00481 "{\n"
00482 " float tmp = 0;\n"
00483 " float vec_entry = 0;\n"
00484 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00485 " {\n"
00486 " vec_entry = vec[i];\n"
00487 " tmp += vec_entry * vec_entry;\n"
00488 " }\n"
00489 " tmp_buffer[get_local_id(0)] = tmp;\n"
00490 " \n"
00491 " helper_norm2_parallel_reduction(tmp_buffer);\n"
00492 " \n"
00493 " return tmp_buffer[0];\n"
00494 "};\n"
00495 "\n"
00496 "__kernel void norm_2(\n"
00497 " __global const float * vec,\n"
00498 " unsigned int start1,\n"
00499 " unsigned int size1,\n"
00500 " __local float * tmp_buffer,\n"
00501 " global float * group_buffer)\n"
00502 "{\n"
00503 " float tmp = impl_norm_2(vec,\n"
00504 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00505 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
00506 " tmp_buffer);\n"
00507 " \n"
00508 " if (get_local_id(0) == 0)\n"
00509 " group_buffer[get_group_id(0)] = tmp; \n"
00510 "}\n"
00511 "\n"
00512 ;
00513
00514 const char * const vector_align1_sub =
00515 "__kernel void sub(\n"
00516 " __global const float * vec1,\n"
00517 " unsigned int start1,\n"
00518 " unsigned int size1,\n"
00519 " __global const float * vec2, \n"
00520 " unsigned int start2,\n"
00521 " unsigned int size2,\n"
00522 " __global float * result,\n"
00523 " unsigned int start3,\n"
00524 " unsigned int size3)\n"
00525 "{ \n"
00526 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00527 " result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
00528 "}\n"
00529 ;
00530
00531 const char * const vector_align1_cpu_mul_add =
00532 "__kernel void cpu_mul_add(\n"
00533 " __global const float * vec1,\n"
00534 " unsigned int start1,\n"
00535 " unsigned int size1,\n"
00536 " float factor,\n"
00537 " __global const float * vec2,\n"
00538 " unsigned int start2,\n"
00539 " unsigned int size2,\n"
00540 " __global float * result,\n"
00541 " unsigned int start3,\n"
00542 " unsigned int size3\n"
00543 " ) \n"
00544 "{ \n"
00545 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00546 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00547 "}\n"
00548 ;
00549
00550 const char * const vector_align1_vmax =
00551 "__kernel void vmax(\n"
00552 " __global float * vec1,\n"
00553 " unsigned int start1,\n"
00554 " unsigned int size1,\n"
00555 " __global float * result) \n"
00556 "{ \n"
00557 " //parallel reduction on global memory (make sure that size is a power of 2)\n"
00558 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00559 " {\n"
00560 " if (get_global_id(0) < stride)\n"
00561 " vec1[get_global_id(0)+start1] = fmax(vec1[get_global_id(0)+start1+stride], vec1[get_global_id(0)+start1]);\n"
00562 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00563 " }\n"
00564 " \n"
00565 " if (get_global_id(0) == 0)\n"
00566 " *result = vec1[start1];\n"
00567 "}\n"
00568 ;
00569
00570 const char * const vector_align1_inner_prod =
00571 "//helper:\n"
00572 "void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
00573 "{\n"
00574 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00575 " {\n"
00576 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00577 " if (get_local_id(0) < stride)\n"
00578 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00579 " }\n"
00580 "}\n"
00581 "//////// inner products:\n"
00582 "float impl_inner_prod(\n"
00583 " __global const float * vec1,\n"
00584 " unsigned int start1,\n"
00585 " unsigned int size1,\n"
00586 " __global const float * vec2,\n"
00587 " unsigned int start2,\n"
00588 " unsigned int size2,\n"
00589 " __local float * tmp_buffer)\n"
00590 "{\n"
00591 " float tmp = 0;\n"
00592 " for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))\n"
00593 " tmp += vec1[i+start1] * vec2[i+start2];\n"
00594 " tmp_buffer[get_local_id(0)] = tmp;\n"
00595 " \n"
00596 " helper_inner_prod_parallel_reduction(tmp_buffer);\n"
00597 " \n"
00598 " return tmp_buffer[0];\n"
00599 "}\n"
00600 "__kernel void inner_prod(\n"
00601 " __global const float * vec1,\n"
00602 " unsigned int start1,\n"
00603 " unsigned int size1,\n"
00604 " __global const float * vec2,\n"
00605 " unsigned int start2,\n"
00606 " unsigned int size2,\n"
00607 " __local float * tmp_buffer,\n"
00608 " global float * group_buffer)\n"
00609 "{\n"
00610 " float tmp = impl_inner_prod(vec1,\n"
00611 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00612 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) - ( get_group_id(0) * size1) / get_num_groups(0),\n"
00613 " vec2,\n"
00614 " ( get_group_id(0) * size2) / get_num_groups(0) + start2,\n"
00615 " ((get_group_id(0) + 1) * size2) / get_num_groups(0) - ( get_group_id(0) * size2) / get_num_groups(0),\n"
00616 " tmp_buffer);\n"
00617 " \n"
00618 " if (get_local_id(0) == 0)\n"
00619 " group_buffer[get_group_id(0)] = tmp;\n"
00620 " \n"
00621 "}\n"
00622 ;
00623
00624 const char * const vector_align1_add =
00625 "__kernel void add(\n"
00626 " __global const float * vec1,\n"
00627 " unsigned int start1,\n"
00628 " unsigned int size1,\n"
00629 " __global const float * vec2,\n"
00630 " unsigned int start2,\n"
00631 " unsigned int size2,\n"
00632 " __global float * result,\n"
00633 " unsigned int start3,\n"
00634 " unsigned int size3) \n"
00635 "{ \n"
00636 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00637 " result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
00638 "}\n"
00639 ;
00640
00641 const char * const vector_align1_plane_rotation =
00642 "////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
00643 "__kernel void plane_rotation(\n"
00644 " __global float * vec1,\n"
00645 " unsigned int start1,\n"
00646 " unsigned int size1,\n"
00647 " __global float * vec2, \n"
00648 " unsigned int start2,\n"
00649 " unsigned int size2,\n"
00650 " float alpha,\n"
00651 " float beta) \n"
00652 "{ \n"
00653 " float tmp1 = 0;\n"
00654 " float tmp2 = 0;\n"
00655 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00656 " {\n"
00657 " tmp1 = vec1[i+start1];\n"
00658 " tmp2 = vec2[i+start2];\n"
00659 " \n"
00660 " vec1[i+start1] = alpha * tmp1 + beta * tmp2;\n"
00661 " vec2[i+start2] = alpha * tmp2 - beta * tmp1;\n"
00662 " }\n"
00663 "}\n"
00664 ;
00665
00666 const char * const vector_align1_inplace_mul_sub =
00667 "__kernel void inplace_mul_sub(\n"
00668 " __global float * vec1,\n"
00669 " unsigned int start1,\n"
00670 " unsigned int size1,\n"
00671 " __global const float * vec2,\n"
00672 " unsigned int start2,\n"
00673 " unsigned int size2,\n"
00674 " __global const float * fac) //CPU variant is mapped to mult_add\n"
00675 "{ \n"
00676 " float factor = *fac;\n"
00677 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00678 " vec1[i+start1] -= vec2[i+start2] * factor;\n"
00679 "}\n"
00680 ;
00681
00682 const char * const vector_align1_inplace_mul_add =
00683 "__kernel void inplace_mul_add(\n"
00684 " __global float * vec1,\n"
00685 " unsigned int start1,\n"
00686 " unsigned int size1,\n"
00687 " __global const float * vec2,\n"
00688 " unsigned int start2,\n"
00689 " unsigned int size2,\n"
00690 " __global const float * fac) \n"
00691 "{ \n"
00692 " float factor = *fac;\n"
00693 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00694 " vec1[i+start1] += vec2[i+start2] * factor;\n"
00695 "}\n"
00696 ;
00697
00698 const char * const vector_align1_mul_add =
00699 "__kernel void mul_add(\n"
00700 " __global const float * vec1,\n"
00701 " unsigned int start1,\n"
00702 " unsigned int size1,\n"
00703 " __global const float * fac,\n"
00704 " __global const float * vec2,\n"
00705 " unsigned int start2,\n"
00706 " unsigned int size2,\n"
00707 " __global float * result,\n"
00708 " unsigned int start3,\n"
00709 " unsigned int size3\n"
00710 " ) \n"
00711 "{ \n"
00712 " float factor = *fac;\n"
00713 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00714 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00715 "}\n"
00716 ;
00717
00718 const char * const vector_align1_cpu_mult =
00719 "__kernel void cpu_mult(\n"
00720 " __global const float * vec,\n"
00721 " unsigned int start1,\n"
00722 " unsigned int size1,\n"
00723 " float factor, \n"
00724 " __global float * result,\n"
00725 " unsigned int start2,\n"
00726 " unsigned int size2) \n"
00727 "{ \n"
00728 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00729 " result[i+start2] = vec[i+start1] * factor;\n"
00730 "}\n"
00731 ;
00732
00733 const char * const vector_align1_inplace_divide =
00734 "__kernel void inplace_divide(\n"
00735 " __global float * vec,\n"
00736 " unsigned int start1,\n"
00737 " unsigned int size1,\n"
00738 " __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
00739 "{ \n"
00740 " float factor = *fac;\n"
00741 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00742 " vec[i+start1] /= factor;\n"
00743 "}\n"
00744 ;
00745
00746 const char * const vector_align1_sqrt_sum =
00747 "__kernel void sqrt_sum(\n"
00748 " __global float * vec1,\n"
00749 " unsigned int start1,\n"
00750 " unsigned int size1,\n"
00751 " __global float * result) \n"
00752 "{ \n"
00753 " //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)\n"
00754 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00755 " {\n"
00756 " if (get_global_id(0) < stride)\n"
00757 " vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
00758 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00759 " }\n"
00760 " \n"
00761 " if (get_global_id(0) == 0)\n"
00762 " *result = sqrt(vec1[start1]);\n"
00763 " \n"
00764 "}\n"
00765 ;
00766
00767 const char * const vector_align1_cpu_inplace_mul_add =
00768 "__kernel void cpu_inplace_mul_add(\n"
00769 " __global float * vec1,\n"
00770 " unsigned int start1,\n"
00771 " unsigned int size1,\n"
00772 " __global const float * vec2,\n"
00773 " unsigned int start2,\n"
00774 " unsigned int size2,\n"
00775 " float factor) \n"
00776 "{ \n"
00777 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00778 " vec1[i+start1] += vec2[i+start2] * factor;\n"
00779 "}\n"
00780 ;
00781
00782 const char * const vector_align1_inplace_add =
00783 "__kernel void inplace_add(\n"
00784 " __global float * vec1,\n"
00785 " unsigned int start1,\n"
00786 " unsigned int size1,\n"
00787 " __global const float * vec2,\n"
00788 " unsigned int start2,\n"
00789 " unsigned int size2) \n"
00790 "{ \n"
00791 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00792 " vec1[i+start1] += vec2[i+start2];\n"
00793 "}\n"
00794 ;
00795
00796 const char * const vector_align1_divide =
00797 "// Note: name 'div' is not allowed by the jit-compiler\n"
00798 "__kernel void divide(\n"
00799 " __global const float * vec,\n"
00800 " unsigned int start1,\n"
00801 " unsigned int size1,\n"
00802 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
00803 " __global float * result,\n"
00804 " unsigned int start3,\n"
00805 " unsigned int size3) \n"
00806 "{ \n"
00807 " float factor = *fac;\n"
00808 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00809 " result[i+start3] = vec[i+start1] / factor;\n"
00810 "}\n"
00811 ;
00812
00813 const char * const vector_align1_norm_1 =
00814 "//helper:\n"
00815 "void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
00816 "{\n"
00817 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00818 " {\n"
00819 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00820 " if (get_global_id(0) < stride)\n"
00821 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00822 " }\n"
00823 "}\n"
00824 "\n"
00825 "////// norm_1\n"
00826 "float impl_norm_1(\n"
00827 " __global const float * vec,\n"
00828 " unsigned int start_index,\n"
00829 " unsigned int end_index,\n"
00830 " __local float * tmp_buffer)\n"
00831 "{\n"
00832 " float tmp = 0;\n"
00833 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00834 " tmp += fabs(vec[i]);\n"
00835 " \n"
00836 " tmp_buffer[get_local_id(0)] = tmp;\n"
00837 " \n"
00838 " helper_norm1_parallel_reduction(tmp_buffer);\n"
00839 " \n"
00840 " return tmp_buffer[0];\n"
00841 "};\n"
00842 "\n"
00843 "__kernel void norm_1(\n"
00844 " __global const float * vec,\n"
00845 " unsigned int start1,\n"
00846 " unsigned int size1,\n"
00847 " __local float * tmp_buffer,\n"
00848 " global float * group_buffer)\n"
00849 "{\n"
00850 " float tmp = impl_norm_1(vec,\n"
00851 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00852 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
00853 " tmp_buffer);\n"
00854 " \n"
00855 " if (get_local_id(0) == 0)\n"
00856 " group_buffer[get_group_id(0)] = tmp; \n"
00857 "}\n"
00858 "\n"
00859 ;
00860
00861 const char * const vector_align1_clear =
00862 "__kernel void clear(\n"
00863 " __global float * vec,\n"
00864 " unsigned int start1,\n"
00865 " unsigned int size1) \n"
00866 "{ \n"
00867 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00868 " vec[i+start1] = 0;\n"
00869 "}\n"
00870 ;
00871
00872 const char * const vector_align1_cpu_inplace_mult =
00873 "__kernel void cpu_inplace_mult(\n"
00874 " __global float * vec,\n"
00875 " unsigned int start1,\n"
00876 " unsigned int size1,\n"
00877 " float factor) \n"
00878 "{ \n"
00879 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00880 " vec[i+start1] *= factor;\n"
00881 "}\n"
00882 ;
00883
00884 const char * const vector_align1_inplace_mult =
00885 "__kernel void inplace_mult(\n"
00886 " __global float * vec,\n"
00887 " unsigned int start1,\n"
00888 " unsigned int size1,\n"
00889 " __global const float * fac) \n"
00890 "{ \n"
00891 " float factor = *fac;\n"
00892 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00893 " vec[i+start1] *= factor;\n"
00894 "}\n"
00895 ;
00896
00897 const char * const vector_align1_sum =
00898 "__kernel void sum(\n"
00899 " __global float * vec1,\n"
00900 " unsigned int start1,\n"
00901 " unsigned int size1,\n"
00902 " __global float * result) \n"
00903 "{ \n"
00904 " //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)\n"
00905 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00906 " {\n"
00907 " if (get_global_id(0) < stride)\n"
00908 " vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
00909 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00910 " }\n"
00911 " \n"
00912 " if (get_global_id(0) == 0)\n"
00913 " *result = vec1[0]; \n"
00914 "}\n"
00915 ;
00916
00917 const char * const vector_align1_inplace_div_sub =
00918 "///// divide substract:\n"
00919 "__kernel void inplace_div_sub(\n"
00920 " __global float * vec1,\n"
00921 " unsigned int start1,\n"
00922 " unsigned int size1,\n"
00923 " __global const float * vec2,\n"
00924 " unsigned int start2,\n"
00925 " unsigned int size2,\n"
00926 " __global const float * fac) //CPU variant is mapped to mult_add\n"
00927 "{ \n"
00928 " float factor = *fac;\n"
00929 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00930 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
00931 "}\n"
00932 ;
00933
00934 const char * const vector_align1_diag_precond =
00935 "__kernel void diag_precond(\n"
00936 " __global const float * diag_A_inv, \n"
00937 " unsigned int start1,\n"
00938 " unsigned int size1,\n"
00939 " __global float * x, \n"
00940 " unsigned int start2,\n"
00941 " unsigned int size2) \n"
00942 "{ \n"
00943 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00944 " x[i+start2] *= diag_A_inv[i+start1];\n"
00945 "}\n"
00946 ;
00947
00948 const char * const vector_align1_mul_sub =
00949 "///// multiply subtract:\n"
00950 "__kernel void mul_sub(\n"
00951 " __global const float * vec1,\n"
00952 " unsigned int start1,\n"
00953 " unsigned int size1,\n"
00954 " __global const float * fac,\n"
00955 " __global const float * vec2,\n"
00956 " unsigned int start2,\n"
00957 " unsigned int size2,\n"
00958 " __global float * result,\n"
00959 " unsigned int start3,\n"
00960 " unsigned int size3\n"
00961 " ) \n"
00962 "{ \n"
00963 " float factor = *fac;\n"
00964 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00965 " result[i+start3] = vec1[i+start1] * factor - vec2[i+start2];\n"
00966 "}\n"
00967 ;
00968
00969 }
00970 }
00971 }
00972 #endif