ViennaCL - The Vienna Computing Library: /data/development/ViennaCL/dev/viennacl/linalg/kernels/vector

Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
00003 //Automatically generated file from auxiliary-directory, do not edit manually!
00004 namespace viennacl
00005 {
00006  namespace linalg
00007  {
00008   namespace kernels
00009   {
00010 const char * const vector_align16_inplace_sub = 
00011 "__kernel void inplace_sub(\n"
00012 "          __global float16 * vec1,\n"
00013 "          unsigned int start1,\n"
00014 "          unsigned int size1,\n"
00015 "          __global const float16 * vec2,\n"
00016 "          unsigned int start2,\n"
00017 "          unsigned int size2) \n"
00018 "{ \n"
00019 "  unsigned int i_end = size1/16;\n"
00020 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00021 "    vec1[i+start1] -= vec2[i+start2];\n"
00022 "}\n"
00023 ; //vector_align16_inplace_sub
00024 
00025 const char * const vector_align16_mult = 
00026 "__kernel void mult(\n"
00027 "          __global const float16 * vec,\n"
00028 "          unsigned int start1,\n"
00029 "          unsigned int size1,\n"
00030 "          __global const float * fac, \n"
00031 "          __global float16 * result,\n"
00032 "          unsigned int start2,\n"
00033 "          unsigned int size2) \n"
00034 "{ \n"
00035 "  float factor = *fac;\n"
00036 "  unsigned int i_end = size1/16;\n"
00037 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00038 "    result[i+start2] = vec[i+start1] * factor;\n"
00039 "}\n"
00040 ; //vector_align16_mult
00041 
00042 const char * const vector_align16_sub = 
00043 "__kernel void sub(\n"
00044 "          __global const float16 * vec1,\n"
00045 "          unsigned int start1,\n"
00046 "          unsigned int size1,\n"
00047 "          __global const float16 * vec2, \n"
00048 "          unsigned int start2,\n"
00049 "          unsigned int size2,\n"
00050 "          __global float16 * result,\n"
00051 "          unsigned int start3,\n"
00052 "          unsigned int size3)\n"
00053 "{ \n"
00054 "  unsigned int i_end = size1 / 16;\n"
00055 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00056 "    result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
00057 "}\n"
00058 ; //vector_align16_sub
00059 
00060 const char * const vector_align16_cpu_inplace_mul = 
00061 "\n"
00062 "__kernel void cpu_inplace_mult(\n"
00063 "          __global float16 * vec,\n"
00064 "          unsigned int start1,\n"
00065 "          unsigned int size1,\n"
00066 "          float factor) \n"
00067 "{ \n"
00068 "  unsigned int i_end = size1/16;\n"
00069 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00070 "    vec[i+start1] *= factor;\n"
00071 "}\n"
00072 "\n"
00073 ; //vector_align16_cpu_inplace_mul
00074 
00075 const char * const vector_align16_add = 
00076 "__kernel void add(\n"
00077 "          __global const float16 * vec1,\n"
00078 "          unsigned int start1,\n"
00079 "          unsigned int size1,\n"
00080 "          __global const float16 * vec2, \n"
00081 "          unsigned int start2,\n"
00082 "          unsigned int size2,\n"
00083 "          __global float16 * result,\n"
00084 "          unsigned int start3,\n"
00085 "          unsigned int size3)\n"
00086 "{ \n"
00087 "  unsigned int i_end = size/16;\n"
00088 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00089 "    result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
00090 "}\n"
00091 ; //vector_align16_add
00092 
00093 const char * const vector_align16_cpu_mult = 
00094 "__kernel void cpu_mult(\n"
00095 "          __global const float16 * vec,\n"
00096 "          unsigned int start1,\n"
00097 "          unsigned int size1,\n"
00098 "          float factor, \n"
00099 "          __global float16 * result,\n"
00100 "          unsigned int start2,\n"
00101 "          unsigned int size2) \n"
00102 "{ \n"
00103 "  unsigned int i_end = size1/16;\n"
00104 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00105 "    result[i+start2] = vec[i+start1] * factor;\n"
00106 "}\n"
00107 ; //vector_align16_cpu_mult
00108 
00109 const char * const vector_align16_inplace_divide = 
00110 "__kernel void inplace_divide(\n"
00111 "          __global float16 * vec,\n"
00112 "          unsigned int start1,\n"
00113 "          unsigned int size1,\n"
00114 "          __global const float * fac)  //note: CPU variant is mapped to prod_scalar\n"
00115 "{ \n"
00116 "  float factor = *fac;\n"
00117 "  unsigned int i_end = size1/16;\n"
00118 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00119 "    vec[i+start1] /= factor;\n"
00120 "}\n"
00121 ; //vector_align16_inplace_divide
00122 
00123 const char * const vector_align16_inplace_add = 
00124 "__kernel void inplace_add(\n"
00125 "          __global float16 * vec1,\n"
00126 "          unsigned int start1,\n"
00127 "          unsigned int size1,\n"
00128 "          __global const float16 * vec2,\n"
00129 "          unsigned int start2,\n"
00130 "          unsigned int size2) \n"
00131 "{ \n"
00132 "  unsigned int i_end = size1/16;\n"
00133 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00134 "    vec1[i+start1] += vec2[i+start2];\n"
00135 "}\n"
00136 ; //vector_align16_inplace_add
00137 
00138 const char * const vector_align16_divide = 
00139 "//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
00140 "__kernel void divide(\n"
00141 "          __global const float16 * vec,\n"
00142 "          unsigned int start1,\n"
00143 "          unsigned int size1,\n"
00144 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00145 "          __global float16 * result,\n"
00146 "          unsigned int start2,\n"
00147 "          unsigned int size2)  \n"
00148 "{ \n"
00149 "  float factor = *fac;\n"
00150 "  unsigned int i_end = size/16;\n"
00151 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00152 "    result[i+start2] = vec[i+start1] / factor;\n"
00153 "}\n"
00154 ; //vector_align16_divide
00155 
00156 const char * const vector_align16_inplace_mult = 
00157 "__kernel void inplace_mult(\n"
00158 "          __global float16 * vec,\n"
00159 "          unsigned int start1,\n"
00160 "          unsigned int size1,\n"
00161 "          __global const float * fac) \n"
00162 "{ \n"
00163 "  float factor = *fac;\n"
00164 "  unsigned int i_end = size1/16;\n"
00165 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00166 "    vec[i+start1] *= factor;\n"
00167 "}\n"
00168 ; //vector_align16_inplace_mult
00169 
00170 const char * const vector_align4_inplace_div_add = 
00171 "__kernel void inplace_div_add(\n"
00172 "          __global float4 * vec1,\n"
00173 "          unsigned int start1,\n"
00174 "          unsigned int size1,\n"
00175 "          __global const float4 * vec2,\n"
00176 "          unsigned int start2,\n"
00177 "          unsigned int size2,\n"
00178 "          __global const float * fac)   //CPU variant is mapped to mult_add\n"
00179 "{ \n"
00180 "  float factor = *fac;\n"
00181 "  unsigned int i_end = size1 / 4;\n"
00182 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00183 "    vec1[i+start1] -= vec2[i+start2] / factor;\n"
00184 "}\n"
00185 ; //vector_align4_inplace_div_add
00186 
00187 const char * const vector_align4_cpu_mul_add = 
00188 "__kernel void cpu_mul_add(\n"
00189 "          __global const float4 * vec1,\n"
00190 "          unsigned int start1,\n"
00191 "          unsigned int size1,\n"
00192 "          float factor,\n"
00193 "          __global const float4 * vec2,\n"
00194 "          unsigned int start2,\n"
00195 "          unsigned int size2,\n"
00196 "          __global float4 * result,\n"
00197 "          unsigned int start3,\n"
00198 "          unsigned int size3) \n"
00199 "{ \n"
00200 "  unsigned int i_end = size1/4;\n"
00201 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00202 "    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00203 "}\n"
00204 ; //vector_align4_cpu_mul_add
00205 
00206 const char * const vector_align4_inplace_mul_sub = 
00207 "__kernel void inplace_mul_sub(\n"
00208 "          __global float4 * vec1,\n"
00209 "          unsigned int start1,\n"
00210 "          unsigned int size1,\n"
00211 "          __global const float4 * vec2,\n"
00212 "          unsigned int start2,\n"
00213 "          unsigned int size2,\n"
00214 "          __global const float * fac)   //CPU variant is mapped to mult_add\n"
00215 "{ \n"
00216 "  float factor = *fac;\n"
00217 "  unsigned int i_end = size/4;\n"
00218 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00219 "    vec1[i+start1] -= vec2[i+start2] * factor;\n"
00220 "}\n"
00221 ; //vector_align4_inplace_mul_sub
00222 
00223 const char * const vector_align4_inplace_mul_add = 
00224 "__kernel void inplace_mul_add(\n"
00225 "          __global float4 * vec1,\n"
00226 "          unsigned int start1,\n"
00227 "          unsigned int size1,\n"
00228 "          __global const float4 * vec2,\n"
00229 "          unsigned int start2,\n"
00230 "          unsigned int size2,\n"
00231 "          __global const float * fac) \n"
00232 "{ \n"
00233 "  float factor = *fac;\n"
00234 "  unsigned int size_div_4 = size1/4;\n"
00235 "  for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
00236 "    vec1[i+start1] += vec2[i+start2] * factor;\n"
00237 "}\n"
00238 ; //vector_align4_inplace_mul_add
00239 
00240 const char * const vector_align4_mul_add = 
00241 "__kernel void mul_add(\n"
00242 "          __global const float4 * vec1,\n"
00243 "          unsigned int start1,\n"
00244 "          unsigned int size1,\n"
00245 "          __global const float * fac,\n"
00246 "          __global const float4 * vec2,\n"
00247 "          unsigned int start2,\n"
00248 "          unsigned int size2,\n"
00249 "          __global float4 * result,\n"
00250 "          unsigned int start3,\n"
00251 "          unsigned int size3) \n"
00252 "{ \n"
00253 "  float factor = *fac;\n"
00254 "  unsigned int i_end = size1/4;\n"
00255 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00256 "    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00257 "}\n"
00258 ; //vector_align4_mul_add
00259 
00260 const char * const vector_align4_cpu_inplace_mul_add = 
00261 "__kernel void cpu_inplace_mul_add(\n"
00262 "          __global float4 * vec1,\n"
00263 "          unsigned int start1,\n"
00264 "          unsigned int size1,\n"
00265 "          __global const float4 * vec2,\n"
00266 "          unsigned int start2,\n"
00267 "          unsigned int size2,\n"
00268 "          float factor) \n"
00269 "{ \n"
00270 "  unsigned int i_end = size1/4;\n"
00271 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00272 "    vec1[i+start1] += vec2[i+start2] * factor;\n"
00273 "}\n"
00274 ; //vector_align4_cpu_inplace_mul_add
00275 
00276 const char * const vector_align4_inplace_div_sub = 
00277 "__kernel void inplace_div_sub(\n"
00278 "          __global float4 * vec1,\n"
00279 "          unsigned int start1,\n"
00280 "          unsigned int size1,\n"
00281 "          __global const float4 * vec2,\n"
00282 "          unsigned int start2,\n"
00283 "          unsigned int size2,\n"
00284 "          __global const float * fac)   //CPU variant is mapped to mult_add\n"
00285 "{ \n"
00286 "  float factor = *fac;\n"
00287 "  unsigned int i_end = size1/4;\n"
00288 "  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
00289 "    vec1[i+start1] -= vec2[i+start2] / factor;\n"
00290 "}\n"
00291 ; //vector_align4_inplace_div_sub
00292 
00293 const char * const vector_align1_inplace_sub = 
00294 "__kernel void inplace_sub(\n"
00295 "          __global float * vec1,\n"
00296 "          unsigned int start1,\n"
00297 "          unsigned int size1,\n"
00298 "          __global const float * vec2,\n"
00299 "          unsigned int start2,\n"
00300 "          unsigned int size2) \n"
00301 "{ \n"
00302 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00303 "    vec1[i+start1] -= vec2[i+start2];\n"
00304 "}\n"
00305 ; //vector_align1_inplace_sub
00306 
00307 const char * const vector_align1_norm_inf = 
00308 "\n"
00309 "////// norm_inf\n"
00310 "float impl_norm_inf(\n"
00311 "          __global const float * vec,\n"
00312 "          unsigned int start_index,\n"
00313 "          unsigned int end_index,\n"
00314 "          __local float * tmp_buffer)\n"
00315 "{\n"
00316 "  float tmp = 0;\n"
00317 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00318 "    tmp = fmax(fabs(vec[i]), tmp);\n"
00319 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00320 "  \n"
00321 "  //step 2: parallel reduction:\n"
00322 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00323 "  {\n"
00324 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00325 "    if (get_global_id(0) < stride)\n"
00326 "      tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
00327 "  }\n"
00328 "  \n"
00329 "  return tmp_buffer[0];\n"
00330 "}\n"
00331 "\n"
00332 "__kernel void norm_inf(\n"
00333 "          __global const float * vec,\n"
00334 "          unsigned int start1,\n"
00335 "          unsigned int size1,\n"
00336 "          __local float * tmp_buffer,\n"
00337 "          global float * group_buffer)\n"
00338 "{\n"
00339 "  float tmp = impl_norm_inf(vec,\n"
00340 "                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00341 "                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
00342 "                          tmp_buffer);\n"
00343 "  \n"
00344 "  if (get_local_id(0) == 0)\n"
00345 "    group_buffer[get_group_id(0)] = tmp;  \n"
00346 "}\n"
00347 ; //vector_align1_norm_inf
00348 
00349 const char * const vector_align1_index_norm_inf = 
00350 "//index_norm_inf:\n"
00351 "unsigned int float_vector1_index_norm_inf_impl(\n"
00352 "          __global const float * vec,\n"
00353 "          unsigned int start1,\n"
00354 "          unsigned int size1,\n"
00355 "          __local float * float_buffer,\n"
00356 "          __local unsigned int * index_buffer)\n"
00357 "{\n"
00358 "  //step 1: fill buffer:\n"
00359 "  float cur_max = 0.0f;\n"
00360 "  float tmp;\n"
00361 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00362 "  {\n"
00363 "    tmp = fabs(vec[i+start1]);\n"
00364 "    if (cur_max < tmp)\n"
00365 "    {\n"
00366 "      float_buffer[get_global_id(0)] = tmp;\n"
00367 "      index_buffer[get_global_id(0)] = i;\n"
00368 "      cur_max = tmp;\n"
00369 "    }\n"
00370 "  }\n"
00371 "  \n"
00372 "  //step 2: parallel reduction:\n"
00373 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00374 "  {\n"
00375 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00376 "    if (get_global_id(0) < stride)\n"
00377 "    {\n"
00378 "      //find the first occurring index\n"
00379 "      if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
00380 "      {\n"
00381 "        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
00382 "        float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
00383 "      }\n"
00384 "      \n"
00385 "      //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
00386 "      //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
00387 "    }\n"
00388 "  }\n"
00389 "  \n"
00390 "  return index_buffer[0];\n"
00391 "}\n"
00392 "\n"
00393 "__kernel void index_norm_inf(\n"
00394 "          __global float * vec,\n"
00395 "          unsigned int start1,\n"
00396 "          unsigned int size1,\n"
00397 "          __local float * float_buffer,\n"
00398 "          __local unsigned int * index_buffer,\n"
00399 "          global unsigned int * result) \n"
00400 "{ \n"
00401 "  unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, size1, float_buffer, index_buffer);\n"
00402 "  if (get_global_id(0) == 0) *result = tmp;\n"
00403 "}\n"
00404 "\n"
00405 "\n"
00406 ; //vector_align1_index_norm_inf
00407 
00408 const char * const vector_align1_mult = 
00409 "__kernel void mult(\n"
00410 "          __global const float * vec,\n"
00411 "          unsigned int start1,\n"
00412 "          unsigned int size1,\n"
00413 "          __global const float * fac, \n"
00414 "          __global float * result,\n"
00415 "          unsigned int start3,\n"
00416 "          unsigned int size3) \n"
00417 "{ \n"
00418 "  float factor = *fac;\n"
00419 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00420 "    result[i+start3] = vec[i+start1] * factor;\n"
00421 "}\n"
00422 ; //vector_align1_mult
00423 
00424 const char * const vector_align1_swap = 
00425 "////// swap:\n"
00426 "__kernel void swap(\n"
00427 "          __global float * vec1,\n"
00428 "          unsigned int start1,\n"
00429 "          unsigned int size1,\n"
00430 "          __global float * vec2,\n"
00431 "          unsigned int start2,\n"
00432 "          unsigned int size2\n"
00433 "          ) \n"
00434 "{ \n"
00435 "  float tmp;\n"
00436 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00437 "  {\n"
00438 "    tmp = vec2[i+start2];\n"
00439 "    vec2[i+start2] = vec1[i+start1];\n"
00440 "    vec1[i+start1] = tmp;\n"
00441 "  }\n"
00442 "}\n"
00443 " \n"
00444 ; //vector_align1_swap
00445 
00446 const char * const vector_align1_inplace_div_add = 
00447 "///// divide add:\n"
00448 "__kernel void inplace_div_add(\n"
00449 "          __global float * vec1,\n"
00450 "          unsigned int start1,\n"
00451 "          unsigned int size1,\n"
00452 "          __global const float * vec2,\n"
00453 "          unsigned int start2,\n"
00454 "          unsigned int size2,\n"
00455 "          __global const float * fac)   //CPU variant is mapped to mult_add\n"
00456 "{ \n"
00457 "  float factor = *fac;\n"
00458 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00459 "    vec1[i+start1] -= vec2[i+start2] / factor;\n"
00460 "}\n"
00461 ; //vector_align1_inplace_div_add
00462 
00463 const char * const vector_align1_norm_2 = 
00464 "//helper:\n"
00465 "void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
00466 "{\n"
00467 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00468 "  {\n"
00469 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00470 "    if (get_global_id(0) < stride)\n"
00471 "      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00472 "  }\n"
00473 "}\n"
00474 "\n"
00475 "////// norm_2\n"
00476 "float impl_norm_2(\n"
00477 "          __global const float * vec,\n"
00478 "          unsigned int start_index,\n"
00479 "          unsigned int end_index,\n"
00480 "          __local float * tmp_buffer)\n"
00481 "{\n"
00482 "  float tmp = 0;\n"
00483 "  float vec_entry = 0;\n"
00484 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00485 "  {\n"
00486 "    vec_entry = vec[i];\n"
00487 "    tmp += vec_entry * vec_entry;\n"
00488 "  }\n"
00489 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00490 "  \n"
00491 "  helper_norm2_parallel_reduction(tmp_buffer);\n"
00492 "  \n"
00493 "  return tmp_buffer[0];\n"
00494 "};\n"
00495 "\n"
00496 "__kernel void norm_2(\n"
00497 "          __global const float * vec,\n"
00498 "          unsigned int start1,\n"
00499 "          unsigned int size1,\n"
00500 "          __local float * tmp_buffer,\n"
00501 "          global float * group_buffer)\n"
00502 "{\n"
00503 "  float tmp = impl_norm_2(vec,\n"
00504 "                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00505 "                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
00506 "                          tmp_buffer);\n"
00507 "  \n"
00508 "  if (get_local_id(0) == 0)\n"
00509 "    group_buffer[get_group_id(0)] = tmp;  \n"
00510 "}\n"
00511 "\n"
00512 ; //vector_align1_norm_2
00513 
00514 const char * const vector_align1_sub = 
00515 "__kernel void sub(\n"
00516 "          __global const float * vec1,\n"
00517 "          unsigned int start1,\n"
00518 "          unsigned int size1,\n"
00519 "          __global const float * vec2, \n"
00520 "          unsigned int start2,\n"
00521 "          unsigned int size2,\n"
00522 "          __global float * result,\n"
00523 "          unsigned int start3,\n"
00524 "          unsigned int size3)\n"
00525 "{ \n"
00526 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00527 "    result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
00528 "}\n"
00529 ; //vector_align1_sub
00530 
00531 const char * const vector_align1_cpu_mul_add = 
00532 "__kernel void cpu_mul_add(\n"
00533 "          __global const float * vec1,\n"
00534 "          unsigned int start1,\n"
00535 "          unsigned int size1,\n"
00536 "          float factor,\n"
00537 "          __global const float * vec2,\n"
00538 "          unsigned int start2,\n"
00539 "          unsigned int size2,\n"
00540 "          __global float * result,\n"
00541 "          unsigned int start3,\n"
00542 "          unsigned int size3\n"
00543 "          ) \n"
00544 "{ \n"
00545 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00546 "    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00547 "}\n"
00548 ; //vector_align1_cpu_mul_add
00549 
00550 const char * const vector_align1_vmax = 
00551 "__kernel void vmax(\n"
00552 "          __global float * vec1,\n"
00553 "          unsigned int start1,\n"
00554 "          unsigned int size1,\n"
00555 "          __global float * result) \n"
00556 "{ \n"
00557 "  //parallel reduction on global memory (make sure that size is a power of 2)\n"
00558 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00559 "  {\n"
00560 "    if (get_global_id(0) < stride)\n"
00561 "      vec1[get_global_id(0)+start1] = fmax(vec1[get_global_id(0)+start1+stride], vec1[get_global_id(0)+start1]);\n"
00562 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00563 "  }\n"
00564 "  \n"
00565 "  if (get_global_id(0) == 0)\n"
00566 "    *result = vec1[start1];\n"
00567 "}\n"
00568 ; //vector_align1_vmax
00569 
00570 const char * const vector_align1_inner_prod = 
00571 "//helper:\n"
00572 "void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
00573 "{\n"
00574 "  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00575 "  {\n"
00576 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00577 "    if (get_local_id(0) < stride)\n"
00578 "      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00579 "  }\n"
00580 "}\n"
00581 "//////// inner products:\n"
00582 "float impl_inner_prod(\n"
00583 "          __global const float * vec1,\n"
00584 "          unsigned int start1,\n"
00585 "          unsigned int size1,\n"
00586 "          __global const float * vec2,\n"
00587 "          unsigned int start2,\n"
00588 "          unsigned int size2,\n"
00589 "          __local float * tmp_buffer)\n"
00590 "{\n"
00591 "  float tmp = 0;\n"
00592 "  for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))\n"
00593 "    tmp += vec1[i+start1] * vec2[i+start2];\n"
00594 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00595 "  \n"
00596 "  helper_inner_prod_parallel_reduction(tmp_buffer);\n"
00597 "  \n"
00598 "  return tmp_buffer[0];\n"
00599 "}\n"
00600 "__kernel void inner_prod(\n"
00601 "          __global const float * vec1,\n"
00602 "          unsigned int start1,\n"
00603 "          unsigned int size1,\n"
00604 "          __global const float * vec2,\n"
00605 "          unsigned int start2,\n"
00606 "          unsigned int size2,\n"
00607 "          __local float * tmp_buffer,\n"
00608 "          global float * group_buffer)\n"
00609 "{\n"
00610 "  float tmp = impl_inner_prod(vec1,\n"
00611 "                              (      get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00612 "                              ((get_group_id(0) + 1) * size1) / get_num_groups(0) - (      get_group_id(0) * size1) / get_num_groups(0),\n"
00613 "                              vec2,\n"
00614 "                              (      get_group_id(0) * size2) / get_num_groups(0) + start2,\n"
00615 "                              ((get_group_id(0) + 1) * size2) / get_num_groups(0) - (      get_group_id(0) * size2) / get_num_groups(0),\n"
00616 "                              tmp_buffer);\n"
00617 "  \n"
00618 "  if (get_local_id(0) == 0)\n"
00619 "    group_buffer[get_group_id(0)] = tmp;\n"
00620 "  \n"
00621 "}\n"
00622 ; //vector_align1_inner_prod
00623 
00624 const char * const vector_align1_add = 
00625 "__kernel void add(\n"
00626 "          __global const float * vec1,\n"
00627 "          unsigned int start1,\n"
00628 "          unsigned int size1,\n"
00629 "          __global const float * vec2,\n"
00630 "          unsigned int start2,\n"
00631 "          unsigned int size2,\n"
00632 "          __global float * result,\n"
00633 "          unsigned int start3,\n"
00634 "          unsigned int size3) \n"
00635 "{ \n"
00636 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00637 "    result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
00638 "}\n"
00639 ; //vector_align1_add
00640 
00641 const char * const vector_align1_plane_rotation = 
00642 "////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
00643 "__kernel void plane_rotation(\n"
00644 "          __global float * vec1,\n"
00645 "          unsigned int start1,\n"
00646 "          unsigned int size1,\n"
00647 "          __global float * vec2, \n"
00648 "          unsigned int start2,\n"
00649 "          unsigned int size2,\n"
00650 "          float alpha,\n"
00651 "          float beta) \n"
00652 "{ \n"
00653 "  float tmp1 = 0;\n"
00654 "  float tmp2 = 0;\n"
00655 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00656 "  {\n"
00657 "    tmp1 = vec1[i+start1];\n"
00658 "    tmp2 = vec2[i+start2];\n"
00659 "    \n"
00660 "    vec1[i+start1] = alpha * tmp1 + beta * tmp2;\n"
00661 "    vec2[i+start2] = alpha * tmp2 - beta * tmp1;\n"
00662 "  }\n"
00663 "}\n"
00664 ; //vector_align1_plane_rotation
00665 
00666 const char * const vector_align1_inplace_mul_sub = 
00667 "__kernel void inplace_mul_sub(\n"
00668 "          __global float * vec1,\n"
00669 "          unsigned int start1,\n"
00670 "          unsigned int size1,\n"
00671 "          __global const float * vec2,\n"
00672 "          unsigned int start2,\n"
00673 "          unsigned int size2,\n"
00674 "          __global const float * fac)   //CPU variant is mapped to mult_add\n"
00675 "{ \n"
00676 "  float factor = *fac;\n"
00677 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00678 "    vec1[i+start1] -= vec2[i+start2] * factor;\n"
00679 "}\n"
00680 ; //vector_align1_inplace_mul_sub
00681 
00682 const char * const vector_align1_inplace_mul_add = 
00683 "__kernel void inplace_mul_add(\n"
00684 "          __global float * vec1,\n"
00685 "          unsigned int start1,\n"
00686 "          unsigned int size1,\n"
00687 "          __global const float * vec2,\n"
00688 "          unsigned int start2,\n"
00689 "          unsigned int size2,\n"
00690 "          __global const float * fac) \n"
00691 "{ \n"
00692 "  float factor = *fac;\n"
00693 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00694 "    vec1[i+start1] += vec2[i+start2] * factor;\n"
00695 "}\n"
00696 ; //vector_align1_inplace_mul_add
00697 
00698 const char * const vector_align1_mul_add = 
00699 "__kernel void mul_add(\n"
00700 "          __global const float * vec1,\n"
00701 "          unsigned int start1,\n"
00702 "          unsigned int size1,\n"
00703 "          __global const float * fac,\n"
00704 "          __global const float * vec2,\n"
00705 "          unsigned int start2,\n"
00706 "          unsigned int size2,\n"
00707 "          __global float * result,\n"
00708 "          unsigned int start3,\n"
00709 "          unsigned int size3\n"
00710 "          ) \n"
00711 "{ \n"
00712 "  float factor = *fac;\n"
00713 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00714 "    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
00715 "}\n"
00716 ; //vector_align1_mul_add
00717 
00718 const char * const vector_align1_cpu_mult = 
00719 "__kernel void cpu_mult(\n"
00720 "          __global const float * vec,\n"
00721 "          unsigned int start1,\n"
00722 "          unsigned int size1,\n"
00723 "          float factor, \n"
00724 "          __global float * result,\n"
00725 "          unsigned int start2,\n"
00726 "          unsigned int size2) \n"
00727 "{ \n"
00728 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00729 "    result[i+start2] = vec[i+start1] * factor;\n"
00730 "}\n"
00731 ; //vector_align1_cpu_mult
00732 
00733 const char * const vector_align1_inplace_divide = 
00734 "__kernel void inplace_divide(\n"
00735 "          __global float * vec,\n"
00736 "          unsigned int start1,\n"
00737 "          unsigned int size1,\n"
00738 "          __global const float * fac)  //note: CPU variant is mapped to prod_scalar\n"
00739 "{ \n"
00740 "  float factor = *fac;\n"
00741 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00742 "    vec[i+start1] /= factor;\n"
00743 "}\n"
00744 ; //vector_align1_inplace_divide
00745 
00746 const char * const vector_align1_sqrt_sum = 
00747 "__kernel void sqrt_sum(\n"
00748 "          __global float * vec1,\n"
00749 "          unsigned int start1,\n"
00750 "          unsigned int size1,\n"
00751 "          __global float * result) \n"
00752 "{ \n"
00753 "  //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)\n"
00754 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00755 "  {\n"
00756 "    if (get_global_id(0) < stride)\n"
00757 "      vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
00758 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00759 "  }\n"
00760 "  \n"
00761 "  if (get_global_id(0) == 0)\n"
00762 "    *result = sqrt(vec1[start1]);\n"
00763 "  \n"
00764 "}\n"
00765 ; //vector_align1_sqrt_sum
00766 
00767 const char * const vector_align1_cpu_inplace_mul_add = 
00768 "__kernel void cpu_inplace_mul_add(\n"
00769 "          __global float * vec1,\n"
00770 "          unsigned int start1,\n"
00771 "          unsigned int size1,\n"
00772 "          __global const float * vec2,\n"
00773 "          unsigned int start2,\n"
00774 "          unsigned int size2,\n"
00775 "          float factor) \n"
00776 "{ \n"
00777 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00778 "    vec1[i+start1] += vec2[i+start2] * factor;\n"
00779 "}\n"
00780 ; //vector_align1_cpu_inplace_mul_add
00781 
00782 const char * const vector_align1_inplace_add = 
00783 "__kernel void inplace_add(\n"
00784 "          __global float * vec1,\n"
00785 "          unsigned int start1,\n"
00786 "          unsigned int size1,\n"
00787 "          __global const float * vec2,\n"
00788 "          unsigned int start2,\n"
00789 "          unsigned int size2) \n"
00790 "{ \n"
00791 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00792 "    vec1[i+start1] += vec2[i+start2];\n"
00793 "}\n"
00794 ; //vector_align1_inplace_add
00795 
00796 const char * const vector_align1_divide = 
00797 "// Note: name 'div' is not allowed by the jit-compiler\n"
00798 "__kernel void divide(\n"
00799 "          __global const float * vec,\n"
00800 "          unsigned int start1,\n"
00801 "          unsigned int size1,\n"
00802 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00803 "          __global float * result,\n"
00804 "          unsigned int start3,\n"
00805 "          unsigned int size3)  \n"
00806 "{ \n"
00807 "  float factor = *fac;\n"
00808 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00809 "    result[i+start3] = vec[i+start1] / factor;\n"
00810 "}\n"
00811 ; //vector_align1_divide
00812 
00813 const char * const vector_align1_norm_1 = 
00814 "//helper:\n"
00815 "void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
00816 "{\n"
00817 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00818 "  {\n"
00819 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00820 "    if (get_global_id(0) < stride)\n"
00821 "      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00822 "  }\n"
00823 "}\n"
00824 "\n"
00825 "////// norm_1\n"
00826 "float impl_norm_1(\n"
00827 "          __global const float * vec,\n"
00828 "          unsigned int start_index,\n"
00829 "          unsigned int end_index,\n"
00830 "          __local float * tmp_buffer)\n"
00831 "{\n"
00832 "  float tmp = 0;\n"
00833 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00834 "    tmp += fabs(vec[i]);\n"
00835 "  \n"
00836 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00837 "  \n"
00838 "  helper_norm1_parallel_reduction(tmp_buffer);\n"
00839 "  \n"
00840 "  return tmp_buffer[0];\n"
00841 "};\n"
00842 "\n"
00843 "__kernel void norm_1(\n"
00844 "          __global const float * vec,\n"
00845 "          unsigned int start1,\n"
00846 "          unsigned int size1,\n"
00847 "          __local float * tmp_buffer,\n"
00848 "          global float * group_buffer)\n"
00849 "{\n"
00850 "  float tmp = impl_norm_1(vec,\n"
00851 "                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
00852 "                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
00853 "                          tmp_buffer);\n"
00854 "  \n"
00855 "  if (get_local_id(0) == 0)\n"
00856 "    group_buffer[get_group_id(0)] = tmp;  \n"
00857 "}\n"
00858 "\n"
00859 ; //vector_align1_norm_1
00860 
00861 const char * const vector_align1_clear = 
00862 "__kernel void clear(\n"
00863 "          __global float * vec,\n"
00864 "          unsigned int start1,\n"
00865 "          unsigned int size1) \n"
00866 "{ \n"
00867 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00868 "    vec[i+start1] = 0;\n"
00869 "}\n"
00870 ; //vector_align1_clear
00871 
00872 const char * const vector_align1_cpu_inplace_mult = 
00873 "__kernel void cpu_inplace_mult(\n"
00874 "          __global float * vec,\n"
00875 "          unsigned int start1,\n"
00876 "          unsigned int size1,\n"
00877 "          float factor) \n"
00878 "{ \n"
00879 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00880 "    vec[i+start1] *= factor;\n"
00881 "}\n"
00882 ; //vector_align1_cpu_inplace_mult
00883 
00884 const char * const vector_align1_inplace_mult = 
00885 "__kernel void inplace_mult(\n"
00886 "          __global float * vec,\n"
00887 "          unsigned int start1,\n"
00888 "          unsigned int size1,\n"
00889 "          __global const float * fac) \n"
00890 "{ \n"
00891 "  float factor = *fac;\n"
00892 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00893 "    vec[i+start1] *= factor;\n"
00894 "}\n"
00895 ; //vector_align1_inplace_mult
00896 
00897 const char * const vector_align1_sum = 
00898 "__kernel void sum(\n"
00899 "          __global float * vec1,\n"
00900 "          unsigned int start1,\n"
00901 "          unsigned int size1,\n"
00902 "          __global float * result) \n"
00903 "{ \n"
00904 "  //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)\n"
00905 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00906 "  {\n"
00907 "    if (get_global_id(0) < stride)\n"
00908 "      vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
00909 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00910 "  }\n"
00911 "  \n"
00912 "  if (get_global_id(0) == 0)\n"
00913 "    *result = vec1[0];  \n"
00914 "}\n"
00915 ; //vector_align1_sum
00916 
00917 const char * const vector_align1_inplace_div_sub = 
00918 "///// divide substract:\n"
00919 "__kernel void inplace_div_sub(\n"
00920 "          __global float * vec1,\n"
00921 "          unsigned int start1,\n"
00922 "          unsigned int size1,\n"
00923 "          __global const float * vec2,\n"
00924 "          unsigned int start2,\n"
00925 "          unsigned int size2,\n"
00926 "          __global const float * fac)   //CPU variant is mapped to mult_add\n"
00927 "{ \n"
00928 "  float factor = *fac;\n"
00929 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00930 "    vec1[i+start1] -= vec2[i+start2] / factor;\n"
00931 "}\n"
00932 ; //vector_align1_inplace_div_sub
00933 
00934 const char * const vector_align1_diag_precond = 
00935 "__kernel void diag_precond(\n"
00936 "          __global const float * diag_A_inv, \n"
00937 "          unsigned int start1,\n"
00938 "          unsigned int size1,\n"
00939 "          __global float * x, \n"
00940 "          unsigned int start2,\n"
00941 "          unsigned int size2) \n"
00942 "{ \n"
00943 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00944 "    x[i+start2] *= diag_A_inv[i+start1];\n"
00945 "}\n"
00946 ; //vector_align1_diag_precond
00947 
00948 const char * const vector_align1_mul_sub = 
00949 "///// multiply subtract:\n"
00950 "__kernel void mul_sub(\n"
00951 "          __global const float * vec1,\n"
00952 "          unsigned int start1,\n"
00953 "          unsigned int size1,\n"
00954 "          __global const float * fac,\n"
00955 "          __global const float * vec2,\n"
00956 "          unsigned int start2,\n"
00957 "          unsigned int size2,\n"
00958 "          __global float * result,\n"
00959 "          unsigned int start3,\n"
00960 "          unsigned int size3\n"
00961 "          ) \n"
00962 "{ \n"
00963 "  float factor = *fac;\n"
00964 "  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
00965 "    result[i+start3] = vec1[i+start1] * factor - vec2[i+start2];\n"
00966 "}\n"
00967 ; //vector_align1_mul_sub
00968 
00969   }  //namespace kernels
00970  }  //namespace linalg
00971 }  //namespace viennacl
00972 #endif
/data/development/ViennaCL/dev/viennacl/linalg/kernels/vector_source.h