ViennaCL - The Vienna Computing Library: /data/development/ViennaCL/dev/viennacl/linalg/kernels/compressed_matrix

Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
00003 //Automatically generated file from auxiliary-directory, do not edit manually!
00004 namespace viennacl
00005 {
00006  namespace linalg
00007  {
00008   namespace kernels
00009   {
00010 const char * const compressed_matrix_align4_vec_mul = 
00011 "__kernel void vec_mul(\n"
00012 "          __global const unsigned int * row_indices,\n"
00013 "          __global const uint4 * column_indices, \n"
00014 "          __global const float4 * elements,\n"
00015 "          __global const float * vector,  \n"
00016 "          __global float * result,\n"
00017 "          unsigned int size)\n"
00018 "{ \n"
00019 "  float dot_prod;\n"
00020 "  unsigned int start, next_stop;\n"
00021 "  uint4 col_idx;\n"
00022 "  float4 tmp_vec;\n"
00023 "  float4 tmp_entries;\n"
00024 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00025 "  {\n"
00026 "    dot_prod = 0.0f;\n"
00027 "    start = row_indices[row] / 4;\n"
00028 "    next_stop = row_indices[row+1] / 4;\n"
00029 "    for (unsigned int i = start; i < next_stop; ++i)\n"
00030 "    {\n"
00031 "      col_idx = column_indices[i];\n"
00032 "      tmp_entries = elements[i];\n"
00033 "      tmp_vec.x = vector[col_idx.x];\n"
00034 "      tmp_vec.y = vector[col_idx.y];\n"
00035 "      tmp_vec.z = vector[col_idx.z];\n"
00036 "      tmp_vec.w = vector[col_idx.w];\n"
00037 "      dot_prod += dot(tmp_entries, tmp_vec);\n"
00038 "    }\n"
00039 "    result[row] = dot_prod;\n"
00040 "  }\n"
00041 "}\n"
00042 ; //compressed_matrix_align4_vec_mul
00043 
00044 const char * const compressed_matrix_align1_jacobi = 
00045 "__kernel void jacobi(\n"
00046 " __global const unsigned int * row_indices,\n"
00047 " __global const unsigned int * column_indices,\n"
00048 " __global const float * elements,\n"
00049 " float weight,\n"
00050 " __global const float * old_result,\n"
00051 " __global float * new_result,\n"
00052 " __global const float * rhs,\n"
00053 " unsigned int size)\n"
00054 " {\n"
00055 "  float sum, diag=1;\n"
00056 "  int col;\n"
00057 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00058 "  {\n"
00059 "    sum = 0;\n"
00060 "    for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)\n"
00061 "    {\n"
00062 "      col = column_indices[j];\n"
00063 "      if (i == col)\n"
00064 "       diag = elements[j];\n"
00065 "      else \n"
00066 "       sum += elements[j] * old_result[col]; \n"
00067 "    } \n"
00068 "      new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n"
00069 "   } \n"
00070 " } \n"
00071 ; //compressed_matrix_align1_jacobi
00072 
00073 const char * const compressed_matrix_align1_vec_mul = 
00074 "__kernel void vec_mul(\n"
00075 "          __global const unsigned int * row_indices,\n"
00076 "          __global const unsigned int * column_indices, \n"
00077 "          __global const float * elements,\n"
00078 "          __global const float * vector,  \n"
00079 "          __global float * result,\n"
00080 "          unsigned int size) \n"
00081 "{ \n"
00082 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00083 "  {\n"
00084 "    float dot_prod = 0.0f;\n"
00085 "    unsigned int row_end = row_indices[row+1];\n"
00086 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00087 "      dot_prod += elements[i] * vector[column_indices[i]];\n"
00088 "    result[row] = dot_prod;\n"
00089 "  }\n"
00090 "}\n"
00091 ; //compressed_matrix_align1_vec_mul
00092 
00093 const char * const compressed_matrix_align1_row_scaling_2 = 
00094 "__kernel void row_scaling_2(\n"
00095 "          __global const unsigned int * row_indices,\n"
00096 "          __global const unsigned int * column_indices, \n"
00097 "          __global const float * elements,\n"
00098 "          __global float * diag_M_inv,\n"
00099 "          unsigned int size) \n"
00100 "{ \n"
00101 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00102 "  {\n"
00103 "    float dot_prod = 0.0f;\n"
00104 "    float temp = 0.0f;\n"
00105 "    unsigned int row_end = row_indices[row+1];\n"
00106 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00107 "    {\n"
00108 "      temp = elements[i];\n"
00109 "      dot_prod += temp * temp;\n"
00110 "    }\n"
00111 "    diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n"
00112 "  }\n"
00113 "}\n"
00114 ; //compressed_matrix_align1_row_scaling_2
00115 
00116 const char * const compressed_matrix_align1_jacobi_precond = 
00117 "__kernel void jacobi_precond(\n"
00118 "          __global const unsigned int * row_indices,\n"
00119 "          __global const unsigned int * column_indices, \n"
00120 "          __global const float * elements,\n"
00121 "          __global float * diag_M_inv,\n"
00122 "          unsigned int size) \n"
00123 "{ \n"
00124 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00125 "  {\n"
00126 "    float diag = 1.0f;\n"
00127 "    unsigned int row_end = row_indices[row+1];\n"
00128 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00129 "    {\n"
00130 "      if (row == column_indices[i])\n"
00131 "      {\n"
00132 "        diag = elements[i];\n"
00133 "        break;\n"
00134 "      }\n"
00135 "    }\n"
00136 "    diag_M_inv[row] = 1.0f / diag;\n"
00137 "  }\n"
00138 "}\n"
00139 ; //compressed_matrix_align1_jacobi_precond
00140 
00141 const char * const compressed_matrix_align1_row_scaling_1 = 
00142 "__kernel void row_scaling_1(\n"
00143 "          __global const unsigned int * row_indices,\n"
00144 "          __global const unsigned int * column_indices, \n"
00145 "          __global const float * elements,\n"
00146 "          __global float * diag_M_inv,\n"
00147 "          unsigned int size) \n"
00148 "{ \n"
00149 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00150 "  {\n"
00151 "    float dot_prod = 0.0f;\n"
00152 "    unsigned int row_end = row_indices[row+1];\n"
00153 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00154 "      dot_prod += fabs(elements[i]);\n"
00155 "    diag_M_inv[row] = 1.0f / dot_prod;\n"
00156 "  }\n"
00157 "}\n"
00158 ; //compressed_matrix_align1_row_scaling_1
00159 
00160 const char * const compressed_matrix_align1_lu_forward = 
00161 " \n"
00162 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00163 "__kernel void lu_forward(\n"
00164 "          __global const unsigned int * row_indices,\n"
00165 "          __global const unsigned int * column_indices, \n"
00166 "          __global const float * elements,\n"
00167 "          __local  int * buffer,                              \n"
00168 "          __local  float * vec_entries,   //a memory block from vector\n"
00169 "          __global float * vector,\n"
00170 "          unsigned int size) \n"
00171 "{\n"
00172 "  int waiting_for; //block index that must be finished before the current thread can start\n"
00173 "  unsigned int waiting_for_index;\n"
00174 "  int block_offset;\n"
00175 "  unsigned int col;\n"
00176 "  unsigned int row;\n"
00177 "  unsigned int row_index_end;\n"
00178 "  \n"
00179 "  //backward substitution: one thread per row in blocks of get_global_size(0)\n"
00180 "  for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n"
00181 "  {\n"
00182 "    block_offset = block_num * get_global_size(0);\n"
00183 "    row = block_offset + get_global_id(0);\n"
00184 "    buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00185 "    waiting_for = -1;\n"
00186 "    if (row < size)\n"
00187 "    {\n"
00188 "      vec_entries[get_global_id(0)] = vector[row];\n"
00189 "      waiting_for_index = row_indices[row];\n"
00190 "      row_index_end = row_indices[row+1];\n"
00191 "    }\n"
00192 "    \n"
00193 "    if (get_global_id(0) == 0)\n"
00194 "      buffer[get_global_size(0)] = 1;\n"
00195 "    //try to eliminate all lines in the block. \n"
00196 "    //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00197 "    for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00198 "    {\n"
00199 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00200 "      if (row < size) //valid index?\n"
00201 "      {\n"
00202 "        if (waiting_for >= 0)\n"
00203 "        {\n"
00204 "          if (buffer[waiting_for] == 1)\n"
00205 "            waiting_for = -1;\n"
00206 "        }\n"
00207 "        \n"
00208 "        if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00209 "        {\n"
00210 "          //check whether reduction is possible:\n"
00211 "          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00212 "          {\n"
00213 "            col = column_indices[j];\n"
00214 "            if (col < block_offset) //index valid, but not from current block\n"
00215 "              vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00216 "            else if (col < row)  //index is from current block\n"
00217 "            {\n"
00218 "              if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00219 "              {\n"
00220 "                waiting_for = col - block_offset;\n"
00221 "                waiting_for_index = j;\n"
00222 "                break;\n"
00223 "              }\n"
00224 "              else  //updated entry is available in shared memory:\n"
00225 "                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00226 "            }\n"
00227 "          }\n"
00228 "          \n"
00229 "          if (waiting_for == -1)  //this row is done\n"
00230 "          {\n"
00231 "            buffer[get_global_id(0)] = 1;\n"
00232 "            waiting_for = -2; //magic number: thread is finished\n"
00233 "          }\n"
00234 "        } \n"
00235 "      } //row < size\n"
00236 "      else\n"
00237 "        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00238 "      ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00239 "      \n"
00240 "      if (buffer[get_global_id(0)] == 0)\n"
00241 "        buffer[get_global_size(0)] = 0;\n"
00242 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00243 "      \n"
00244 "      if (buffer[get_global_size(0)] > 0)  //all threads break this loop simultaneously\n"
00245 "        break;\n"
00246 "      if (get_global_id(0) == 0)\n"
00247 "        buffer[get_global_size(0)] = 1;\n"
00248 "    } //for k\n"
00249 "    \n"
00250 "    //write to vector:\n"
00251 "    if (row < size)\n"
00252 "      vector[row] = vec_entries[get_global_id(0)];\n"
00253 "    \n"
00254 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00255 "  } //for block_num\n"
00256 "}\n"
00257 ; //compressed_matrix_align1_lu_forward
00258 
00259 const char * const compressed_matrix_align1_bicgstab_kernel2 = 
00260 "void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n"
00261 "{\n"
00262 "  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00263 "  {\n"
00264 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00265 "    if (get_local_id(0) < stride)\n"
00266 "      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00267 "  }\n"
00268 "}\n"
00269 "//////// inner products:\n"
00270 "float bicgstab_kernel2_inner_prod(\n"
00271 "          __global const float * vec1,\n"
00272 "          __global const float * vec2,\n"
00273 "          unsigned int size,\n"
00274 "          __local float * tmp_buffer)\n"
00275 "{\n"
00276 "  float tmp = 0;\n"
00277 "  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00278 "  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00279 "  {\n"
00280 "    if (i < size)\n"
00281 "      tmp += vec1[i] * vec2[i];\n"
00282 "  }\n"
00283 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00284 "  \n"
00285 "  helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n"
00286 "  barrier(CLK_LOCAL_MEM_FENCE);\n"
00287 "  return tmp_buffer[0];\n"
00288 "}\n"
00289 "__kernel void bicgstab_kernel2(\n"
00290 "          __global const float * tmp0,\n"
00291 "          __global const float * tmp1,\n"
00292 "          __global const float * r0star, \n"
00293 "          __global const float * s, \n"
00294 "          __global float * p, \n"
00295 "          __global float * result,\n"
00296 "          __global float * residual,\n"
00297 "          __global const float * alpha,\n"
00298 "          __global float * ip_rr0star,\n"
00299 "          __global float * error_estimate,\n"
00300 "          __local float * tmp_buffer,\n"
00301 "          unsigned int size) \n"
00302 "{ \n"
00303 "  float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n"
00304 "  float alpha_local = alpha[0];\n"
00305 "  \n"
00306 "  //result += alpha * p + omega * s;\n"
00307 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00308 "    result[i] += alpha_local * p[i] + omega_local * s[i];\n"
00309 "  //residual = s - omega * tmp1;\n"
00310 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00311 "    residual[i] = s[i] - omega_local * tmp1[i];\n"
00312 "  //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n"
00313 "  float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n"
00314 "  float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n"
00315 "  \n"
00316 "  //p = residual + beta * (p - omega*tmp0);\n"
00317 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00318 "    p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n"
00319 "  //compute norm of residual:\n"
00320 "  float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n"
00321 "  barrier(CLK_GLOBAL_MEM_FENCE);\n"
00322 "  //update values:\n"
00323 "  if (get_global_id(0) == 0)\n"
00324 "  {\n"
00325 "    error_estimate[0] = new_error_estimate;\n"
00326 "    ip_rr0star[0] = new_ip_rr0star;\n"
00327 "  }\n"
00328 "}\n"
00329 ; //compressed_matrix_align1_bicgstab_kernel2
00330 
00331 const char * const compressed_matrix_align1_lu_backward = 
00332 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
00333 "__kernel void lu_backward(\n"
00334 "          __global const unsigned int * row_indices,\n"
00335 "          __global const unsigned int * column_indices, \n"
00336 "          __global const float * elements,\n"
00337 "          __local  int * buffer,                              \n"
00338 "          __local  float * vec_entries,   //a memory block from vector\n"
00339 "          __global float * vector,\n"
00340 "          unsigned int size) \n"
00341 "{\n"
00342 "  int waiting_for; //block index that must be finished before the current thread can start\n"
00343 "  unsigned int waiting_for_index;\n"
00344 "  unsigned int block_offset;\n"
00345 "  unsigned int col;\n"
00346 "  unsigned int row;\n"
00347 "  unsigned int row_index_end;\n"
00348 "  float diagonal_entry = 42;\n"
00349 "  \n"
00350 "  //forward substitution: one thread per row in blocks of get_global_size(0)\n"
00351 "  for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n"
00352 "  {\n"
00353 "    block_offset = block_num * get_global_size(0);\n"
00354 "    row = block_offset + get_global_id(0);\n"
00355 "    buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00356 "    waiting_for = -1;\n"
00357 "    \n"
00358 "    if (row < size)\n"
00359 "    {\n"
00360 "      vec_entries[get_global_id(0)] = vector[row];\n"
00361 "      waiting_for_index = row_indices[row];\n"
00362 "      row_index_end = row_indices[row+1];\n"
00363 "      diagonal_entry = column_indices[waiting_for_index];\n"
00364 "    }\n"
00365 "    \n"
00366 "    if (get_global_id(0) == 0)\n"
00367 "       buffer[get_global_size(0)] = 1;\n"
00368 "    //try to eliminate all lines in the block. \n"
00369 "    //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00370 "    for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00371 "    {\n"
00372 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00373 "      if (row < size) //valid index?\n"
00374 "      {\n"
00375 "        if (waiting_for >= 0)\n"
00376 "        {\n"
00377 "          if (buffer[waiting_for] == 1)\n"
00378 "            waiting_for = -1;\n"
00379 "        }\n"
00380 "        \n"
00381 "        if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00382 "        {\n"
00383 "          //check whether reduction is possible:\n"
00384 "          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00385 "          {\n"
00386 "            col = column_indices[j];\n"
00387 "            barrier(CLK_LOCAL_MEM_FENCE);\n"
00388 "            if (col >= block_offset + get_global_size(0))  //index valid, but not from current block\n"
00389 "              vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00390 "            else if (col > row)  //index is from current block\n"
00391 "            {\n"
00392 "              if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00393 "              {\n"
00394 "                waiting_for = col - block_offset;\n"
00395 "                waiting_for_index = j;\n"
00396 "                break;\n"
00397 "              }\n"
00398 "              else  //updated entry is available in shared memory:\n"
00399 "                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00400 "            }\n"
00401 "            else if (col == row)\n"
00402 "              diagonal_entry = elements[j];\n"
00403 "          }\n"
00404 "          \n"
00405 "          if (waiting_for == -1)  //this row is done\n"
00406 "          {\n"
00407 "            if (row == 0)\n"
00408 "              vec_entries[get_global_id(0)] /= elements[0];\n"
00409 "            else\n"
00410 "              vec_entries[get_global_id(0)] /= diagonal_entry;\n"
00411 "            buffer[get_global_id(0)] = 1;\n"
00412 "            waiting_for = -2; //magic number: thread is finished\n"
00413 "          }\n"
00414 "        } \n"
00415 "      } //row < size\n"
00416 "      else\n"
00417 "        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00418 "      \n"
00419 "      ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00420 "      if (buffer[get_global_id(0)] == 0)\n"
00421 "        buffer[get_global_size(0)] = 0;\n"
00422 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00423 "      \n"
00424 "      if (buffer[get_global_size(0)] > 0)  //all threads break the loop simultaneously\n"
00425 "        break;\n"
00426 "      if (get_global_id(0) == 0)\n"
00427 "        buffer[get_global_size(0)] = 1;\n"
00428 "    } //for k\n"
00429 "    if (row < size)\n"
00430 "      vector[row] = vec_entries[get_global_id(0)];\n"
00431 "      //vector[row] = diagonal_entry;\n"
00432 "    \n"
00433 "    //if (row == 0)\n"
00434 "      //vector[0] = diagonal_entry;\n"
00435 "      //vector[0] = elements[0];\n"
00436 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00437 "  } //for block_num\n"
00438 "}\n"
00439 ; //compressed_matrix_align1_lu_backward
00440 
00441 const char * const compressed_matrix_align1_bicgstab_kernel1 = 
00442 "void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n"
00443 "{\n"
00444 "  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00445 "  {\n"
00446 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00447 "    if (get_local_id(0) < stride)\n"
00448 "      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00449 "  }\n"
00450 "}\n"
00451 "//////// inner products:\n"
00452 "float bicgstab_kernel1_inner_prod(\n"
00453 "          __global const float * vec1,\n"
00454 "          __global const float * vec2,\n"
00455 "          unsigned int size,\n"
00456 "          __local float * tmp_buffer)\n"
00457 "{\n"
00458 "  float tmp = 0;\n"
00459 "  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00460 "  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00461 "  {\n"
00462 "    if (i < size)\n"
00463 "      tmp += vec1[i] * vec2[i];\n"
00464 "  }\n"
00465 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00466 "  \n"
00467 "  helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n"
00468 "  barrier(CLK_LOCAL_MEM_FENCE);\n"
00469 "  return tmp_buffer[0];\n"
00470 "}\n"
00471 "__kernel void bicgstab_kernel1(\n"
00472 "          __global const float * tmp0,\n"
00473 "          __global const float * r0star, \n"
00474 "          __global const float * residual,\n"
00475 "          __global float * s,\n"
00476 "          __global float * alpha,\n"
00477 "          __global const float * ip_rr0star,\n"
00478 "          __local float * tmp_buffer,\n"
00479 "          unsigned int size) \n"
00480 "{ \n"
00481 "  float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n"
00482 "  \n"
00483 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00484 "    s[i] = residual[i] - alpha_local * tmp0[i];\n"
00485 "  \n"
00486 "  if (get_global_id(0) == 0)\n"
00487 "    alpha[0] = alpha_local;\n"
00488 "}\n"
00489 ; //compressed_matrix_align1_bicgstab_kernel1
00490 
00491 const char * const compressed_matrix_align8_vec_mul = 
00492 "__kernel void vec_mul(\n"
00493 "          __global const unsigned int * row_indices,\n"
00494 "          __global const uint8 * column_indices, \n"
00495 "          __global const float8 * elements,\n"
00496 "          __global const float * vector,  \n"
00497 "          __global float * result,\n"
00498 "          unsigned int size)\n"
00499 "{ \n"
00500 "  float dot_prod;\n"
00501 "  unsigned int start, next_stop;\n"
00502 "  uint8 col_idx;\n"
00503 "  float8 tmp_vec;\n"
00504 "  float8 tmp_entries;\n"
00505 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00506 "  {\n"
00507 "    dot_prod = 0.0f;\n"
00508 "    start = row_indices[row] / 8;\n"
00509 "    next_stop = row_indices[row+1] / 8;\n"
00510 "    for (unsigned int i = start; i < next_stop; ++i)\n"
00511 "    {\n"
00512 "      col_idx = column_indices[i];\n"
00513 "      tmp_entries = elements[i];\n"
00514 "      tmp_vec.s0 = vector[col_idx.s0];\n"
00515 "      tmp_vec.s1 = vector[col_idx.s1];\n"
00516 "      tmp_vec.s2 = vector[col_idx.s2];\n"
00517 "      tmp_vec.s3 = vector[col_idx.s3];\n"
00518 "      tmp_vec.s4 = vector[col_idx.s4];\n"
00519 "      tmp_vec.s5 = vector[col_idx.s5];\n"
00520 "      tmp_vec.s6 = vector[col_idx.s6];\n"
00521 "      tmp_vec.s7 = vector[col_idx.s7];\n"
00522 "      dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
00523 "      dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
00524 "    }\n"
00525 "    result[row] = dot_prod;\n"
00526 "  }\n"
00527 "}\n"
00528 ; //compressed_matrix_align8_vec_mul
00529 
00530   }  //namespace kernels
00531  }  //namespace linalg
00532 }  //namespace viennacl
00533 #endif
/data/development/ViennaCL/dev/viennacl/linalg/kernels/compressed_matrix_source.h