1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP
41 template<
typename StringT>
44 source.append(
"__kernel void vec_mul( \n");
45 source.append(
" __global const unsigned int * columns_per_block, \n");
46 source.append(
" __global const unsigned int * column_indices, \n");
47 source.append(
" __global const unsigned int * block_start, \n");
48 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
49 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
50 source.append(
" uint4 layout_x, \n");
51 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
52 source.append(
" uint4 layout_result, \n");
53 source.append(
" unsigned int block_size) \n");
54 source.append(
"{ \n");
55 source.append(
" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
56 source.append(
" uint id_in_block = get_local_id(0) % block_size; \n");
57 source.append(
" uint num_blocks = (layout_result.z - 1) / block_size + 1; \n");
58 source.append(
" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
59 source.append(
" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
61 source.append(
" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
62 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
64 source.append(
" uint row = block_idx * block_size + id_in_block; \n");
65 source.append(
" uint offset = block_start[block_idx]; \n");
66 source.append(
" uint num_columns = columns_per_block[block_idx]; \n");
67 source.append(
" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
68 source.append(
" uint index = offset + item_id * block_size + id_in_block; \n");
69 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
70 source.append(
" sum += val ? (x[column_indices[index] * layout_x.y + layout_x.x] * val) : 0; \n");
71 source.append(
" } \n");
73 source.append(
" if (row < layout_result.z) \n");
74 source.append(
" result[row * layout_result.y + layout_result.x] = sum; \n");
75 source.append(
" } \n");
76 source.append(
"} \n");
84 template<
typename NumericT,
typename IndexT>
87 template<
typename NumericT>
97 static std::map<cl_context, bool> init_done;
104 source.reserve(1024);
106 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
111 std::string prog_name = program_name();
112 #ifdef VIENNACL_BUILD_INFO
113 std::cout <<
"Creating program " << prog_name << std::endl;
115 ctx.add_program(source, prog_name);
116 init_done[ctx.handle().get()] =
true;
void generate_sliced_ell_vec_mul(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
static std::string program_name()
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Common implementations shared by OpenCL-based operations.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const
Main kernel class for generating OpenCL kernels for ell_matrix.
Representation of an OpenCL kernel in ViennaCL.
Helper class for converting a type to its string representation.