1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
41 template<
typename StringT>
44 source.append(
"__kernel void vec_mul( \n");
45 source.append(
" __global const uint2 * coords, \n");
46 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
47 source.append(
" __global const uint * group_boundaries, \n");
48 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
49 source.append(
" uint4 layout_x, \n");
50 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
51 source.append(
" uint4 layout_result, \n");
52 source.append(
" __local unsigned int * shared_rows, \n");
53 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results) \n");
54 source.append(
"{ \n");
55 source.append(
" uint2 tmp; \n");
56 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
57 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
58 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
59 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
61 source.append(
" uint local_index = 0; \n");
63 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
64 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
66 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
67 source.append(
" val = (local_index < group_end) ? elements[local_index] * x[tmp.y * layout_x.y + layout_x.x] : 0; \n");
70 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
71 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
72 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
73 source.append(
" else \n");
74 source.append(
" result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] = inter_results[get_local_size(0)-1]; \n");
75 source.append(
" } \n");
78 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
79 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
80 source.append(
" inter_results[get_local_id(0)] = val; \n");
81 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
82 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
84 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
85 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
86 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
87 source.append(
" inter_results[get_local_id(0)] += left; \n");
88 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
89 source.append(
" } \n");
92 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
93 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
94 source.append(
" result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
95 source.append(
" } \n");
97 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
98 source.append(
" } \n");
100 source.append(
" if (local_index + 1 == group_end) \n");
101 source.append(
" result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
102 source.append(
"} \n");
109 template<
typename StringT>
111 bool B_transposed,
bool B_row_major,
bool C_row_major)
113 source.append(
"__kernel void ");
115 source.append(
"( \n");
116 source.append(
" __global const uint2 * coords, \n");
117 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
118 source.append(
" __global const uint * group_boundaries, \n");
119 source.append(
" __global const "); source.append(numeric_string); source.append(
" * d_mat, \n");
120 source.append(
" unsigned int d_mat_row_start, \n");
121 source.append(
" unsigned int d_mat_col_start, \n");
122 source.append(
" unsigned int d_mat_row_inc, \n");
123 source.append(
" unsigned int d_mat_col_inc, \n");
124 source.append(
" unsigned int d_mat_row_size, \n");
125 source.append(
" unsigned int d_mat_col_size, \n");
126 source.append(
" unsigned int d_mat_internal_rows, \n");
127 source.append(
" unsigned int d_mat_internal_cols, \n");
128 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
129 source.append(
" unsigned int result_row_start, \n");
130 source.append(
" unsigned int result_col_start, \n");
131 source.append(
" unsigned int result_row_inc, \n");
132 source.append(
" unsigned int result_col_inc, \n");
133 source.append(
" unsigned int result_row_size, \n");
134 source.append(
" unsigned int result_col_size, \n");
135 source.append(
" unsigned int result_internal_rows, \n");
136 source.append(
" unsigned int result_internal_cols, \n");
137 source.append(
" __local unsigned int * shared_rows, \n");
138 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results) \n");
139 source.append(
"{ \n");
140 source.append(
" uint2 tmp; \n");
141 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
142 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
143 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
144 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
146 source.append(
" uint local_index = 0; \n");
148 source.append(
" for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
149 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
150 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
152 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
153 if (B_transposed && B_row_major)
154 source.append(
" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + tmp.y * d_mat_col_inc ] : 0; \n");
155 if (B_transposed && !B_row_major)
156 source.append(
" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + tmp.y * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
157 else if (!B_transposed && B_row_major)
158 source.append(
" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] : 0; \n");
160 source.append(
" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
163 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
164 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
165 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
166 source.append(
" else \n");
168 source.append(
" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_size(0)-1]; \n");
170 source.append(
" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_size(0)-1]; \n");
171 source.append(
" } \n");
174 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
175 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
176 source.append(
" inter_results[get_local_id(0)] = val; \n");
177 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
178 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
180 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
181 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
182 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
183 source.append(
" inter_results[get_local_id(0)] += left; \n");
184 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
185 source.append(
" } \n");
188 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
189 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
191 source.append(
" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
193 source.append(
" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
194 source.append(
" } \n");
196 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
197 source.append(
" } \n");
199 source.append(
" if (local_index + 1 == group_end) \n");
201 source.append(
" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
203 source.append(
" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
204 source.append(
" } \n");
205 source.append(
"} \n");
210 template<
typename StringT>
224 template<
typename StringT>
227 source.append(
"__kernel void row_info_extractor( \n");
228 source.append(
" __global const uint2 * coords, \n");
229 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
230 source.append(
" __global const uint * group_boundaries, \n");
231 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
232 source.append(
" unsigned int option, \n");
233 source.append(
" __local unsigned int * shared_rows, \n");
234 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results) \n");
235 source.append(
"{ \n");
236 source.append(
" uint2 tmp; \n");
237 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
238 source.append(
" uint last_index = get_local_size(0) - 1; \n");
239 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
240 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
241 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : ("); source.append(numeric_string); source.append(
")0; \n");
243 source.append(
" uint local_index = 0; \n");
245 source.append(
" for (uint k = 0; k < k_end; ++k) \n");
246 source.append(
" { \n");
247 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
249 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
250 source.append(
" val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; \n");
253 source.append(
" if (get_local_id(0) == 0 && k > 0) \n");
254 source.append(
" { \n");
255 source.append(
" if (tmp.x == shared_rows[last_index]) \n");
256 source.append(
" { \n");
257 source.append(
" switch (option) \n");
258 source.append(
" { \n");
259 source.append(
" case 0: \n");
260 source.append(
" case 3: \n");
261 source.append(
" val = max(val, fabs(inter_results[last_index])); \n");
262 source.append(
" break; \n");
264 source.append(
" case 1: \n");
265 source.append(
" val = fabs(val) + inter_results[last_index]; \n");
266 source.append(
" break; \n");
268 source.append(
" case 2: \n");
269 source.append(
" val = sqrt(val * val + inter_results[last_index]); \n");
270 source.append(
" break; \n");
272 source.append(
" default: \n");
273 source.append(
" break; \n");
274 source.append(
" } \n");
275 source.append(
" } \n");
276 source.append(
" else \n");
277 source.append(
" { \n");
278 source.append(
" switch (option) \n");
279 source.append(
" { \n");
280 source.append(
" case 0: \n");
281 source.append(
" case 1: \n");
282 source.append(
" case 3: \n");
283 source.append(
" result[shared_rows[last_index]] = inter_results[last_index]; \n");
284 source.append(
" break; \n");
286 source.append(
" case 2: \n");
287 source.append(
" result[shared_rows[last_index]] = sqrt(inter_results[last_index]); \n");
288 source.append(
" default: \n");
289 source.append(
" break; \n");
290 source.append(
" } \n");
291 source.append(
" } \n");
292 source.append(
" } \n");
295 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
296 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
297 source.append(
" switch (option) \n");
298 source.append(
" { \n");
299 source.append(
" case 0: \n");
300 source.append(
" case 3: \n");
301 source.append(
" inter_results[get_local_id(0)] = val; \n");
302 source.append(
" break; \n");
303 source.append(
" case 1: \n");
304 source.append(
" inter_results[get_local_id(0)] = fabs(val); \n");
305 source.append(
" break; \n");
306 source.append(
" case 2: \n");
307 source.append(
" inter_results[get_local_id(0)] = val * val; \n");
308 source.append(
" default: \n");
309 source.append(
" break; \n");
310 source.append(
" } \n");
311 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
313 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) \n");
314 source.append(
" { \n");
315 source.append(
" "); source.append(numeric_string); source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : ("); source.append(numeric_string); source.append(
")0; \n");
316 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
317 source.append(
" switch (option) \n");
318 source.append(
" { \n");
319 source.append(
" case 0: \n");
320 source.append(
" case 3: \n");
321 source.append(
" inter_results[get_local_id(0)] = max(inter_results[get_local_id(0)], left); \n");
322 source.append(
" break; \n");
324 source.append(
" case 1: \n");
325 source.append(
" inter_results[get_local_id(0)] += left; \n");
326 source.append(
" break; \n");
328 source.append(
" case 2: \n");
329 source.append(
" inter_results[get_local_id(0)] += left; \n");
330 source.append(
" break; \n");
332 source.append(
" default: \n");
333 source.append(
" break; \n");
334 source.append(
" } \n");
335 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
336 source.append(
" } \n");
339 source.append(
" if (get_local_id(0) != last_index && \n");
340 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] && \n");
341 source.append(
" inter_results[get_local_id(0)] != 0) \n");
342 source.append(
" { \n");
343 source.append(
" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
344 source.append(
" } \n");
346 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
347 source.append(
" } \n");
349 source.append(
" if (local_index + 1 == group_end && inter_results[get_local_id(0)] != 0) \n");
350 source.append(
" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
351 source.append(
"} \n");
358 template<
typename NumericT>
368 static std::map<cl_context, bool> init_done;
375 source.reserve(1024);
377 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
384 #ifdef VIENNACL_BUILD_INFO
385 std::cout <<
"Creating program " << prog_name << std::endl;
387 ctx.add_program(source, prog_name);
388 init_done[ctx.handle().get()] =
true;
static void init(viennacl::ocl::context &ctx)
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Main kernel class for generating OpenCL kernels for coordinate_matrix.
Provides OpenCL-related utilities.
void generate_coordinate_matrix_dense_matrix_mul(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Common implementations shared by OpenCL-based operations.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const
static std::string program_name()
void generate_coordinate_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_vec_mul(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
void generate_coordinate_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)