ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
reduction_template.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_TEMPLATE_HPP
2 #define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_TEMPLATE_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
27 #include <vector>
28 
30 
34 
37 
38 #include "viennacl/tools/tools.hpp"
39 
40 namespace viennacl
41 {
42 namespace device_specific
43 {
44 
46 {
47  reduction_parameters(unsigned int _simd_width,
48  unsigned int _group_size, unsigned int _num_groups,
49  fetching_policy_type _fetching_policy) : template_base::parameters_type(_simd_width, _group_size, 1, 2), num_groups(_num_groups), fetching_policy(_fetching_policy){ }
50 
51  unsigned int num_groups;
53 };
54 
55 class reduction_template : public template_base_impl<reduction_template, reduction_parameters>
56 {
57 
58 private:
59  unsigned int n_lmem_elements() const
60  {
61  return p_.local_size_0;
62  }
63 
64  int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
65  {
66  if (p_.fetching_policy==FETCH_FROM_LOCAL)
67  return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
68  return TEMPLATE_VALID;
69  }
70 
71  inline void reduce_1d_local_memory(utils::kernel_generation_stream & stream, unsigned int size, std::vector<mapped_scalar_reduction*> exprs,
72  std::string const & buf_str, std::string const & buf_value_str) const
73  {
74  stream << "#pragma unroll" << std::endl;
75  stream << "for(unsigned int stride = " << size/2 << "; stride >0; stride /=2)" << std::endl;
76  stream << "{" << std::endl;
77  stream.inc_tab();
78  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
79  stream << "if (lid < stride)" << std::endl;
80  stream << "{" << std::endl;
81  stream.inc_tab();
82 
83  for (unsigned int k = 0; k < exprs.size(); k++)
84  if (exprs[k]->is_index_reduction())
85  compute_index_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]")
86  , exprs[k]->process(buf_value_str+"[lid]"), exprs[k]->process(buf_value_str+"[lid+stride]"),
87  exprs[k]->root_op());
88  else
89  compute_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]"), exprs[k]->root_op());
90  stream.dec_tab();
91  stream << "}" << std::endl;
92  stream.dec_tab();
93  stream << "}" << std::endl;
94  }
95 
96  std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width) const
97  {
99 
100  std::vector<mapped_scalar_reduction*> exprs;
101  for (std::vector<mapping_type>::const_iterator it = mappings.begin(); it != mappings.end(); ++it)
102  for (mapping_type::const_iterator iit = it->begin(); iit != it->end(); ++iit)
103  if (mapped_scalar_reduction * p = dynamic_cast<mapped_scalar_reduction*>(iit->second.get()))
104  exprs.push_back(p);
105  vcl_size_t N = exprs.size();
106 
107  std::string arguments = generate_value_kernel_argument("unsigned int", "N");
108  for (unsigned int k = 0; k < N; ++k)
109  {
110  std::string numeric_type = utils::numeric_type_to_string(lhs_most(exprs[k]->statement().array(),
111  exprs[k]->statement().root()).lhs.numeric_type);
112  if (exprs[k]->is_index_reduction())
113  {
114  arguments += generate_pointer_kernel_argument("__global", "unsigned int", exprs[k]->process("#name_temp"));
115  arguments += generate_pointer_kernel_argument("__global", numeric_type, exprs[k]->process("#name_temp_value"));
116  }
117  else
118  arguments += generate_pointer_kernel_argument("__global", numeric_type, exprs[k]->process("#name_temp"));
119  }
120 
121 
122  /* ------------------------
123  * First Kernel
124  * -----------------------*/
125  stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
126  generate_prototype(stream, kernel_prefix + "_0", arguments, mappings, statements);
127  stream << "{" << std::endl;
128  stream.inc_tab();
129 
130  tree_parsing::process(stream, PARENT_NODE_TYPE, "scalar", "#scalartype #namereg = *#pointer;", statements, mappings);
131  stream << "unsigned int lid = get_local_id(0);" << std::endl;
132  tree_parsing::process(stream, PARENT_NODE_TYPE, "vector", "#pointer += #start;", statements, mappings);
133 
134  for (unsigned int k = 0; k < N; ++k)
135  {
136  if (exprs[k]->is_index_reduction())
137  {
138  stream << exprs[k]->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
139  stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
140  stream << exprs[k]->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
141  stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl;
142  }
143  else
144  {
145  stream << exprs[k]->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
146  stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
147  }
148  }
149 
150  class loop_body : public loop_body_base
151  {
152  public:
153  loop_body(std::vector<mapped_scalar_reduction*> const & exprs_) : exprs(exprs_){ }
154 
155  void operator()(utils::kernel_generation_stream & kernel_stream, unsigned int loop_simd_width) const
156  {
157  std::string i = (loop_simd_width==1)?"i*#stride":"i";
158  std::string process_str;
159  //Fetch vector entry
160  {
161  std::set<std::string> already_fetched;
162  process_str = utils::append_width("#scalartype",loop_simd_width) + " #namereg = " + vload(loop_simd_width,i,"#pointer")+";";
163  for (std::vector<mapped_scalar_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
164  {
165  (*it)->process_recursive(kernel_stream, PARENT_NODE_TYPE, "vector", process_str, already_fetched);
166  (*it)->process_recursive(kernel_stream, PARENT_NODE_TYPE, "matrix_row", "#scalartype #namereg = #pointer[$OFFSET{#row*#stride1, i*#stride2}];", already_fetched);
167  (*it)->process_recursive(kernel_stream, PARENT_NODE_TYPE, "matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride1,#column*#stride2}];", already_fetched);
168  (*it)->process_recursive(kernel_stream, PARENT_NODE_TYPE, "matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride1, i*#stride2}:$OFFSET{i*#stride1, (i + #diag_offset)*#stride2}];", already_fetched);
169  }
170  }
171 
172  //Update accumulators
173  std::vector<std::string> str(loop_simd_width);
174  if (loop_simd_width==1)
175  str[0] = "#namereg";
176  else
177  for (unsigned int a = 0; a < loop_simd_width; ++a)
178  str[a] = append_simd_suffix("#namereg.s", a);
179 
180  for (unsigned int k = 0; k < exprs.size(); ++k)
181  {
182  for (unsigned int a = 0; a < loop_simd_width; ++a)
183  {
184  std::map<std::string, std::string> accessors;
185  accessors["vector"] = str[a];
186  accessors["matrix_row"] = str[a];
187  accessors["matrix_column"] = str[a];
188  accessors["matrix_diag"] = str[a];
189  accessors["scalar"] = "#namereg";
190  std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
191  if (exprs[k]->root_node().op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
192  value+= "*" + exprs[k]->evaluate_recursive(RHS_NODE_TYPE, accessors);
193 
194  if (exprs[k]->is_index_reduction())
195  compute_index_reduction(kernel_stream, exprs[k]->process("#name_acc"), "i*"+tools::to_string(loop_simd_width) + "+" + tools::to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
196  else
197  compute_reduction(kernel_stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
198  }
199  }
200  }
201 
202  private:
203  std::vector<mapped_scalar_reduction*> exprs;
204  };
205 
206  element_wise_loop_1D(stream, loop_body(exprs), p_.fetching_policy, simd_width, "i", "N", "get_global_id(0)", "get_global_size(0)");
207 
208  //Fills local memory
209  for (unsigned int k = 0; k < N; ++k)
210  {
211  if (exprs[k]->is_index_reduction())
212  stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
213  stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
214  }
215 
216  //Reduce local memory
217  reduce_1d_local_memory(stream, p_.local_size_0, exprs, "#name_buf", "#name_buf_value");
218 
219  //Write to temporary buffers
220  stream << "if (lid==0)" << std::endl;
221  stream << "{" << std::endl;
222  stream.inc_tab();
223  for (unsigned int k = 0; k < N; ++k)
224  {
225  if (exprs[k]->is_index_reduction())
226  stream << exprs[k]->process("#name_temp_value[get_group_id(0)] = #name_buf_value[0];") << std::endl;
227  stream << exprs[k]->process("#name_temp[get_group_id(0)] = #name_buf[0];") << std::endl;
228  }
229  stream.dec_tab();
230  stream << "}" << std::endl;
231 
232  stream.dec_tab();
233  stream << "}" << std::endl;
234 
235  /* ------------------------
236  * Second kernel
237  * -----------------------*/
238  stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
239  generate_prototype(stream, kernel_prefix + "_1", arguments, mappings, statements);
240  stream << "{" << std::endl;
241  stream.inc_tab();
242 
243  stream << "unsigned int lid = get_local_id(0);" << std::endl;
244 
245  for (unsigned int k = 0; k < N; ++k)
246  {
247  if (exprs[k]->is_index_reduction())
248  {
249  stream << exprs[k]->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];");
250  stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl;
251  stream << exprs[k]->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
252  stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";");
253  }
254  else
255  {
256  stream << exprs[k]->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
257  stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";");
258  }
259  }
260 
261  stream << "for(unsigned int i = lid; i < " << p_.num_groups << "; i += get_local_size(0))" << std::endl;
262  stream << "{" << std::endl;
263  stream.inc_tab();
264  for (unsigned int k = 0; k < N; ++k)
265  if (exprs[k]->is_index_reduction())
266  compute_index_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"),
267  exprs[k]->process("#name_acc_value"),exprs[k]->process("#name_temp_value[i]"),exprs[k]->root_op());
268  else
269  compute_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"), exprs[k]->root_op());
270 
271  stream.dec_tab();
272  stream << "}" << std::endl;
273 
274  for (unsigned int k = 0; k < N; ++k)
275  {
276  if (exprs[k]->is_index_reduction())
277  stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
278  stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
279  }
280 
281 
282  //Reduce and write final result
283  reduce_1d_local_memory(stream, p_.local_size_0, exprs, "#name_buf", "#name_buf_value");
284 
285  stream << "if (lid==0)" << std::endl;
286  stream << "{" << std::endl;
287  stream.inc_tab();
288  std::map<std::string, std::string> accessors;
289  accessors["scalar_reduction"] = "#name_buf[0]";
290  accessors["scalar"] = "*#pointer";
291  accessors["vector"] = "#pointer[#start]";
292  tree_parsing::evaluate(stream, PARENT_NODE_TYPE, accessors, statements, mappings);
293  stream.dec_tab();
294  stream << "}" << std::endl;
295 
296  stream.dec_tab();
297  stream << "}" << std::endl;
298 
299  return stream.str();
300  }
301 
302  std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
303  {
304  std::vector<std::string> result;
305  result.push_back(generate_impl(kernel_prefix + "_strided", statements, mappings, 1));
306  result.push_back(generate_impl(kernel_prefix, statements, mappings, p_.simd_width));
307  return result;
308  }
309 public:
311 
312  void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
313  {
314  std::vector<scheduler::statement_node const *> reductions;
315  cl_uint size = 0;
316  for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
317  {
318  std::vector<vcl_size_t> reductions_idx;
319  tree_parsing::traverse(*it, it->root(), tree_parsing::filter(&utils::is_reduction, reductions_idx), false);
320  size = static_cast<cl_uint>(vector_size(lhs_most(it->array(), reductions_idx[0]), false));
321  for (std::vector<vcl_size_t>::iterator itt = reductions_idx.begin(); itt != reductions_idx.end(); ++itt)
322  reductions.push_back(&it->array()[*itt]);
323  }
324 
325  scheduler::statement const & statement = statements.data().front();
326  unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
327 
328  viennacl::ocl::kernel * kernels[2];
329  if (has_strided_access(statements) && p_.simd_width > 1)
330  {
331  kernels[0] = &programs[0].program().get_kernel(kernel_prefix+"_strided_0");
332  kernels[1] = &programs[0].program().get_kernel(kernel_prefix+"_strided_1");
333  }
334  else
335  {
336  kernels[0] = &programs[1].program().get_kernel(kernel_prefix+"_0");
337  kernels[1] = &programs[1].program().get_kernel(kernel_prefix+"_1");
338  }
339 
340  kernels[0]->local_work_size(0, p_.local_size_0);
341  kernels[0]->global_work_size(0,p_.local_size_0*p_.num_groups);
342 
343  kernels[1]->local_work_size(0, p_.local_size_0);
344  kernels[1]->global_work_size(0,p_.local_size_0);
345 
346  for (unsigned int k = 0; k < 2; k++)
347  {
348  unsigned int n_arg = 0;
349  kernels[k]->arg(n_arg++, size);
350  unsigned int i = 0;
351  unsigned int j = 0;
352  for (std::vector<scheduler::statement_node const *>::const_iterator it = reductions.begin(); it != reductions.end(); ++it)
353  {
354  if (utils::is_index_reduction((*it)->op))
355  {
356  if (tmpidx_.size() <= j)
357  tmpidx_.push_back(kernels[k]->context().create_memory(CL_MEM_READ_WRITE, p_.num_groups*4));
358  kernels[k]->arg(n_arg++, tmpidx_[j]);
359  j++;
360  }
361 
362  if (tmp_.size() <= i)
363  tmp_.push_back(kernels[k]->context().create_memory(CL_MEM_READ_WRITE, p_.num_groups*scalartype_size));
364  kernels[k]->arg(n_arg++, tmp_[i]);
365  i++;
366  }
367  set_arguments(statements, *kernels[k], n_arg);
368  }
369 
370  for (unsigned int k = 0; k < 2; k++)
371  viennacl::ocl::enqueue(*kernels[k]);
372 
373  }
374 
375 private:
376  std::vector< viennacl::ocl::handle<cl_mem> > tmp_;
377  std::vector< viennacl::ocl::handle<cl_mem> > tmpidx_;
378 };
379 
380 }
381 }
382 
383 #endif
void set_arguments(statements_container const &statements, viennacl::ocl::kernel &kernel, unsigned int &current_arg)
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Various little tools used here and there in ViennaCL.
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
void traverse(scheduler::statement const &statement, vcl_size_t root_idx, Fun const &fun, bool inspect)
Recursively execute a functor on a statement.
void enqueue(std::string const &kernel_prefix, std::vector< lazy_program_compiler > &programs, statements_container const &statements)
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
container_type const & array() const
Definition: forwards.h:528
std::list< scheduler::statement > const & data() const
Definition: forwards.h:282
static std::string append_simd_suffix(std::string const &str, unsigned int i)
scheduler::statement_node const & lhs_most(scheduler::statement::container_type const &array, vcl_size_t root)
Definition: forwards.h:87
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
static bool has_strided_access(statements_container const &statements)
reduction_template(reduction_template::parameters_type const &parameters, binding_policy_t binding_policy=BIND_ALL_UNIQUE)
std::string evaluate(leaf_t leaf, std::map< std::string, std::string > const &accessors, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping)
bool is_reduction(scheduler::statement_node const &node)
Definition: utils.hpp:361
std::size_t vcl_size_t
Definition: forwards.h:75
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
static vcl_size_t vector_size(scheduler::statement_node const &node, bool up_to_internal_size)
static void generate_prototype(utils::kernel_generation_stream &stream, std::string const &name, std::string const &first_arguments, std::vector< mapping_type > const &mappings, statements_container const &statements, std::map< std::string, unsigned int > const &widths)
void compute_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, scheduler::op_element const &op)
Definition: utils.hpp:40
Code for parsing the expression trees.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Internal utils.
viennacl::ocl::context const & context() const
Definition: kernel.hpp:788
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
bool is_index_reduction(scheduler::op_element const &op)
Definition: utils.hpp:370
void compute_index_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, std::string const &acc_value, std::string const &cur_value, scheduler::op_element const &op)
Definition: utils.hpp:48
static void element_wise_loop_1D(utils::kernel_generation_stream &stream, loop_body_base const &loop_body, fetching_policy_type fetch, unsigned int simd_width, std::string const &i, std::string const &bound, std::string const &domain_id, std::string const &domain_size)
static std::string vload(unsigned int simd_width, std::string const &offset, std::string const &ptr)
size_type root() const
Definition: forwards.h:530
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
Definition: forwards.h:502
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
Definition: kernel.hpp:116
Implementations for the OpenCL backend functionality.
std::string to_string(T const t)
Definition: tools.hpp:304
unsigned int size_of(scheduler::statement_node_numeric_type type)
Definition: utils.hpp:534
reduction_parameters(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy)
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels)
void process(utils::kernel_generation_stream &stream, leaf_t leaf, std::string const &type_key, std::string const &to_process, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping, std::set< std::string > &already_processed)
std::string neutral_element(scheduler::op_element const &op)
Definition: utils.hpp:82
std::string append_width(std::string const &str, unsigned int width)
Definition: utils.hpp:558
viennacl::ocl::handle< cl_mem > create_memory(cl_mem_flags flags, unsigned int size, void *ptr=NULL) const
Creates a memory buffer within the context.
Definition: context.hpp:216