ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
row_wise_reduction_template.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
2 #define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
27 #include <vector>
28 
30 
34 
37 
38 #include "viennacl/tools/tools.hpp"
39 
41 
42 namespace viennacl
43 {
44 namespace device_specific
45 {
46 
48 {
49  row_wise_reduction_parameters(unsigned int _simd_width,
50  unsigned int _local_size_0, unsigned int _local_size_1,
51  unsigned int _num_groups_0, fetching_policy_type _fetch_policy): template_base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1),
52  num_groups_0(_num_groups_0), fetch_policy(_fetch_policy) { }
53 
54  unsigned int num_groups_0;
56 };
57 
58 class row_wise_reduction_template : public template_base_impl<row_wise_reduction_template, row_wise_reduction_parameters>
59 {
60 private:
61  virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
62  {
63  if (p_.fetch_policy==FETCH_FROM_LOCAL)
64  return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
65  return TEMPLATE_VALID;
66  }
67 
68  unsigned int n_lmem_elements() const
69  {
70  return p_.local_size_0*(p_.local_size_1+1);
71  }
72 
73  static void parse(scheduler::statement const & statement, std::vector<vcl_size_t> & idx, bool & is_trans, scheduler::lhs_rhs_element & matrix)
74  {
75  tree_parsing::traverse(statement, statement.root(), tree_parsing::filter(&utils::is_reduction, idx), false);
76  is_trans = is_node_trans(statement.array(), idx[0], LHS_NODE_TYPE);
77  matrix = lhs_most(statement.array(), idx[0]).lhs;
78  }
79 
80  std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width, bool is_trans, std::vector<mapped_row_wise_reduction*> const & exprs) const
81  {
82  using tools::to_string;
83 
84  unsigned int lsize0 = p_.local_size_0;
85  unsigned int lsize1 = p_.local_size_1+1;
86  std::string lsize1str = to_string(lsize1);
87 
89 
90  stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
91  generate_prototype(stream, kernel_prefix, "unsigned int M, unsigned int N,", mappings, statements);
92  stream << "{" << std::endl;
93  stream.inc_tab();
94 
95  tree_parsing::process(stream, PARENT_NODE_TYPE, "scalar", "#scalartype #namereg = *#pointer;", statements, mappings);
96  tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#pointer += #start1 + #start2*#ld;", statements, mappings);
97  tree_parsing::process(stream, PARENT_NODE_TYPE, "vector", "#pointer += #start;", statements, mappings);
98 
99  tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#ld *= #nldstride;", statements, mappings);
100 
101  for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
102  stream << (*it)->process("__local #scalartype #name_buf[" + to_string(lsize0*lsize1) + "];") << std::endl;
103 
104  stream << "unsigned int lid0 = get_local_id(0);" << std::endl;
105  stream << "unsigned int lid1 = get_local_id(1);" << std::endl;
106  stream << "unsigned int upper_bound_0 = ( M +" << p_.local_size_0 - 1 << ")/" << p_.local_size_0 << "*" << p_.local_size_0 << ";" << std::endl;
107  stream << "for(unsigned int r = get_global_id(0); r < upper_bound_0; r += get_global_size(0)){" << std::endl;
108  stream.inc_tab();
109 
110  for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
111  stream << (*it)->process("#scalartype #name_acc = " + neutral_element((*it)->root_op()) + ";") << std::endl;
112 
113  stream << "if (r < M)" << std::endl;
114  stream << "{" << std::endl;
115  stream.inc_tab();
116 
117  class loop_body : public loop_body_base
118  {
119  public:
120  loop_body(std::vector<mapped_row_wise_reduction*> const & _exprs, bool _is_trans) : exprs(_exprs), is_trans(_is_trans){ }
121  void operator()(utils::kernel_generation_stream & kernel_stream, unsigned int loop_simd_width) const
122  {
123  std::set<std::string> already_fetched;
124  for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
125  {
126  if (is_trans)
127  (*it)->process_recursive(kernel_stream, LHS_NODE_TYPE, "matrix_trans", utils::append_width("#scalartype",loop_simd_width) + " #namereg = " + vload(loop_simd_width, "c*#stride1", "#pointer + r*#ld")+";", already_fetched);
128  else
129  (*it)->process_recursive(kernel_stream, LHS_NODE_TYPE, "matrix", "#scalartype #namereg = #pointer[r*#stride1 + c*#ld];", already_fetched);
130  (*it)->process_recursive(kernel_stream, RHS_NODE_TYPE, "vector", utils::append_width("#scalartype",loop_simd_width) + " #namereg = " + vload(loop_simd_width, "c*#stride", "#pointer")+";", already_fetched);
131  }
132 
133 
134  //Update accumulators
135  std::vector<std::string> str(loop_simd_width);
136  if (loop_simd_width==1)
137  str[0] = "#namereg";
138  else
139  for (unsigned int a = 0; a < loop_simd_width; ++a)
140  str[a] = append_simd_suffix("#namereg.s", a);
141 
142 
143  for (unsigned int k = 0; k < exprs.size(); ++k)
144  {
145  for (unsigned int a = 0; a < loop_simd_width; ++a)
146  {
147  std::map<std::string, std::string> accessors;
148  if (is_trans)
149  accessors["matrix_trans"] = str[a];
150  else
151  accessors["matrix"] = str[a];
152  accessors["vector"] = str[a];
153  accessors["scalar"] = "#namereg";
154  std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
155  if (exprs[k]->root_node().op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
156  value+= "*" + exprs[k]->evaluate_recursive(RHS_NODE_TYPE, accessors);
157 
158  if (exprs[k]->is_index_reduction())
159  compute_index_reduction(kernel_stream, exprs[k]->process("#name_acc"), "c*"+to_string(loop_simd_width) + to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
160  else
161  compute_reduction(kernel_stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
162  }
163  }
164  }
165  private:
166  std::vector<mapped_row_wise_reduction*> exprs;
167  bool is_trans;
168  };
169 
170  element_wise_loop_1D(stream, loop_body(exprs, is_trans), p_.fetch_policy, simd_width, "c", "N", "get_local_id(1)", "get_local_size(1)");
171  stream.dec_tab();
172  stream << "}" << std::endl;
173 
174  for (unsigned int k = 0; k < exprs.size(); ++k)
175  stream << exprs[k]->process("#name_buf[lid0*" + lsize1str + "+ lid1] = #name_acc;") << std::endl;
176 
177  stream << "#pragma unroll" << std::endl;
178  stream << "for(unsigned int stride = " << p_.local_size_1/2 << "; stride >0; stride /=2)" << std::endl;
179  stream << "{" << std::endl;
180  stream.inc_tab();
181 
182  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
183  stream << "if (lid1 < stride)" << std::endl;
184  stream << "{" << std::endl;
185  stream.inc_tab();
186 
187  for (unsigned int k = 0; k < exprs.size(); k++)
188  if (exprs[k]->is_index_reduction())
189  compute_index_reduction(stream, exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]")
190  , exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1 + stride]"),
191  exprs[k]->root_op());
192  else
193  compute_reduction(stream,exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]"), exprs[k]->root_op());
194 
195  stream.dec_tab();
196  stream << "}" << std::endl;
197 
198  stream.dec_tab();
199  stream << "}" << std::endl;
200 
201 
202  stream << "if (lid1 == 0 && r < M)";
203  stream << "{" << std::endl;
204  stream.inc_tab();
205  std::map<std::string, std::string> accessors;
206  accessors["row_wise_reduction"] = "#name_buf[lid0*" + lsize1str + "]";
207  accessors["vector"] = "#pointer[r*#stride]";
208  tree_parsing::evaluate(stream, PARENT_NODE_TYPE, accessors, statements, mappings);
209  stream.dec_tab();
210  stream << "}" << std::endl;
211 
212 
213  stream.dec_tab();
214  stream << "}" << std::endl;
215 
216  stream.dec_tab();
217  stream << "}" << std::endl;
218 
219  return stream.str();
220  }
221 
222  std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
223  {
224  std::vector<mapped_row_wise_reduction*> exprs;
225  bool is_trans = false;
226  bool row_major = false;
227  statements_container::data_type::const_iterator sit;
228  std::vector<mapping_type>::const_iterator mit;
229  for (mit = mappings.begin(), sit = statements.data().begin(); mit != mappings.end(); ++mit, ++sit)
230  {
231  std::vector<vcl_size_t> idx;
233  parse(*sit, idx, is_trans, A);
234  row_major = utils::call_on_matrix(A, utils::row_major_fun());
235  for (unsigned int j = 0; j < idx.size(); ++j)
236  exprs.push_back((mapped_row_wise_reduction*)(at(*mit, mapping_key(idx[j], PARENT_NODE_TYPE)).get()));
237  }
238  is_trans = is_trans ^ row_major;
239 
240  std::vector<std::string> res;
241  if (is_trans && p_.simd_width>1)
242  {
243  res.push_back(generate_impl(kernel_prefix, statements, mappings, p_.simd_width, is_trans, exprs));
244  res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
245  }
246  else
247  res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
248 
249  return res;
250  }
251 public:
253 
254  void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
255  {
256  std::vector<vcl_size_t> idx;
258  bool is_trans;
259  parse(statements.data().front(), idx, is_trans, A);
260  bool row_major = utils::call_on_matrix(A, utils::row_major_fun());
261 
262  viennacl::ocl::kernel * kernel;
263  if ((is_trans ^ row_major)&& p_.simd_width>1)
264  {
265  if (has_strided_access(statements))
266  kernel = &programs[1].program().get_kernel(kernel_prefix);
267  else
268  kernel = &programs[0].program().get_kernel(kernel_prefix);
269  }
270  else
271  kernel = &programs[0].program().get_kernel(kernel_prefix);
272 
273  kernel->local_work_size(0,p_.local_size_0);
274  kernel->local_work_size(1,p_.local_size_1);
275  kernel->global_work_size(0,p_.local_size_0*p_.num_groups_0);
276  kernel->global_work_size(1,p_.local_size_1);
277 
278  unsigned int current_arg = 0;
279  if (is_trans)
280  {
281  kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size2_fun())));
282  kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size1_fun())));
283  }
284  else
285  {
286  kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size1_fun())));
287  kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size2_fun())));
288  }
289 
290 
291  set_arguments(statements, *kernel, current_arg);
292  viennacl::ocl::enqueue(*kernel);
293  }
294 
295 private:
296  const char A_trans_;
297 };
298 
299 }
300 }
301 
302 #endif
void set_arguments(statements_container const &statements, viennacl::ocl::kernel &kernel, unsigned int &current_arg)
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Various little tools used here and there in ViennaCL.
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
Some helper routines for reading/writing/printing scheduler expressions.
void traverse(scheduler::statement const &statement, vcl_size_t root_idx, Fun const &fun, bool inspect)
Recursively execute a functor on a statement.
static bool is_node_trans(scheduler::statement::container_type const &array, vcl_size_t root_idx, leaf_t leaf_type)
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
A dense matrix class.
Definition: forwards.h:375
A class representing the 'data' for the LHS or RHS operand of the respective node.
Definition: forwards.h:337
container_type const & array() const
Definition: forwards.h:528
std::list< scheduler::statement > const & data() const
Definition: forwards.h:282
static std::string append_simd_suffix(std::string const &str, unsigned int i)
scheduler::statement_node const & lhs_most(scheduler::statement::container_type const &array, vcl_size_t root)
Definition: forwards.h:87
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
std::string to_string(viennacl::scheduler::op_element op_elem)
Helper routine for converting the operation enums to string.
Definition: io.hpp:42
Map ViennaCL objects to generator wrappers.
static bool has_strided_access(statements_container const &statements)
std::string evaluate(leaf_t leaf, std::map< std::string, std::string > const &accessors, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping)
bool is_reduction(scheduler::statement_node const &node)
Definition: utils.hpp:361
void enqueue(std::string const &kernel_prefix, std::vector< lazy_program_compiler > &programs, statements_container const &statements)
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
static void generate_prototype(utils::kernel_generation_stream &stream, std::string const &name, std::string const &first_arguments, std::vector< mapping_type > const &mappings, statements_container const &statements, std::map< std::string, unsigned int > const &widths)
void compute_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, scheduler::op_element const &op)
Definition: utils.hpp:40
Code for parsing the expression trees.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Internal utils.
bool row_major(T const &)
Definition: row_major.hpp:38
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
row_wise_reduction_parameters(unsigned int _simd_width, unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, fetching_policy_type _fetch_policy)
bool is_index_reduction(scheduler::op_element const &op)
Definition: utils.hpp:370
void compute_index_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, std::string const &acc_value, std::string const &cur_value, scheduler::op_element const &op)
Definition: utils.hpp:48
static void element_wise_loop_1D(utils::kernel_generation_stream &stream, loop_body_base const &loop_body, fetching_policy_type fetch, unsigned int simd_width, std::string const &i, std::string const &bound, std::string const &domain_id, std::string const &domain_size)
static std::string vload(unsigned int simd_width, std::string const &offset, std::string const &ptr)
size_type root() const
Definition: forwards.h:530
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
Definition: forwards.h:502
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
Definition: kernel.hpp:116
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
Definition: forwards.h:142
std::string to_string(T const t)
Definition: tools.hpp:304
row_wise_reduction_template(row_wise_reduction_template::parameters_type const &parameters, char A_trans, binding_policy_t binding_policy=BIND_ALL_UNIQUE)
A tag for row-major storage of a dense matrix.
Definition: forwards.h:304
std::pair< vcl_size_t, leaf_t > mapping_key
Definition: forwards.h:188
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels)
void process(utils::kernel_generation_stream &stream, leaf_t leaf, std::string const &type_key, std::string const &to_process, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping, std::set< std::string > &already_processed)
std::string neutral_element(scheduler::op_element const &op)
Definition: utils.hpp:82
std::string append_width(std::string const &str, unsigned int width)
Definition: utils.hpp:558