ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
scan.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
28 namespace viennacl
29 {
30 namespace linalg
31 {
32 namespace opencl
33 {
34 namespace kernels
35 {
36 
37 
38 template <typename StringType>
39 void generate_scan_kernel_1(StringType & source, std::string const & numeric_string)
40 {
41  source.append("__kernel void scan_1(__global "); source.append(numeric_string); source.append("* X, \n");
42  source.append(" unsigned int startX, \n");
43  source.append(" unsigned int incX, \n");
44  source.append(" unsigned int sizeX, \n");
45 
46  source.append(" __global "); source.append(numeric_string); source.append("* Y, \n");
47  source.append(" unsigned int startY, \n");
48  source.append(" unsigned int incY, \n");
49 
50  source.append(" unsigned int scan_offset, \n"); // 0 for inclusive scan, 1 for exclusive scan
51  source.append(" __global "); source.append(numeric_string); source.append("* carries) { \n");
52 
53  source.append(" __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n");
54  source.append(" "); source.append(numeric_string); source.append(" my_value; \n");
55 
56  source.append(" unsigned int work_per_thread = (sizeX - 1) / get_global_size(0) + 1; \n");
57  source.append(" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n");
58  source.append(" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
59  source.append(" unsigned int block_offset = 0; \n");
60 
61  // run scan on each section:
62  source.append(" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) { \n");
63 
64  // load data
65  source.append(" my_value = (i < sizeX) ? X[i * incX + startX] : 0; \n");
66 
67  // inclusive scan in shared buffer:
68  source.append(" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
69  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
70  source.append(" shared_buffer[get_local_id(0)] = my_value; \n");
71  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
72  source.append(" if (get_local_id(0) >= stride) \n");
73  source.append(" my_value += shared_buffer[get_local_id(0) - stride]; \n");
74  source.append(" } \n");
75  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
76  source.append(" shared_buffer[get_local_id(0)] = my_value; \n");
77  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
78 
79  // write to output array:
80  source.append(" if (scan_offset > 0) \n");
81  source.append(" my_value = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
82 
83  source.append(" if (i < sizeX) \n");
84  source.append(" Y[i * incY + startY] = block_offset + my_value; \n");
85 
86  source.append(" block_offset += shared_buffer[get_local_size(0)-1]; \n");
87  source.append(" } \n");
88 
89  // write carry:
90  source.append(" if (get_local_id(0) == 0) carries[get_group_id(0)] = block_offset; \n");
91 
92  source.append("} \n");
93 }
94 
95 template <typename StringType>
96 void generate_scan_kernel_2(StringType & source, std::string const & numeric_string)
97 {
98  source.append("__kernel void scan_2(__global "); source.append(numeric_string); source.append("* carries) { \n");
99 
100  source.append(" __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n"); //section size
101 
102  // load data
103  source.append(" "); source.append(numeric_string); source.append(" my_carry = carries[get_local_id(0)]; \n");
104 
105  // scan in shared buffer:
106  source.append(" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
107  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
108  source.append(" shared_buffer[get_local_id(0)] = my_carry; \n");
109  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
110  source.append(" if (get_local_id(0) >= stride) \n");
111  source.append(" my_carry += shared_buffer[get_local_id(0) - stride]; \n");
112  source.append(" } \n");
113  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
114  source.append(" shared_buffer[get_local_id(0)] = my_carry; \n");
115  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
116 
117  // write to output array:
118  source.append(" carries[get_local_id(0)] = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
119 
120  source.append("} \n");
121 }
122 
123 template <typename StringType>
124 void generate_scan_kernel_3(StringType & source, std::string const & numeric_string)
125 {
126  source.append("__kernel void scan_3(__global "); source.append(numeric_string); source.append(" * Y, \n");
127  source.append(" unsigned int startY, \n");
128  source.append(" unsigned int incY, \n");
129  source.append(" unsigned int sizeY, \n");
130 
131  source.append(" __global "); source.append(numeric_string); source.append("* carries) { \n");
132 
133  source.append(" unsigned int work_per_thread = (sizeY - 1) / get_global_size(0) + 1; \n");
134  source.append(" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n");
135  source.append(" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
136 
137  source.append(" __local "); source.append(numeric_string); source.append(" shared_offset; \n");
138 
139  source.append(" if (get_local_id(0) == 0) shared_offset = carries[get_group_id(0)]; \n");
140  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
141 
142  source.append(" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) \n");
143  source.append(" if (i < sizeY) \n");
144  source.append(" Y[i * incY + startY] += shared_offset; \n");
145 
146  source.append("} \n");
147 }
148 
149 
150 
151 
152 // main kernel class
154 template<typename NumericT>
155 struct scan
156 {
157  static std::string program_name()
158  {
160  }
161 
162  static void init(viennacl::ocl::context & ctx)
163  {
164  static std::map<cl_context, bool> init_done;
165  if (!init_done[ctx.handle().get()])
166  {
168  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
169 
170  std::string source;
171  source.reserve(1024);
172 
173  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
174 
175  generate_scan_kernel_1(source, numeric_string);
176  generate_scan_kernel_2(source, numeric_string);
177  generate_scan_kernel_3(source, numeric_string);
178 
179  std::string prog_name = program_name();
180  #ifdef VIENNACL_BUILD_INFO
181  std::cout << "Creating program " << prog_name << std::endl;
182  #endif
183  ctx.add_program(source, prog_name);
184  init_done[ctx.handle().get()] = true;
185  } //if
186  } //init
187 };
188 
189 } // namespace kernels
190 } // namespace opencl
191 } // namespace linalg
192 } // namespace viennacl
193 #endif
194 
Implements a OpenCL platform within ViennaCL.
void generate_scan_kernel_3(StringType &source, std::string const &numeric_string)
Definition: scan.hpp:124
Various little tools used here and there in ViennaCL.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Provides OpenCL-related utilities.
static void init(viennacl::ocl::context &ctx)
Definition: scan.hpp:162
Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices...
Definition: scan.hpp:155
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
static std::string program_name()
Definition: scan.hpp:157
Representation of an OpenCL kernel in ViennaCL.
void generate_scan_kernel_1(StringType &source, std::string const &numeric_string)
Definition: scan.hpp:39
void generate_scan_kernel_2(StringType &source, std::string const &numeric_string)
Definition: scan.hpp:96
Helper class for converting a type to its string representation.
Definition: utils.hpp:57