ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
svd.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
28 namespace viennacl
29 {
30 namespace linalg
31 {
32 namespace opencl
33 {
34 namespace kernels
35 {
36 
37 template <typename StringType>
38 void generate_svd_bidiag_pack(StringType & source, std::string const & numeric_string, bool is_row_major)
39 {
40  source.append("__kernel void bidiag_pack(__global "); source.append(numeric_string); source.append("* A, \n");
41  source.append(" __global "); source.append(numeric_string); source.append("* D, \n");
42  source.append(" __global "); source.append(numeric_string); source.append("* S, \n");
43  source.append(" uint size1, \n");
44  source.append(" uint size2, \n");
45  source.append(" uint stride \n");
46  source.append(") { \n");
47  source.append(" uint size = min(size1, size2); \n");
48 
49  source.append(" if(get_global_id(0) == 0) \n");
50  source.append(" S[0] = 0; \n");
51  if(is_row_major)
52  {
53  source.append(" for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n");
54  source.append(" D[i] = A[i*stride + i]; \n");
55  source.append(" S[i + 1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0; \n");
56  }
57  else
58  {
59  source.append(" for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n");
60  source.append(" D[i] = A[i*stride + i]; \n");
61  source.append(" S[i + 1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0; \n");
62  }
63  source.append(" } \n");
64  source.append("} \n");
65 }
66 
67 template<typename StringT>
68 void generate_svd_col_reduce_lcl_array(StringT & source, std::string const & numeric_string)
69 {
70  // calculates a sum of local array elements
71  source.append("void col_reduce_lcl_array(__local "); source.append(numeric_string); source.append("* sums, uint lcl_id, uint lcl_sz) { \n");
72  source.append(" uint step = lcl_sz >> 1; \n");
73 
74  source.append(" while (step > 0) { \n");
75  source.append(" if (lcl_id < step) { \n");
76  source.append(" sums[lcl_id] += sums[lcl_id + step]; \n");
77  source.append(" } \n");
78  source.append(" step >>= 1; \n");
79  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
80  source.append(" } \n");
81  source.append("} \n");
82 }
83 
84 template <typename StringType>
85 void generate_svd_copy_col(StringType & source, std::string const & numeric_string, bool is_row_major)
86 {
87  // probably, this is a ugly way
88  source.append("__kernel void copy_col(__global "); source.append(numeric_string); source.append("* A, \n");
89  source.append(" __global "); source.append(numeric_string); source.append("* V, \n");
90  source.append(" uint row_start, \n");
91  source.append(" uint col_start, \n");
92  source.append(" uint size, \n");
93  source.append(" uint stride \n");
94  source.append(" ) { \n");
95  source.append(" uint glb_id = get_global_id(0); \n");
96  source.append(" uint glb_sz = get_global_size(0); \n");
97  if(is_row_major)
98  {
99  source.append(" for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n");
100  source.append(" V[i - row_start] = A[i * stride + col_start]; \n");
101  source.append(" } \n");
102  }
103  else
104  {
105  source.append(" for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n");
106  source.append(" V[i - row_start] = A[i + col_start * stride]; \n");
107  source.append(" } \n");
108  }
109 
110  source.append("} \n");
111 }
112 
113 template <typename StringType>
114 void generate_svd_copy_row(StringType & source, std::string const & numeric_string, bool is_row_major)
115 {
116  // probably, this is too
117  source.append("__kernel void copy_row(__global "); source.append(numeric_string); source.append("* A, \n");
118  source.append(" __global "); source.append(numeric_string); source.append("* V, \n");
119  source.append(" uint row_start, \n");
120  source.append(" uint col_start, \n");
121  source.append(" uint size, \n");
122  source.append(" uint stride \n");
123  source.append(" ) { \n");
124  source.append(" uint glb_id = get_global_id(0); \n");
125  source.append(" uint glb_sz = get_global_size(0); \n");
126  if(is_row_major)
127  {
128  source.append(" for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n");
129  source.append(" V[i - col_start] = A[row_start * stride + i]; \n");
130  source.append(" } \n");
131  }
132  else
133  {
134  source.append(" for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n");
135  source.append(" V[i - col_start] = A[row_start + i * stride]; \n");
136  source.append(" } \n");
137  }
138 
139  source.append("} \n");
140 }
141 
142 template<typename StringT>
143 void generate_svd_final_iter_update(StringT & source, std::string const & numeric_string)
144 {
145  source.append("__kernel void final_iter_update(__global "); source.append(numeric_string); source.append("* A, \n");
146  source.append(" uint stride, \n");
147  source.append(" uint n, \n");
148  source.append(" uint last_n, \n");
149  source.append(" "); source.append(numeric_string); source.append(" q, \n");
150  source.append(" "); source.append(numeric_string); source.append(" p \n");
151  source.append(" ) \n");
152  source.append("{ \n");
153  source.append(" uint glb_id = get_global_id(0); \n");
154  source.append(" uint glb_sz = get_global_size(0); \n");
155 
156  source.append(" for (uint px = glb_id; px < last_n; px += glb_sz) \n");
157  source.append(" { \n");
158  source.append(" "); source.append(numeric_string); source.append(" v_in = A[n * stride + px]; \n");
159  source.append(" "); source.append(numeric_string); source.append(" z = A[(n - 1) * stride + px]; \n");
160  source.append(" A[(n - 1) * stride + px] = q * z + p * v_in; \n");
161  source.append(" A[n * stride + px] = q * v_in - p * z; \n");
162  source.append(" } \n");
163  source.append("} \n");
164 }
165 
166 template <typename StringType>
167 void generate_svd_givens_next(StringType & source, std::string const & numeric_string, bool is_row_major)
168 {
169  source.append("__kernel void givens_next(__global "); source.append(numeric_string); source.append("* matr, \n");
170  source.append(" __global "); source.append(numeric_string); source.append("* cs, \n");
171  source.append(" __global "); source.append(numeric_string); source.append("* ss, \n");
172  source.append(" uint size, \n");
173  source.append(" uint stride, \n");
174  source.append(" uint start_i, \n");
175  source.append(" uint end_i \n");
176  source.append(" ) \n");
177  source.append("{ \n");
178  source.append(" uint glb_id = get_global_id(0); \n");
179  source.append(" uint glb_sz = get_global_size(0); \n");
180 
181  source.append(" uint lcl_id = get_local_id(0); \n");
182  source.append(" uint lcl_sz = get_local_size(0); \n");
183 
184  source.append(" uint j = glb_id; \n");
185 
186  source.append(" __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
187  source.append(" __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
188  if(is_row_major)
189  {
190 
191  source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) + j * stride] : 0; \n");
192 
193  source.append(" uint elems_num = end_i - start_i + 1; \n");
194  source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
195 
196  source.append(" for(uint block_id = 0; block_id < block_num; block_id++) \n");
197  source.append(" { \n");
198  source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
199 
200  source.append(" if(lcl_id < to) \n");
201  source.append(" { \n");
202  source.append(" cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n");
203  source.append(" ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n");
204  source.append(" } \n");
205 
206  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
207 
208  source.append(" if(j < size) \n");
209  source.append(" { \n");
210  source.append(" for(uint ind = 0; ind < to; ind++) \n");
211  source.append(" { \n");
212  source.append(" uint i = end_i - (ind + block_id * lcl_sz); \n");
213 
214  source.append(" "); source.append(numeric_string); source.append(" z = matr[i + j * stride]; \n");
215 
216  source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n");
217  source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n");
218 
219  source.append(" matr[(i + 1) + j * stride] = x * cs_val + z * ss_val; \n");
220  source.append(" x = -x * ss_val + z * cs_val; \n");
221  source.append(" } \n");
222  source.append(" } \n");
223  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
224  source.append(" } \n");
225  source.append(" if(j < size) \n");
226  source.append(" matr[(start_i) + j * stride] = x; \n");
227  }
228  else
229  {
230 
231  source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) * stride + j] : 0; \n");
232 
233  source.append(" uint elems_num = end_i - start_i + 1; \n");
234  source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
235 
236  source.append(" for(uint block_id = 0; block_id < block_num; block_id++) \n");
237  source.append(" { \n");
238  source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
239 
240  source.append(" if(lcl_id < to) \n");
241  source.append(" { \n");
242  source.append(" cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n");
243  source.append(" ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n");
244  source.append(" } \n");
245 
246  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
247 
248  source.append(" if(j < size) \n");
249  source.append(" { \n");
250  source.append(" for(uint ind = 0; ind < to; ind++) \n");
251  source.append(" { \n");
252  source.append(" uint i = end_i - (ind + block_id * lcl_sz); \n");
253 
254  source.append(" "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
255 
256  source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n");
257  source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n");
258 
259  source.append(" matr[(i + 1) * stride + j] = x * cs_val + z * ss_val; \n");
260  source.append(" x = -x * ss_val + z * cs_val; \n");
261  source.append(" } \n");
262  source.append(" } \n");
263  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
264  source.append(" } \n");
265  source.append(" if(j < size) \n");
266  source.append(" matr[(start_i) * stride + j] = x; \n");
267  }
268  source.append("} \n");
269 }
270 
271 template<typename StringT>
272 void generate_svd_givens_prev(StringT & source, std::string const & numeric_string)
273 {
274  source.append("__kernel void givens_prev(__global "); source.append(numeric_string); source.append("* matr, \n");
275  source.append(" __global "); source.append(numeric_string); source.append("* cs, \n");
276  source.append(" __global "); source.append(numeric_string); source.append("* ss, \n");
277  source.append(" uint size, \n");
278  source.append(" uint stride, \n");
279  source.append(" uint start_i, \n");
280  source.append(" uint end_i \n");
281  source.append(" ) \n");
282  source.append("{ \n");
283  source.append(" uint glb_id = get_global_id(0); \n");
284  source.append(" uint glb_sz = get_global_size(0); \n");
285 
286  source.append(" uint lcl_id = get_local_id(0); \n");
287  source.append(" uint lcl_sz = get_local_size(0); \n");
288 
289  source.append(" uint j = glb_id; \n");
290 
291  source.append(" __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
292  source.append(" __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
293 
294  source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(start_i - 1) * stride + j] : 0; \n");
295 
296  source.append(" uint elems_num = end_i - start_i; \n");
297  source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
298 
299  source.append(" for (uint block_id = 0; block_id < block_num; block_id++) \n");
300  source.append(" { \n");
301  source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
302 
303  source.append(" if (lcl_id < to) \n");
304  source.append(" { \n");
305  source.append(" cs_lcl[lcl_id] = cs[lcl_id + start_i + block_id * lcl_sz]; \n");
306  source.append(" ss_lcl[lcl_id] = ss[lcl_id + start_i + block_id * lcl_sz]; \n");
307  source.append(" } \n");
308 
309  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
310 
311  source.append(" if (j < size) \n");
312  source.append(" { \n");
313  source.append(" for (uint ind = 0; ind < to; ind++) \n");
314  source.append(" { \n");
315  source.append(" uint i = ind + start_i + block_id * lcl_sz; \n");
316 
317  source.append(" "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
318 
319  source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind];//cs[i]; \n");
320  source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind];//ss[i]; \n");
321 
322  source.append(" matr[(i - 1) * stride + j] = x * cs_val + z * ss_val; \n");
323  source.append(" x = -x * ss_val + z * cs_val; \n");
324  source.append(" } \n");
325  source.append(" } \n");
326  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
327  source.append(" } \n");
328  source.append(" if (j < size) \n");
329  source.append(" matr[(end_i - 1) * stride + j] = x; \n");
330  source.append("} \n");
331 }
332 
333 template <typename StringType>
334 void generate_svd_house_update_A_left(StringType & source, std::string const & numeric_string, bool is_row_major)
335 {
336  source.append("__kernel void house_update_A_left( \n");
337  source.append(" __global "); source.append(numeric_string); source.append("* A, \n");
338  source.append(" __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
339  source.append(" uint row_start, \n");
340  source.append(" uint col_start, \n");
341  source.append(" uint size1, \n");
342  source.append(" uint size2, \n");
343  source.append(" uint stride, \n");
344  source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
345  source.append(" ) { \n");
346  source.append(" uint glb_id = get_global_id(0); \n");
347  source.append(" uint glb_sz = get_global_size(0); \n");
348 
349  source.append(" uint grp_id = get_group_id(0); \n");
350  source.append(" uint grp_nm = get_num_groups(0); \n");
351 
352  source.append(" uint lcl_id = get_local_id(0); \n");
353  source.append(" uint lcl_sz = get_local_size(0); \n");
354 
355  source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
356 
357  // doing it in slightly different way to avoid cache misses
358  if(is_row_major)
359  {
360  source.append(" for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n");
361  source.append(" ss = 0; \n");
362  source.append(" for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j * stride + i]); \n");
363 
364  source.append(" for(uint j = row_start; j < size1; j++) \n");
365  source.append(" A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss); \n");
366  source.append(" } \n");
367  }
368  else
369  {
370  source.append(" for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n");
371  source.append(" ss = 0; \n");
372  source.append(" for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j + i * stride]); \n");
373 
374  source.append(" for(uint j = row_start; j < size1; j++) \n");
375  source.append(" A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss); \n");
376  source.append(" } \n");
377  }
378  source.append("} \n");
379 }
380 
381 template <typename StringType>
382 void generate_svd_house_update_A_right(StringType & source, std::string const & numeric_string, bool is_row_major)
383 {
384 
385  source.append("__kernel void house_update_A_right( \n");
386  source.append(" __global "); source.append(numeric_string); source.append("* A, \n");
387  source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
388  source.append(" uint row_start, \n");
389  source.append(" uint col_start, \n");
390  source.append(" uint size1, \n");
391  source.append(" uint size2, \n");
392  source.append(" uint stride, \n");
393  source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
394  source.append(" ) { \n");
395 
396  source.append(" uint glb_id = get_global_id(0); \n");
397 
398  source.append(" uint grp_id = get_group_id(0); \n");
399  source.append(" uint grp_nm = get_num_groups(0); \n");
400 
401  source.append(" uint lcl_id = get_local_id(0); \n");
402  source.append(" uint lcl_sz = get_local_size(0); \n");
403 
404  source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
405 
406  // update of A matrix
407  if(is_row_major)
408  {
409  source.append(" for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n");
410  source.append(" ss = 0; \n");
411 
412  source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i * stride + j]); \n");
413  source.append(" sums[lcl_id] = ss; \n");
414 
415  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
416  source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
417  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
418 
419  source.append(" "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n");
420 
421  source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
422  source.append(" A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av); \n");
423  source.append(" } \n");
424  }
425  else
426  {
427  source.append(" for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n");
428  source.append(" ss = 0; \n");
429 
430  source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i + j * stride]); \n");
431  source.append(" sums[lcl_id] = ss; \n");
432 
433  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
434  source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
435  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
436 
437  source.append(" "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n");
438 
439  source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
440  source.append(" A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av); \n");
441  source.append(" } \n");
442  }
443 
444  source.append("} \n");
445 
446 }
447 
448 template <typename StringType>
449 void generate_svd_house_update_QL(StringType & source, std::string const & numeric_string, bool is_row_major)
450 {
451  source.append("__kernel void house_update_QL(\n");
452  source.append(" __global "); source.append(numeric_string); source.append("* QL, \n");
453  source.append(" __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
454  source.append(" uint size1, \n");
455  source.append(" uint strideQ, \n");
456  source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
457  source.append(" ) { \n");
458  source.append(" uint glb_id = get_global_id(0); \n");
459  source.append(" uint glb_sz = get_global_size(0); \n");
460 
461  source.append(" uint grp_id = get_group_id(0); \n");
462  source.append(" uint grp_nm = get_num_groups(0); \n");
463 
464  source.append(" uint lcl_id = get_local_id(0); \n");
465  source.append(" uint lcl_sz = get_local_size(0); \n");
466 
467  source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
468 
469  if(is_row_major)
470  {
471  source.append(" for(uint i = grp_id; i < size1; i += grp_nm) { \n");
472  source.append(" ss = 0; \n");
473  source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i * strideQ + j]); \n");
474  source.append(" sums[lcl_id] = ss; \n");
475 
476  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
477  source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
478  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
479 
480  source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
481 
482  source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) \n");
483  source.append(" QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
484  source.append(" } \n");
485  }
486  else
487  {
488  source.append(" for(uint i = grp_id; i < size1; i += grp_nm) { \n");
489  source.append(" ss = 0; \n");
490  source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i + j * strideQ]); \n");
491  source.append(" sums[lcl_id] = ss; \n");
492 
493  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
494  source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
495  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
496 
497  source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
498 
499  source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) \n");
500  source.append(" QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv); \n");
501  source.append(" } \n");
502  }
503  source.append("} \n");
504 
505 }
506 
507 template<typename StringT>
508 void generate_svd_house_update_QR(StringT & source, std::string const & numeric_string)
509 {
510  source.append("__kernel void house_update_QR( \n");
511  source.append(" __global "); source.append(numeric_string); source.append("* QR, \n");
512  source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
513  source.append(" uint size1, \n");
514  source.append(" uint size2, \n");
515  source.append(" uint strideQ, \n");
516  source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
517  source.append(" ) { \n");
518 
519  source.append(" uint glb_id = get_global_id(0); \n");
520 
521  source.append(" uint grp_id = get_group_id(0); \n");
522  source.append(" uint grp_nm = get_num_groups(0); \n");
523 
524  source.append(" uint lcl_id = get_local_id(0); \n");
525  source.append(" uint lcl_sz = get_local_size(0); \n");
526 
527  source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
528 
529  // update of QR matrix
530  // Actually, we are calculating a transpose of right matrix. This allows to avoid cache
531  // misses.
532  source.append(" for (uint i = grp_id; i < size2; i += grp_nm) { \n");
533  source.append(" ss = 0; \n");
534  source.append(" for (uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * QR[i * strideQ + j]); \n");
535  source.append(" sums[lcl_id] = ss; \n");
536 
537  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
538  source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
539  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
540 
541  source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
542  source.append(" for (uint j = lcl_id; j < size2; j += lcl_sz) \n");
543  source.append(" QR[i * strideQ + j] = QR[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
544  source.append(" } \n");
545  source.append("} \n");
546 }
547 
548 template<typename StringT>
549 void generate_svd_inverse_signs(StringT & source, std::string const & numeric_string)
550 {
551  source.append("__kernel void inverse_signs(__global "); source.append(numeric_string); source.append("* v, \n");
552  source.append(" __global "); source.append(numeric_string); source.append("* signs, \n");
553  source.append(" uint size, \n");
554  source.append(" uint stride \n");
555  source.append(" ) \n");
556  source.append("{ \n");
557  source.append(" uint glb_id_x = get_global_id(0); \n");
558  source.append(" uint glb_id_y = get_global_id(1); \n");
559 
560  source.append(" if ((glb_id_x < size) && (glb_id_y < size)) \n");
561  source.append(" v[glb_id_x * stride + glb_id_y] *= signs[glb_id_x]; \n");
562  source.append("} \n");
563 
564 }
565 
566 template<typename StringT>
567 void generate_svd_transpose_inplace(StringT & source, std::string const & numeric_string)
568 {
569 
570  source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("* input, \n");
571  source.append(" unsigned int row_num, \n");
572  source.append(" unsigned int col_num) { \n");
573  source.append(" unsigned int size = row_num * col_num; \n");
574  source.append(" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
575  source.append(" unsigned int row = i / col_num; \n");
576  source.append(" unsigned int col = i - row*col_num; \n");
577 
578  source.append(" unsigned int new_pos = col * row_num + row; \n");
579 
580  //new_pos = (col < row) ? 0 : 1;
581  //input[i] = new_pos;
582 
583  source.append(" if (i < new_pos) { \n");
584  source.append(" "); source.append(numeric_string); source.append(" val = input[i]; \n");
585  source.append(" input[i] = input[new_pos]; \n");
586  source.append(" input[new_pos] = val; \n");
587  source.append(" } \n");
588  source.append(" } \n");
589  source.append("} \n");
590 
591 }
592 
593 template<typename StringT>
594 void generate_svd_update_qr_column(StringT & source, std::string const & numeric_string)
595 {
596  source.append("__kernel void update_qr_column(__global "); source.append(numeric_string); source.append("* A, \n");
597  source.append(" uint stride, \n");
598  source.append(" __global "); source.append(numeric_string); source.append("* buf, \n");
599  source.append(" int m, \n");
600  source.append(" int n, \n");
601  source.append(" int last_n) \n");
602  source.append("{ \n");
603  source.append(" uint glb_id = get_global_id(0); \n");
604  source.append(" uint glb_sz = get_global_size(0); \n");
605 
606  source.append(" for (int i = glb_id; i < last_n; i += glb_sz) \n");
607  source.append(" { \n");
608  source.append(" "); source.append(numeric_string); source.append(" a_ik = A[m * stride + i], a_ik_1, a_ik_2; \n");
609 
610  source.append(" a_ik_1 = A[(m + 1) * stride + i]; \n");
611 
612  source.append(" for (int k = m; k < n; k++) \n");
613  source.append(" { \n");
614  source.append(" bool notlast = (k != n - 1); \n");
615 
616  source.append(" "); source.append(numeric_string); source.append(" p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1; \n");
617 
618  source.append(" if (notlast) \n");
619  source.append(" { \n");
620  source.append(" a_ik_2 = A[(k + 2) * stride + i]; \n");
621  source.append(" p = p + buf[5 * k + 2] * a_ik_2; \n");
622  source.append(" a_ik_2 = a_ik_2 - p * buf[5 * k + 4]; \n");
623  source.append(" } \n");
624 
625  source.append(" A[k * stride + i] = a_ik - p; \n");
626  source.append(" a_ik_1 = a_ik_1 - p * buf[5 * k + 3]; \n");
627 
628  source.append(" a_ik = a_ik_1; \n");
629  source.append(" a_ik_1 = a_ik_2; \n");
630  source.append(" } \n");
631 
632  source.append(" A[n * stride + i] = a_ik; \n");
633  source.append(" } \n");
634 
635  source.append("} \n");
636 }
637 
638 
639 
640 
641 // main kernel class
643 template<typename NumericT, typename MatrixLayout = row_major>
644 struct svd
645 {
646  static std::string program_name()
647  {
649  return (viennacl::ocl::type_to_string<NumericT>::apply() + "_svd_") + (is_row ? "row" : "col");
650  }
651 
652  static void init(viennacl::ocl::context & ctx)
653  {
654  static std::map<cl_context, bool> init_done;
655  if (!init_done[ctx.handle().get()])
656  {
658  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
660 
661  std::string source;
662  source.reserve(1024);
663 
664  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
665 
666  // only generate for floating points (forces error for integers)
667  if (numeric_string == "float" || numeric_string == "double")
668  {
669  //helper function used by multiple kernels:
670  generate_svd_col_reduce_lcl_array(source, numeric_string);
671 
672  //kernels:
673  generate_svd_bidiag_pack(source, numeric_string, is_row_major);
674  generate_svd_copy_col(source, numeric_string, is_row_major);
675  generate_svd_copy_row(source, numeric_string, is_row_major);
676  generate_svd_final_iter_update(source, numeric_string);
677  generate_svd_givens_next(source, numeric_string, is_row_major);
678  generate_svd_givens_prev(source, numeric_string);
679  generate_svd_house_update_A_left(source, numeric_string, is_row_major);
680  generate_svd_house_update_A_right(source, numeric_string, is_row_major);
681  generate_svd_house_update_QL(source, numeric_string, is_row_major);
682  generate_svd_house_update_QR(source, numeric_string);
683  generate_svd_inverse_signs(source, numeric_string);
684  generate_svd_transpose_inplace(source, numeric_string);
685  generate_svd_update_qr_column(source, numeric_string);
686  }
687 
688  std::string prog_name = program_name();
689  #ifdef VIENNACL_BUILD_INFO
690  std::cout << "Creating program " << prog_name << std::endl;
691  #endif
692  ctx.add_program(source, prog_name);
693  init_done[ctx.handle().get()] = true;
694  } //if
695  } //init
696 };
697 
698 } // namespace kernels
699 } // namespace opencl
700 } // namespace linalg
701 } // namespace viennacl
702 #endif
703 
void generate_svd_copy_row(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: svd.hpp:114
Implements a OpenCL platform within ViennaCL.
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:484
Various little tools used here and there in ViennaCL.
void generate_svd_bidiag_pack(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: svd.hpp:38
void generate_svd_final_iter_update(StringT &source, std::string const &numeric_string)
Definition: svd.hpp:143
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
void generate_svd_update_qr_column(StringT &source, std::string const &numeric_string)
Definition: svd.hpp:594
Provides OpenCL-related utilities.
static std::string program_name()
Definition: svd.hpp:646
void generate_svd_inverse_signs(StringT &source, std::string const &numeric_string)
Definition: svd.hpp:549
static void init(viennacl::ocl::context &ctx)
Definition: svd.hpp:652
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
void generate_svd_house_update_QR(StringT &source, std::string const &numeric_string)
Definition: svd.hpp:508
void generate_svd_transpose_inplace(StringT &source, std::string const &numeric_string)
Definition: svd.hpp:567
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
void generate_svd_givens_prev(StringT &source, std::string const &numeric_string)
Definition: svd.hpp:272
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices...
Definition: svd.hpp:644
void generate_svd_house_update_A_right(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: svd.hpp:382
Representation of an OpenCL kernel in ViennaCL.
void generate_svd_col_reduce_lcl_array(StringT &source, std::string const &numeric_string)
Definition: svd.hpp:68
void generate_svd_copy_col(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: svd.hpp:85
void generate_svd_house_update_QL(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: svd.hpp:449
void generate_svd_givens_next(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: svd.hpp:167
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_svd_house_update_A_left(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: svd.hpp:334