ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
bisect_kernel_large.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
31 /* Determine eigenvalues for large symmetric, tridiagonal matrix. First
32  step of the computation. */
33 
34 // includes, project
37 // additional kernel
39 
40 // declaration, forward
41 
42 namespace viennacl
43 {
44 namespace linalg
45 {
46 namespace cuda
47 {
51 template<typename NumericT>
52 __device__
53 void writeToGmem(const unsigned int tid, const unsigned int tid_2,
54  const unsigned int num_threads_active,
55  const unsigned int num_blocks_mult,
56  NumericT *g_left_one, NumericT *g_right_one,
57  unsigned int *g_pos_one,
58  NumericT *g_left_mult, NumericT *g_right_mult,
59  unsigned int *g_left_count_mult,
60  unsigned int *g_right_count_mult,
61  NumericT *s_left, NumericT *s_right,
62  unsigned short *s_left_count, unsigned short *s_right_count,
63  unsigned int *g_blocks_mult,
64  unsigned int *g_blocks_mult_sum,
65  unsigned short *s_compaction_list,
66  unsigned short *s_cl_helper,
67  unsigned int offset_mult_lambda
68  )
69 {
70 
71  if (tid < offset_mult_lambda)
72  {
73 
74  g_left_one[tid] = s_left[tid];
75  g_right_one[tid] = s_right[tid];
76  // right count can be used to order eigenvalues without sorting
77  g_pos_one[tid] = s_right_count[tid];
78  }
79  else
80  {
81 
82 
83  g_left_mult[tid - offset_mult_lambda] = s_left[tid];
84  g_right_mult[tid - offset_mult_lambda] = s_right[tid];
85  g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
86  g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
87  }
88 
89  if (tid_2 < num_threads_active)
90  {
91 
92  if (tid_2 < offset_mult_lambda)
93  {
94 
95  g_left_one[tid_2] = s_left[tid_2];
96  g_right_one[tid_2] = s_right[tid_2];
97  // right count can be used to order eigenvalues without sorting
98  g_pos_one[tid_2] = s_right_count[tid_2];
99  }
100  else
101  {
102 
103  g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
104  g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
105  g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
106  g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
107  }
108 
109  } // end writing out data
110 
111  __syncthreads();
112 
113  // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
114  // s_compaction_list we write the exclusive scan result
115  if (tid <= num_blocks_mult)
116  {
117  g_blocks_mult[tid] = s_compaction_list[tid];
118  g_blocks_mult_sum[tid] = s_cl_helper[tid];
119  }
120 
121  if (tid_2 <= num_blocks_mult)
122  {
123  g_blocks_mult[tid_2] = s_compaction_list[tid_2];
124  g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
125  }
126 }
127 
131 template<typename NumericT>
132 __device__
133 void
134 compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
135  const unsigned int num_threads_active,
136  unsigned int &offset_mult_lambda,
137  NumericT *s_left, NumericT *s_right,
138  unsigned short *s_left_count, unsigned short *s_right_count,
139  unsigned short *s_cl_one, unsigned short *s_cl_mult,
140  unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
141  unsigned int is_one_lambda, unsigned int is_one_lambda_2,
142  NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2,
143  unsigned int &left_count, unsigned int &right_count,
144  unsigned int &left_count_2, unsigned int &right_count_2,
145  unsigned int c_block_iend, unsigned int c_sum_block,
146  unsigned int c_block_iend_2, unsigned int c_sum_block_2
147  )
148 {
149  // cache data before performing compaction
150  left = s_left[tid];
151  right = s_right[tid];
152 
153  if (tid_2 < num_threads_active)
154  {
155 
156  left_2 = s_left[tid_2];
157  right_2 = s_right[tid_2];
158  }
159 
160  __syncthreads();
161 
162  // determine addresses for intervals containing multiple eigenvalues and
163  // addresses for blocks of intervals
164  unsigned int ptr_w = 0;
165  unsigned int ptr_w_2 = 0;
166  unsigned int ptr_blocking_w = 0;
167  unsigned int ptr_blocking_w_2 = 0;
168 
169 
170 
171  ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
172  : s_cl_mult[tid] + offset_mult_lambda;
173 
174  if (0 != c_block_iend)
175  {
176  ptr_blocking_w = s_cl_blocking[tid];
177  }
178 
179  if (tid_2 < num_threads_active)
180  {
181  ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
182  : s_cl_mult[tid_2] + offset_mult_lambda;
183 
184  if (0 != c_block_iend_2)
185  {
186  ptr_blocking_w_2 = s_cl_blocking[tid_2];
187  }
188  }
189 
190 
191  __syncthreads();
192  if(tid < num_threads_active)
193  {
194  // store compactly in shared mem
195  s_left[ptr_w] = left;
196  s_right[ptr_w] = right;
197  s_left_count[ptr_w] = left_count;
198  s_right_count[ptr_w] = right_count;
199  }
200 
201 
202  __syncthreads();
203  if(tid == 1)
204  {
205  s_left[ptr_w] = left;
206  s_right[ptr_w] = right;
207  s_left_count[ptr_w] = left_count;
208  s_right_count[ptr_w] = right_count;
209  }
210  if (0 != c_block_iend)
211  {
212  s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
213  s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
214  }
215 
216  if (tid_2 < num_threads_active)
217  {
218 
219  // store compactly in shared mem
220  s_left[ptr_w_2] = left_2;
221  s_right[ptr_w_2] = right_2;
222  s_left_count[ptr_w_2] = left_count_2;
223  s_right_count[ptr_w_2] = right_count_2;
224 
225  if (0 != c_block_iend_2)
226  {
227  s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
228  s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
229  }
230  }
231 
232 }
233 
237 inline __device__
238 void
239 scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
240  const unsigned int num_threads_compaction,
241  unsigned short *s_cl_blocking,
242  unsigned short *s_cl_helper
243  )
244 {
245  // prepare for second step of block generation: compaction of the block
246  // list itself to efficiently write out these
247  s_cl_blocking[tid] = s_cl_helper[tid];
248 
249  if (tid_2 < num_threads_compaction)
250  {
251  s_cl_blocking[tid_2] = s_cl_helper[tid_2];
252  }
253 
254  __syncthreads();
255 
256  // additional scan to compact s_cl_blocking that permits to generate a
257  // compact list of eigenvalue blocks each one containing about
258  // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
259  // processed by one thread block in a subsequent processing step
260 
261  unsigned int offset = 1;
262 
263  // build scan tree
264  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
265  {
266 
267  __syncthreads();
268 
269  if (tid < d)
270  {
271 
272  unsigned int ai = offset*(2*tid+1)-1;
273  unsigned int bi = offset*(2*tid+2)-1;
274  s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
275  }
276 
277  offset <<= 1;
278  }
279 
280  // traverse down tree: first down to level 2 across
281  for (int d = 2; d < num_threads_compaction; d <<= 1)
282  {
283 
284  offset >>= 1;
285  __syncthreads();
286 
287  //
288  if (tid < (d-1))
289  {
290 
291  unsigned int ai = offset*(tid+1) - 1;
292  unsigned int bi = ai + (offset >> 1);
293  s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
294  }
295  }
296 
297 }
298 
302 inline __device__
303 void
304 scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
305  const unsigned int num_threads_active,
306  const unsigned int num_threads_compaction,
307  unsigned short *s_cl_blocking,
308  unsigned short *s_cl_helper)
309 {
310  unsigned int offset = 1;
311 
312  // first step of scan to build the sum of elements within each block
313  // build up tree
314  for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)
315  {
316 
317  __syncthreads();
318 
319  if (tid < d)
320  {
321 
322  unsigned int ai = offset*(2*tid+1)-1;
323  unsigned int bi = offset*(2*tid+2)-1;
324 
325  s_cl_blocking[bi] += s_cl_blocking[ai];
326  }
327 
328  offset *= 2;
329  }
330 
331  // first step of scan to build the sum of elements within each block
332  // traverse down tree
333  for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)
334  {
335 
336  offset >>= 1;
337  __syncthreads();
338 
339  if (tid < (d-1))
340  {
341 
342  unsigned int ai = offset*(tid+1) - 1;
343  unsigned int bi = ai + (offset >> 1);
344 
345  s_cl_blocking[bi] += s_cl_blocking[ai];
346  }
347  }
348 
349  __syncthreads();
350 
351  if (0 == tid)
352  {
353 
354  // move last element of scan to last element that is valid
355  // necessary because the number of threads employed for scan is a power
356  // of two and not necessarily the number of active threasd
357  s_cl_helper[num_threads_active - 1] =
358  s_cl_helper[num_threads_compaction - 1];
359  s_cl_blocking[num_threads_active - 1] =
360  s_cl_blocking[num_threads_compaction - 1];
361  }
362 }
363 
368 inline __device__
369 void
370 scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size,
371  const unsigned int num_threads_active,
372  const unsigned int num_threads_compaction,
373  unsigned short *s_cl_one, unsigned short *s_cl_mult,
374  unsigned short *s_cl_blocking, unsigned short *s_cl_helper
375  )
376 {
377 
378  // perform scan to compactly write out the intervals containing one and
379  // multiple eigenvalues
380  // also generate tree for blocking of intervals containing multiple
381  // eigenvalues
382 
383  unsigned int offset = 1;
384 
385  // build scan tree
386  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
387  {
388 
389  __syncthreads();
390 
391  if (tid < d)
392  {
393 
394  unsigned int ai = offset*(2*tid+1);
395  unsigned int bi = offset*(2*tid+2)-1;
396 
397  s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
398  s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
399 
400  // s_cl_helper is binary and zero for an internal node and 1 for a
401  // root node of a tree corresponding to a block
402  // s_cl_blocking contains the number of nodes in each sub-tree at each
403  // iteration, the data has to be kept to compute the total number of
404  // eigenvalues per block that, in turn, is needed to efficiently
405  // write out data in the second step
406  if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
407  {
408 
409  // check how many childs are non terminated
410  if (s_cl_helper[ai - 1] == 1)
411  {
412  // mark as terminated
413  s_cl_helper[bi] = 1;
414  }
415  else if (s_cl_helper[bi] == 1)
416  {
417  // mark as terminated
418  s_cl_helper[ai - 1] = 1;
419  }
420  else // both childs are non-terminated
421  {
422 
423  unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
424 
425  if (temp > (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2))
426  {
427 
428  // the two child trees have to form separate blocks, terminate trees
429  s_cl_helper[ai - 1] = 1;
430  s_cl_helper[bi] = 1;
431  }
432  else
433  {
434  // build up tree by joining subtrees
435  s_cl_blocking[bi] = temp;
436  s_cl_blocking[ai - 1] = 0;
437  }
438  }
439  } // end s_cl_helper update
440 
441  }
442 
443  offset <<= 1;
444  }
445 
446 
447  // traverse down tree, this only for stream compaction, not for block
448  // construction
449  for (int d = 2; d < num_threads_compaction; d <<= 1)
450  {
451 
452  offset >>= 1;
453  __syncthreads();
454 
455  //
456  if (tid < (d-1))
457  {
458 
459  unsigned int ai = offset*(tid+1) - 1;
460  unsigned int bi = ai + (offset >> 1);
461 
462  s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
463  s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
464  }
465  }
466 
467 }
468 
473 template<typename NumericT>
474 __device__
475 void
476 storeNonEmptyIntervalsLarge(unsigned int addr,
477  const unsigned int num_threads_active,
478  NumericT *s_left, NumericT *s_right,
479  unsigned short *s_left_count,
480  unsigned short *s_right_count,
481  NumericT left, NumericT mid, NumericT right,
482  const unsigned short left_count,
483  const unsigned short mid_count,
484  const unsigned short right_count,
485  NumericT epsilon,
486  unsigned int &compact_second_chunk,
487  unsigned short *s_compaction_list,
488  unsigned int &is_active_second)
489 {
490  // check if both child intervals are valid
491  if ((left_count != mid_count) && (mid_count != right_count))
492  {
493 
494  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
495  left, mid, left_count, mid_count, epsilon);
496 
497  is_active_second = 1;
498  s_compaction_list[threadIdx.x] = 1;
499  compact_second_chunk = 1;
500  }
501  else
502  {
503 
504  // only one non-empty child interval
505 
506  // mark that no second child
507  is_active_second = 0;
508  s_compaction_list[threadIdx.x] = 0;
509 
510  // store the one valid child interval
511  if (left_count != mid_count)
512  {
513  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
514  left, mid, left_count, mid_count, epsilon);
515  }
516  else
517  {
518  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
519  mid, right, mid_count, right_count, epsilon);
520  }
521  }
522 }
523 
534 template<typename NumericT>
535 __global__
536 void
537 bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
538  const NumericT lg, const NumericT ug,
539  const unsigned int lg_eig_count,
540  const unsigned int ug_eig_count,
541  NumericT epsilon,
542  unsigned int *g_num_one,
543  unsigned int *g_num_blocks_mult,
544  NumericT *g_left_one, NumericT *g_right_one,
545  unsigned int *g_pos_one,
546  NumericT *g_left_mult, NumericT *g_right_mult,
547  unsigned int *g_left_count_mult,
548  unsigned int *g_right_count_mult,
549  unsigned int *g_blocks_mult,
550  unsigned int *g_blocks_mult_sum
551  )
552 {
553  const unsigned int tid = threadIdx.x;
554 
555  // intervals (store left and right because the subdivision tree is in general
556  // not dense
557  __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
558  __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
559 
560  // number of eigenvalues that are smaller than s_left / s_right
561  // (correspondence is realized via indices)
562  __shared__ unsigned short s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
563  __shared__ unsigned short s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
564 
565  // helper for stream compaction
566  __shared__ unsigned short s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
567 
568  // state variables for whole block
569  // if 0 then compaction of second chunk of child intervals is not necessary
570  // (because all intervals had exactly one non-dead child)
571  __shared__ unsigned int compact_second_chunk;
572  // if 1 then all threads are converged
573  __shared__ unsigned int all_threads_converged;
574 
575  // number of currently active threads
576  __shared__ unsigned int num_threads_active;
577 
578  // number of threads to use for stream compaction
579  __shared__ unsigned int num_threads_compaction;
580 
581  // helper for exclusive scan
582  unsigned short *s_compaction_list_exc = s_compaction_list + 1;
583 
584 
585  // variables for currently processed interval
586  // left and right limit of active interval
587  NumericT left = 0.0f;
588  NumericT right = 0.0f;
589  unsigned int left_count = 0;
590  unsigned int right_count = 0;
591  // midpoint of active interval
592  NumericT mid = 0.0f;
593  // number of eigenvalues smaller then mid
594  unsigned int mid_count = 0;
595  // helper for stream compaction (tracking of threads generating second child)
596  unsigned int is_active_second = 0;
597 
598  // initialize lists
599  s_compaction_list[tid] = 0;
600  s_left[tid] = 0;
601  s_right[tid] = 0;
602  s_left_count[tid] = 0;
603  s_right_count[tid] = 0;
604 
605  __syncthreads();
606 
607  // set up initial configuration
608  if (0 == tid)
609  {
610 
611  s_left[0] = lg;
612  s_right[0] = ug;
613  s_left_count[0] = lg_eig_count;
614  s_right_count[0] = ug_eig_count;
615 
616  compact_second_chunk = 0;
617  num_threads_active = 1;
618 
619  num_threads_compaction = 1;
620 
621  all_threads_converged = 1;
622  }
623 
624  __syncthreads();
625 
626  // for all active threads read intervals from the last level
627  // the number of (worst case) active threads per level l is 2^l
628  // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues
629  // will be executed in the second step
630  while(true)
631  {
632  s_compaction_list[tid] = 0;
633  s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
634  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
635  subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
636  num_threads_active,
637  left, right, left_count, right_count,
638  mid, all_threads_converged);
639 
640  __syncthreads();
641 
642  // check if done
643  if (1 == all_threads_converged)
644  {
645  break;
646  }
647 
648  // compute number of eigenvalues smaller than mid
649  // use all threads for reading the necessary matrix data from global
650  // memory
651  // use s_left and s_right as scratch space for diagonal and
652  // superdiagonal of matrix
653  mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
654  mid, threadIdx.x,
655  num_threads_active,
656  s_left, s_right,
657  (left == right));
658 
659  __syncthreads();
660 
661  // store intervals
662  // for all threads store the first child interval in a continuous chunk of
663  // memory, and the second child interval -- if it exists -- in a second
664  // chunk; it is likely that all threads reach convergence up to
665  // \a epsilon at the same level; furthermore, for higher level most / all
666  // threads will have only one child, storing the first child compactly will
667  // (first) avoid to perform a compaction step on the first chunk, (second)
668  // make it for higher levels (when all threads / intervals have
669  // exactly one child) unnecessary to perform a compaction of the second
670  // chunk
671  if (tid < num_threads_active)
672  {
673 
674  if (left != right)
675  {
676 
677  // store intervals
678  storeNonEmptyIntervalsLarge(tid, num_threads_active,
679  s_left, s_right,
680  s_left_count, s_right_count,
681  left, mid, right,
682  left_count, mid_count, right_count,
683  epsilon, compact_second_chunk,
684  s_compaction_list_exc,
685  is_active_second);
686  }
687  else
688  {
689 
690  // re-write converged interval (has to be stored again because s_left
691  // and s_right are used as scratch space for
692  // computeNumSmallerEigenvalsLarge()
693  s_left[tid] = left;
694  s_right[tid] = left;
695  s_left_count[tid] = left_count;
696  s_right_count[tid] = right_count;
697 
698  is_active_second = 0;
699  }
700  }
701 
702  // necessary so that compact_second_chunk is up-to-date
703  __syncthreads();
704 
705  // perform compaction of chunk where second children are stored
706  // scan of (num_threads_active / 2) elements, thus at most
707  // (num_threads_active / 4) threads are needed
708  if (compact_second_chunk > 0)
709  {
710 
711  // create indices for compaction
712  createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
713  }
714  __syncthreads();
715 
716  if (compact_second_chunk > 0)
717  {
718  compactIntervals(s_left, s_right, s_left_count, s_right_count,
719  mid, right, mid_count, right_count,
720  s_compaction_list, num_threads_active,
721  is_active_second);
722  }
723 
724  __syncthreads();
725 
726  // update state variables
727  if (0 == tid)
728  {
729 
730  // update number of active threads with result of reduction
731  num_threads_active += s_compaction_list[num_threads_active];
732  num_threads_compaction = ceilPow2(num_threads_active);
733 
734  compact_second_chunk = 0;
735  all_threads_converged = 1;
736  }
737 
738  __syncthreads();
739 
740  if (num_threads_compaction > blockDim.x)
741  {
742  break;
743  }
744 
745  }
746 
747  __syncthreads();
748 
749  // generate two lists of intervals; one with intervals that contain one
750  // eigenvalue (or are converged), and one with intervals that need further
751  // subdivision
752 
753  // perform two scans in parallel
754 
755  unsigned int left_count_2;
756  unsigned int right_count_2;
757 
758  unsigned int tid_2 = tid + blockDim.x;
759 
760  // cache in per thread registers so that s_left_count and s_right_count
761  // can be used for scans
762  left_count = s_left_count[tid];
763  right_count = s_right_count[tid];
764 
765  // some threads have to cache data for two intervals
766  if (tid_2 < num_threads_active)
767  {
768  left_count_2 = s_left_count[tid_2];
769  right_count_2 = s_right_count[tid_2];
770  }
771 
772  // compaction list for intervals containing one and multiple eigenvalues
773  // do not affect first element for exclusive scan
774  unsigned short *s_cl_one = s_left_count + 1;
775  unsigned short *s_cl_mult = s_right_count + 1;
776 
777  // compaction list for generating blocks of intervals containing multiple
778  // eigenvalues
779  unsigned short *s_cl_blocking = s_compaction_list_exc;
780  // helper compaction list for generating blocks of intervals
781  __shared__ unsigned short s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
782 
783  if (0 == tid)
784  {
785  // set to 0 for exclusive scan
786  s_left_count[0] = 0;
787  s_right_count[0] = 0;
788 
789  }
790 
791  __syncthreads();
792 
793  // flag if interval contains one or multiple eigenvalues
794  unsigned int is_one_lambda = 0;
795  unsigned int is_one_lambda_2 = 0;
796 
797  // number of eigenvalues in the interval
798  unsigned int multiplicity = right_count - left_count;
799  is_one_lambda = (1 == multiplicity);
800 
801  s_cl_one[tid] = is_one_lambda;
802  s_cl_mult[tid] = (! is_one_lambda);
803 
804  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
805  s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
806  s_cl_helper[tid] = 0;
807 
808  if (tid_2 < num_threads_active)
809  {
810 
811  unsigned int multiplicity = right_count_2 - left_count_2;
812  is_one_lambda_2 = (1 == multiplicity);
813 
814  s_cl_one[tid_2] = is_one_lambda_2;
815  s_cl_mult[tid_2] = (! is_one_lambda_2);
816 
817  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
818  s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
819  s_cl_helper[tid_2] = 0;
820  }
821  else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1))
822  {
823 
824  // clear
825  s_cl_blocking[tid_2] = 0;
826  s_cl_helper[tid_2] = 0;
827  }
828 
829 
830  scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,
831  s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
832 
833  __syncthreads();
834 
835  scanSumBlocks(tid, tid_2, num_threads_active,
836  num_threads_compaction, s_cl_blocking, s_cl_helper);
837 
838  // end down sweep of scan
839  __syncthreads();
840 
841  unsigned int c_block_iend = 0;
842  unsigned int c_block_iend_2 = 0;
843  unsigned int c_sum_block = 0;
844  unsigned int c_sum_block_2 = 0;
845 
846  // for each thread / interval that corresponds to root node of interval block
847  // store start address of block and total number of eigenvalues in all blocks
848  // before this block (particular thread is irrelevant, constraint is to
849  // have a subset of threads so that one and only one of them is in each
850  // interval)
851  if (1 == s_cl_helper[tid])
852  {
853 
854  c_block_iend = s_cl_mult[tid] + 1;
855  c_sum_block = s_cl_blocking[tid];
856  }
857 
858  if (1 == s_cl_helper[tid_2])
859  {
860 
861  c_block_iend_2 = s_cl_mult[tid_2] + 1;
862  c_sum_block_2 = s_cl_blocking[tid_2];
863  }
864 
865  scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
866  s_cl_blocking, s_cl_helper);
867 
868 
869  // finished second scan for s_cl_blocking
870  __syncthreads();
871 
872  // determine the global results
873  __shared__ unsigned int num_blocks_mult;
874  __shared__ unsigned int num_mult;
875  __shared__ unsigned int offset_mult_lambda;
876 
877  if (0 == tid)
878  {
879 
880  num_blocks_mult = s_cl_blocking[num_threads_active - 1];
881  offset_mult_lambda = s_cl_one[num_threads_active - 1];
882  num_mult = s_cl_mult[num_threads_active - 1];
883 
884  *g_num_one = offset_mult_lambda;
885  *g_num_blocks_mult = num_blocks_mult;
886  }
887 
888  __syncthreads();
889 
890  NumericT left_2, right_2;
891  --s_cl_one;
892  --s_cl_mult;
893  --s_cl_blocking;
894 
895  __syncthreads();
896  compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
897  s_left, s_right, s_left_count, s_right_count,
898  s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
899  is_one_lambda, is_one_lambda_2,
900  left, right, left_2, right_2,
901  left_count, right_count, left_count_2, right_count_2,
902  c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2
903  );
904 
905  __syncthreads();
906 
907  // final adjustment before writing out data to global memory
908  if (0 == tid)
909  {
910  s_cl_blocking[num_blocks_mult] = num_mult;
911  s_cl_helper[0] = 0;
912  }
913 
914  __syncthreads();
915 
916  // write to global memory
917  writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
918  g_left_one, g_right_one, g_pos_one,
919  g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
920  s_left, s_right, s_left_count, s_right_count,
921  g_blocks_mult, g_blocks_mult_sum,
922  s_compaction_list, s_cl_helper, offset_mult_lambda);
923 
924 }
925 }
926 }
927 }
928 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
__device__ void writeToGmem(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum, unsigned short *s_compaction_list, unsigned short *s_cl_helper, unsigned int offset_mult_lambda)
Write data to global memory.
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
Definition: config.hpp:32
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Utility functions.
__device__ int ceilPow2(int n)
Definition: bisect_util.hpp:66
__global__ void bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT lg, const NumericT ug, const unsigned int lg_eig_count, const unsigned int ug_eig_count, NumericT epsilon, unsigned int *g_num_one, unsigned int *g_num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum)
Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix g_d diagonal elements in g...
__device__ void scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Compute addresses to obtain compact list of block start addresses.
Global configuration parameters.
__device__ void storeNonEmptyIntervalsLarge(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, NumericT left, NumericT mid, NumericT right, const unsigned short left_count, const unsigned short mid_count, const unsigned short right_count, NumericT epsilon, unsigned int &compact_second_chunk, unsigned short *s_compaction_list, unsigned int &is_active_second)
float NumericT
Definition: bisect.cpp:40
__device__ void compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, unsigned int &offset_mult_lambda, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper, unsigned int is_one_lambda, unsigned int is_one_lambda_2, NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2, unsigned int &left_count, unsigned int &right_count, unsigned int &left_count_2, unsigned int &right_count_2, unsigned int c_block_iend, unsigned int c_sum_block, unsigned int c_block_iend_2, unsigned int c_sum_block_2)
Perform final stream compaction before writing data to global memory.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void storeInterval(unsigned int addr, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT right, S left_count, S right_count, NumericT precision)
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Perform scan to obtain number of eigenvalues before a specific block.
__device__ void subdivideActiveInterval(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)