1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
51 template<
typename NumericT>
53 void writeToGmem(
const unsigned int tid,
const unsigned int tid_2,
54 const unsigned int num_threads_active,
55 const unsigned int num_blocks_mult,
57 unsigned int *g_pos_one,
59 unsigned int *g_left_count_mult,
60 unsigned int *g_right_count_mult,
62 unsigned short *s_left_count,
unsigned short *s_right_count,
63 unsigned int *g_blocks_mult,
64 unsigned int *g_blocks_mult_sum,
65 unsigned short *s_compaction_list,
66 unsigned short *s_cl_helper,
67 unsigned int offset_mult_lambda
71 if (tid < offset_mult_lambda)
74 g_left_one[tid] = s_left[tid];
75 g_right_one[tid] = s_right[tid];
77 g_pos_one[tid] = s_right_count[tid];
83 g_left_mult[tid - offset_mult_lambda] = s_left[tid];
84 g_right_mult[tid - offset_mult_lambda] = s_right[tid];
85 g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
86 g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
89 if (tid_2 < num_threads_active)
92 if (tid_2 < offset_mult_lambda)
95 g_left_one[tid_2] = s_left[tid_2];
96 g_right_one[tid_2] = s_right[tid_2];
98 g_pos_one[tid_2] = s_right_count[tid_2];
103 g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
104 g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
105 g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
106 g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
115 if (tid <= num_blocks_mult)
117 g_blocks_mult[tid] = s_compaction_list[tid];
118 g_blocks_mult_sum[tid] = s_cl_helper[tid];
121 if (tid_2 <= num_blocks_mult)
123 g_blocks_mult[tid_2] = s_compaction_list[tid_2];
124 g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
131 template<
typename NumericT>
135 const unsigned int num_threads_active,
136 unsigned int &offset_mult_lambda,
138 unsigned short *s_left_count,
unsigned short *s_right_count,
139 unsigned short *s_cl_one,
unsigned short *s_cl_mult,
140 unsigned short *s_cl_blocking,
unsigned short *s_cl_helper,
141 unsigned int is_one_lambda,
unsigned int is_one_lambda_2,
143 unsigned int &left_count,
unsigned int &right_count,
144 unsigned int &left_count_2,
unsigned int &right_count_2,
145 unsigned int c_block_iend,
unsigned int c_sum_block,
146 unsigned int c_block_iend_2,
unsigned int c_sum_block_2
151 right = s_right[tid];
153 if (tid_2 < num_threads_active)
156 left_2 = s_left[tid_2];
157 right_2 = s_right[tid_2];
164 unsigned int ptr_w = 0;
165 unsigned int ptr_w_2 = 0;
166 unsigned int ptr_blocking_w = 0;
167 unsigned int ptr_blocking_w_2 = 0;
171 ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
172 : s_cl_mult[tid] + offset_mult_lambda;
174 if (0 != c_block_iend)
176 ptr_blocking_w = s_cl_blocking[tid];
179 if (tid_2 < num_threads_active)
181 ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
182 : s_cl_mult[tid_2] + offset_mult_lambda;
184 if (0 != c_block_iend_2)
186 ptr_blocking_w_2 = s_cl_blocking[tid_2];
192 if(tid < num_threads_active)
195 s_left[ptr_w] = left;
196 s_right[ptr_w] = right;
197 s_left_count[ptr_w] = left_count;
198 s_right_count[ptr_w] = right_count;
205 s_left[ptr_w] = left;
206 s_right[ptr_w] = right;
207 s_left_count[ptr_w] = left_count;
208 s_right_count[ptr_w] = right_count;
210 if (0 != c_block_iend)
212 s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
213 s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
216 if (tid_2 < num_threads_active)
220 s_left[ptr_w_2] = left_2;
221 s_right[ptr_w_2] = right_2;
222 s_left_count[ptr_w_2] = left_count_2;
223 s_right_count[ptr_w_2] = right_count_2;
225 if (0 != c_block_iend_2)
227 s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
228 s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
240 const unsigned int num_threads_compaction,
241 unsigned short *s_cl_blocking,
242 unsigned short *s_cl_helper
247 s_cl_blocking[tid] = s_cl_helper[tid];
249 if (tid_2 < num_threads_compaction)
251 s_cl_blocking[tid_2] = s_cl_helper[tid_2];
261 unsigned int offset = 1;
264 for (
int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
272 unsigned int ai = offset*(2*tid+1)-1;
273 unsigned int bi = offset*(2*tid+2)-1;
274 s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
281 for (
int d = 2; d < num_threads_compaction; d <<= 1)
291 unsigned int ai = offset*(tid+1) - 1;
292 unsigned int bi = ai + (offset >> 1);
293 s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
305 const unsigned int num_threads_active,
306 const unsigned int num_threads_compaction,
307 unsigned short *s_cl_blocking,
308 unsigned short *s_cl_helper)
310 unsigned int offset = 1;
314 for (
int d = num_threads_compaction >> 1; d > 0; d >>= 1)
322 unsigned int ai = offset*(2*tid+1)-1;
323 unsigned int bi = offset*(2*tid+2)-1;
325 s_cl_blocking[bi] += s_cl_blocking[ai];
333 for (
int d = 2; d < (num_threads_compaction - 1); d <<= 1)
342 unsigned int ai = offset*(tid+1) - 1;
343 unsigned int bi = ai + (offset >> 1);
345 s_cl_blocking[bi] += s_cl_blocking[ai];
357 s_cl_helper[num_threads_active - 1] =
358 s_cl_helper[num_threads_compaction - 1];
359 s_cl_blocking[num_threads_active - 1] =
360 s_cl_blocking[num_threads_compaction - 1];
370 scanInitial(
const unsigned int tid,
const unsigned int tid_2,
const unsigned int mat_size,
371 const unsigned int num_threads_active,
372 const unsigned int num_threads_compaction,
373 unsigned short *s_cl_one,
unsigned short *s_cl_mult,
374 unsigned short *s_cl_blocking,
unsigned short *s_cl_helper
383 unsigned int offset = 1;
386 for (
int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
394 unsigned int ai = offset*(2*tid+1);
395 unsigned int bi = offset*(2*tid+2)-1;
397 s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
398 s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
406 if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
410 if (s_cl_helper[ai - 1] == 1)
415 else if (s_cl_helper[bi] == 1)
418 s_cl_helper[ai - 1] = 1;
423 unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
429 s_cl_helper[ai - 1] = 1;
435 s_cl_blocking[bi] = temp;
436 s_cl_blocking[ai - 1] = 0;
449 for (
int d = 2; d < num_threads_compaction; d <<= 1)
459 unsigned int ai = offset*(tid+1) - 1;
460 unsigned int bi = ai + (offset >> 1);
462 s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
463 s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
473 template<
typename NumericT>
477 const unsigned int num_threads_active,
479 unsigned short *s_left_count,
480 unsigned short *s_right_count,
482 const unsigned short left_count,
483 const unsigned short mid_count,
484 const unsigned short right_count,
486 unsigned int &compact_second_chunk,
487 unsigned short *s_compaction_list,
488 unsigned int &is_active_second)
491 if ((left_count != mid_count) && (mid_count != right_count))
494 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
495 left, mid, left_count, mid_count, epsilon);
497 is_active_second = 1;
498 s_compaction_list[threadIdx.x] = 1;
499 compact_second_chunk = 1;
507 is_active_second = 0;
508 s_compaction_list[threadIdx.x] = 0;
511 if (left_count != mid_count)
513 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
514 left, mid, left_count, mid_count, epsilon);
518 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
519 mid, right, mid_count, right_count, epsilon);
534 template<
typename NumericT>
539 const unsigned int lg_eig_count,
540 const unsigned int ug_eig_count,
542 unsigned int *g_num_one,
543 unsigned int *g_num_blocks_mult,
545 unsigned int *g_pos_one,
547 unsigned int *g_left_count_mult,
548 unsigned int *g_right_count_mult,
549 unsigned int *g_blocks_mult,
550 unsigned int *g_blocks_mult_sum
553 const unsigned int tid = threadIdx.x;
571 __shared__
unsigned int compact_second_chunk;
573 __shared__
unsigned int all_threads_converged;
576 __shared__
unsigned int num_threads_active;
579 __shared__
unsigned int num_threads_compaction;
582 unsigned short *s_compaction_list_exc = s_compaction_list + 1;
589 unsigned int left_count = 0;
590 unsigned int right_count = 0;
594 unsigned int mid_count = 0;
596 unsigned int is_active_second = 0;
599 s_compaction_list[tid] = 0;
602 s_left_count[tid] = 0;
603 s_right_count[tid] = 0;
613 s_left_count[0] = lg_eig_count;
614 s_right_count[0] = ug_eig_count;
616 compact_second_chunk = 0;
617 num_threads_active = 1;
619 num_threads_compaction = 1;
621 all_threads_converged = 1;
632 s_compaction_list[tid] = 0;
637 left, right, left_count, right_count,
638 mid, all_threads_converged);
643 if (1 == all_threads_converged)
671 if (tid < num_threads_active)
680 s_left_count, s_right_count,
682 left_count, mid_count, right_count,
683 epsilon, compact_second_chunk,
684 s_compaction_list_exc,
695 s_left_count[tid] = left_count;
696 s_right_count[tid] = right_count;
698 is_active_second = 0;
708 if (compact_second_chunk > 0)
716 if (compact_second_chunk > 0)
719 mid, right, mid_count, right_count,
720 s_compaction_list, num_threads_active,
731 num_threads_active += s_compaction_list[num_threads_active];
732 num_threads_compaction =
ceilPow2(num_threads_active);
734 compact_second_chunk = 0;
735 all_threads_converged = 1;
740 if (num_threads_compaction > blockDim.x)
755 unsigned int left_count_2;
756 unsigned int right_count_2;
758 unsigned int tid_2 = tid + blockDim.x;
762 left_count = s_left_count[tid];
763 right_count = s_right_count[tid];
766 if (tid_2 < num_threads_active)
768 left_count_2 = s_left_count[tid_2];
769 right_count_2 = s_right_count[tid_2];
774 unsigned short *s_cl_one = s_left_count + 1;
775 unsigned short *s_cl_mult = s_right_count + 1;
779 unsigned short *s_cl_blocking = s_compaction_list_exc;
787 s_right_count[0] = 0;
794 unsigned int is_one_lambda = 0;
795 unsigned int is_one_lambda_2 = 0;
798 unsigned int multiplicity = right_count - left_count;
799 is_one_lambda = (1 == multiplicity);
801 s_cl_one[tid] = is_one_lambda;
802 s_cl_mult[tid] = (! is_one_lambda);
805 s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
806 s_cl_helper[tid] = 0;
808 if (tid_2 < num_threads_active)
811 unsigned int multiplicity = right_count_2 - left_count_2;
812 is_one_lambda_2 = (1 == multiplicity);
814 s_cl_one[tid_2] = is_one_lambda_2;
815 s_cl_mult[tid_2] = (! is_one_lambda_2);
818 s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
819 s_cl_helper[tid_2] = 0;
825 s_cl_blocking[tid_2] = 0;
826 s_cl_helper[tid_2] = 0;
830 scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,
831 s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
836 num_threads_compaction, s_cl_blocking, s_cl_helper);
841 unsigned int c_block_iend = 0;
842 unsigned int c_block_iend_2 = 0;
843 unsigned int c_sum_block = 0;
844 unsigned int c_sum_block_2 = 0;
851 if (1 == s_cl_helper[tid])
854 c_block_iend = s_cl_mult[tid] + 1;
855 c_sum_block = s_cl_blocking[tid];
858 if (1 == s_cl_helper[tid_2])
861 c_block_iend_2 = s_cl_mult[tid_2] + 1;
862 c_sum_block_2 = s_cl_blocking[tid_2];
866 s_cl_blocking, s_cl_helper);
873 __shared__
unsigned int num_blocks_mult;
874 __shared__
unsigned int num_mult;
875 __shared__
unsigned int offset_mult_lambda;
880 num_blocks_mult = s_cl_blocking[num_threads_active - 1];
881 offset_mult_lambda = s_cl_one[num_threads_active - 1];
882 num_mult = s_cl_mult[num_threads_active - 1];
884 *g_num_one = offset_mult_lambda;
885 *g_num_blocks_mult = num_blocks_mult;
897 s_left, s_right, s_left_count, s_right_count,
898 s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
899 is_one_lambda, is_one_lambda_2,
900 left, right, left_2, right_2,
901 left_count, right_count, left_count_2, right_count_2,
902 c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2
910 s_cl_blocking[num_blocks_mult] = num_mult;
917 writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
918 g_left_one, g_right_one, g_pos_one,
919 g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
920 s_left, s_right, s_left_count, s_right_count,
921 g_blocks_mult, g_blocks_mult_sum,
922 s_compaction_list, s_cl_helper, offset_mult_lambda);
928 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
__device__ void writeToGmem(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum, unsigned short *s_compaction_list, unsigned short *s_cl_helper, unsigned int offset_mult_lambda)
Write data to global memory.
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
__device__ int ceilPow2(int n)
__global__ void bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT lg, const NumericT ug, const unsigned int lg_eig_count, const unsigned int ug_eig_count, NumericT epsilon, unsigned int *g_num_one, unsigned int *g_num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum)
Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix g_d diagonal elements in g...
__device__ void scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Compute addresses to obtain compact list of block start addresses.
Global configuration parameters.
__device__ void storeNonEmptyIntervalsLarge(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, NumericT left, NumericT mid, NumericT right, const unsigned short left_count, const unsigned short mid_count, const unsigned short right_count, NumericT epsilon, unsigned int &compact_second_chunk, unsigned short *s_compaction_list, unsigned int &is_active_second)
__device__ void compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, unsigned int &offset_mult_lambda, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper, unsigned int is_one_lambda, unsigned int is_one_lambda_2, NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2, unsigned int &left_count, unsigned int &right_count, unsigned int &left_count_2, unsigned int &right_count_2, unsigned int c_block_iend, unsigned int c_sum_block, unsigned int c_block_iend_2, unsigned int c_sum_block_2)
Perform final stream compaction before writing data to global memory.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void storeInterval(unsigned int addr, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT right, S left_count, S right_count, NumericT precision)
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Perform scan to obtain number of eigenvalues before a specific block.
__device__ void subdivideActiveInterval(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)