ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
bisect_util.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
30 // includes, project
33 
34 namespace viennacl
35 {
36 namespace linalg
37 {
38 namespace cuda
39 {
44 __device__
45 inline int
46 floorPow2(int n)
47 {
48 
49  // early out if already power of two
50  if (0 == (n & (n-1)))
51  {
52  return n;
53  }
54 
55  int exp;
56  frexp((float)n, &exp);
57  return (1 << (exp - 1));
58 }
59 
64 __device__
65 inline int
66 ceilPow2(int n)
67 {
68 
69  // early out if already power of two
70  if (0 == (n & (n-1)))
71  {
72  return n;
73  }
74 
75  int exp;
76  frexp((float)n, &exp);
77  return (1 << exp);
78 }
79 
86 template<typename NumericT>
87 __device__
88 inline NumericT
89 computeMidpoint(const NumericT left, const NumericT right)
90 {
91 
92  NumericT mid;
93 
95  {
96  mid = left + (right - left) * 0.5f;
97  }
98  else
99  {
100  mid = (left + right) * 0.5f;
101  }
102 
103  return mid;
104 }
105 
121 template<class S, class T, class NumericT>
122 __device__
123 void
124 storeInterval(unsigned int addr,
125  NumericT *s_left, NumericT *s_right,
126  T *s_left_count, T *s_right_count,
127  NumericT left, NumericT right,
128  S left_count, S right_count,
129  NumericT precision)
130 {
131  s_left_count[addr] = left_count;
132  s_right_count[addr] = right_count;
133 
134  // check if interval converged
135  NumericT t0 = abs(right - left);
136  NumericT t1 = max(abs(left), abs(right)) * precision;
137 
138  if (t0 <= max(static_cast<NumericT>(VIENNACL_BISECT_MIN_ABS_INTERVAL), t1))
139  {
140  // compute mid point
141  NumericT lambda = computeMidpoint(left, right);
142 
143  // mark as converged
144  s_left[addr] = lambda;
145  s_right[addr] = lambda;
146  }
147  else
148  {
149 
150  // store current limits
151  s_left[addr] = left;
152  s_right[addr] = right;
153  }
154 }
155 
174 template<typename NumericT>
175 __device__
176 inline unsigned int
177 computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
178  const NumericT x,
179  const unsigned int tid,
180  const unsigned int num_intervals_active,
181  NumericT *s_d, NumericT *s_s,
182  unsigned int converged
183  )
184 {
185 
186  NumericT delta = 1.0f;
187  unsigned int count = 0;
188 
189  __syncthreads();
190 
191  // read data into shared memory
192  if (threadIdx.x < n)
193  {
194  s_d[threadIdx.x] = *(g_d + threadIdx.x);
195  s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
196  }
197 
198  __syncthreads();
199 
200  // perform loop only for active threads
201  if ((tid < num_intervals_active) && (0 == converged))
202  {
203 
204  // perform (optimized) Gaussian elimination to determine the number
205  // of eigenvalues that are smaller than n
206  for (unsigned int k = 0; k < n; ++k)
207  {
208  delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
209  count += (delta < 0) ? 1 : 0;
210  }
211 
212  } // end if thread currently processing an interval
213 
214  return count;
215 }
234 template<typename NumericT>
235 __device__
236 inline unsigned int
237 computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
238  const NumericT x,
239  const unsigned int tid,
240  const unsigned int num_intervals_active,
241  NumericT *s_d, NumericT *s_s,
242  unsigned int converged
243  )
244 {
245  NumericT delta = 1.0f;
246  unsigned int count = 0;
247 
248  unsigned int rem = n;
249 
250  // do until whole diagonal and superdiagonal has been loaded and processed
251  for (unsigned int i = 0; i < n; i += blockDim.x)
252  {
253 
254  __syncthreads();
255 
256  // read new chunk of data into shared memory
257  if ((i + threadIdx.x) < n)
258  {
259 
260  s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
261  s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
262  }
263 
264  __syncthreads();
265 
266 
267  if (tid < num_intervals_active)
268  {
269 
270  // perform (optimized) Gaussian elimination to determine the number
271  // of eigenvalues that are smaller than n
272  for (unsigned int k = 0; k < min(rem,blockDim.x); ++k)
273  {
274  delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
275  // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
276  count += (delta < 0) ? 1 : 0;
277  }
278 
279  } // end if thread currently processing an interval
280 
281  rem -= blockDim.x;
282  }
283 
284  return count;
285 }
286 
306 template<class S, class T, class NumericT>
307 __device__
308 void
309 storeNonEmptyIntervals(unsigned int addr,
310  const unsigned int num_threads_active,
311  NumericT *s_left, NumericT *s_right,
312  T *s_left_count, T *s_right_count,
313  NumericT left, NumericT mid, NumericT right,
314  const S left_count,
315  const S mid_count,
316  const S right_count,
317  NumericT precision,
318  unsigned int &compact_second_chunk,
319  T *s_compaction_list_exc,
320  unsigned int &is_active_second)
321 {
322  // check if both child intervals are valid
323 
324  if ((left_count != mid_count) && (mid_count != right_count))
325  {
326 
327  // store the left interval
328  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
329  left, mid, left_count, mid_count, precision);
330 
331  // mark that a second interval has been generated, only stored after
332  // stream compaction of second chunk
333  is_active_second = 1;
334  s_compaction_list_exc[threadIdx.x] = 1;
335  compact_second_chunk = 1;
336  }
337  else
338  {
339 
340  // only one non-empty child interval
341 
342  // mark that no second child
343  is_active_second = 0;
344  s_compaction_list_exc[threadIdx.x] = 0;
345 
346  // store the one valid child interval
347  if (left_count != mid_count)
348  {
349  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
350  left, mid, left_count, mid_count, precision);
351  }
352  else
353  {
354  storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
355  mid, right, mid_count, right_count, precision);
356  }
357 
358  }
359 }
370 template<class T>
371 __device__
372 void
373 createIndicesCompaction(T *s_compaction_list_exc,
374  unsigned int num_threads_compaction)
375 {
376 
377  unsigned int offset = 1;
378  const unsigned int tid = threadIdx.x;
379  // if(tid == 0)
380  // printf("num_threads_compaction = %u\n", num_threads_compaction);
381 
382  // higher levels of scan tree
383  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
384  {
385 
386  __syncthreads();
387 
388  if (tid < d)
389  {
390 
391  unsigned int ai = offset*(2*tid+1)-1;
392  unsigned int bi = offset*(2*tid+2)-1;
393 
394  s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
395  + s_compaction_list_exc[ai];
396  }
397 
398  offset <<= 1;
399  }
400 
401  // traverse down tree: first down to level 2 across
402  for (int d = 2; d < num_threads_compaction; d <<= 1)
403  {
404 
405  offset >>= 1;
406  __syncthreads();
407 
408  if (tid < (d-1))
409  {
410 
411  unsigned int ai = offset*(tid+1) - 1;
412  unsigned int bi = ai + (offset >> 1);
413 
414  s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
415  + s_compaction_list_exc[ai];
416  }
417  }
418 
419  __syncthreads();
420 
421 }
422 
437 template<class T, class NumericT>
438 __device__
439 void
441  T *s_left_count, T *s_right_count,
442  NumericT mid, NumericT right,
443  unsigned int mid_count, unsigned int right_count,
444  T *s_compaction_list,
445  unsigned int num_threads_active,
446  unsigned int is_active_second)
447 {
448  const unsigned int tid = threadIdx.x;
449 
450  // perform compaction / copy data for all threads where the second
451  // child is not dead
452  if ((tid < num_threads_active) && (1 == is_active_second))
453  {
454  unsigned int addr_w = num_threads_active + s_compaction_list[tid];
455  s_left[addr_w] = mid;
456  s_right[addr_w] = right;
457  s_left_count[addr_w] = mid_count;
458  s_right_count[addr_w] = right_count;
459  }
460 }
461 
462 template<class T, class S, class NumericT>
463 __device__
464 void
466  T *s_left_count, T *s_right_count,
467  NumericT &left, NumericT &mid, NumericT &right,
468  S &left_count, S &mid_count, S &right_count,
469  T *s_compaction_list_exc,
470  unsigned int &compact_second_chunk,
471  const unsigned int num_threads_active,
472  unsigned int &is_active_second)
473 {
474  const unsigned int tid = threadIdx.x;
475  const unsigned int multiplicity = right_count - left_count;
476  // check multiplicity of eigenvalue
477  if (1 == multiplicity)
478  {
479 
480  // just re-store intervals, simple eigenvalue
481  s_left[tid] = left;
482  s_right[tid] = right;
483  s_left_count[tid] = left_count;
484  s_right_count[tid] = right_count;
485 
486 
487  // mark that no second child / clear
488  is_active_second = 0;
489  s_compaction_list_exc[tid] = 0;
490  }
491  else
492  {
493 
494  // number of eigenvalues after the split less than mid
495  mid_count = left_count + (multiplicity >> 1);
496 
497  // store left interval
498  s_left[tid] = left;
499  s_right[tid] = right;
500  s_left_count[tid] = left_count;
501  s_right_count[tid] = mid_count;
502  mid = left;
503 
504  // mark that second child interval exists
505  is_active_second = 1;
506  s_compaction_list_exc[tid] = 1;
507  compact_second_chunk = 1;
508  }
509 }
510 
526 template<class T, class NumericT>
527 __device__
528 void
529 subdivideActiveIntervalMulti(const unsigned int tid,
530  NumericT *s_left, NumericT *s_right,
531  T *s_left_count, T *s_right_count,
532  const unsigned int num_threads_active,
533  NumericT &left, NumericT &right,
534  unsigned int &left_count, unsigned int &right_count,
535  NumericT &mid, unsigned int &all_threads_converged)
536 {
537  // for all active threads
538  if (tid < num_threads_active)
539  {
540 
541  left = s_left[tid];
542  right = s_right[tid];
543  left_count = s_left_count[tid];
544  right_count = s_right_count[tid];
545 
546  // check if thread already converged
547  if (left != right)
548  {
549 
550  mid = computeMidpoint(left, right);
551  all_threads_converged = 0;
552  }
553  else if ((right_count - left_count) > 1)
554  {
555  // mark as not converged if multiple eigenvalues enclosed
556  // duplicate interval in storeIntervalsConverged()
557  all_threads_converged = 0;
558  }
559 
560  } // end for all active threads
561 }
562 
563 
579 template<class T, class NumericT>
580 __device__
581 void
582 subdivideActiveInterval(const unsigned int tid,
583  NumericT *s_left, NumericT *s_right,
584  T *s_left_count, T *s_right_count,
585  const unsigned int num_threads_active,
586  NumericT &left, NumericT &right,
587  unsigned int &left_count, unsigned int &right_count,
588  NumericT &mid, unsigned int &all_threads_converged)
589 {
590  // for all active threads
591  if (tid < num_threads_active)
592  {
593 
594  left = s_left[tid];
595  right = s_right[tid];
596  left_count = s_left_count[tid];
597  right_count = s_right_count[tid];
598 
599  // check if thread already converged
600  if (left != right)
601  {
602 
603  mid = computeMidpoint(left, right);
604  all_threads_converged = 0;
605  }
606  } // end for all active threads
607 }
608 }
609 }
610 }
611 
612 #endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
613 
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
__device__ unsigned int computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)
__device__ void storeNonEmptyIntervals(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT mid, NumericT right, const S left_count, const S mid_count, const S right_count, NumericT precision, unsigned int &compact_second_chunk, T *s_compaction_list_exc, unsigned int &is_active_second)
Store all non-empty intervals resulting from the subdivision of the interval currently processed by t...
#define VIENNACL_BISECT_MIN_ABS_INTERVAL
Definition: config.hpp:42
__device__ int floorPow2(int n)
Definition: bisect_util.hpp:46
Utility functions.
__device__ int ceilPow2(int n)
Definition: bisect_util.hpp:66
Global configuration parameters.
float NumericT
Definition: bisect.cpp:40
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void storeInterval(unsigned int addr, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT right, S left_count, S right_count, NumericT precision)
__device__ void subdivideActiveInterval(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ void subdivideActiveIntervalMulti(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ NumericT computeMidpoint(const NumericT left, const NumericT right)
Definition: bisect_util.hpp:89
__device__ void storeIntervalConverged(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT &left, NumericT &mid, NumericT &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active, unsigned int &is_active_second)
float sign_f(const float &val)
Sign of number (float)
Definition: util.hpp:72
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91