ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
amg_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
2 #define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cstdlib>
26 #include <cmath>
28 
29 #include <map>
30 #include <set>
31 
32 namespace viennacl
33 {
34 namespace linalg
35 {
36 namespace cuda
37 {
38 namespace amg
39 {
40 
41 
43 
45  const unsigned int * row_indices,
46  const unsigned int * column_indices,
47  unsigned int size1,
48  unsigned int nnz,
49  unsigned int *influences_row,
50  unsigned int *influences_id,
51  unsigned int *influences_values
52  )
53 {
54  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
55  unsigned int global_size = gridDim.x * blockDim.x;
56 
57  for (unsigned int i = global_id; i < size1; i += global_size)
58  {
59  unsigned int tmp = row_indices[i];
60  influences_row[i] = tmp;
61  influences_values[i] = row_indices[i+1] - tmp;
62  }
63 
64  for (unsigned int i = global_id; i < nnz; i += global_size)
65  influences_id[i] = column_indices[i];
66 
67  if (global_id == 0)
68  influences_row[size1] = row_indices[size1];
69 }
70 
71 
73 template<typename NumericT>
77 {
78  (void)tag;
79 
80  amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
81  viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
82  static_cast<unsigned int>(A.size1()),
83  static_cast<unsigned int>(A.nnz()),
87  );
88  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_influence_trivial_kernel");
89 }
90 
91 
93 template<typename NumericT>
97 {
98  throw std::runtime_error("not implemented yet");
99 }
100 
102 template<typename NumericT>
106 {
107  // TODO: dispatch based on influence tolerance provided
108  amg_influence_trivial(A, amg_context, tag);
109 }
110 
116 {
119  viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get());
120  viennacl::backend::memory_read(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
121 
122  unsigned int coarse_id = 0;
123  for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
124  {
125  coarse_ids.set(i, coarse_id);
127  ++coarse_id;
128  }
129 
130  amg_context.num_coarse_ = coarse_id;
131 
132  viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
133 }
134 
136 
138 template<typename IndexT>
139 __global__ void amg_pmis2_init_workdata(IndexT *work_state,
140  IndexT *work_random,
141  IndexT *work_index,
142  IndexT const *point_types,
143  IndexT const *random_weights,
144  unsigned int size)
145 {
146  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
147  unsigned int global_size = gridDim.x * blockDim.x;
148 
149  for (unsigned int i = global_id; i < size; i += global_size)
150  {
151  switch (point_types[i])
152  {
156  default:
157  break; // do nothing
158  }
159 
160  work_random[i] = random_weights[i];
161  work_index[i] = i;
162  }
163 }
164 
166 template<typename IndexT>
167 __global__ void amg_pmis2_max_neighborhood(IndexT const *work_state,
168  IndexT const *work_random,
169  IndexT const *work_index,
170  IndexT *work_state2,
171  IndexT *work_random2,
172  IndexT *work_index2,
173  IndexT const *influences_row,
174  IndexT const *influences_id,
175  unsigned int size)
176 {
177  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
178  unsigned int global_size = gridDim.x * blockDim.x;
179 
180  for (unsigned int i = global_id; i < size; i += global_size)
181  {
182  // load
183  unsigned int state = work_state[i];
184  unsigned int random = work_random[i];
185  unsigned int index = work_index[i];
186 
187  // max
188  unsigned int j_stop = influences_row[i + 1];
189  for (unsigned int j = influences_row[i]; j < j_stop; ++j)
190  {
191  unsigned int influenced_point_id = influences_id[j];
192 
193  // lexigraphical triple-max (not particularly pretty, but does the job):
194  if (state < work_state[influenced_point_id])
195  {
196  state = work_state[influenced_point_id];
197  random = work_random[influenced_point_id];
198  index = work_index[influenced_point_id];
199  }
200  else if (state == work_state[influenced_point_id])
201  {
202  if (random < work_random[influenced_point_id])
203  {
204  state = work_state[influenced_point_id];
205  random = work_random[influenced_point_id];
206  index = work_index[influenced_point_id];
207  }
208  else if (random == work_random[influenced_point_id])
209  {
210  if (index < work_index[influenced_point_id])
211  {
212  state = work_state[influenced_point_id];
213  random = work_random[influenced_point_id];
214  index = work_index[influenced_point_id];
215  }
216  } // max(random)
217  } // max(state)
218  } // for
219 
220  // store
221  work_state2[i] = state;
222  work_random2[i] = random;
223  work_index2[i] = index;
224  }
225 }
226 
228 template<typename IndexT>
229 __global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state,
230  IndexT const *work_index,
231  IndexT *point_types,
232  IndexT *undecided_buffer,
233  unsigned int size)
234 {
235  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
236  unsigned int global_size = gridDim.x * blockDim.x;
237 
238  unsigned int num_undecided = 0;
239  for (unsigned int i = global_id; i < size; i += global_size)
240  {
241  unsigned int max_state = work_state[i];
242  unsigned int max_index = work_index[i];
243 
245  {
246  if (i == max_index) // make this a MIS node
248  else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
250  else
251  num_undecided += 1;
252  }
253  }
254 
255  // reduction of the number of undecided nodes:
256  __shared__ unsigned int shared_buffer[256];
257  shared_buffer[threadIdx.x] = num_undecided;
258  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
259  {
260  __syncthreads();
261  if (threadIdx.x < stride)
262  shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+stride];
263  }
264 
265  if (threadIdx.x == 0)
266  undecided_buffer[blockIdx.x] = shared_buffer[0];
267 
268 }
269 
271 __global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size)
272 {
273  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
274  unsigned int global_size = gridDim.x * blockDim.x;
275 
276  for (unsigned int i = global_id; i < size; i += global_size)
277  {
280  }
281 }
282 
289 template<typename NumericT>
293 {
295  unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
296  for (std::size_t i=0; i<random_weights.size(); ++i)
297  random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
299 
300  // work vectors:
304 
308 
309  unsigned int num_undecided = static_cast<unsigned int>(A.size1());
311  viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
312 
313  unsigned int pmis_iters = 0;
314  while (num_undecided > 0)
315  {
316  ++pmis_iters;
317 
318  //
319  // init temporary work data:
320  //
321  amg_pmis2_init_workdata<<<128, 128>>>(viennacl::cuda_arg(work_state),
322  viennacl::cuda_arg(work_random),
323  viennacl::cuda_arg(work_index),
324  viennacl::cuda_arg(amg_context.point_types_),
325  viennacl::cuda_arg(random_weights),
326  static_cast<unsigned int>(A.size1())
327  );
328  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
329 
330 
331  //
332  // Propagate maximum tuple twice
333  //
334  for (unsigned int r = 0; r < 2; ++r)
335  {
336  // max operation over neighborhood
337  amg_pmis2_max_neighborhood<<<128, 128>>>(viennacl::cuda_arg(work_state),
338  viennacl::cuda_arg(work_random),
339  viennacl::cuda_arg(work_index),
340  viennacl::cuda_arg(work_state2),
341  viennacl::cuda_arg(work_random2),
342  viennacl::cuda_arg(work_index2),
344  viennacl::cuda_arg(amg_context.influence_ids_),
345  static_cast<unsigned int>(A.size1())
346  );
347  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_max_neighborhood");
348 
349  // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy)
350  work_state = work_state2;
351  work_random = work_random2;
352  work_index = work_index2;
353  }
354 
355  //
356  // mark MIS and non-MIS nodes:
357  //
358  amg_pmis2_mark_mis_nodes<<<128, 128>>>(viennacl::cuda_arg(work_state),
359  viennacl::cuda_arg(work_index),
360  viennacl::cuda_arg(amg_context.point_types_),
361  viennacl::cuda_arg(undecided_buffer),
362  static_cast<unsigned int>(A.size1())
363  );
364  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
365 
366  // get number of undecided points on host:
367  viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get());
368  num_undecided = 0;
369  for (std::size_t i=0; i<undecided_buffer.size(); ++i)
370  num_undecided += undecided_buffer_host[i];
371 
372  } //while
373 
374  // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later
375  amg_pmis2_reset_state<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
376  static_cast<unsigned int>(amg_context.point_types_.size())
377  );
378  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
379 }
380 
381 
382 
383 
384 
385 template<typename IndexT>
386 __global__ void amg_agg_propagate_coarse_indices(IndexT *point_types,
387  IndexT *coarse_ids,
388  IndexT const *influences_row,
389  IndexT const *influences_id,
390  unsigned int size)
391 {
392  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
393  unsigned int global_size = gridDim.x * blockDim.x;
394 
395  for (unsigned int i = global_id; i < size; i += global_size)
396  {
398  {
399  unsigned int coarse_index = coarse_ids[i];
400 
401  unsigned int j_stop = influences_row[i + 1];
402  for (unsigned int j = influences_row[i]; j < j_stop; ++j)
403  {
404  unsigned int influenced_point_id = influences_id[j];
405  coarse_ids[influenced_point_id] = coarse_index; // Set aggregate index for fine point
406 
407  if (influenced_point_id != i) // Note: Any write races between threads are harmless here
409  }
410  }
411  }
412 }
413 
414 
415 template<typename IndexT>
416 __global__ void amg_agg_merge_undecided(IndexT *point_types,
417  IndexT *coarse_ids,
418  IndexT const *influences_row,
419  IndexT const *influences_id,
420  unsigned int size)
421 {
422  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
423  unsigned int global_size = gridDim.x * blockDim.x;
424 
425  for (unsigned int i = global_id; i < size; i += global_size)
426  {
428  {
429  unsigned int j_stop = influences_row[i + 1];
430  for (unsigned int j = influences_row[i]; j < j_stop; ++j)
431  {
432  unsigned int influenced_point_id = influences_id[j];
433  if (point_types[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point
434  {
435  //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl;
436  coarse_ids[i] = coarse_ids[influenced_point_id];
437  break;
438  }
439  }
440  }
441  }
442 }
443 
444 
445 __global__ void amg_agg_merge_undecided_2(unsigned int *point_types,
446  unsigned int size)
447 {
448  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
449  unsigned int global_size = gridDim.x * blockDim.x;
450 
451  for (unsigned int i = global_id; i < size; i += global_size)
452  {
455  }
456 }
457 
458 
465 template<typename NumericT>
469 {
470 
471  amg_influence_trivial(A, amg_context, tag);
472 
473  //
474  // Stage 1: Build aggregates:
475  //
477  amg_coarse_ag_stage1_mis2(A, amg_context, tag);
478  else
479  throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with CUDA backend!");
480 
482 
483  //
484  // Stage 2: Propagate coarse aggregate indices to neighbors:
485  //
486  amg_agg_propagate_coarse_indices<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
487  viennacl::cuda_arg(amg_context.coarse_id_),
489  viennacl::cuda_arg(amg_context.influence_ids_),
490  static_cast<unsigned int>(A.size1())
491  );
492  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_propagate_coarse_indices");
493 
494 
495  //
496  // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
497  //
498  amg_agg_merge_undecided<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
499  viennacl::cuda_arg(amg_context.coarse_id_),
501  viennacl::cuda_arg(amg_context.influence_ids_),
502  static_cast<unsigned int>(A.size1())
503  );
504  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided");
505 
506  //
507  // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
508  // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
509  //
510  amg_agg_merge_undecided_2<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
511  static_cast<unsigned int>(A.size1())
512  );
513  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided_2");
514 }
515 
516 
517 
518 
525 template<typename InternalT1>
526 void amg_coarse(InternalT1 & A,
529 {
530  switch (tag.get_coarsening_method())
531  {
533  default: throw std::runtime_error("not implemented yet");
534  }
535 }
536 
537 
538 
539 
541 
542 template<typename NumericT>
543 __global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer,
544  unsigned int *P_col_buffer,
545  NumericT *P_elements,
546  unsigned int *coarse_ids,
547  unsigned int size)
548 {
549  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
550  unsigned int global_size = gridDim.x * blockDim.x;
551 
552  for (unsigned int i = global_id; i < size; i += global_size)
553  {
554  P_row_buffer[i] = i;
555  P_col_buffer[i] = coarse_ids[i];
556  P_elements[i] = NumericT(1);
557  }
558 
559  // set last entry as well:
560  if (global_id == 0)
561  P_row_buffer[size] = size;
562 }
563 
571 template<typename NumericT>
576 {
577  (void)tag;
579 
580  amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(P.handle1().cuda_handle()),
581  viennacl::cuda_arg<unsigned int>(P.handle2().cuda_handle()),
582  viennacl::cuda_arg<NumericT>(P.handle().cuda_handle()),
583  viennacl::cuda_arg(amg_context.coarse_id_),
584  static_cast<unsigned int>(A.size1())
585  );
586  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_ag_kernel");
587 
589 }
590 
591 
592 
593 template<typename NumericT>
594 __global__ void amg_interpol_sa_kernel(
595  const unsigned int *A_row_indices,
596  const unsigned int *A_col_indices,
597  const NumericT *A_elements,
598  unsigned int A_size1,
599  unsigned int A_nnz,
600  unsigned int *Jacobi_row_indices,
601  unsigned int *Jacobi_col_indices,
602  NumericT *Jacobi_elements,
603  NumericT omega
604  )
605 {
606  unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
607  unsigned int global_size = gridDim.x * blockDim.x;
608 
609  for (unsigned int row = global_id; row < A_size1; row += global_size)
610  {
611  unsigned int row_begin = A_row_indices[row];
612  unsigned int row_end = A_row_indices[row+1];
613 
614  Jacobi_row_indices[row] = row_begin;
615 
616  // Step 1: Extract diagonal:
617  NumericT diag = 0;
618  for (unsigned int j = row_begin; j < row_end; ++j)
619  {
620  if (A_col_indices[j] == row)
621  {
622  diag = A_elements[j];
623  break;
624  }
625  }
626 
627  // Step 2: Write entries:
628  for (unsigned int j = row_begin; j < row_end; ++j)
629  {
630  unsigned int col_index = A_col_indices[j];
631  Jacobi_col_indices[j] = col_index;
632 
633  if (col_index == row)
634  Jacobi_elements[j] = NumericT(1) - omega;
635  else
636  Jacobi_elements[j] = - omega * A_elements[j] / diag;
637  }
638  }
639 
640  if (global_id == 0)
641  Jacobi_row_indices[A_size1] = A_nnz; // don't forget finalizer
642 }
643 
644 
645 
653 template<typename NumericT>
658 {
659  (void)tag;
661 
662  // form tentative operator:
663  amg_interpol_ag(A, P_tentative, amg_context, tag);
664 
666 
667  amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
668  viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
669  viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
670  static_cast<unsigned int>(A.size1()),
671  static_cast<unsigned int>(A.nnz()),
672  viennacl::cuda_arg<unsigned int>(Jacobi.handle1().cuda_handle()),
673  viennacl::cuda_arg<unsigned int>(Jacobi.handle2().cuda_handle()),
674  viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()),
676  );
677  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_sa_kernel");
678 
679  P = viennacl::linalg::prod(Jacobi, P_tentative);
680 
682 }
683 
684 
692 template<typename MatrixT>
693 void amg_interpol(MatrixT const & A,
694  MatrixT & P,
697 {
698  switch (tag.get_interpolation_method())
699  {
700  case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break;
702  default: throw std::runtime_error("Not implemented yet!");
703  }
704 }
705 
706 
707 template<typename NumericT>
709  const unsigned int * row_indices,
710  const unsigned int * column_indices,
711  const NumericT * elements,
712  NumericT *B,
713  unsigned int B_row_start, unsigned int B_col_start,
714  unsigned int B_row_inc, unsigned int B_col_inc,
715  unsigned int B_row_size, unsigned int B_col_size,
716  unsigned int B_internal_rows, unsigned int B_internal_cols)
717 {
718  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
719  row < B_row_size;
720  row += gridDim.x * blockDim.x)
721  {
722  unsigned int row_end = row_indices[row+1];
723  for (unsigned int j = row_indices[row]; j<row_end; j++)
724  B[(B_row_start + row * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j];
725  }
726 }
727 
728 
729 template<typename NumericT, unsigned int AlignmentV>
732 {
733  compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
734  viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
735  viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
736  viennacl::cuda_arg<NumericT>(B),
737  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
738  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
739  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
740  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B))
741  );
742  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_assign_to_dense");
743 }
744 
745 
746 
747 
748 template<typename NumericT>
750  const unsigned int * row_indices,
751  const unsigned int * column_indices,
752  const NumericT * elements,
753  NumericT weight,
754  const NumericT * x_old,
755  NumericT * x_new,
756  const NumericT * rhs,
757  unsigned int size)
758 {
759  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
760  row < size;
761  row += gridDim.x * blockDim.x)
762  {
763  NumericT sum = NumericT(0);
764  NumericT diag = NumericT(1);
765  unsigned int row_end = row_indices[row+1];
766  for (unsigned int j = row_indices[row]; j < row_end; ++j)
767  {
768  unsigned int col = column_indices[j];
769  if (col == row)
770  diag = elements[j];
771  else
772  sum += elements[j] * x_old[col];
773  }
774  x_new[row] = weight * (rhs[row] - sum) / diag + (NumericT(1) - weight) * x_old[row];
775  }
776 }
777 
778 
779 
780 
790 template<typename NumericT>
791 void smooth_jacobi(unsigned int iterations,
792  compressed_matrix<NumericT> const & A,
793  vector<NumericT> & x,
794  vector<NumericT> & x_backup,
795  vector<NumericT> const & rhs_smooth,
796  NumericT weight)
797 {
798  for (unsigned int i=0; i<iterations; ++i)
799  {
800  x_backup = x;
801 
802  compressed_matrix_smooth_jacobi_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
803  viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
804  viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
805  static_cast<NumericT>(weight),
806  viennacl::cuda_arg(x_backup),
808  viennacl::cuda_arg(rhs_smooth),
809  static_cast<unsigned int>(rhs_smooth.size())
810  );
811  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_smooth_jacobi_kernel");
812  }
813 }
814 
815 
816 } //namespace amg
817 } //namespace host_based
818 } //namespace linalg
819 } //namespace viennacl
820 
821 #endif
void amg_influence_trivial(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Routine for taking all connections in the matrix as strong.
void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context &amg_context)
Assign IDs to coarse points.
Helper class implementing an array on the host. Default case: No conversion necessary.
Definition: util.hpp:92
__global__ void amg_agg_merge_undecided_2(unsigned int *point_types, unsigned int size)
amg_coarsening_method get_coarsening_method() const
Returns the current coarsening strategy.
Definition: amg_base.hpp:88
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
Definition: memory.hpp:220
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
__global__ void compressed_matrix_smooth_jacobi_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT weight, const NumericT *x_old, NumericT *x_new, const NumericT *rhs, unsigned int size)
void amg_influence_advanced(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Routine for extracting strongly connected points considering a user-provided threshold value...
const vcl_size_t & size1() const
Returns the number of rows.
__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state, IndexT const *work_index, IndexT *point_types, IndexT *undecided_buffer, unsigned int size)
CUDA kernel for marking MIS and non-MIS nodes.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:382
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
__global__ void amg_pmis2_init_workdata(IndexT *work_state, IndexT *work_random, IndexT *work_index, IndexT const *point_types, IndexT const *random_weights, unsigned int size)
CUDA kernel initializing the work vectors at each PMIS iteration.
double get_jacobi_weight() const
Returns the Jacobi smoother weight (damping).
Definition: amg_base.hpp:113
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:390
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void amg_interpol_ag(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
void amg_coarse(InternalT1 &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Calls the right coarsening procedure.
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
__global__ void amg_agg_merge_undecided(IndexT *point_types, IndexT *coarse_ids, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
__global__ void amg_influence_trivial_kernel(const unsigned int *row_indices, const unsigned int *column_indices, unsigned int size1, unsigned int nnz, unsigned int *influences_row, unsigned int *influences_id, unsigned int *influences_values)
void amg_interpol(MatrixT const &A, MatrixT &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Dispatcher for building the interpolation matrix.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
__global__ void amg_interpol_sa_kernel(const unsigned int *A_row_indices, const unsigned int *A_col_indices, const NumericT *A_elements, unsigned int A_size1, unsigned int A_nnz, unsigned int *Jacobi_row_indices, unsigned int *Jacobi_col_indices, NumericT *Jacobi_elements, NumericT omega)
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:102
void generate_row_block_information()
Builds the row block information needed for fast sparse matrix-vector multiplications.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
void assign_to_dense(viennacl::compressed_matrix< NumericT, AlignmentV > const &A, viennacl::matrix_base< NumericT > &B)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
void amg_coarse_ag_stage1_mis2(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) coarsening, single-threaded version of stage 1.
__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer, unsigned int *P_col_buffer, NumericT *P_elements, unsigned int *coarse_ids, unsigned int size)
void smooth_jacobi(unsigned int iterations, compressed_matrix< NumericT > const &A, vector< NumericT > &x, vector< NumericT > &x_backup, vector< NumericT > const &rhs_smooth, NumericT weight)
Damped Jacobi Smoother (CUDA version)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
Definition: matrix.hpp:885
__global__ void amg_agg_propagate_coarse_indices(IndexT *point_types, IndexT *coarse_ids, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:900
A tag for algebraic multigrid (AMG). Used to transport information from the user to the implementatio...
Definition: amg_base.hpp:64
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
void amg_interpol_sa(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
viennacl::vector< unsigned int > point_types_
Definition: amg_base.hpp:197
__global__ void compressed_matrix_assign_to_dense(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols)
void switch_memory_context(viennacl::context new_ctx)
Definition: vector.hpp:1064
amg_interpolation_method get_interpolation_method() const
Returns the current interpolation method.
Definition: amg_base.hpp:93
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
Definition: mem_handle.hpp:230
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
Helper classes and functions for the AMG preconditioner. Experimental.
viennacl::vector< unsigned int > coarse_id_
Definition: amg_base.hpp:198
__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size)
CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided so that subsequent kernels work...
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
viennacl::vector< unsigned int > influence_values_
Definition: amg_base.hpp:196
__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state, IndexT const *work_random, IndexT const *work_index, IndexT *work_state2, IndexT *work_random2, IndexT *work_index2, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
CUDA kernel propagating the state triple (status, weight, nodeID) to neighbors using a max()-operatio...
viennacl::vector< unsigned int > influence_ids_
Definition: amg_base.hpp:195
void amg_coarse_ag(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG) ...
void amg_influence(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Dispatcher for influence processing.
viennacl::vector< unsigned int > influence_jumper_
Definition: amg_base.hpp:194