ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
sparse_matrix_operations_solve.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 
27 namespace viennacl
28 {
29 namespace linalg
30 {
31 namespace cuda
32 {
33 //
34 // Compressed matrix
35 //
36 
37 //
38 // non-transposed
39 //
40 
41 template<typename NumericT>
43  const unsigned int * row_indices,
44  const unsigned int * column_indices,
45  const NumericT * elements,
46  NumericT * vector,
47  unsigned int size)
48 {
49  __shared__ unsigned int col_index_buffer[128];
50  __shared__ NumericT element_buffer[128];
51  __shared__ NumericT vector_buffer[128];
52 
53  unsigned int nnz = row_indices[size];
54  unsigned int current_row = 0;
55  unsigned int row_at_window_start = 0;
56  NumericT current_vector_entry = vector[0];
57  unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
58  unsigned int next_row = row_indices[1];
59 
60  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
61  {
62  //load into shared memory (coalesced access):
63  if (i < nnz)
64  {
65  element_buffer[threadIdx.x] = elements[i];
66  unsigned int tmp = column_indices[i];
67  col_index_buffer[threadIdx.x] = tmp;
68  vector_buffer[threadIdx.x] = vector[tmp];
69  }
70 
71  __syncthreads();
72 
73  //now a single thread does the remaining work in shared memory:
74  if (threadIdx.x == 0)
75  {
76  // traverse through all the loaded data:
77  for (unsigned int k=0; k<blockDim.x; ++k)
78  {
79  if (current_row < size && i+k == next_row) //current row is finished. Write back result
80  {
81  vector[current_row] = current_vector_entry;
82  ++current_row;
83  if (current_row < size) //load next row's data
84  {
85  next_row = row_indices[current_row+1];
86  current_vector_entry = vector[current_row];
87  }
88  }
89 
90  if (current_row < size && col_index_buffer[k] < current_row) //substitute
91  {
92  if (col_index_buffer[k] < row_at_window_start) //use recently computed results
93  current_vector_entry -= element_buffer[k] * vector_buffer[k];
94  else if (col_index_buffer[k] < current_row) //use buffered data
95  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
96  }
97 
98  } // for k
99 
100  row_at_window_start = current_row;
101  } // if (get_local_id(0) == 0)
102 
103  __syncthreads();
104  } //for i
105 }
106 
107 
108 
109 template<typename NumericT>
110 __global__ void csr_lu_forward_kernel(
111  const unsigned int * row_indices,
112  const unsigned int * column_indices,
113  const NumericT * elements,
114  NumericT * vector,
115  unsigned int size)
116 {
117  __shared__ unsigned int col_index_buffer[128];
118  __shared__ NumericT element_buffer[128];
119  __shared__ NumericT vector_buffer[128];
120 
121  unsigned int nnz = row_indices[size];
122  unsigned int current_row = 0;
123  unsigned int row_at_window_start = 0;
124  NumericT current_vector_entry = vector[0];
125  NumericT diagonal_entry = 0;
126  unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
127  unsigned int next_row = row_indices[1];
128 
129  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
130  {
131  //load into shared memory (coalesced access):
132  if (i < nnz)
133  {
134  element_buffer[threadIdx.x] = elements[i];
135  unsigned int tmp = column_indices[i];
136  col_index_buffer[threadIdx.x] = tmp;
137  vector_buffer[threadIdx.x] = vector[tmp];
138  }
139 
140  __syncthreads();
141 
142  //now a single thread does the remaining work in shared memory:
143  if (threadIdx.x == 0)
144  {
145  // traverse through all the loaded data:
146  for (unsigned int k=0; k<blockDim.x; ++k)
147  {
148  if (current_row < size && i+k == next_row) //current row is finished. Write back result
149  {
150  vector[current_row] = current_vector_entry / diagonal_entry;
151  ++current_row;
152  if (current_row < size) //load next row's data
153  {
154  next_row = row_indices[current_row+1];
155  current_vector_entry = vector[current_row];
156  }
157  }
158 
159  if (current_row < size && col_index_buffer[k] < current_row) //substitute
160  {
161  if (col_index_buffer[k] < row_at_window_start) //use recently computed results
162  current_vector_entry -= element_buffer[k] * vector_buffer[k];
163  else if (col_index_buffer[k] < current_row) //use buffered data
164  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
165  }
166  else if (col_index_buffer[k] == current_row)
167  diagonal_entry = element_buffer[k];
168 
169  } // for k
170 
171  row_at_window_start = current_row;
172  } // if (get_local_id(0) == 0)
173 
174  __syncthreads();
175  } //for i
176 }
177 
178 
179 template<typename NumericT>
181  const unsigned int * row_indices,
182  const unsigned int * column_indices,
183  const NumericT * elements,
184  NumericT * vector,
185  unsigned int size)
186 {
187  __shared__ unsigned int col_index_buffer[128];
188  __shared__ NumericT element_buffer[128];
189  __shared__ NumericT vector_buffer[128];
190 
191  unsigned int nnz = row_indices[size];
192  unsigned int current_row = size-1;
193  unsigned int row_at_window_start = size-1;
194  NumericT current_vector_entry = vector[size-1];
195  unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
196  unsigned int next_row = row_indices[size-1];
197 
198  unsigned int i = loop_end + threadIdx.x;
199  while (1)
200  {
201  //load into shared memory (coalesced access):
202  if (i < nnz)
203  {
204  element_buffer[threadIdx.x] = elements[i];
205  unsigned int tmp = column_indices[i];
206  col_index_buffer[threadIdx.x] = tmp;
207  vector_buffer[threadIdx.x] = vector[tmp];
208  }
209 
210  __syncthreads();
211 
212  //now a single thread does the remaining work in shared memory:
213  if (threadIdx.x == 0)
214  {
215  // traverse through all the loaded data from back to front:
216  for (unsigned int k2=0; k2<blockDim.x; ++k2)
217  {
218  unsigned int k = (blockDim.x - k2) - 1;
219 
220  if (i+k >= nnz)
221  continue;
222 
223  if (col_index_buffer[k] > row_at_window_start) //use recently computed results
224  current_vector_entry -= element_buffer[k] * vector_buffer[k];
225  else if (col_index_buffer[k] > current_row) //use buffered data
226  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
227 
228  if (i+k == next_row) //current row is finished. Write back result
229  {
230  vector[current_row] = current_vector_entry;
231  if (current_row > 0) //load next row's data
232  {
233  --current_row;
234  next_row = row_indices[current_row];
235  current_vector_entry = vector[current_row];
236  }
237  }
238 
239 
240  } // for k
241 
242  row_at_window_start = current_row;
243  } // if (get_local_id(0) == 0)
244 
245  __syncthreads();
246 
247  if (i < blockDim.x)
248  break;
249 
250  i -= blockDim.x;
251  } //for i
252 }
253 
254 
255 
256 template<typename NumericT>
257 __global__ void csr_lu_backward_kernel(
258  const unsigned int * row_indices,
259  const unsigned int * column_indices,
260  const NumericT * elements,
261  NumericT * vector,
262  unsigned int size)
263 {
264  __shared__ unsigned int col_index_buffer[128];
265  __shared__ NumericT element_buffer[128];
266  __shared__ NumericT vector_buffer[128];
267 
268  unsigned int nnz = row_indices[size];
269  unsigned int current_row = size-1;
270  unsigned int row_at_window_start = size-1;
271  NumericT current_vector_entry = vector[size-1];
272  NumericT diagonal_entry;
273  unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
274  unsigned int next_row = row_indices[size-1];
275 
276  unsigned int i = loop_end + threadIdx.x;
277  while (1)
278  {
279  //load into shared memory (coalesced access):
280  if (i < nnz)
281  {
282  element_buffer[threadIdx.x] = elements[i];
283  unsigned int tmp = column_indices[i];
284  col_index_buffer[threadIdx.x] = tmp;
285  vector_buffer[threadIdx.x] = vector[tmp];
286  }
287 
288  __syncthreads();
289 
290  //now a single thread does the remaining work in shared memory:
291  if (threadIdx.x == 0)
292  {
293  // traverse through all the loaded data from back to front:
294  for (unsigned int k2=0; k2<blockDim.x; ++k2)
295  {
296  unsigned int k = (blockDim.x - k2) - 1;
297 
298  if (i+k >= nnz)
299  continue;
300 
301  if (col_index_buffer[k] > row_at_window_start) //use recently computed results
302  current_vector_entry -= element_buffer[k] * vector_buffer[k];
303  else if (col_index_buffer[k] > current_row) //use buffered data
304  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
305  else if (col_index_buffer[k] == current_row)
306  diagonal_entry = element_buffer[k];
307 
308  if (i+k == next_row) //current row is finished. Write back result
309  {
310  vector[current_row] = current_vector_entry / diagonal_entry;
311  if (current_row > 0) //load next row's data
312  {
313  --current_row;
314  next_row = row_indices[current_row];
315  current_vector_entry = vector[current_row];
316  }
317  }
318 
319 
320  } // for k
321 
322  row_at_window_start = current_row;
323  } // if (get_local_id(0) == 0)
324 
325  __syncthreads();
326 
327  if (i < blockDim.x)
328  break;
329 
330  i -= blockDim.x;
331  } //for i
332 }
333 
334 
335 
336 //
337 // transposed
338 //
339 
340 
341 template<typename NumericT>
343  const unsigned int * row_indices,
344  const unsigned int * column_indices,
345  const NumericT * elements,
346  NumericT * vector,
347  unsigned int size)
348 {
349  for (unsigned int row = 0; row < size; ++row)
350  {
351  NumericT result_entry = vector[row];
352 
353  unsigned int row_start = row_indices[row];
354  unsigned int row_stop = row_indices[row + 1];
355  for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
356  {
357  unsigned int col_index = column_indices[entry_index];
358  if (col_index > row)
359  vector[col_index] -= result_entry * elements[entry_index];
360  }
361 
362  __syncthreads();
363  }
364 }
365 
366 template<typename NumericT>
368  const unsigned int * row_indices,
369  const unsigned int * column_indices,
370  const NumericT * elements,
371  NumericT * vector,
372  unsigned int size)
373 {
374  __shared__ unsigned int row_index_lookahead[256];
375  __shared__ unsigned int row_index_buffer[256];
376 
377  unsigned int row_index;
378  unsigned int col_index;
379  NumericT matrix_entry;
380  unsigned int nnz = row_indices[size];
381  unsigned int row_at_window_start = 0;
382  unsigned int row_at_window_end = 0;
383  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
384 
385  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
386  {
387  col_index = (i < nnz) ? column_indices[i] : 0;
388  matrix_entry = (i < nnz) ? elements[i] : 0;
389  row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
390 
391  __syncthreads();
392 
393  if (i < nnz)
394  {
395  unsigned int row_index_inc = 0;
396  while (i >= row_index_lookahead[row_index_inc + 1])
397  ++row_index_inc;
398  row_index = row_at_window_start + row_index_inc;
399  row_index_buffer[threadIdx.x] = row_index;
400  }
401  else
402  {
403  row_index = size+1;
404  row_index_buffer[threadIdx.x] = size - 1;
405  }
406 
407  __syncthreads();
408 
409  row_at_window_start = row_index_buffer[0];
410  row_at_window_end = row_index_buffer[blockDim.x - 1];
411 
412  //forward elimination
413  for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
414  {
415  NumericT result_entry = vector[row];
416 
417  if ( (row_index == row) && (col_index > row) )
418  vector[col_index] -= result_entry * matrix_entry;
419 
420  __syncthreads();
421  }
422 
423  row_at_window_start = row_at_window_end;
424  }
425 
426 }
427 
428 template<typename NumericT>
430  const unsigned int * row_indices,
431  const unsigned int * column_indices,
432  const NumericT * elements,
433  const NumericT * diagonal_entries,
434  NumericT * vector,
435  unsigned int size)
436 {
437  __shared__ unsigned int row_index_lookahead[256];
438  __shared__ unsigned int row_index_buffer[256];
439 
440  unsigned int row_index;
441  unsigned int col_index;
442  NumericT matrix_entry;
443  unsigned int nnz = row_indices[size];
444  unsigned int row_at_window_start = 0;
445  unsigned int row_at_window_end = 0;
446  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
447 
448  for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
449  {
450  col_index = (i < nnz) ? column_indices[i] : 0;
451  matrix_entry = (i < nnz) ? elements[i] : 0;
452  row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
453 
454  __syncthreads();
455 
456  if (i < nnz)
457  {
458  unsigned int row_index_inc = 0;
459  while (i >= row_index_lookahead[row_index_inc + 1])
460  ++row_index_inc;
461  row_index = row_at_window_start + row_index_inc;
462  row_index_buffer[threadIdx.x] = row_index;
463  }
464  else
465  {
466  row_index = size+1;
467  row_index_buffer[threadIdx.x] = size - 1;
468  }
469 
470  __syncthreads();
471 
472  row_at_window_start = row_index_buffer[0];
473  row_at_window_end = row_index_buffer[blockDim.x - 1];
474 
475  //forward elimination
476  for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
477  {
478  NumericT result_entry = vector[row] / diagonal_entries[row];
479 
480  if ( (row_index == row) && (col_index > row) )
481  vector[col_index] -= result_entry * matrix_entry;
482 
483  __syncthreads();
484  }
485 
486  row_at_window_start = row_at_window_end;
487  }
488 
489  // final step: Divide vector by diagonal entries:
490  for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
491  vector[i] /= diagonal_entries[i];
492 
493 }
494 
495 
496 template<typename NumericT>
498  const unsigned int * row_indices,
499  const unsigned int * column_indices,
500  const NumericT * elements,
501  NumericT * vector,
502  unsigned int size)
503 {
504  __shared__ unsigned int row_index_lookahead[256];
505  __shared__ unsigned int row_index_buffer[256];
506 
507  unsigned int row_index;
508  unsigned int col_index;
509  NumericT matrix_entry;
510  unsigned int nnz = row_indices[size];
511  unsigned int row_at_window_start = size;
512  unsigned int row_at_window_end;
513  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
514 
515  for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
516  {
517  unsigned int i = (nnz - i2) - 1;
518  col_index = (i2 < nnz) ? column_indices[i] : 0;
519  matrix_entry = (i2 < nnz) ? elements[i] : 0;
520  row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
521 
522  __syncthreads();
523 
524  if (i2 < nnz)
525  {
526  unsigned int row_index_dec = 0;
527  while (row_index_lookahead[row_index_dec] > i)
528  ++row_index_dec;
529  row_index = row_at_window_start - row_index_dec;
530  row_index_buffer[threadIdx.x] = row_index;
531  }
532  else
533  {
534  row_index = size+1;
535  row_index_buffer[threadIdx.x] = 0;
536  }
537 
538  __syncthreads();
539 
540  row_at_window_start = row_index_buffer[0];
541  row_at_window_end = row_index_buffer[blockDim.x - 1];
542 
543  //backward elimination
544  for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
545  {
546  unsigned int row = row_at_window_start - row2;
547  NumericT result_entry = vector[row];
548 
549  if ( (row_index == row) && (col_index < row) )
550  vector[col_index] -= result_entry * matrix_entry;
551 
552  __syncthreads();
553  }
554 
555  row_at_window_start = row_at_window_end;
556  }
557 
558 }
559 
560 
561 
562 template<typename NumericT>
564  const unsigned int * row_indices,
565  const unsigned int * column_indices,
566  const NumericT * elements,
567  const NumericT * diagonal_entries,
568  NumericT * vector,
569  unsigned int size)
570 {
571  NumericT result_entry = 0;
572 
573  //backward elimination, using U and D:
574  for (unsigned int row2 = 0; row2 < size; ++row2)
575  {
576  unsigned int row = (size - row2) - 1;
577  result_entry = vector[row] / diagonal_entries[row];
578 
579  unsigned int row_start = row_indices[row];
580  unsigned int row_stop = row_indices[row + 1];
581  for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
582  {
583  unsigned int col_index = column_indices[entry_index];
584  if (col_index < row)
585  vector[col_index] -= result_entry * elements[entry_index];
586  }
587 
588  __syncthreads();
589 
590  if (threadIdx.x == 0)
591  vector[row] = result_entry;
592  }
593 }
594 
595 
596 template<typename NumericT>
598  const unsigned int * row_indices,
599  const unsigned int * column_indices,
600  const NumericT * elements,
601  const NumericT * diagonal_entries,
602  NumericT * vector,
603  unsigned int size)
604 {
605  __shared__ unsigned int row_index_lookahead[256];
606  __shared__ unsigned int row_index_buffer[256];
607 
608  unsigned int row_index;
609  unsigned int col_index;
610  NumericT matrix_entry;
611  unsigned int nnz = row_indices[size];
612  unsigned int row_at_window_start = size;
613  unsigned int row_at_window_end;
614  unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
615 
616  for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
617  {
618  unsigned int i = (nnz - i2) - 1;
619  col_index = (i2 < nnz) ? column_indices[i] : 0;
620  matrix_entry = (i2 < nnz) ? elements[i] : 0;
621  row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
622 
623  __syncthreads();
624 
625  if (i2 < nnz)
626  {
627  unsigned int row_index_dec = 0;
628  while (row_index_lookahead[row_index_dec] > i)
629  ++row_index_dec;
630  row_index = row_at_window_start - row_index_dec;
631  row_index_buffer[threadIdx.x] = row_index;
632  }
633  else
634  {
635  row_index = size+1;
636  row_index_buffer[threadIdx.x] = 0;
637  }
638 
639  __syncthreads();
640 
641  row_at_window_start = row_index_buffer[0];
642  row_at_window_end = row_index_buffer[blockDim.x - 1];
643 
644  //backward elimination
645  for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
646  {
647  unsigned int row = row_at_window_start - row2;
648  NumericT result_entry = vector[row] / diagonal_entries[row];
649 
650  if ( (row_index == row) && (col_index < row) )
651  vector[col_index] -= result_entry * matrix_entry;
652 
653  __syncthreads();
654  }
655 
656  row_at_window_start = row_at_window_end;
657  }
658 
659 
660  // final step: Divide vector by diagonal entries:
661  for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
662  vector[i] /= diagonal_entries[i];
663 
664 }
665 
666 
667 template<typename NumericT>
669  const unsigned int * row_jumper_L, //L part (note that L is transposed in memory)
670  const unsigned int * column_indices_L,
671  const NumericT * elements_L,
672  const unsigned int * block_offsets,
673  NumericT * result,
674  unsigned int size)
675 {
676  unsigned int col_start = block_offsets[2*blockIdx.x];
677  unsigned int col_stop = block_offsets[2*blockIdx.x+1];
678  unsigned int row_start = row_jumper_L[col_start];
679  unsigned int row_stop;
680  NumericT result_entry = 0;
681 
682  if (col_start >= col_stop)
683  return;
684 
685  //forward elimination, using L:
686  for (unsigned int col = col_start; col < col_stop; ++col)
687  {
688  result_entry = result[col];
689  row_stop = row_jumper_L[col + 1];
690  for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
691  result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
692  row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)
693  __syncthreads();
694  }
695 
696 }
697 
698 
699 template<typename NumericT>
701  const unsigned int * row_jumper_U, //U part (note that U is transposed in memory)
702  const unsigned int * column_indices_U,
703  const NumericT * elements_U,
704  const NumericT * diagonal_U,
705  const unsigned int * block_offsets,
706  NumericT * result,
707  unsigned int size)
708 {
709  unsigned int col_start = block_offsets[2*blockIdx.x];
710  unsigned int col_stop = block_offsets[2*blockIdx.x+1];
711  unsigned int row_start;
712  unsigned int row_stop;
713  NumericT result_entry = 0;
714 
715  if (col_start >= col_stop)
716  return;
717 
718  //backward elimination, using U and diagonal_U
719  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter)
720  {
721  unsigned int col = (col_stop - iter) - 1;
722  result_entry = result[col] / diagonal_U[col];
723  row_start = row_jumper_U[col];
724  row_stop = row_jumper_U[col + 1];
725  for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
726  result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
727  __syncthreads();
728  }
729 
730  //divide result vector by diagonal:
731  for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
732  result[col] /= diagonal_U[col];
733 }
734 
735 
736 
737 //
738 // Coordinate Matrix
739 //
740 
741 
742 
743 
744 //
745 // ELL Matrix
746 //
747 
748 
749 
750 //
751 // Hybrid Matrix
752 //
753 
754 
755 
756 } // namespace opencl
757 } //namespace linalg
758 } //namespace viennacl
759 
760 
761 #endif
__global__ void csr_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
This file provides the forward declarations for the main types used within ViennaCL.
float NumericT
Definition: bisect.cpp:40
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:235
__global__ void csr_trans_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_lu_backward(const unsigned int *row_jumper_U, const unsigned int *column_indices_U, const NumericT *elements_U, const NumericT *diagonal_U, const unsigned int *block_offsets, NumericT *result, unsigned int size)
__global__ void csr_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:900
__global__ void csr_trans_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_unit_lu_forward(const unsigned int *row_jumper_L, const unsigned int *column_indices_L, const NumericT *elements_L, const unsigned int *block_offsets, NumericT *result, unsigned int size)