1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
41 template<
typename NumericT>
43 const unsigned int * row_indices,
44 const unsigned int * column_indices,
49 __shared__
unsigned int col_index_buffer[128];
50 __shared__
NumericT element_buffer[128];
51 __shared__
NumericT vector_buffer[128];
53 unsigned int nnz = row_indices[
size];
54 unsigned int current_row = 0;
55 unsigned int row_at_window_start = 0;
56 NumericT current_vector_entry = vector[0];
57 unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
58 unsigned int next_row = row_indices[1];
60 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
65 element_buffer[threadIdx.x] = elements[i];
66 unsigned int tmp = column_indices[i];
67 col_index_buffer[threadIdx.x] = tmp;
68 vector_buffer[threadIdx.x] = vector[tmp];
77 for (
unsigned int k=0; k<blockDim.x; ++k)
79 if (current_row < size && i+k == next_row)
81 vector[current_row] = current_vector_entry;
83 if (current_row < size)
85 next_row = row_indices[current_row+1];
86 current_vector_entry = vector[current_row];
90 if (current_row < size && col_index_buffer[k] < current_row)
92 if (col_index_buffer[k] < row_at_window_start)
93 current_vector_entry -= element_buffer[k] * vector_buffer[k];
94 else if (col_index_buffer[k] < current_row)
95 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
100 row_at_window_start = current_row;
109 template<
typename NumericT>
111 const unsigned int * row_indices,
112 const unsigned int * column_indices,
117 __shared__
unsigned int col_index_buffer[128];
118 __shared__
NumericT element_buffer[128];
119 __shared__
NumericT vector_buffer[128];
121 unsigned int nnz = row_indices[
size];
122 unsigned int current_row = 0;
123 unsigned int row_at_window_start = 0;
124 NumericT current_vector_entry = vector[0];
126 unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
127 unsigned int next_row = row_indices[1];
129 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
134 element_buffer[threadIdx.x] = elements[i];
135 unsigned int tmp = column_indices[i];
136 col_index_buffer[threadIdx.x] = tmp;
137 vector_buffer[threadIdx.x] = vector[tmp];
143 if (threadIdx.x == 0)
146 for (
unsigned int k=0; k<blockDim.x; ++k)
148 if (current_row < size && i+k == next_row)
150 vector[current_row] = current_vector_entry / diagonal_entry;
152 if (current_row < size)
154 next_row = row_indices[current_row+1];
155 current_vector_entry = vector[current_row];
159 if (current_row < size && col_index_buffer[k] < current_row)
161 if (col_index_buffer[k] < row_at_window_start)
162 current_vector_entry -= element_buffer[k] * vector_buffer[k];
163 else if (col_index_buffer[k] < current_row)
164 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
166 else if (col_index_buffer[k] == current_row)
167 diagonal_entry = element_buffer[k];
171 row_at_window_start = current_row;
179 template<
typename NumericT>
181 const unsigned int * row_indices,
182 const unsigned int * column_indices,
187 __shared__
unsigned int col_index_buffer[128];
188 __shared__
NumericT element_buffer[128];
189 __shared__
NumericT vector_buffer[128];
191 unsigned int nnz = row_indices[
size];
192 unsigned int current_row = size-1;
193 unsigned int row_at_window_start = size-1;
194 NumericT current_vector_entry = vector[size-1];
195 unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
196 unsigned int next_row = row_indices[size-1];
198 unsigned int i = loop_end + threadIdx.x;
204 element_buffer[threadIdx.x] = elements[i];
205 unsigned int tmp = column_indices[i];
206 col_index_buffer[threadIdx.x] = tmp;
207 vector_buffer[threadIdx.x] = vector[tmp];
213 if (threadIdx.x == 0)
216 for (
unsigned int k2=0; k2<blockDim.x; ++k2)
218 unsigned int k = (blockDim.x - k2) - 1;
223 if (col_index_buffer[k] > row_at_window_start)
224 current_vector_entry -= element_buffer[k] * vector_buffer[k];
225 else if (col_index_buffer[k] > current_row)
226 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
230 vector[current_row] = current_vector_entry;
234 next_row = row_indices[current_row];
235 current_vector_entry = vector[current_row];
242 row_at_window_start = current_row;
256 template<
typename NumericT>
258 const unsigned int * row_indices,
259 const unsigned int * column_indices,
264 __shared__
unsigned int col_index_buffer[128];
265 __shared__
NumericT element_buffer[128];
266 __shared__
NumericT vector_buffer[128];
268 unsigned int nnz = row_indices[
size];
269 unsigned int current_row = size-1;
270 unsigned int row_at_window_start = size-1;
271 NumericT current_vector_entry = vector[size-1];
273 unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
274 unsigned int next_row = row_indices[size-1];
276 unsigned int i = loop_end + threadIdx.x;
282 element_buffer[threadIdx.x] = elements[i];
283 unsigned int tmp = column_indices[i];
284 col_index_buffer[threadIdx.x] = tmp;
285 vector_buffer[threadIdx.x] = vector[tmp];
291 if (threadIdx.x == 0)
294 for (
unsigned int k2=0; k2<blockDim.x; ++k2)
296 unsigned int k = (blockDim.x - k2) - 1;
301 if (col_index_buffer[k] > row_at_window_start)
302 current_vector_entry -= element_buffer[k] * vector_buffer[k];
303 else if (col_index_buffer[k] > current_row)
304 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
305 else if (col_index_buffer[k] == current_row)
306 diagonal_entry = element_buffer[k];
310 vector[current_row] = current_vector_entry / diagonal_entry;
314 next_row = row_indices[current_row];
315 current_vector_entry = vector[current_row];
322 row_at_window_start = current_row;
341 template<
typename NumericT>
343 const unsigned int * row_indices,
344 const unsigned int * column_indices,
353 unsigned int row_start = row_indices[
row];
354 unsigned int row_stop = row_indices[
row + 1];
355 for (
unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
357 unsigned int col_index = column_indices[entry_index];
359 vector[col_index] -= result_entry * elements[entry_index];
366 template<
typename NumericT>
368 const unsigned int * row_indices,
369 const unsigned int * column_indices,
374 __shared__
unsigned int row_index_lookahead[256];
375 __shared__
unsigned int row_index_buffer[256];
377 unsigned int row_index;
378 unsigned int col_index;
380 unsigned int nnz = row_indices[
size];
381 unsigned int row_at_window_start = 0;
382 unsigned int row_at_window_end = 0;
383 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
385 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
387 col_index = (i < nnz) ? column_indices[i] : 0;
388 matrix_entry = (i < nnz) ? elements[i] : 0;
389 row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x <
size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
395 unsigned int row_index_inc = 0;
396 while (i >= row_index_lookahead[row_index_inc + 1])
398 row_index = row_at_window_start + row_index_inc;
399 row_index_buffer[threadIdx.x] = row_index;
404 row_index_buffer[threadIdx.x] = size - 1;
409 row_at_window_start = row_index_buffer[0];
410 row_at_window_end = row_index_buffer[blockDim.x - 1];
413 for (
unsigned int row = row_at_window_start;
row <= row_at_window_end; ++
row)
417 if ( (row_index ==
row) && (col_index >
row) )
418 vector[col_index] -= result_entry * matrix_entry;
423 row_at_window_start = row_at_window_end;
428 template<
typename NumericT>
430 const unsigned int * row_indices,
431 const unsigned int * column_indices,
437 __shared__
unsigned int row_index_lookahead[256];
438 __shared__
unsigned int row_index_buffer[256];
440 unsigned int row_index;
441 unsigned int col_index;
443 unsigned int nnz = row_indices[
size];
444 unsigned int row_at_window_start = 0;
445 unsigned int row_at_window_end = 0;
446 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
448 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
450 col_index = (i < nnz) ? column_indices[i] : 0;
451 matrix_entry = (i < nnz) ? elements[i] : 0;
452 row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x <
size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
458 unsigned int row_index_inc = 0;
459 while (i >= row_index_lookahead[row_index_inc + 1])
461 row_index = row_at_window_start + row_index_inc;
462 row_index_buffer[threadIdx.x] = row_index;
467 row_index_buffer[threadIdx.x] = size - 1;
472 row_at_window_start = row_index_buffer[0];
473 row_at_window_end = row_index_buffer[blockDim.x - 1];
476 for (
unsigned int row = row_at_window_start;
row <= row_at_window_end; ++
row)
480 if ( (row_index ==
row) && (col_index >
row) )
481 vector[col_index] -= result_entry * matrix_entry;
486 row_at_window_start = row_at_window_end;
490 for (
unsigned int i = threadIdx.x; i < size; i += blockDim.x)
491 vector[i] /= diagonal_entries[i];
496 template<
typename NumericT>
498 const unsigned int * row_indices,
499 const unsigned int * column_indices,
504 __shared__
unsigned int row_index_lookahead[256];
505 __shared__
unsigned int row_index_buffer[256];
507 unsigned int row_index;
508 unsigned int col_index;
510 unsigned int nnz = row_indices[
size];
511 unsigned int row_at_window_start =
size;
512 unsigned int row_at_window_end;
513 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
515 for (
unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
517 unsigned int i = (nnz - i2) - 1;
518 col_index = (i2 < nnz) ? column_indices[i] : 0;
519 matrix_entry = (i2 < nnz) ? elements[i] : 0;
520 row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
526 unsigned int row_index_dec = 0;
527 while (row_index_lookahead[row_index_dec] > i)
529 row_index = row_at_window_start - row_index_dec;
530 row_index_buffer[threadIdx.x] = row_index;
535 row_index_buffer[threadIdx.x] = 0;
540 row_at_window_start = row_index_buffer[0];
541 row_at_window_end = row_index_buffer[blockDim.x - 1];
544 for (
unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
546 unsigned int row = row_at_window_start - row2;
549 if ( (row_index == row) && (col_index <
row) )
550 vector[col_index] -= result_entry * matrix_entry;
555 row_at_window_start = row_at_window_end;
562 template<
typename NumericT>
564 const unsigned int * row_indices,
565 const unsigned int * column_indices,
574 for (
unsigned int row2 = 0; row2 <
size; ++row2)
576 unsigned int row = (size - row2) - 1;
577 result_entry = vector[
row] / diagonal_entries[
row];
579 unsigned int row_start = row_indices[
row];
580 unsigned int row_stop = row_indices[row + 1];
581 for (
unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
583 unsigned int col_index = column_indices[entry_index];
585 vector[col_index] -= result_entry * elements[entry_index];
590 if (threadIdx.x == 0)
591 vector[
row] = result_entry;
596 template<
typename NumericT>
598 const unsigned int * row_indices,
599 const unsigned int * column_indices,
605 __shared__
unsigned int row_index_lookahead[256];
606 __shared__
unsigned int row_index_buffer[256];
608 unsigned int row_index;
609 unsigned int col_index;
611 unsigned int nnz = row_indices[
size];
612 unsigned int row_at_window_start =
size;
613 unsigned int row_at_window_end;
614 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
616 for (
unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
618 unsigned int i = (nnz - i2) - 1;
619 col_index = (i2 < nnz) ? column_indices[i] : 0;
620 matrix_entry = (i2 < nnz) ? elements[i] : 0;
621 row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
627 unsigned int row_index_dec = 0;
628 while (row_index_lookahead[row_index_dec] > i)
630 row_index = row_at_window_start - row_index_dec;
631 row_index_buffer[threadIdx.x] = row_index;
636 row_index_buffer[threadIdx.x] = 0;
641 row_at_window_start = row_index_buffer[0];
642 row_at_window_end = row_index_buffer[blockDim.x - 1];
645 for (
unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
647 unsigned int row = row_at_window_start - row2;
650 if ( (row_index == row) && (col_index <
row) )
651 vector[col_index] -= result_entry * matrix_entry;
656 row_at_window_start = row_at_window_end;
661 for (
unsigned int i = threadIdx.x; i < size; i += blockDim.x)
662 vector[i] /= diagonal_entries[i];
667 template<
typename NumericT>
669 const unsigned int * row_jumper_L,
670 const unsigned int * column_indices_L,
672 const unsigned int * block_offsets,
676 unsigned int col_start = block_offsets[2*blockIdx.x];
677 unsigned int col_stop = block_offsets[2*blockIdx.x+1];
678 unsigned int row_start = row_jumper_L[col_start];
679 unsigned int row_stop;
682 if (col_start >= col_stop)
686 for (
unsigned int col = col_start; col < col_stop; ++col)
688 result_entry = result[col];
689 row_stop = row_jumper_L[col + 1];
690 for (
unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
691 result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
692 row_start = row_stop;
699 template<
typename NumericT>
701 const unsigned int * row_jumper_U,
702 const unsigned int * column_indices_U,
705 const unsigned int * block_offsets,
709 unsigned int col_start = block_offsets[2*blockIdx.x];
710 unsigned int col_stop = block_offsets[2*blockIdx.x+1];
711 unsigned int row_start;
712 unsigned int row_stop;
715 if (col_start >= col_stop)
719 for (
unsigned int iter = 0; iter < col_stop - col_start; ++iter)
721 unsigned int col = (col_stop - iter) - 1;
722 result_entry = result[col] / diagonal_U[col];
723 row_start = row_jumper_U[col];
724 row_stop = row_jumper_U[col + 1];
725 for (
unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
726 result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
731 for (
unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
732 result[col] /= diagonal_U[col];
__global__ void csr_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
This file provides the forward declarations for the main types used within ViennaCL.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
__global__ void csr_trans_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_lu_backward(const unsigned int *row_jumper_U, const unsigned int *column_indices_U, const NumericT *elements_U, const NumericT *diagonal_U, const unsigned int *block_offsets, NumericT *result, unsigned int size)
__global__ void csr_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void csr_trans_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_unit_lu_forward(const unsigned int *row_jumper_L, const unsigned int *column_indices_L, const NumericT *elements_L, const unsigned int *block_offsets, NumericT *result, unsigned int size)