Tpetra parallel linear algebra  Version of the Day
Tpetra_Experimental_BlockView.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKVIEW_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKVIEW_HPP
44 
53 
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Kokkos_ArithTraits.hpp"
56 #include "Kokkos_Complex.hpp"
57 
58 namespace Tpetra {
59 
71 namespace Experimental {
72 
73 namespace Impl {
74 
81 template<class ViewType1,
82  class ViewType2,
83  const int rank1 = ViewType1::rank>
84 struct AbsMax {
85  static void run (const ViewType2& Y, const ViewType1& X);
86 };
87 
92 template<class ViewType1,
93  class ViewType2>
94 struct AbsMax<ViewType1, ViewType2, 2> {
97  static void run (const ViewType2& Y, const ViewType1& X)
98  {
99  static_assert (ViewType1::rank == ViewType2::rank,
100  "AbsMax: ViewType1 and ViewType2 must have the same rank.");
101  typedef typename std::remove_reference<decltype (Y(0,0)) >::type STY;
102  static_assert(! std::is_const<STY>::value,
103  "AbsMax: The type of each entry of Y must be nonconst.");
104  typedef typename std::decay<decltype (X(0,0)) >::type STX;
105  static_assert( std::is_same<STX, STY>::value,
106  "AbsMax: The type of each entry of X and Y must be the same.");
107  typedef Kokkos::Details::ArithTraits<STY> KAT;
108 
109  for (int j = 0; j < Y.dimension_1 (); ++j) {
110  for (int i = 0; i < Y.dimension_0 (); ++i) {
111  STY& Y_ij = Y(i,j); // use ref here to avoid 2nd op() call on Y
112  const STX X_ij = X(i,j);
113  // NOTE: no std::max (not a CUDA __device__ function); must
114  // cast back up to complex.
115  const auto Y_ij_abs = KAT::abs (Y_ij);
116  const auto X_ij_abs = KAT::abs (X_ij);
117  Y_ij = (Y_ij_abs >= X_ij_abs) ?
118  static_cast<STY> (Y_ij_abs) :
119  static_cast<STY> (X_ij_abs);
120  }
121  }
122  }
123 };
124 
129 template<class ViewType1,
130  class ViewType2>
131 struct AbsMax<ViewType1, ViewType2, 1> {
134  static void run (const ViewType2& Y, const ViewType1& X)
135  {
136  static_assert (ViewType1::rank == ViewType2::rank,
137  "AbsMax: ViewType1 and ViewType2 must have the same rank.");
138 
139  typedef typename std::remove_reference<decltype (Y(0)) >::type STY;
140  static_assert(! std::is_const<STY>::value,
141  "AbsMax: The type of each entry of Y must be nonconst.");
142  typedef typename std::remove_const<typename std::remove_reference<decltype (X(0)) >::type>::type STX;
143  static_assert( std::is_same<STX, STY>::value,
144  "AbsMax: The type of each entry of X and Y must be the same.");
145  typedef Kokkos::Details::ArithTraits<STY> KAT;
146 
147  for (int i = 0; i < Y.dimension_0 (); ++i) {
148  STY& Y_i = Y(i); // use ref here to avoid 2nd op() call on Y
149  const STX X_i = X(i);
150  // NOTE: no std::max (not a CUDA __device__ function); must
151  // cast back up to complex.
152  const auto Y_i_abs = KAT::abs (Y_i);
153  const auto X_i_abs = KAT::abs (X_i);
154  Y_i = (Y_i_abs >= X_i_abs) ?
155  static_cast<STY> (Y_i_abs) :
156  static_cast<STY> (X_i_abs);
157  }
158  }
159 };
160 
167 template<class ViewType1, class ViewType2, const int rank = ViewType1::rank>
168 void absMax (const ViewType2& Y, const ViewType1& X) {
169  static_assert (ViewType1::rank == ViewType2::rank,
170  "absMax: ViewType1 and ViewType2 must have the same rank.");
172 }
173 
178 template<class ViewType,
179  class CoefficientType,
180  class LayoutType = typename ViewType::array_layout,
181  class IndexType = int,
182  const int rank = ViewType::rank>
183 struct SCAL {
184  static void run (const CoefficientType& alpha, const ViewType& x);
185 };
186 
189 template<class ViewType,
190  class CoefficientType,
191  class LayoutType,
192  class IndexType>
193 struct SCAL<ViewType, CoefficientType, LayoutType, IndexType, 1> {
195  static void run (const CoefficientType& alpha, const ViewType& x)
196  {
197  const IndexType numRows = static_cast<IndexType> (x.dimension_0 ());
198  // BLAS _SCAL doesn't check whether alpha is 0.
199  for (IndexType i = 0; i < numRows; ++i) {
200  x(i) = alpha * x(i);
201  }
202  }
203 };
204 
207 template<class ViewType,
208  class CoefficientType,
209  class LayoutType,
210  class IndexType>
211 struct SCAL<ViewType, CoefficientType, LayoutType, IndexType, 2> {
213  static void run (const CoefficientType& alpha, const ViewType& A)
214  {
215  const IndexType numRows = static_cast<IndexType> (A.dimension_0 ());
216  const IndexType numCols = static_cast<IndexType> (A.dimension_1 ());
217 
218  // BLAS _SCAL doesn't check whether alpha is 0.
219  for (IndexType i = 0; i < numRows; ++i) {
220  for (IndexType j = 0; j < numCols; ++j) {
221  A(i,j) = alpha * A(i,j);
222  }
223  }
224  }
225 };
226 
232 template<class ViewType,
233  class CoefficientType,
234  class IndexType>
235 struct SCAL<ViewType, CoefficientType, Kokkos::LayoutRight, IndexType, 2> {
237  static void run (const CoefficientType& alpha, const ViewType& A)
238  {
239  const IndexType N = A.size ();
240  typedef typename std::decay<decltype (A(0,0)) >::type scalar_type;
241  scalar_type* const A_raw = A.ptr_on_device ();
242 
243  for (IndexType i = 0; i < N; ++i) {
244  A_raw[i] = alpha * A_raw[i];
245  }
246  }
247 };
248 
253 template<class CoefficientType,
254  class ViewType1,
255  class ViewType2,
256  class LayoutType1 = typename ViewType1::array_layout,
257  class LayoutType2 = typename ViewType2::array_layout,
258  class IndexType = int,
259  const int rank = ViewType1::rank>
260 struct AXPY {
261  static void
262  run (const CoefficientType& alpha,
263  const ViewType1& x,
264  const ViewType2& y);
265 };
266 
269 template<class CoefficientType,
270  class ViewType1,
271  class ViewType2,
272  class LayoutType1,
273  class LayoutType2,
274  class IndexType>
275 struct AXPY<CoefficientType, ViewType1, ViewType2, LayoutType1, LayoutType2, IndexType, 1> {
277  static void
278  run (const CoefficientType& alpha,
279  const ViewType1& x,
280  const ViewType2& y)
281  {
282  static_assert (ViewType1::rank == ViewType2::rank,
283  "AXPY: x and y must have the same rank.");
284  const IndexType numRows = static_cast<IndexType> (y.dimension_0 ());
285  if (alpha != 0.0) {
286  for (IndexType i = 0; i < numRows; ++i) {
287  y(i) += alpha * x(i);
288  }
289  }
290  }
291 };
292 
295 template<class CoefficientType,
296  class ViewType1,
297  class ViewType2,
298  class LayoutType1,
299  class LayoutType2,
300  class IndexType>
301 struct AXPY<CoefficientType, ViewType1, ViewType2, LayoutType1, LayoutType2, IndexType, 2> {
303  static void
304  run (const CoefficientType& alpha,
305  const ViewType1& X,
306  const ViewType2& Y)
307  {
308  static_assert (ViewType1::rank == ViewType2::rank,
309  "AXPY: X and Y must have the same rank.");
310  const IndexType numRows = static_cast<IndexType> (Y.dimension_0 ());
311  const IndexType numCols = static_cast<IndexType> (Y.dimension_1 ());
312 
313  throw std::runtime_error ("OH HAI");
314 
315  if (alpha != 0.0) {
316  for (IndexType i = 0; i < numRows; ++i) {
317  for (IndexType j = 0; j < numCols; ++j) {
318  Y(i,j) += alpha * X(i,j);
319  }
320  }
321  }
322  }
323 };
324 
328 template<class CoefficientType,
329  class ViewType1,
330  class ViewType2,
331  class IndexType>
332 struct AXPY<CoefficientType, ViewType1, ViewType2, Kokkos::LayoutRight, Kokkos::LayoutRight, IndexType, 2> {
334  static void
335  run (const CoefficientType& alpha,
336  const ViewType1& X,
337  const ViewType2& Y)
338  {
339  static_assert (ViewType1::rank == ViewType2::rank,
340  "AXPY: X and Y must have the same rank.");
341  typedef typename std::decay<decltype (X(0,0)) >::type SX;
342  typedef typename std::decay<decltype (Y(0,0)) >::type SY;
343 
344  const IndexType N = static_cast<IndexType> (Y.size ());
345  const SX* const X_raw = X.ptr_on_device ();
346  SY* const Y_raw = Y.ptr_on_device ();
347 
348  if (alpha != 0.0) {
349  for (IndexType i = 0; i < N; ++i) {
350  Y_raw[i] += alpha * X_raw[i];
351  }
352  }
353  }
354 };
355 
360 template<class ViewType1,
361  class ViewType2,
362  class LayoutType1 = typename ViewType1::array_layout,
363  class LayoutType2 = typename ViewType2::array_layout,
364  class IndexType = int,
365  const int rank = ViewType1::rank>
366 struct COPY {
367  static void run (const ViewType1& x, const ViewType2& y);
368 };
369 
372 template<class ViewType1,
373  class ViewType2,
374  class LayoutType1,
375  class LayoutType2,
376  class IndexType>
377 struct COPY<ViewType1, ViewType2, LayoutType1, LayoutType2, IndexType, 1> {
379  static void run (const ViewType1& x, const ViewType2& y)
380  {
381  const IndexType numRows = static_cast<IndexType> (x.dimension_0 ());
382  for (IndexType i = 0; i < numRows; ++i) {
383  y(i) = x(i);
384  }
385  }
386 };
387 
390 template<class ViewType1,
391  class ViewType2,
392  class LayoutType1,
393  class LayoutType2,
394  class IndexType>
395 struct COPY<ViewType1, ViewType2, LayoutType1, LayoutType2, IndexType, 2> {
397  static void run (const ViewType1& X, const ViewType2& Y)
398  {
399  const IndexType numRows = static_cast<IndexType> (Y.dimension_0 ());
400  const IndexType numCols = static_cast<IndexType> (Y.dimension_1 ());
401 
402  throw std::runtime_error ("OH HAI");
403 
404  // BLAS _SCAL doesn't check whether alpha is 0.
405  for (IndexType i = 0; i < numRows; ++i) {
406  for (IndexType j = 0; j < numCols; ++j) {
407  Y(i,j) = X(i,j);
408  }
409  }
410  }
411 };
412 
413 
417 template<class ViewType1,
418  class ViewType2,
419  class IndexType>
420 struct COPY<ViewType1, ViewType2, Kokkos::LayoutRight, Kokkos::LayoutRight, IndexType, 2> {
422  static void run (const ViewType1& X, const ViewType2& Y)
423  {
424  typedef typename std::decay<decltype (X(0,0)) >::type SX;
425  typedef typename std::decay<decltype (Y(0,0)) >::type SY;
426 
427  const IndexType N = static_cast<IndexType> (Y.size ());
428  const SX* const X_raw = X.ptr_on_device ();
429  SY* const Y_raw = Y.ptr_on_device ();
430 
431  // BLAS _SCAL doesn't check whether alpha is 0.
432  for (IndexType i = 0; i < N; ++i) {
433  Y_raw[i] = X_raw[i];
434  }
435  }
436 };
437 
438 } // namespace Impl
439 
442 template<class ViewType,
443  class CoefficientType,
444  class LayoutType = typename ViewType::array_layout,
445  class IndexType = int,
446  const int rank = ViewType::rank>
447 void SCAL (const CoefficientType& alpha, const ViewType& x) {
449 }
450 
456 template<class CoefficientType,
457  class ViewType1,
458  class ViewType2,
459  class LayoutType1 = typename ViewType1::array_layout,
460  class LayoutType2 = typename ViewType2::array_layout,
461  class IndexType = int,
462  const int rank = ViewType1::rank>
463 void
464 AXPY (const CoefficientType& alpha,
465  const ViewType1& x,
466  const ViewType2& y)
467 {
468  static_assert (ViewType1::rank == ViewType2::rank,
469  "AXPY: x and y must have the same rank.");
471 }
472 
481 template<class ViewType1,
482  class ViewType2,
483  class LayoutType1 = typename ViewType1::array_layout,
484  class LayoutType2 = typename ViewType2::array_layout,
485  class IndexType = int,
486  const int rank = ViewType1::rank>
487 void COPY (const ViewType1& x, const ViewType2& y) {
488  static_assert (ViewType1::rank == ViewType2::rank,
489  "COPY: x and y must have the same rank.");
491 }
492 
505 template<class LittleVectorType1,
506  class LittleBlockType,
507  class LittleVectorType2,
508  class CoefficientType,
509  class IndexType = int>
510 void
511 GEMV (const CoefficientType& alpha,
512  const LittleBlockType& A,
513  const LittleVectorType1& x,
514  const LittleVectorType2& y)
515 {
516  const IndexType numRows = static_cast<IndexType> (A.dimension_0 ());
517  const IndexType numCols = static_cast<IndexType> (A.dimension_1 ());
518 
519  if (alpha != 0.0) {
520  for (IndexType i = 0; i < numRows; ++i) {
521  typename std::remove_reference<decltype (y(i)) >::type y_i = y(i);
522  for (IndexType j = 0; j < numCols; ++j) {
523  y_i += alpha * A(i,j) * x(j);
524  }
525  y(i) = y_i;
526  }
527  }
528 }
529 
530 
538 template<class ViewType1,
539  class ViewType2,
540  class ViewType3,
541  class CoefficientType,
542  class IndexType = int>
543 void
544 GEMM (const char transA[],
545  const char transB[],
546  const CoefficientType& alpha,
547  const ViewType1& A,
548  const ViewType2& B,
549  const CoefficientType& beta,
550  const ViewType3& C)
551 {
552  // Assert that A, B, and C are in fact matrices
553  static_assert (ViewType1::rank == 2, "GEMM: A must have rank 2 (be a matrix).");
554  static_assert (ViewType2::rank == 2, "GEMM: B must have rank 2 (be a matrix).");
555  static_assert (ViewType3::rank == 2, "GEMM: C must have rank 2 (be a matrix).");
556 
557  typedef typename std::remove_reference<decltype (A(0,0))>::type Scalar;
558  typedef Kokkos::Details::ArithTraits<Scalar> STS;
559  const Scalar ZERO = STS::zero();
560  const Scalar ONE = STS::one();
561 
562  // Get the dimensions
563  IndexType m, n, k;
564  if(transA[0] == 'N' || transA[0] == 'n') {
565  m = static_cast<IndexType> (A.dimension_0 ());
566  n = static_cast<IndexType> (A.dimension_1 ());
567  }
568  else {
569  m = static_cast<IndexType> (A.dimension_1 ());
570  n = static_cast<IndexType> (A.dimension_0 ());
571  }
572  k = static_cast<IndexType> (C.dimension_1 ());
573 
574  // quick return if possible
575  if(alpha == ZERO && beta == ONE)
576  return;
577 
578  // And if alpha equals zero...
579  if(alpha == ZERO) {
580  if(beta == ZERO) {
581  for(IndexType i=0; i<m; i++) {
582  for(IndexType j=0; j<k; j++) {
583  C(i,j) = ZERO;
584  }
585  }
586  }
587  else {
588  for(IndexType i=0; i<m; i++) {
589  for(IndexType j=0; j<k; j++) {
590  C(i,j) = beta*C(i,j);
591  }
592  }
593  }
594  }
595 
596  // Start the operations
597  if(transB[0] == 'n' || transB[0] == 'N') {
598  if(transA[0] == 'n' || transA[0] == 'N') {
599  // Form C = alpha*A*B + beta*C
600  for(IndexType j=0; j<n; j++) {
601  if(beta == ZERO) {
602  for(IndexType i=0; i<m; i++) {
603  C(i,j) = ZERO;
604  }
605  }
606  else if(beta != ONE) {
607  for(IndexType i=0; i<m; i++) {
608  C(i,j) = beta*C(i,j);
609  }
610  }
611  for(IndexType l=0; l<k; l++) {
612  Scalar temp = alpha*B(l,j);
613  for(IndexType i=0; i<m; i++) {
614  C(i,j) = C(i,j) + temp*A(i,l);
615  }
616  }
617  }
618  }
619  else {
620  // Form C = alpha*A**T*B + beta*C
621  for(IndexType j=0; j<n; j++) {
622  for(IndexType i=0; i<m; i++) {
623  Scalar temp = ZERO;
624  for(IndexType l=0; l<k; l++) {
625  temp = temp + A(l,i)*B(l,j);
626  }
627  if(beta == ZERO) {
628  C(i,j) = alpha*temp;
629  }
630  else {
631  C(i,j) = alpha*temp + beta*C(i,j);
632  }
633  }
634  }
635  }
636  }
637  else {
638  if(transA[0] == 'n' || transA[0] == 'N') {
639  // Form C = alpha*A*B**T + beta*C
640  for(IndexType j=0; j<n; j++) {
641  if(beta == ZERO) {
642  for(IndexType i=0; i<m; i++) {
643  C(i,j) = ZERO;
644  }
645  }
646  else if(beta != ONE) {
647  for(IndexType i=0; i<m; i++) {
648  C(i,j) = beta*C(i,j);
649  }
650  }
651  for(IndexType l=0; l<k; l++) {
652  Scalar temp = alpha*B(j,l);
653  for(IndexType i=0; i<m; i++) {
654  C(i,j) = C(i,j) + temp*A(i,l);
655  }
656  }
657  }
658  }
659  else {
660  // Form C = alpha*A**T*B**T + beta*C
661  for(IndexType j=0; j<n; j++) {
662  for(IndexType i=0; i<m; i++) {
663  Scalar temp = ZERO;
664  for(IndexType l=0; l<k; l++) {
665  temp = temp + A(l,i)*B(j,l);
666  }
667  if(beta == ZERO) {
668  C(i,j) = alpha*temp;
669  }
670  else {
671  C(i,j) = alpha*temp + beta*C(i,j);
672  }
673  }
674  }
675  }
676  }
677 }
678 
680 template<class LittleBlockType,
681  class LittleVectorType>
682 void
683 GETF2 (const LittleBlockType& A, const LittleVectorType& ipiv, int& info)
684 {
685  // The type of an entry of ipiv is the index type.
686  typedef typename std::decay<decltype (ipiv(0)) >::type IndexType;
687  static_assert (std::is_integral<IndexType>::value,
688  "GETF2: The type of each entry of ipiv must be an integer type.");
689  typedef typename std::remove_reference<decltype (A(0,0))>::type Scalar;
690  static_assert (! std::is_const<Scalar>::value,
691  "GETF2: A must not be a const View (or LittleBlock).");
692  static_assert (! std::is_const<std::remove_reference<decltype (ipiv(0))>>::value,
693  "GETF2: ipiv must not be a const View (or LittleBlock).");
694  static_assert (LittleBlockType::rank == 2, "GETF2: A must have rank 2 (be a matrix).");
695  typedef Kokkos::Details::ArithTraits<Scalar> STS;
696  const Scalar ZERO = STS::zero();
697 
698  const IndexType numRows = static_cast<IndexType> (A.dimension_0 ());
699  const IndexType numCols = static_cast<IndexType> (A.dimension_1 ());
700  const IndexType pivDim = static_cast<IndexType> (ipiv.dimension_0 ());
701 
702  // std::min is not a CUDA device function
703  const IndexType minPivDim = (numRows < numCols) ? numRows : numCols;
704  if (pivDim < minPivDim) {
705  info = -2;
706  return;
707  }
708 
709  // Initialize info
710  info = 0;
711 
712  for(IndexType j=0; j < pivDim; j++)
713  {
714  // Find pivot and test for singularity
715  IndexType jp = j;
716  for(IndexType i=j+1; i<numRows; i++)
717  {
718  if(STS::abs(A(i,j)) > STS::abs(A(jp,j))) {
719  jp = i;
720  }
721  }
722  ipiv(j) = jp+1;
723 
724  if(A(jp,j) != ZERO)
725  {
726  // Apply the interchange to columns 1:N
727  if(jp != j)
728  {
729  for(IndexType i=0; i < numCols; i++)
730  {
731  Scalar temp = A(jp,i);
732  A(jp,i) = A(j,i);
733  A(j,i) = temp;
734  }
735  }
736 
737  // Compute elements J+1:M of J-th column
738  for(IndexType i=j+1; i<numRows; i++) {
739  A(i,j) = A(i,j) / A(j,j);
740  }
741  }
742  else if(info == 0) {
743  info = j;
744  }
745 
746  // Update trailing submatrix
747  for(IndexType r=j+1; r < numRows; r++)
748  {
749  for(IndexType c=j+1; c < numCols; c++) {
750  A(r,c) = A(r,c) - A(r,j) * A(j,c);
751  }
752  }
753  }
754 }
755 
756 namespace Impl {
757 
761 template<class LittleBlockType,
762  class LittleIntVectorType,
763  class LittleScalarVectorType,
764  const int rank = LittleScalarVectorType::rank>
765 struct GETRS {
766  static void
767  run (const char mode[],
768  const LittleBlockType& A,
769  const LittleIntVectorType& ipiv,
770  const LittleScalarVectorType& B,
771  int& info);
772 };
773 
775 template<class LittleBlockType,
776  class LittleIntVectorType,
777  class LittleScalarVectorType>
778 struct GETRS<LittleBlockType, LittleIntVectorType, LittleScalarVectorType, 1> {
779  static void
780  run (const char mode[],
781  const LittleBlockType& A,
782  const LittleIntVectorType& ipiv,
783  const LittleScalarVectorType& B,
784  int& info)
785  {
786  // The type of an entry of ipiv is the index type.
787  typedef typename std::remove_const<typename std::remove_reference<decltype (ipiv(0))>::type>::type IndexType;
788  // IndexType must be signed, because this code does a countdown loop
789  // to zero. Unsigned integers are always >= 0, even on underflow.
790  static_assert (std::is_integral<IndexType>::value &&
791  std::is_signed<IndexType>::value,
792  "GETRS: The type of each entry of ipiv must be a signed integer.");
793  typedef typename std::decay<decltype (A(0,0))>::type Scalar;
794  static_assert (! std::is_const<std::remove_reference<decltype (B(0))>>::value,
795  "GETRS: B must not be a const View (or LittleBlock).");
796  static_assert (LittleBlockType::rank == 2, "GETRS: A must have rank 2 (be a matrix).");
797  static_assert (LittleIntVectorType::rank == 1, "GETRS: ipiv must have rank 1.");
798  static_assert (LittleScalarVectorType::rank == 1, "GETRS: For this specialization, B must have rank 1.");
799 
800  typedef Kokkos::Details::ArithTraits<Scalar> STS;
801  const Scalar ZERO = STS::zero();
802  const IndexType numRows = static_cast<IndexType> (A.dimension_0 ());
803  const IndexType numCols = static_cast<IndexType> (A.dimension_1 ());
804  const IndexType pivDim = static_cast<IndexType> (ipiv.dimension_0 ());
805 
806  info = 0;
807 
808  // Ensure that the matrix is square
809  if (numRows != numCols) {
810  info = -2;
811  return;
812  }
813 
814  // Ensure that the pivot array is sufficiently large
815  if (pivDim < numRows) {
816  info = -3;
817  return;
818  }
819 
820  // No transpose case
821  if(mode[0] == 'n' || mode[0] == 'N') {
822  // Apply row interchanges to the RHS
823  for(IndexType i=0; i<numRows; i++) {
824  if(ipiv(i) != i+1) {
825  Scalar temp = B(i);
826  B(i) = B(ipiv(i)-1);
827  B(ipiv(i)-1) = temp;
828  }
829  }
830 
831  // Solve Lx=b, overwriting b with x
832  for(IndexType r=1; r < numRows; r++) {
833  for(IndexType c=0; c < r; c++) {
834  B(r) = B(r) - A(r,c)*B(c);
835  }
836  }
837 
838  // Solve Ux=b, overwriting b with x
839  for(IndexType r=numRows-1; r >= 0; r--) {
840  // Check whether U is singular
841  if(A(r,r) == ZERO) {
842  info = r+1;
843  return;
844  }
845 
846  for(IndexType c=r+1; c < numCols; c++) {
847  B(r) = B(r) - A(r,c)*B(c);
848  }
849  B(r) = B(r) / A(r,r);
850  }
851  }
852  // Transpose case
853  else if(mode[0] == 't' || mode[0] == 'T') {
854  info = -1; // NOT YET IMPLEMENTED
855  return;
856  }
857  // Conjugate transpose case
858  else if (mode[0] == 'c' || mode[0] == 'C') {
859  info = -1; // NOT YET IMPLEMENTED
860  return;
861  }
862  else { // invalid mode
863  info = -1;
864  return;
865  }
866  }
867 };
868 
869 
871 template<class LittleBlockType,
872  class LittleIntVectorType,
873  class LittleScalarVectorType>
874 struct GETRS<LittleBlockType, LittleIntVectorType, LittleScalarVectorType, 2> {
875  static void
876  run (const char mode[],
877  const LittleBlockType& A,
878  const LittleIntVectorType& ipiv,
879  const LittleScalarVectorType& B,
880  int& info)
881  {
882  // The type of an entry of ipiv is the index type.
883  typedef typename std::remove_const<typename std::remove_reference<decltype (ipiv(0)) >::type>::type IndexType;
884  static_assert (std::is_integral<IndexType>::value,
885  "GETRS: The type of each entry of ipiv must be an integer type.");
886  static_assert (! std::is_const<std::remove_reference<decltype (B(0)) > >::value,
887  "GETRS: B must not be a const View (or LittleBlock).");
888  static_assert (LittleBlockType::rank == 2, "GETRS: A must have rank 2 (be a matrix).");
889  static_assert (LittleIntVectorType::rank == 1, "GETRS: ipiv must have rank 1.");
890  static_assert (LittleScalarVectorType::rank == 2, "GETRS: For this specialization, B must have rank 2.");
891 
892  // The current implementation iterates over one right-hand side at
893  // a time. It might be faster to do this differently, but this
894  // should work for now.
895  const IndexType numRhs = B.dimension_1 ();
896  info = 0;
897 
898  for (IndexType rhs = 0; rhs < numRhs; ++rhs) {
899  auto B_cur = Kokkos::subview (B, Kokkos::ALL (), rhs);
901  if (info != 0) {
902  return;
903  }
904  }
905  }
906 };
907 
908 } // namespace Impl
909 
913 template<class LittleBlockType,
914  class LittleIntVectorType,
915  class LittleScalarVectorType>
916 void
917 GETRS (const char mode[], const LittleBlockType& A, const LittleIntVectorType& ipiv, const LittleScalarVectorType& B, int& info)
918 {
920 }
921 
922 
937 template<class LittleBlockType,
938  class LittleIntVectorType,
939  class LittleScalarVectorType>
940 void
941 GETRI (const LittleBlockType& A,
942  const LittleIntVectorType& ipiv,
943  const LittleScalarVectorType& work,
944  int& info)
945 {
946  // The type of an entry of ipiv is the index type.
947  typedef typename std::remove_const<typename std::remove_reference<decltype (ipiv(0))>::type>::type IndexType;
948  // IndexType must be signed, because this code does a countdown loop
949  // to zero. Unsigned integers are always >= 0, even on underflow.
950  static_assert (std::is_integral<IndexType>::value &&
951  std::is_signed<IndexType>::value,
952  "GETRI: The type of each entry of ipiv must be a signed integer.");
953  typedef typename std::remove_reference<decltype (A(0,0))>::type Scalar;
954  static_assert (! std::is_const<std::remove_reference<decltype (A(0,0))>>::value,
955  "GETRI: A must not be a const View (or LittleBlock).");
956  static_assert (! std::is_const<std::remove_reference<decltype (work(0))>>::value,
957  "GETRI: work must not be a const View (or LittleBlock).");
958  static_assert (LittleBlockType::rank == 2, "GETRI: A must have rank 2 (be a matrix).");
959  typedef Kokkos::Details::ArithTraits<Scalar> STS;
960  const Scalar ZERO = STS::zero();
961  const Scalar ONE = STS::one();
962 
963  const IndexType numRows = static_cast<IndexType> (A.dimension_0 ());
964  const IndexType numCols = static_cast<IndexType> (A.dimension_1 ());
965  const IndexType pivDim = static_cast<IndexType> (ipiv.dimension_0 ());
966  const IndexType workDim = static_cast<IndexType> (work.dimension_0 ());
967 
968  info = 0;
969 
970  // Ensure that the matrix is square
971  if (numRows != numCols) {
972  info = -1;
973  return;
974  }
975 
976  // Ensure that the pivot array is sufficiently large
977  if (pivDim < numRows) {
978  info = -2;
979  return;
980  }
981 
982  // Ensure that the work array is sufficiently large
983  if (workDim < numRows) {
984  info = -3;
985  return;
986  }
987 
988  // Form Uinv in place
989  for(IndexType j=0; j < numRows; j++) {
990  if(A(j,j) == ZERO) {
991  info = j+1;
992  return;
993  }
994 
995  A(j,j) = ONE / A(j,j);
996 
997  // Compute elements 1:j-1 of j-th column
998  for(IndexType r=0; r < j; r++) {
999  A(r,j) = A(r,r)*A(r,j);
1000  for(IndexType c=r+1; c < j; c++) {
1001  A(r,j) = A(r,j) + A(r,c)*A(c,j);
1002  }
1003  }
1004  for(IndexType r=0; r < j; r++) {
1005  A(r,j) = -A(j,j)*A(r,j);
1006  }
1007  }
1008 
1009  // Compute Ainv by solving A\L = Uinv
1010  for(IndexType j = numCols-2; j >= 0; j--) {
1011  // Copy lower triangular data to work array and replace with 0
1012  for(IndexType r=j+1; r < numRows; r++) {
1013  work(r) = A(r,j);
1014  A(r,j) = 0;
1015  }
1016 
1017  for(IndexType r=0; r < numRows; r++) {
1018  for(IndexType i=j+1; i < numRows; i++) {
1019  A(r,j) = A(r,j) - work(i)*A(r,i);
1020  }
1021  }
1022  }
1023 
1024  // Apply column interchanges
1025  for(IndexType j=numRows-1; j >= 0; j--) {
1026  IndexType jp = ipiv(j)-1;
1027  if(j != jp) {
1028  for(IndexType r=0; r < numRows; r++) {
1029  Scalar temp = A(r,j);
1030  A(r,j) = A(r,jp);
1031  A(r,jp) = temp;
1032  }
1033  }
1034  }
1035 }
1036 
1037 
1038 // mfh 08 Nov 2015: I haven't tested this overload yet. It also needs
1039 // an implementation for trans != 'N' (the transpose and conjugate
1040 // transpose cases).
1041 #if 0
1042 template<class LittleBlockType,
1043  class LittleVectorTypeX,
1044  class LittleVectorTypeY,
1045  class CoefficientType,
1046  class IndexType = int>
1047 void
1048 GEMV (const char trans,
1049  const CoefficientType& alpha,
1050  const LittleBlockType& A,
1051  const LittleVectorTypeX& x,
1052  const CoefficientType& beta,
1053  const LittleVectorTypeY& y)
1054 {
1055  // y(0) returns a reference to the 0-th entry of y. Remove that
1056  // reference to get the type of each entry of y. It's OK if y has
1057  // zero entries -- this doesn't actually do y(i), it just returns
1058  // the type of that expression.
1059  typedef typename std::remove_reference<decltype (y(0)) >::type y_value_type;
1060  const IndexType numRows = static_cast<IndexType> (A.dimension_0 ());
1061  const IndexType numCols = static_cast<IndexType> (A.dimension_1 ());
1062 
1063  if (beta == 0.0) {
1064  if (alpha == 0.0) {
1065  for (IndexType i = 0; i < numRows; ++i) {
1066  y(i) = 0.0;
1067  }
1068  }
1069  else {
1070  for (IndexType i = 0; i < numRows; ++i) {
1071  y_value_type y_i = 0.0;
1072  for (IndexType j = 0; j < numCols; ++j) {
1073  y_i += A(i,j) * x(j);
1074  }
1075  y(i) = y_i;
1076  }
1077  }
1078  }
1079  else { // beta != 0
1080  if (alpha == 0.0) {
1081  if (beta == 0.0) {
1082  for (IndexType i = 0; i < numRows; ++i) {
1083  y(i) = 0.0;
1084  }
1085  }
1086  else {
1087  for (IndexType i = 0; i < numRows; ++i) {
1088  y(i) *= beta;
1089  }
1090  }
1091  }
1092  else {
1093  for (IndexType i = 0; i < numRows; ++i) {
1094  y_value_type y_i = beta * y(i);
1095  for (IndexType j = 0; j < numCols; ++j) {
1096  y_i += alpha * A(i,j) * x(j);
1097  }
1098  y(i) = y_i;
1099  }
1100  }
1101  }
1102 }
1103 
1104 #endif // 0
1105 
1120 template<class Scalar, class LO = int>
1122 public:
1123  typedef Scalar scalar_type;
1124  typedef typename Kokkos::Details::ArithTraits<Scalar>::val_type impl_scalar_type;
1125 
1126 private:
1127  typedef Kokkos::Details::ArithTraits<impl_scalar_type> STS;
1128 
1129 public:
1131  static const int rank = 2;
1132 
1134  typedef Kokkos::LayoutRight array_layout;
1135 
1141  LittleBlock (Scalar* const A,
1142  const LO blockSize,
1143  const LO strideX,
1144  const LO strideY) :
1145  A_ (reinterpret_cast<impl_scalar_type*> (A)),
1146  blockSize_ (blockSize),
1147  strideX_ (strideX),
1148  strideY_ (strideY)
1149  {}
1150 
1168  template<class T>
1169  LittleBlock (T* const A,
1170  const LO blockSize,
1171  const LO strideX,
1172  const LO strideY,
1173  typename std::enable_if<
1174  ! std::is_same<Scalar, T>::value &&
1175  std::is_convertible<Scalar, T>::value &&
1176  sizeof (Scalar) == sizeof (T),
1177  int*>::type ignoreMe = NULL) :
1178  A_ (reinterpret_cast<impl_scalar_type*> (A)),
1179  blockSize_ (blockSize),
1180  strideX_ (strideX),
1181  strideY_ (strideY)
1182  {}
1183 
1185  LO getBlockSize () const {
1186  return blockSize_;
1187  }
1188 
1190  LO dimension_0 () const {
1191  return blockSize_;
1192  }
1193 
1195  LO dimension_1 () const {
1196  return blockSize_;
1197  }
1198 
1200  LO size () const {
1201  return blockSize_ * blockSize_;
1202  }
1203 
1204  template<class IntegerType>
1205  void stride (IntegerType* const s) const {
1206  s[0] = strideX_;
1207  s[1] = strideY_;
1208  }
1209 
1211  Scalar* ptr_on_device () const {
1212  return reinterpret_cast<Scalar*> (A_);
1213  }
1214 
1216  Scalar* getRawPtr () const {
1217  return reinterpret_cast<Scalar*> (A_);
1218  }
1219 
1230  impl_scalar_type& operator() (const LO i, const LO j) const {
1231  return A_[i * strideX_ + j * strideY_];
1232  }
1233 
1234 private:
1235  impl_scalar_type* const A_;
1236  const LO blockSize_;
1237  const LO strideX_;
1238  const LO strideY_;
1239 };
1240 
1241 
1258 template<class Scalar, class LO = int>
1260 public:
1261  typedef Scalar scalar_type;
1262  typedef typename Kokkos::Details::ArithTraits<Scalar>::val_type impl_scalar_type;
1263 
1264 private:
1265  typedef Kokkos::Details::ArithTraits<impl_scalar_type> STS;
1266 
1267 public:
1269  static const int rank = 1;
1270 
1272  typedef Kokkos::LayoutRight array_layout;
1273 
1278  LittleVector (Scalar* const A, const LO blockSize, const LO strideX) :
1279  A_ (reinterpret_cast<impl_scalar_type*> (A)),
1280  blockSize_ (blockSize),
1281  strideX_ (strideX)
1282  {}
1283 
1300  template<class T>
1301  LittleVector (T* const A,
1302  const LO blockSize,
1303  const LO strideX,
1304  typename std::enable_if<
1305  ! std::is_same<Scalar, T>::value &&
1306  std::is_convertible<Scalar, T>::value &&
1307  sizeof (Scalar) == sizeof (T),
1308  int*>::type ignoreMe = NULL) :
1309  A_ (reinterpret_cast<impl_scalar_type*> (A)),
1310  blockSize_ (blockSize),
1311  strideX_ (strideX)
1312  {}
1313 
1315  Scalar* getRawPtr () const {
1316  return reinterpret_cast<Scalar*> (A_);
1317  }
1318 
1320  Scalar* ptr_on_device () const {
1321  return reinterpret_cast<Scalar*> (A_);
1322  }
1323 
1325  LO getBlockSize () const {
1326  return blockSize_;
1327  }
1328 
1330  LO dimension_0 () const {
1331  return blockSize_;
1332  }
1333 
1335  LO size () const {
1336  return blockSize_;
1337  }
1338 
1340  LO getStride () const {
1341  return strideX_;
1342  }
1343 
1347  template<class IntegerType>
1348  void stride (IntegerType* const s) const {
1349  s[0] = strideX_;
1350  }
1351 
1362  impl_scalar_type& operator() (const LO i) const {
1363  return A_[i * strideX_];
1364  }
1365 
1366 private:
1367  impl_scalar_type* const A_;
1368  const LO blockSize_;
1369  const LO strideX_;
1370 };
1371 
1385 template<class ST1, class ST2, class LO>
1386 inline void
1388  const LittleBlock<ST1, LO>& src,
1389  typename std::enable_if<std::is_convertible<ST1, ST2>::value && ! std::is_const<ST2>::value, int>::type* = NULL)
1390 {
1391  COPY (src, dst);
1392 }
1393 
1407 template<class ST1, class ST2, class LO>
1408 inline void
1410  const LittleVector<ST1, LO>& src,
1411  typename std::enable_if<std::is_convertible<ST1, ST2>::value && ! std::is_const<ST2>::value, int>::type* = NULL)
1412 {
1413  COPY (src, dst);
1414 }
1415 
1429 template<class ST1, class ST2, class LO>
1430 inline void
1432  const ST1& val,
1433  typename std::enable_if<std::is_convertible<ST1, ST2>::value && ! std::is_const<ST2>::value, int>::type* = NULL)
1434 {
1435  const LO numRows = dst.dimension_0 ();
1436  const LO numCols = dst.dimension_1 ();
1437 
1438  const ST2 theVal = static_cast<ST2> (val);
1439  for (LO j = 0; j < numCols; ++j) {
1440  for (LO i = 0; i < numRows; ++i) {
1441  dst(i,j) = theVal;
1442  }
1443  }
1444 }
1445 
1459 template<class ST1, class ST2, class LO>
1460 inline void
1462  const ST1& val,
1463  typename std::enable_if<std::is_convertible<ST1, ST2>::value && ! std::is_const<ST2>::value, int>::type* = NULL)
1464 {
1465  const LO N = dst.dimension_0 ();
1466  const ST2 theVal = static_cast<ST2> (val);
1467  for (LO i = 0; i < N; ++i) {
1468  dst(i) = theVal;
1469  }
1470 }
1471 
1472 } // namespace Experimental
1473 } // namespace Tpetra
1474 
1475 #endif // TPETRA_EXPERIMENTAL_BLOCKVIEW_HPP
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, or the small dense vectors in BlockMultiVector and BlockVector.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
LO size() const
Number of rows times number of columns.
LO dimension_0() const
Number of rows in the block.
void GETRI(const LittleBlockType &A, const LittleIntVectorType &ipiv, const LittleScalarVectorType &work, int &info)
Compute inverse of A, using result of GETRF or GETF2.
Scalar * getRawPtr() const
Pointer to the block&#39;s entries, as Scalar*.
LO dimension_1() const
Number of columns in the block.
LO size() const
Number of entries in the vector.
void COPY(const ViewType1 &x, const ViewType2 &y)
y := x, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dimension(s)...
Kokkos::LayoutRight array_layout
Data layout (in the same sense as Kokkos::View).
static void run(const ViewType1 &X, const ViewType2 &Y)
Y := X (rank-2 X and Y, i.e., matrices)
void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Nonowning view of a square dense block in a block matrix.
static void run(const ViewType2 &Y, const ViewType1 &X)
(*this)(i) := max(abs((*this)(i)), abs(X(i))) for all i.
static void run(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha*x (rank-1 x and y, i.e., vectors)
static void run(const CoefficientType &alpha, const ViewType1 &X, const ViewType2 &Y)
Y := Y + alpha*X (rank-2 X and Y, i.e., matrices)
void deep_copy(const LittleBlock< ST2, LO > &dst, const LittleBlock< ST1, LO > &src, typename std::enable_if< std::is_convertible< ST1, ST2 >::value &&!std::is_const< ST2 >::value, int >::type *=NULL)
Copy the LittleBlock src into the LittleBlock dst.
Kokkos::LayoutRight array_layout
Data layout (in the same sense as Kokkos::View).
static void run(const ViewType1 &x, const ViewType2 &y)
y := x (rank-1 x and y, i.e., vectors)
LO getStride() const
Stride between consecutive entries.
void GETF2(const LittleBlockType &A, const LittleVectorType &ipiv, int &info)
Computes A = P*L*U.
Implementation of Tpetra::Experimental::SCAL function.
void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
LittleBlock(Scalar *const A, const LO blockSize, const LO strideX, const LO strideY)
Constructor.
void GEMV(const CoefficientType &alpha, const LittleBlockType &A, const LittleVectorType1 &x, const LittleVectorType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
LittleVector(T *const A, const LO blockSize, const LO strideX, typename std::enable_if< !std::is_same< Scalar, T >::value &&std::is_convertible< Scalar, T >::value &&sizeof(Scalar)==sizeof(T), int * >::type ignoreMe=NULL)
Constructor that takes an impl_scalar_type pointer.
void GEMM(const char transA[], const char transB[], const CoefficientType &alpha, const ViewType1 &A, const ViewType2 &B, const CoefficientType &beta, const ViewType3 &C)
Small dense matrix-matrix multiply: C := alpha*A*B + beta*C
static void run(const ViewType2 &Y, const ViewType1 &X)
(*this)(i,j) := max(abs((*this)(i,j)), abs(X(i,j))) for all (i,j).
static void run(const ViewType1 &X, const ViewType2 &Y)
Y := X (rank-2 X and Y, i.e., matrices)
Scalar * getRawPtr() const
Pointer to the vector&#39;s entries.
static void run(const CoefficientType &alpha, const ViewType &x)
x := alpha*x (rank-1 x, i.e., a vector)
void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Nonowning view of a set of degrees of freedom corresponding to a mesh point in a block vector or mult...
static void run(const CoefficientType &alpha, const ViewType1 &X, const ViewType2 &Y)
Y := Y + alpha*X (rank-2 X and Y, i.e., matrices)
static void run(const CoefficientType &alpha, const ViewType &A)
A := alpha*A (rank-2 A, i.e., a matrix)
Scalar * ptr_on_device() const
Pointer to the vector&#39;s entries.
Replace old values with zero.
LO getBlockSize() const
The block size (number of rows, and number of columns).
Scalar * ptr_on_device() const
Pointer to the block&#39;s entries, as Scalar*.
LO getBlockSize() const
The block size (number of degrees of freedom per mesh point).
static void run(const CoefficientType &alpha, const ViewType &A)
A := alpha*A (rank-2 A, i.e., a matrix)
LittleBlock(T *const A, const LO blockSize, const LO strideX, const LO strideY, typename std::enable_if< !std::is_same< Scalar, T >::value &&std::is_convertible< Scalar, T >::value &&sizeof(Scalar)==sizeof(T), int * >::type ignoreMe=NULL)
Constructor that takes an impl_scalar_type pointer.
void stride(IntegerType *const s) const
Stride between consecutive entries.
Implementation of Tpetra::Experimental::COPY function.
void GETRS(const char mode[], const LittleBlockType &A, const LittleIntVectorType &ipiv, const LittleScalarVectorType &B, int &info)
Solve the linear system(s) AX=B, using the result of GETRF or GETF2.
Implementation of Tpetra::Experimental::AXPY function.
LittleVector(Scalar *const A, const LO blockSize, const LO strideX)
Constructor.
LO dimension_0() const
Number of entries in the vector.