Tpetra parallel linear algebra  Version of the Day
Kokkos_MV_GEMM.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos: Node API and Parallel Node Kernels
6 // Copyright (2008) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 
44 #ifndef KOKKOS_MV_GEMM_HPP
45 #define KOKKOS_MV_GEMM_HPP
46 
47 // Note this code lives only temporarily in TpetraCore. As soon as
48 // GEMM kernels exist in the TpetraKernels subpackage, and thus a
49 // depnedency on Teuchos can be eliminated, the code will move to
50 // TpetraKernels.
51 
52 #include<Teuchos_BLAS.hpp>
53 
54 #ifdef KOKKOS_HAVE_CUDA
55 #include<cublas.h>
56 #endif
57 
58 namespace Teuchos {
59 
60  // mfh 11 Nov 2014: The DeviceGEMM specializations below need to be
61  // able to use Teuchos::BLAS::{GEMM, GEMV}. We provide just enough
62  // of a specialization for Kokkos::complex<{float, double}> to make
63  // DeviceGEMM work. They just defer to BLAS<int,
64  // std::complex<{float, double}> > via reinterpret_cast. Please
65  // feel free to expand these specializations if you need to.
66 
67  template<>
68  class BLAS<int, ::Kokkos::complex<float> > {
69  public:
70  typedef float mag_type;
71  typedef ::Kokkos::complex<float> val_type;
72  typedef std::complex<float> impl_type;
73 
74  BLAS () {}
75  BLAS (const BLAS<int, val_type>&) {}
76  virtual ~BLAS () {}
77 
78  // void ROTG (val_type* da, val_type* db, mag_type* c, val_type* s) const;
79  // void ROT (const int n, val_type* dx, const int incx, val_type* dy, const int incy, RealType* c, val_type* s) const;
80  // RealType ASUM (const int n, const val_type* x, const int incx) const;
81  //void AXPY (const int n, const val_type alpha, const val_type* x, const int incx, val_type* y, const int incy) const;
82  //void COPY (const int n, const val_type* x, const int incx, val_type* y, const int incy) const;
83  //val_type DOT(const int n, const val_type* x, const int incx, const val_type* y, const int incy) const;
84  //RealType NRM2(const int n, const val_type* x, const int incx) const;
85  //void SCAL(const int n, const val_type alpha, val_type* x, const int incx) const;
86  //int IAMAX(const int n, const val_type* x, const int incx) const;
87 
88  void
89  GEMV (ETransp trans, const int m, const int n, const val_type alpha,
90  const val_type* A, const int lda, const val_type* x, const int incx,
91  const val_type beta, val_type* y, const int incy) const
92  {
93  BLAS<int, impl_type> blas;
94  blas.GEMV (trans, m, n, static_cast<impl_type> (alpha),
95  reinterpret_cast<const impl_type*> (A), lda,
96  reinterpret_cast<const impl_type*> (x), incx,
97  static_cast<impl_type> (beta),
98  reinterpret_cast<impl_type*> (y), incy);
99  }
100 
101  //void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const val_type* A, const int lda, val_type* x, const int incx) const;
102  //void GER(const int m, const int n, const val_type alpha, const val_type* x, const int incx, const val_type* y, const int incy, val_type* A, const int lda) const;
103 
104  void
105  GEMM (ETransp transa, ETransp transb, const int m, const int n, const int k,
106  const val_type alpha, const val_type* A, const int lda,
107  const val_type* B, const int ldb, const val_type beta, val_type* C,
108  const int ldc) const
109  {
110  BLAS<int, impl_type> blas;
111  blas.GEMM (transa, transb, m, n, k,
112  static_cast<impl_type> (alpha),
113  reinterpret_cast<const impl_type*> (A), lda,
114  reinterpret_cast<const impl_type*> (B), ldb,
115  static_cast<impl_type> (beta),
116  reinterpret_cast<impl_type*> (C), ldc);
117  }
118 
119  //void SYMM(ESide side, EUplo uplo, const int m, const int n, const val_type alpha, const val_type* A, const int lda, const val_type *B, const int ldb, const val_type beta, val_type *C, const int ldc) const;
120  //void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const val_type alpha, const val_type* A, const int lda, const val_type beta, val_type* C, const int ldc) const;
121  //void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
122  //void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
123  };
124 
125  template<>
126  class BLAS<int, ::Kokkos::complex<double> > {
127  public:
128  typedef double mag_type;
129  typedef ::Kokkos::complex<double> val_type;
130  typedef std::complex<double> impl_type;
131 
132  BLAS () {}
133  BLAS (const BLAS<int, val_type>&) {}
134  virtual ~BLAS () {}
135 
136  // void ROTG (val_type* da, val_type* db, mag_type* c, val_type* s) const;
137  // void ROT (const int n, val_type* dx, const int incx, val_type* dy, const int incy, RealType* c, val_type* s) const;
138  // RealType ASUM (const int n, const val_type* x, const int incx) const;
139  //void AXPY (const int n, const val_type alpha, const val_type* x, const int incx, val_type* y, const int incy) const;
140  //void COPY (const int n, const val_type* x, const int incx, val_type* y, const int incy) const;
141  //val_type DOT(const int n, const val_type* x, const int incx, const val_type* y, const int incy) const;
142  //RealType NRM2(const int n, const val_type* x, const int incx) const;
143  //void SCAL(const int n, const val_type alpha, val_type* x, const int incx) const;
144  //int IAMAX(const int n, const val_type* x, const int incx) const;
145 
146  void
147  GEMV (ETransp trans, const int m, const int n, const val_type alpha,
148  const val_type* A, const int lda, const val_type* x, const int incx,
149  const val_type beta, val_type* y, const int incy) const
150  {
151  BLAS<int, impl_type> blas;
152  blas.GEMV (trans, m, n, static_cast<impl_type> (alpha),
153  reinterpret_cast<const impl_type*> (A), lda,
154  reinterpret_cast<const impl_type*> (x), incx,
155  static_cast<impl_type> (beta),
156  reinterpret_cast<impl_type*> (y), incy);
157  }
158 
159  //void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const val_type* A, const int lda, val_type* x, const int incx) const;
160  //void GER(const int m, const int n, const val_type alpha, const val_type* x, const int incx, const val_type* y, const int incy, val_type* A, const int lda) const;
161 
162  void
163  GEMM (ETransp transa, ETransp transb, const int m, const int n, const int k,
164  const val_type alpha, const val_type* A, const int lda,
165  const val_type* B, const int ldb, const val_type beta, val_type* C,
166  const int ldc) const
167  {
168  BLAS<int, impl_type> blas;
169  blas.GEMM (transa, transb, m, n, k,
170  static_cast<impl_type> (alpha),
171  reinterpret_cast<const impl_type*> (A), lda,
172  reinterpret_cast<const impl_type*> (B), ldb,
173  static_cast<impl_type> (beta),
174  reinterpret_cast<impl_type*> (C), ldc);
175  }
176 
177  //void SYMM(ESide side, EUplo uplo, const int m, const int n, const val_type alpha, const val_type* A, const int lda, const val_type *B, const int ldb, const val_type beta, val_type *C, const int ldc) const;
178  //void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const val_type alpha, const val_type* A, const int lda, const val_type beta, val_type* C, const int ldc) const;
179  //void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
180  //void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
181  };
182 
183 } // namespace Teuchos
184 
185 
186 namespace Kokkos {
187  namespace Impl {
188 
189  template<class ViewType>
190  size_t getStride2DView (ViewType A) {
191  size_t stride[8];
192  A.stride (stride);
193  return A.dimension_1 () > 1 ? stride[1] : A.dimension_0 ();
194  }
195  }
196 
203  template <typename Scalar, typename DeviceType>
204  struct DeviceGEMM {
205  public:
206  static void
207  GEMM (const Teuchos::ETransp transA,
208  const Teuchos::ETransp transB,
209  const Scalar alpha,
210  View<const Scalar**, LayoutLeft, DeviceType> A,
211  View<const Scalar**, LayoutLeft, DeviceType> B,
212  const Scalar beta,
213  View<Scalar**, LayoutLeft, DeviceType> C)
214  {
215  Teuchos::BLAS<int,Scalar> blas;
216  const int m = static_cast<int> (C.dimension_0 ()),
217  n = static_cast<int> (C.dimension_1 ()),
218  k = (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ()),
219  lda = static_cast<int> (Impl::getStride2DView (A)),
220  ldb = static_cast<int> (Impl::getStride2DView (B)),
221  ldc = static_cast<int> (Impl::getStride2DView (C));
222  // For some BLAS implementations (e.g., MKL), GEMM when B has
223  // one column may be signficantly less efficient than GEMV.
224  if (n == 1 && transB == Teuchos::NO_TRANS) {
225  blas.GEMV (transA, A.dimension_0 (), A.dimension_1 (), alpha,
226  A.ptr_on_device(), lda,
227  B.ptr_on_device(), static_cast<int> (1),
228  beta, C.ptr_on_device(), static_cast<int> (1));
229  }
230  else {
231  blas.GEMM (transA, transB, m, n, k, alpha,
232  A.ptr_on_device(), lda,
233  B.ptr_on_device(), ldb,
234  beta, C.ptr_on_device(), ldc);
235  }
236  }
237  };
238 
239 // template <typename Scalar>
240 // struct DeviceGEMM<Scalar,Serial> {
241 // public:
242 // static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha,
243 // View<const Scalar**,LayoutLeft,Serial> A, View<const Scalar**,LayoutLeft,Serial> B,
244 // Scalar beta, View<Scalar**,Serial> C) {
245 // Teuchos::BLAS<int,Scalar> blas;
246 // const int m = static_cast<int>(C.dimension_0()),
247 // n = static_cast<int>(C.dimension_1()),
248 // k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
249 // lda = static_cast<int>(Impl::getStride2DView(A)),
250 // ldb = static_cast<int>(Impl::getStride2DView(B)),
251 // ldc = static_cast<int>(Impl::getStride2DView(C));
252 // // For some BLAS implementations (i.e. MKL), GEMM when B has one column
253 // // is signficantly less efficient
254 // if (n == 1 && transB == Teuchos::NO_TRANS)
255 // blas.GEMV(transA, A.dimension_0(), A.dimension_1(), alpha, A.ptr_on_device(), lda, B.ptr_on_device(), static_cast<int>(1), beta, C.ptr_on_device(), static_cast<int>(1));
256 // else
257 // blas.GEMM(transA, transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc);
258 // }
259 // };
260 
261 // #ifdef KOKKOS_HAVE_PTHREAD
262 // template <typename Scalar>
263 // struct DeviceGEMM<Scalar,Threads> {
264 // public:
265 // static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha,
266 // View<const Scalar**,LayoutLeft,Threads> A, View<const Scalar**,LayoutLeft,Threads> B,
267 // Scalar beta, View<Scalar**,LayoutLeft,Threads> C) {
268 // Teuchos::BLAS<int,Scalar> blas;
269 // const int m = static_cast<int>(C.dimension_0()),
270 // n = static_cast<int>(C.dimension_1()),
271 // k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
272 // lda = static_cast<int>(Impl::getStride2DView(A)),
273 // ldb = static_cast<int>(Impl::getStride2DView(B)),
274 // ldc = static_cast<int>(Impl::getStride2DView(C));
275 // blas.GEMM(transA, transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc);
276 // }
277 // };
278 // #endif
279 
280 // #ifdef KOKKOS_HAVE_OPENMP
281 // template <typename Scalar>
282 // struct DeviceGEMM<Scalar,OpenMP> {
283 // public:
284 // static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha,
285 // View<const Scalar**,LayoutLeft,OpenMP> A, View<const Scalar**,LayoutLeft,OpenMP> B,
286 // Scalar beta, View<Scalar**,LayoutLeft,OpenMP> C) {
287 // Teuchos::BLAS<int,Scalar> blas;
288 // const int m = static_cast<int>(C.dimension_0()),
289 // n = static_cast<int>(C.dimension_1()),
290 // k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
291 // lda = static_cast<int>(Impl::getStride2DView(A)),
292 // ldb = static_cast<int>(Impl::getStride2DView(B)),
293 // ldc = static_cast<int>(Impl::getStride2DView(C));
294 // blas.GEMM(transA, transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc);
295 // }
296 // };
297 // #endif
298 
299 #ifdef KOKKOS_HAVE_CUDA
300  template <typename Scalar>
301  struct DeviceGEMM<Scalar,Cuda> {
302  public:
303  static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha,
304  View<const Scalar**,LayoutLeft,Cuda> A, View<const Scalar**,LayoutLeft,Cuda> B,
305  Scalar beta, View<Scalar**,LayoutLeft,Cuda> C) {
306  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "DeviceGEMM: Kokkos::Cuda has no support for GEMM operations over Scalar=" << Teuchos::typeName(alpha) << ".");
307  }
308  };
309 
310 
311  template <>
312  struct DeviceGEMM<float,Cuda> {
313  public:
314  static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, float alpha,
315  View<const float**,LayoutLeft,Cuda> A, View<const float**,LayoutLeft,Cuda> B,
316  float beta, View<float**,LayoutLeft,Cuda> C) {
317  const int m = static_cast<int>(C.dimension_0()),
318  n = static_cast<int>(C.dimension_1()),
319  k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
320  lda = static_cast<int>(Impl::getStride2DView(A)),
321  ldb = static_cast<int>(Impl::getStride2DView(B)),
322  ldc = static_cast<int>(Impl::getStride2DView(C));
323  const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
324  char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
325  cublasSgemm(char_transA, char_transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc);
326 #ifdef HAVE_KOKKOS_DEBUG
327  cublasStatus info = cublasGetError();
328  TEUCHOS_TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasSgemm failed with status " << info << "." );
329 #endif
330  }
331  };
332 
333  template <>
334  struct DeviceGEMM<double,Cuda> {
335  public:
336  static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, double alpha,
337  View<const double**,LayoutLeft,Cuda> A, View<const double**,LayoutLeft,Cuda> B,
338  double beta, View<double**,LayoutLeft,Cuda> C) {
339  const int m = static_cast<int>(C.dimension_0()),
340  n = static_cast<int>(C.dimension_1()),
341  k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
342  lda = static_cast<int>(Impl::getStride2DView(A)),
343  ldb = static_cast<int>(Impl::getStride2DView(B)),
344  ldc = static_cast<int>(Impl::getStride2DView(C));
345  const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
346  char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
347  cublasDgemm(char_transA, char_transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc);
348 #ifdef HAVE_KOKKOS_DEBUG
349  cublasStatus info = cublasGetError();
350  TEUCHOS_TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasDgemm failed with status " << info << "." );
351 #endif
352  }
353  };
354 
355 
356 #endif
357 }
358 #endif // KOKKOS_MV_GEMM_HPP
359 
void GEMV(const CoefficientType &alpha, const LittleBlockType &A, const LittleVectorType1 &x, const LittleVectorType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
void GEMM(const char transA[], const char transB[], const CoefficientType &alpha, const ViewType1 &A, const ViewType2 &B, const CoefficientType &beta, const ViewType3 &C)
Small dense matrix-matrix multiply: C := alpha*A*B + beta*C
Class that provides GEMM for a particular Kokkos Device.