Tpetra parallel linear algebra  Version of the Day
Tpetra_CrsMatrixMultiplyOp.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_CRSMATRIXMULTIPLYOP_HPP
43 #define TPETRA_CRSMATRIXMULTIPLYOP_HPP
44 
49 
50 #include <Tpetra_CrsMatrix.hpp>
51 #include <Tpetra_Util.hpp>
52 #include <Teuchos_TimeMonitor.hpp>
53 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
54 # include "Teuchos_VerboseObject.hpp"
55 #endif
56 
57 namespace Tpetra {
58 
123  template <class Scalar,
124  class MatScalar = Scalar,
125  class LocalOrdinal = Details::DefaultTypes::local_ordinal_type,
126  class GlobalOrdinal = Details::DefaultTypes::global_ordinal_type,
129  public Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node>
130  {
131  public:
136 
138 
139 
144  CrsMatrixMultiplyOp (const Teuchos::RCP<const crs_matrix_type>& A) :
145  matrix_ (A)
146  {
147  // we don't require that A is fill complete; we will query for the
148  // importer/exporter at apply()-time
149 #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_PROFILING
150  importTimer_ = Teuchos::TimeMonitor::getNewCounter ("CrsMatrixMultiplyOp::import");
151  exportTimer_ = Teuchos::TimeMonitor::getNewCounter ("CrsMatrixMultiplyOp::export");
152 #endif
153  }
154 
156  virtual ~CrsMatrixMultiplyOp () {}
157 
159 
161 
167  void
170  Teuchos::ETransp mode = Teuchos::NO_TRANS,
171  Scalar alpha = Teuchos::ScalarTraits<Scalar>::one (),
172  Scalar beta = Teuchos::ScalarTraits<Scalar>::zero ()) const
173  {
174  TEUCHOS_TEST_FOR_EXCEPTION
175  (! matrix_->isFillComplete (), std::runtime_error,
176  Teuchos::typeName (*this) << "::apply(): underlying matrix is not fill-complete.");
177  TEUCHOS_TEST_FOR_EXCEPTION
178  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
179  Teuchos::typeName (*this) << "::apply(X,Y): X and Y must have the same number of vectors.");
180  TEUCHOS_TEST_FOR_EXCEPTION
181  (Teuchos::ScalarTraits<Scalar>::isComplex && mode == Teuchos::TRANS, std::logic_error,
182  Teuchos::typeName (*this) << "::apply() does not currently support transposed multiplications for complex scalar types.");
183  if (mode == Teuchos::NO_TRANS) {
184  applyNonTranspose (X, Y, alpha, beta);
185  }
186  else {
187  applyTranspose (X, Y, mode, alpha, beta);
188  }
189  }
190 
230  void
234  const Scalar& dampingFactor,
235  const ESweepDirection direction,
236  const int numSweeps) const
237  {
238  using Teuchos::null;
239  using Teuchos::RCP;
240  using Teuchos::rcp;
241  using Teuchos::rcpFromRef;
242  using Teuchos::rcp_const_cast;
243  typedef Scalar OS;
245  typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
246  typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
248 
249  TEUCHOS_TEST_FOR_EXCEPTION
250  (numSweeps < 0, std::invalid_argument,
251  "gaussSeidel: The number of sweeps must be nonnegative, "
252  "but you provided numSweeps = " << numSweeps << " < 0.");
253 
254  // Translate from global to local sweep direction.
255  // While doing this, validate the input.
256  KokkosClassic::ESweepDirection localDirection;
257  if (direction == Forward) {
258  localDirection = KokkosClassic::Forward;
259  }
260  else if (direction == Backward) {
261  localDirection = KokkosClassic::Backward;
262  }
263  else if (direction == Symmetric) {
264  // We'll control local sweep direction manually.
265  localDirection = KokkosClassic::Forward;
266  }
267  else {
268  TEUCHOS_TEST_FOR_EXCEPTION
269  (true, std::invalid_argument,
270  "gaussSeidel: The 'direction' enum does not have any of its valid "
271  "values: Forward, Backward, or Symmetric.");
272  }
273 
274  if (numSweeps == 0) {
275  return; // Nothing to do.
276  }
277 
278  // We don't need the Export object because this method assumes
279  // that the row, domain, and range Maps are the same. We do need
280  // the Import object, if there is one, though.
281  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
282  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
283  TEUCHOS_TEST_FOR_EXCEPTION
284  (! exporter.is_null (), std::runtime_error,
285  "Tpetra's gaussSeidel implementation requires that the row, domain, "
286  "and range Maps be the same. This cannot be the case, because the "
287  "matrix has a nontrivial Export object.");
288 
289  RCP<const map_type> domainMap = matrix_->getDomainMap ();
290  RCP<const map_type> rangeMap = matrix_->getRangeMap ();
291  RCP<const map_type> rowMap = matrix_->getGraph ()->getRowMap ();
292  RCP<const map_type> colMap = matrix_->getGraph ()->getColMap ();
293 
294 #ifdef HAVE_TEUCHOS_DEBUG
295  {
296  // The relation 'isSameAs' is transitive. It's also a
297  // collective, so we don't have to do a "shared" test for
298  // exception (i.e., a global reduction on the test value).
299  TEUCHOS_TEST_FOR_EXCEPTION
300  (! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
301  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
302  "multivector X be in the domain Map of the matrix.");
303  TEUCHOS_TEST_FOR_EXCEPTION
304  (! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
305  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
306  "B be in the range Map of the matrix.");
307  TEUCHOS_TEST_FOR_EXCEPTION
308  (! D.getMap ()->isSameAs (*rowMap), std::runtime_error,
309  "Tpetra::CrsMatrix::gaussSeidel requires that the input "
310  "D be in the row Map of the matrix.");
311  TEUCHOS_TEST_FOR_EXCEPTION
312  (! rowMap->isSameAs (*rangeMap), std::runtime_error,
313  "Tpetra::CrsMatrix::gaussSeidel requires that the row Map and the "
314  "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
315  TEUCHOS_TEST_FOR_EXCEPTION
316  (! domainMap->isSameAs (*rangeMap), std::runtime_error,
317  "Tpetra::CrsMatrix::gaussSeidel requires that the domain Map and "
318  "the range Map of the matrix be the same.");
319  }
320 #else
321  // Forestall any compiler warnings for unused variables.
322  (void) rangeMap;
323  (void) rowMap;
324 #endif // HAVE_TEUCHOS_DEBUG
325 
326  // If B is not constant stride, copy it into a constant stride
327  // multivector. We'l handle the right-hand side B first and deal
328  // with X right before the sweeps, to improve locality of the
329  // first sweep. (If the problem is small enough, then that will
330  // hopefully keep more of the entries of X in cache. This
331  // optimizes for the typical case of a small number of sweeps.)
332  RCP<const OSMV> B_in;
333  if (B.isConstantStride()) {
334  B_in = rcpFromRef (B);
335  }
336  else {
337  // The range Map and row Map are the same in this case, so we
338  // can use the (possibly cached) row Map multivector to store a
339  // constant stride copy of B. We don't have to copy back, since
340  // Gauss-Seidel won't modify B.
341  RCP<OSMV> B_in_nonconst = getRowMapMultiVector (B, true);
342  deep_copy (*B_in_nonconst, B);
343  B_in = rcp_const_cast<const OSMV> (B_in_nonconst);
344 
346  (! B.isConstantStride (), std::runtime_error,
347  "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
348  "requires that X and B both have constant stride. Since B does not "
349  "have constant stride, we had to make a copy. This is a limitation of "
350  "the current implementation and not your fault, but we still report it "
351  "as an efficiency warning for your information.");
352  }
353 
354  // If X is not constant stride, copy it into a constant stride
355  // multivector. Also, make the column Map multivector X_colMap,
356  // and its domain Map view X_domainMap. (X actually must be a
357  // domain Map view of a column Map multivector; exploit this, if X
358  // has constant stride.)
359 
360  RCP<OSMV> X_domainMap;
361  RCP<OSMV> X_colMap;
362  bool copiedInput = false;
363 
364  if (importer.is_null ()) { // Domain and column Maps are the same.
365  if (X.isConstantStride ()) {
366  X_domainMap = rcpFromRef (X);
367  X_colMap = X_domainMap;
368  copiedInput = false;
369  }
370  else {
371  // Get a temporary column Map multivector, make a domain Map
372  // view of it, and copy X into the domain Map view. We have
373  // to copy here because we won't be doing Import operations.
374  X_colMap = getColumnMapMultiVector (X, true);
375  X_domainMap = X_colMap; // Domain and column Maps are the same.
376  deep_copy (*X_domainMap, X); // Copy X into the domain Map view.
377  copiedInput = true;
379  (! X.isConstantStride (), std::runtime_error,
380  "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
381  "requires that X and B both have constant stride. Since X does not "
382  "have constant stride, we had to make a copy. This is a limitation of "
383  "the current implementation and not your fault, but we still report it "
384  "as an efficiency warning for your information.");
385  }
386  }
387  else { // We will be doing Import operations in the sweeps.
388  if (X.isConstantStride ()) {
389  X_domainMap = rcpFromRef (X);
390  // This kernel assumes that X is a domain Map view of a column
391  // Map multivector. We will only check if this is valid if
392  // the CMake configure Teuchos_ENABLE_DEBUG is ON.
393  X_colMap = X_domainMap->offsetViewNonConst (colMap, 0);
394 
395  // Do the first Import for the first sweep. This simplifies
396  // the logic in the sweeps.
397  X_colMap->doImport (X, *importer, INSERT);
398  copiedInput = false;
399  }
400  else {
401  // Get a temporary column Map multivector X_colMap, and make a
402  // domain Map view X_domainMap of it. Instead of copying, we
403  // do an Import from X into X_domainMap. This saves us a
404  // copy, since the Import has to copy the data anyway.
405  X_colMap = getColumnMapMultiVector (X, true);
406  X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
407  X_colMap->doImport (X, *importer, INSERT);
408  copiedInput = true;
410  (! X.isConstantStride (), std::runtime_error,
411  "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
412  "requires that X and B both have constant stride. Since X does not "
413  "have constant stride, we had to make a copy. This is a limitation of "
414  "the current implementation and not your fault, but we still report it "
415  "as an efficiency warning for your information.");
416  }
417  }
418 
419  for (int sweep = 0; sweep < numSweeps; ++sweep) {
420  if (! importer.is_null () && sweep > 0) {
421  // We already did the first Import for the zeroth sweep.
422  X_colMap->doImport (*X_domainMap, *importer, INSERT);
423  }
424 
425  // Do local Gauss-Seidel.
426  if (direction != Symmetric) {
427  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
428  dampingFactor,
429  localDirection);
430  }
431  else { // direction == Symmetric
432  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
433  dampingFactor,
434  KokkosClassic::Forward);
435  // Communicate again before the Backward sweep.
436  if (! importer.is_null ()) {
437  X_colMap->doImport (*X_domainMap, *importer, INSERT);
438  }
439  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
440  dampingFactor,
441  KokkosClassic::Backward);
442  }
443  }
444 
445  if (copiedInput) {
446  deep_copy (X, *X_domainMap); // Copy back: X_domainMap -> X.
447  }
448  }
449 
474  void
478  const Scalar& dampingFactor,
479  const ESweepDirection direction,
480  const int numSweeps) const
481  {
482  using Teuchos::null;
483  using Teuchos::RCP;
484  using Teuchos::rcp;
485  using Teuchos::rcpFromRef;
486  using Teuchos::rcp_const_cast;
487  typedef Scalar OS;
489  typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
490  typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
492 
493  TEUCHOS_TEST_FOR_EXCEPTION
494  (numSweeps < 0, std::invalid_argument,
495  "gaussSeidelCopy: The number of sweeps must be nonnegative, "
496  "but you provided numSweeps = " << numSweeps << " < 0.");
497 
498  // Translate from global to local sweep direction.
499  // While doing this, validate the input.
500  KokkosClassic::ESweepDirection localDirection;
501  if (direction == Forward) {
502  localDirection = KokkosClassic::Forward;
503  }
504  else if (direction == Backward) {
505  localDirection = KokkosClassic::Backward;
506  }
507  else if (direction == Symmetric) {
508  // We'll control local sweep direction manually.
509  localDirection = KokkosClassic::Forward;
510  }
511  else {
512  TEUCHOS_TEST_FOR_EXCEPTION
513  (true, std::invalid_argument,
514  "gaussSeidelCopy: The 'direction' enum does not have any of its "
515  "valid values: Forward, Backward, or Symmetric.");
516  }
517 
518  if (numSweeps == 0) {
519  return;
520  }
521 
522  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
523  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
524  TEUCHOS_TEST_FOR_EXCEPTION
525  (! exporter.is_null (),
526  std::runtime_error,
527  "Tpetra's gaussSeidelCopy implementation requires that the row, domain, "
528  "and range Maps be the same. This cannot be the case, because the "
529  "matrix has a nontrivial Export object.");
530 
531  RCP<const map_type> domainMap = matrix_->getDomainMap ();
532  RCP<const map_type> rangeMap = matrix_->getRangeMap ();
533  RCP<const map_type> rowMap = matrix_->getGraph ()->getRowMap ();
534  RCP<const map_type> colMap = matrix_->getGraph ()->getColMap ();
535 
536 #ifdef HAVE_TEUCHOS_DEBUG
537  {
538  // The relation 'isSameAs' is transitive. It's also a
539  // collective, so we don't have to do a "shared" test for
540  // exception (i.e., a global reduction on the test value).
541  TEUCHOS_TEST_FOR_EXCEPTION
542  (! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
543  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
544  "multivector X be in the domain Map of the matrix.");
545  TEUCHOS_TEST_FOR_EXCEPTION
546  (! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
547  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
548  "B be in the range Map of the matrix.");
549  TEUCHOS_TEST_FOR_EXCEPTION
550  (! D.getMap ()->isSameAs (*rowMap), std::runtime_error,
551  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
552  "D be in the row Map of the matrix.");
553  TEUCHOS_TEST_FOR_EXCEPTION
554  (! rowMap->isSameAs (*rangeMap), std::runtime_error,
555  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the "
556  "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
557  TEUCHOS_TEST_FOR_EXCEPTION
558  (! domainMap->isSameAs (*rangeMap), std::runtime_error,
559  "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and "
560  "the range Map of the matrix be the same.");
561  }
562 #else
563  // Forestall any compiler warnings for unused variables.
564  (void) rangeMap;
565  (void) rowMap;
566 #endif // HAVE_TEUCHOS_DEBUG
567 
568  // Fetch a (possibly cached) temporary column Map multivector
569  // X_colMap, and a domain Map view X_domainMap of it. Both have
570  // constant stride by construction. We know that the domain Map
571  // must include the column Map, because our Gauss-Seidel kernel
572  // requires that the row Map, domain Map, and range Map are all
573  // the same, and that each process owns all of its own diagonal
574  // entries of the matrix.
575 
576  RCP<OSMV> X_colMap;
577  RCP<OSMV> X_domainMap;
578  bool copyBackOutput = false;
579  if (importer.is_null ()) {
580  if (X.isConstantStride ()) {
581  X_colMap = rcpFromRef (X);
582  X_domainMap = rcpFromRef (X);
583  // No need to copy back to X at end.
584  }
585  else { // We must copy X into a constant stride multivector.
586  // Just use the cached column Map multivector for that.
587  X_colMap = getColumnMapMultiVector (X, true);
588  // X_domainMap is always a domain Map view of the column Map
589  // multivector. In this case, the domain and column Maps are
590  // the same, so X_domainMap _is_ X_colMap.
591  X_domainMap = X_colMap;
592  deep_copy (*X_domainMap, X); // Copy X into constant stride multivector
593  copyBackOutput = true; // Don't forget to copy back at end.
595  (! X.isConstantStride (), std::runtime_error,
596  "gaussSeidelCopy: The current implementation of the Gauss-Seidel "
597  "kernel requires that X and B both have constant stride. Since X "
598  "does not have constant stride, we had to make a copy. This is a "
599  "limitation of the current implementation and not your fault, but we "
600  "still report it as an efficiency warning for your information.");
601  }
602  }
603  else { // Column Map and domain Map are _not_ the same.
604  X_colMap = getColumnMapMultiVector (X);
605  X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
606 
607  // We could just copy X into X_domainMap. However, that wastes
608  // a copy, because the Import also does a copy (plus
609  // communication). Since the typical use case for Gauss-Seidel
610  // is a small number of sweeps (2 is typical), we don't want to
611  // waste that copy. Thus, we do the Import here, and skip the
612  // first Import in the first sweep. Importing directly from X
613  // effects the copy into X_domainMap (which is a view of
614  // X_colMap).
615  X_colMap->doImport (X, *importer, INSERT);
616 
617  copyBackOutput = true; // Don't forget to copy back at end.
618  }
619 
620  // The Gauss-Seidel / SOR kernel expects multivectors of constant
621  // stride. X_colMap is by construction, but B might not be. If
622  // it's not, we have to make a copy.
623  RCP<const OSMV> B_in;
624  if (B.isConstantStride ()) {
625  B_in = rcpFromRef (B);
626  }
627  else {
628  // Range Map and row Map are the same in this case, so we can
629  // use the cached row Map multivector to store a constant stride
630  // copy of B.
631  RCP<OSMV> B_in_nonconst = getRowMapMultiVector (B, true);
632  *B_in_nonconst = B;
633  B_in = rcp_const_cast<const OSMV> (B_in_nonconst);
634 
636  (! B.isConstantStride (), std::runtime_error,
637  "gaussSeidelCopy: The current implementation requires that B have "
638  "constant stride. Since B does not have constant stride, we had to "
639  "copy it into a separate constant-stride multivector. This is a "
640  "limitation of the current implementation and not your fault, but we "
641  "still report it as an efficiency warning for your information.");
642  }
643 
644  for (int sweep = 0; sweep < numSweeps; ++sweep) {
645  if (! importer.is_null () && sweep > 0) {
646  // We already did the first Import for the zeroth sweep above.
647  X_colMap->doImport (*X_domainMap, *importer, INSERT);
648  }
649 
650  // Do local Gauss-Seidel.
651  if (direction != Symmetric) {
652  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
653  dampingFactor,
654  localDirection);
655  }
656  else { // direction == Symmetric
657  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
658  dampingFactor,
659  KokkosClassic::Forward);
660  // Communicate again before the Backward sweep, if necessary.
661  if (! importer.is_null ()) {
662  X_colMap->doImport (*X_domainMap, *importer, INSERT);
663  }
664  matrix_->template localGaussSeidel<OS,OS> (*B_in, *X_colMap, D,
665  dampingFactor,
666  KokkosClassic::Backward);
667  }
668  }
669 
670  if (copyBackOutput) {
671  deep_copy (X, *X_domainMap); // Copy result back into X.
672  }
673  }
674 
680  bool hasTransposeApply() const {
681  return true;
682  }
683 
685  Teuchos::RCP<const map_type> getDomainMap () const {
686  return matrix_->getDomainMap ();
687  }
688 
690  Teuchos::RCP<const map_type> getRangeMap () const {
691  return matrix_->getRangeMap ();
692  }
693 
695 
696  protected:
698 
700  const Teuchos::RCP<const crs_matrix_type> matrix_;
701 
714  mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > importMV_;
715 
728  mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > exportMV_;
729 
730 #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_PROFILING
731  Teuchos::RCP<Teuchos::Time> importTimer_, exportTimer_;
732 #endif
733 
736  void
739  Teuchos::ETransp mode,
740  Scalar alpha,
741  Scalar beta) const
742  {
743  typedef Teuchos::ScalarTraits<Scalar> ST;
744  using Teuchos::null;
745 
746  int myImageID = Teuchos::rank(*matrix_->getComm());
747 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
748  Teuchos::RCP<Teuchos::FancyOStream> out = Teuchos::VerboseObjectBase::getDefaultOStream();
749  if (myImageID == 0) {
750  *out << "Entering CrsMatrixMultiplyOp::applyTranspose()" << std::endl
751  << "Column Map: " << std::endl;
752  }
753  *out << matrix_->getColMap() << std::endl;
754  if (myImageID == 0) {
755  *out << "Initial input: " << std::endl;
756  }
757  X_in.describe(*out,Teuchos::VERB_EXTREME);
758 #endif
759 
760  const size_t numVectors = X_in.getNumVectors();
761  // because of Views, it is difficult to determine if X and Y point to the same data.
762  // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
763  // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
764  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > importer = matrix_->getGraph()->getImporter();
765  Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Node> > exporter = matrix_->getGraph()->getExporter();
766  // access X indirectly, in case we need to create temporary storage
767  Teuchos::RCP<const MV> X;
768 
769  // some parameters for below
770  const bool Y_is_replicated = !Y_in.isDistributed(),
771  Y_is_overwritten = (beta == ST::zero());
772  if (Y_is_replicated && myImageID > 0) {
773  beta = ST::zero();
774  }
775 
776  // currently, cannot multiply from multivector of non-constant stride
777  if (X_in.isConstantStride() == false && importer==null) {
778  // generate a strided copy of X_in
779  X = Teuchos::rcp(new MV(X_in));
780 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
781  if (myImageID == 0) *out << "X is not constant stride, duplicating X results in a strided copy" << std::endl;
782  X->describe(*out,Teuchos::VERB_EXTREME);
783 #endif
784  }
785  else {
786  // just temporary, so this non-owning RCP is okay
787  X = Teuchos::rcp(&X_in, false);
788  }
789 
790  // set up import/export temporary multivectors
791  if (importer != null) {
792  if (importMV_ != null && importMV_->getNumVectors() != numVectors) importMV_ = null;
793  if (importMV_ == null) {
794  importMV_ = Teuchos::rcp( new MV(matrix_->getColMap(),numVectors) );
795  }
796  }
797  if (exporter != null) {
798  if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) exportMV_ = null;
799  if (exportMV_ == null) {
800  exportMV_ = Teuchos::rcp( new MV(matrix_->getRowMap(),numVectors) );
801  }
802  }
803 
804  // If we have a non-trivial exporter, we must import elements that are permuted or are on other processors
805  if (exporter != null) {
806  {
807 #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_PROFILING
808  Teuchos::TimeMonitor lcltimer(*importTimer_);
809 #endif
810  exportMV_->doImport(X_in,*exporter,INSERT);
811  }
812  // multiply out of exportMV_
813  X = exportMV_;
814 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
815  if (myImageID == 0) {
816  *out << "Performed import of X using exporter..." << std::endl;
817  }
818  X->describe(*out,Teuchos::VERB_EXTREME);
819 #endif
820  }
821 
822 
823  // If we have a non-trivial importer, we must export elements that are permuted or belong to other processors
824  // We will compute solution into the to-be-exported MV; get a view
825  if (importer != null) {
826  // Do actual computation
827  matrix_->template localMultiply<Scalar, Scalar>(*X, *importMV_, mode, alpha, ST::zero());
828 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
829  if (myImageID == 0) *out << "Import vector after localMultiply()..." << std::endl;
830  importMV_->describe(*out,Teuchos::VERB_EXTREME);
831 #endif
832  if (Y_is_overwritten) Y_in.putScalar(ST::zero());
833  else Y_in.scale(beta);
834  //
835  {
836 #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_PROFILING
837  Teuchos::TimeMonitor lcltimer(*importTimer_);
838 #endif
839  Y_in.doExport(*importMV_,*importer,ADD);
840  }
841  }
842  // otherwise, multiply into Y
843  else {
844  // can't multiply in-situ; can't multiply into non-strided multivector
845  if (Y_in.isConstantStride() == false || X.getRawPtr() == &Y_in) {
846  // generate a strided copy of Y
847  MV Y(Y_in);
848  matrix_->template localMultiply<Scalar, Scalar>(*X, Y, mode, alpha, beta);
849  deep_copy (Y_in, Y);
850  }
851  else {
852  matrix_->template localMultiply<Scalar, Scalar>(*X, Y_in, mode, alpha, beta);
853  }
854  }
855 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
856  if (myImageID == 0) *out << "Y_in vector after local multiply/export..." << std::endl;
857  Y_in.describe(*out,Teuchos::VERB_EXTREME);
858 #endif
859  // Handle case of rangemap being a local replicated map: in this case, sum contributions from each processor
860  if (Y_is_replicated) {
861  Y_in.reduce();
862 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
863  if (myImageID == 0) *out << "Output vector is local; result after reduce()..." << std::endl;
864  Y_in.describe(*out,Teuchos::VERB_EXTREME);
865 #endif
866  }
867  }
868 
870  void
873  Scalar alpha,
874  Scalar beta) const
875  {
876  using Teuchos::null;
877  using Teuchos::RCP;
878  using Teuchos::rcp;
879  using Teuchos::rcp_const_cast;
880  using Teuchos::rcpFromRef;
881  typedef Export<LocalOrdinal,GlobalOrdinal,Node> export_type;
882  typedef Import<LocalOrdinal,GlobalOrdinal,Node> import_type;
883  typedef Teuchos::ScalarTraits<Scalar> STS;
884 
885  const int myImageID = matrix_->getComm ()->getRank ();
886 
887 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
888  RCP<Teuchos::FancyOStream> out = Teuchos::VerboseObjectBase::getDefaultOStream();
889  if (myImageID == 0) {
890  *out << "Entering CrsMatrixMultiplyOp::applyNonTranspose()" << std::endl
891  << "Column Map: " << std::endl;
892  }
893  *out << matrix_->getColMap() << std::endl;
894  if (myImageID == 0) {
895  *out << "Initial input: " << std::endl;
896  }
897  X_in.describe (*out, Teuchos::VERB_EXTREME);
898 #endif // TPETRA_CRSMATRIX_MULTIPLY_DUMP
899 
900  // because of Views, it is difficult to determine if X and Y point to the same data.
901  // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
902  // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
903  RCP<const import_type> importer = matrix_->getGraph()->getImporter();
904  RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
905 
906  // If beta == 0, then the output MV will be overwritten; none of
907  // its entries should be read. (Sparse BLAS semantics say that we
908  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
909  // This matters if we need to do an Export operation; see below.
910  const bool Y_is_overwritten = (beta == STS::zero());
911 
912  // We treat the case of a replicated MV output specially.
913  const bool Y_is_replicated = ! Y_in.isDistributed ();
914 
915  // This is part of the "hack" for replicated MV output. We'll let
916  // each process do its thing, but do an all-reduce at the end to
917  // sum up the results. Setting beta=0 on all processes but Proc 0
918  // makes the math work out for the all-reduce. (This assumes that
919  // the replicated data is correctly replicated, so that the data
920  // are the same on all processes.)
921  if (Y_is_replicated && myImageID > 0) {
922  beta = STS::zero();
923  }
924 
925  // Temporary MV for Import operation. After the block of code
926  // below, this will be an (Imported if necessary) column Map MV
927  // ready to give to localMultiply().
928  RCP<const MV> X_colMap;
929  if (importer.is_null ()) {
930  if (! X_in.isConstantStride ()) {
931  // Not all sparse mat-vec kernels can handle an input MV with
932  // nonconstant stride correctly, so we have to copy it in that
933  // case into a constant stride MV. To make a constant stride
934  // copy of X_in, we force creation of the column (== domain)
935  // Map MV (if it hasn't already been created, else fetch the
936  // cached copy). This avoids creating a new MV each time.
937 
938  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
939  *X_colMapNonConst = X_in; // MV assignment just copies the data.
940  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
941  }
942  else {
943  // The domain and column Maps are the same, so do the local
944  // multiply using the domain Map input MV X_in.
945  X_colMap = rcpFromRef (X_in);
946  }
947  }
948  else {
949  // We're doing an Import anyway, which will copy the relevant
950  // elements of the domain Map MV X_in into a separate column Map
951  // MV. Thus, we don't have to worry whether X_in is constant
952  // stride.
953  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
954 
955  // Import from the domain Map MV to the column Map MV.
956  {
957 #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_PROFILING
958  Teuchos::TimeMonitor lcltimer (*importTimer_);
959 #endif
960  X_colMapNonConst->doImport (X_in, *importer, INSERT);
961  }
962  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
963  }
964 
965  // Temporary MV for Export operation, or for copying a nonconstant
966  // stride output MV into a constant stride MV.
967  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
968 
969  // If we have a nontrivial Export object, we must perform an
970  // Export. In that case, the local multiply result will go into
971  // the row Map multivector. We don't have to make a
972  // constant-stride version of Y_in in this case, because we had to
973  // make a constant stride Y_rowMap MV and do an Export anyway.
974  if (! exporter.is_null ()) {
975  matrix_->template localMultiply<Scalar, Scalar> (*X_colMap, *Y_rowMap,
976  Teuchos::NO_TRANS,
977  alpha, STS::zero());
978  // If we're overwriting the output MV Y_in completely (beta ==
979  // 0), then make sure that it is filled with zeros before we do
980  // the Export. Otherwise, the ADD combine mode will use data in
981  // Y_in, which is supposed to be zero.
982  if (Y_is_overwritten) {
983  Y_in.putScalar (STS::zero());
984  }
985  else {
986  // Scale the output MV by beta, so that the Export sums in the
987  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
988  Y_in.scale (beta);
989  }
990  // Do the Export operation.
991  {
992 #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_PROFILING
993  Teuchos::TimeMonitor lcltimer (*exportTimer_);
994 #endif
995  Y_in.doExport (*Y_rowMap, *exporter, ADD);
996  }
997  }
998  else { // Don't do an Export: row Map and range Map are the same.
999  //
1000  // If Y_in does not have constant stride, or if the column Map
1001  // MV aliases Y_in, then we can't let the kernel write directly
1002  // to Y_in. Instead, we have to use the cached row (== range)
1003  // Map MV as temporary storage.
1004  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
1005  // Force creating the MV if it hasn't been created already.
1006  // This will reuse a previously created cached MV.
1007  Y_rowMap = getRowMapMultiVector (Y_in, true);
1008 
1009  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
1010  // since we're overwriting it anyway.
1011  if (beta != STS::zero ()) {
1012  deep_copy (*Y_rowMap, Y_in);
1013  }
1014  matrix_->template localMultiply<Scalar, Scalar> (*X_colMap,
1015  *Y_rowMap,
1016  Teuchos::NO_TRANS,
1017  alpha, beta);
1018  deep_copy (Y_in, *Y_rowMap);
1019  }
1020  else {
1021  matrix_->template localMultiply<Scalar, Scalar> (*X_colMap, Y_in,
1022  Teuchos::NO_TRANS,
1023  alpha, beta);
1024  }
1025  }
1026 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
1027  if (myImageID == 0) {
1028  *out << "Result Y_in after localMultiply and Export:" << std::endl;
1029  }
1030  Y_in.describe (*out, Teuchos::VERB_EXTREME);
1031 #endif // TPETRA_CRSMATRIX_MULTIPLY_DUMP
1032 
1033  // If the range Map is a locally replicated Map, sum up
1034  // contributions from each process. We set beta = 0 on all
1035  // processes but Proc 0 initially, so this will handle the scaling
1036  // factor beta correctly.
1037  if (Y_is_replicated) {
1038  Y_in.reduce ();
1039 #ifdef TPETRA_CRSMATRIX_MULTIPLY_DUMP
1040  if (myImageID == 0) {
1041  *out << "Result Y_in after reduce:" << std::endl;
1042  }
1043  Y_in.describe (*out, Teuchos::VERB_EXTREME);
1044 #endif // TPETRA_CRSMATRIX_MULTIPLY_DUMP
1045  }
1046 
1047  }
1048 
1049  private:
1069  Teuchos::RCP<MV>
1070  getColumnMapMultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& X_domainMap,
1071  const bool force = false) const
1072  {
1073  using Teuchos::null;
1074  using Teuchos::RCP;
1075  using Teuchos::rcp;
1076  typedef Import<LocalOrdinal,GlobalOrdinal,Node> import_type;
1078 
1079  const size_t numVecs = X_domainMap.getNumVectors ();
1080  RCP<const import_type> importer = matrix_->getGraph ()->getImporter ();
1081  RCP<const map_type> colMap = matrix_->getColMap ();
1082 
1083  RCP<MV> X_colMap; // null by default
1084 
1085  // If the Import object is trivial (null), then we don't need a
1086  // separate column Map multivector. Just return null in that
1087  // case. The caller is responsible for knowing not to use the
1088  // returned null pointer.
1089  //
1090  // If the Import is nontrivial, then we do need a separate
1091  // column Map multivector for the Import operation. Check in
1092  // that case if we have to (re)create the column Map
1093  // multivector.
1094  if (! importer.is_null () || force) {
1095  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
1096  X_colMap = rcp (new MV (colMap, numVecs));
1097 
1098  // Cache the newly created multivector for later reuse.
1099  importMV_ = X_colMap;
1100  }
1101  else { // Yay, we can reuse the cached multivector!
1102  X_colMap = importMV_;
1103  // mfh 09 Jan 2013: We don't have to fill with zeros first,
1104  // because the Import uses INSERT combine mode, which overwrites
1105  // existing entries.
1106  //
1107  //X_colMap->putScalar (STS::zero ());
1108  }
1109  }
1110  return X_colMap;
1111  }
1112 
1134  Teuchos::RCP<MV>
1135  getRowMapMultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Y_rangeMap,
1136  const bool force = false) const
1137  {
1138  using Teuchos::null;
1139  using Teuchos::RCP;
1140  using Teuchos::rcp;
1141  typedef Export<LocalOrdinal,GlobalOrdinal,Node> export_type;
1143 
1144  const size_t numVecs = Y_rangeMap.getNumVectors ();
1145  RCP<const export_type> exporter = matrix_->getGraph ()->getExporter ();
1146  RCP<const map_type> rowMap = matrix_->getRowMap ();
1147 
1148  RCP<MV> Y_rowMap; // null by default
1149 
1150  // If the Export object is trivial (null), then we don't need a
1151  // separate row Map multivector. Just return null in that case.
1152  // The caller is responsible for knowing not to use the returned
1153  // null pointer.
1154  //
1155  // If the Export is nontrivial, then we do need a separate row
1156  // Map multivector for the Export operation. Check in that case
1157  // if we have to (re)create the row Map multivector.
1158  if (! exporter.is_null () || force) {
1159  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
1160  Y_rowMap = rcp (new MV (rowMap, numVecs));
1161 
1162  // Cache the newly created multivector for later reuse.
1163  exportMV_ = Y_rowMap;
1164  }
1165  else { // Yay, we can reuse the cached multivector!
1166  Y_rowMap = exportMV_;
1167  }
1168  }
1169  return Y_rowMap;
1170  }
1171  };
1172 
1180  template <class OpScalar,
1181  class MatScalar,
1182  class LocalOrdinal,
1183  class GlobalOrdinal,
1184  class Node>
1185  Teuchos::RCP<
1187  createCrsMatrixMultiplyOp (const Teuchos::RCP<
1189  {
1190  typedef CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal,
1191  GlobalOrdinal, Node> op_type;
1192  return Teuchos::rcp (new op_type (A));
1193  }
1194 
1195 } // end of namespace Tpetra
1196 
1197 #endif // TPETRA_CRSMATRIXMULTIPLYOP_HPP
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute Y = beta*Y + alpha*Op(A)*X, where Op(A) is either A, , or .
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > crs_matrix_type
The specialization of CrsMatrix which this class wraps.
Teuchos::RCP< const map_type > getDomainMap() const
The domain Map of this Operator.
KokkosClassic::DefaultNode::DefaultNodeType node_type
Default value of Node template parameter.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > exportMV_
Row Map MultiVector used in apply().
One or more distributed dense vectors.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
int local_ordinal_type
Default value of LocalOrdinal template parameter.
void reduce()
Sum values of a locally replicated multivector across all processes.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMV_
Column Map MultiVector used in apply().
bool hasTransposeApply() const
Whether this Operator&#39;s apply() method can apply the transpose or conjugate transpose.
CrsMatrixMultiplyOp(const Teuchos::RCP< const crs_matrix_type > &A)
Constructor.
Insert new values that don&#39;t currently exist.
Abstract interface for operators (e.g., matrices and preconditioners).
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
const Teuchos::RCP< const crs_matrix_type > matrix_
The underlying CrsMatrix object.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void applyTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Apply the transpose or conjugate transpose of the matrix to X_in, producing Y_in. ...
Sum new values into existing values.
Map< LocalOrdinal, GlobalOrdinal, Node > map_type
The specialization of Map which this class uses.
bool isDistributed() const
Whether this is a globally distributed object.
#define TPETRA_EFFICIENCY_WARNING(throw_exception_test, Exception, msg)
Print or throw an efficency warning.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
Teuchos::RCP< CrsMatrixMultiplyOp< OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node > > createCrsMatrixMultiplyOp(const Teuchos::RCP< const CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > > &A)
Non-member function to create a CrsMatrixMultiplyOp.
void applyNonTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Scalar alpha, Scalar beta) const
Apply the matrix (not its transpose) to X_in, producing Y_in.
Describes a parallel distribution of objects over processes.
A class for wrapping a CrsMatrix multiply in a Operator.
Stand-alone utility functions and macros.
size_t getNumVectors() const
Number of columns in the multivector.
Teuchos::RCP< const map_type > getRangeMap() const
The range Map of this Operator.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, CombineMode CM)
Export data into this object using an Export object ("forward mode").
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
void gaussSeidel(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps) const
"Hybrid" Jacobi + (Gauss-Seidel or SOR) on .
virtual ~CrsMatrixMultiplyOp()
Destructor (virtual for memory safety of derived classes).
void gaussSeidelCopy(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps) const
Version of gaussSeidel(), with fewer requirements on X.