Ifpack2 Templated Preconditioning Package  Version 1.0
Ifpack2_Relaxation_def.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Tempated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_RELAXATION_DEF_HPP
44 #define IFPACK2_RELAXATION_DEF_HPP
45 
46 #include <Teuchos_StandardParameterEntryValidators.hpp>
47 #include <Teuchos_TimeMonitor.hpp>
48 #include <Tpetra_ConfigDefs.hpp>
49 #include <Tpetra_CrsMatrix.hpp>
50 #include <Tpetra_Experimental_BlockCrsMatrix.hpp>
51 #include <Ifpack2_Utilities.hpp>
52 #include <Ifpack2_Relaxation_decl.hpp>
53 
54 // mfh 28 Mar 2013: Uncomment out these three lines to compute
55 // statistics on diagonal entries in compute().
56 // #ifndef IFPACK2_RELAXATION_COMPUTE_DIAGONAL_STATS
57 // # define IFPACK2_RELAXATION_COMPUTE_DIAGONAL_STATS 1
58 // #endif // IFPACK2_RELAXATION_COMPUTE_DIAGONAL_STATS
59 
60 namespace {
61  // Validate that a given int is nonnegative.
62  class NonnegativeIntValidator : public Teuchos::ParameterEntryValidator {
63  public:
64  // Constructor (does nothing).
65  NonnegativeIntValidator () {}
66 
67  // ParameterEntryValidator wants this method.
68  Teuchos::ParameterEntryValidator::ValidStringsList validStringValues () const {
69  return Teuchos::null;
70  }
71 
72  // Actually validate the parameter's value.
73  void
74  validate (const Teuchos::ParameterEntry& entry,
75  const std::string& paramName,
76  const std::string& sublistName) const
77  {
78  using std::endl;
79  Teuchos::any anyVal = entry.getAny (true);
80  const std::string entryName = entry.getAny (false).typeName ();
81 
82  TEUCHOS_TEST_FOR_EXCEPTION(
83  anyVal.type () != typeid (int),
84  Teuchos::Exceptions::InvalidParameterType,
85  "Parameter \"" << paramName << "\" in sublist \"" << sublistName
86  << "\" has the wrong type." << endl << "Parameter: " << paramName
87  << endl << "Type specified: " << entryName << endl
88  << "Type required: int" << endl);
89 
90  const int val = Teuchos::any_cast<int> (anyVal);
91  TEUCHOS_TEST_FOR_EXCEPTION(
92  val < 0, Teuchos::Exceptions::InvalidParameterValue,
93  "Parameter \"" << paramName << "\" in sublist \"" << sublistName
94  << "\" is negative." << endl << "Parameter: " << paramName
95  << endl << "Value specified: " << val << endl
96  << "Required range: [0, INT_MAX]" << endl);
97  }
98 
99  // ParameterEntryValidator wants this method.
100  const std::string getXMLTypeName () const {
101  return "NonnegativeIntValidator";
102  }
103 
104  // ParameterEntryValidator wants this method.
105  void
106  printDoc (const std::string& docString,
107  std::ostream &out) const
108  {
109  Teuchos::StrUtils::printLines (out, "# ", docString);
110  out << "#\tValidator Used: " << std::endl;
111  out << "#\t\tNonnegativeIntValidator" << std::endl;
112  }
113  };
114 
115  // A way to get a small positive number (eps() for floating-point
116  // types, or 1 for integer types) when Teuchos::ScalarTraits doesn't
117  // define it (for example, for integer values).
118  template<class Scalar, const bool isOrdinal=Teuchos::ScalarTraits<Scalar>::isOrdinal>
119  class SmallTraits {
120  public:
121  // Return eps if Scalar is a floating-point type, else return 1.
122  static const Scalar eps ();
123  };
124 
125  // Partial specialization for when Scalar is not a floating-point type.
126  template<class Scalar>
127  class SmallTraits<Scalar, true> {
128  public:
129  static const Scalar eps () {
130  return Teuchos::ScalarTraits<Scalar>::one ();
131  }
132  };
133 
134  // Partial specialization for when Scalar is a floating-point type.
135  template<class Scalar>
136  class SmallTraits<Scalar, false> {
137  public:
138  static const Scalar eps () {
139  return Teuchos::ScalarTraits<Scalar>::eps ();
140  }
141  };
142 } // namespace (anonymous)
143 
144 namespace Ifpack2 {
145 
146 template<class MatrixType>
147 void Relaxation<MatrixType>::
148 setMatrix (const Teuchos::RCP<const row_matrix_type>& A)
149 {
150  if (A.getRawPtr () != A_.getRawPtr ()) { // it's a different matrix
151  Importer_ = Teuchos::null;
152  Diagonal_ = Teuchos::null; // ??? what if this comes from the user???
153  isInitialized_ = false;
154  IsComputed_ = false;
155  diagOffsets_ = Teuchos::null;
156  savedDiagOffsets_ = false;
157  hasBlockCrsMatrix_ = false;
158  if (! A.is_null ()) {
159  IsParallel_ = (A->getRowMap ()->getComm ()->getSize () > 1);
160  }
161  A_ = A;
162  }
163 }
164 
165 
166 template<class MatrixType>
168 Relaxation (const Teuchos::RCP<const row_matrix_type>& A)
169 : A_ (A),
170  Time_ (Teuchos::rcp (new Teuchos::Time ("Ifpack2::Relaxation"))),
171  NumSweeps_ (1),
172  PrecType_ (Ifpack2::Details::JACOBI),
173  DampingFactor_ (STS::one ()),
174  IsParallel_ ((A.is_null () || A->getRowMap ().is_null () || A->getRowMap ()->getComm ().is_null ()) ?
175  false : // a reasonable default if there's no communicator
176  A->getRowMap ()->getComm ()->getSize () > 1),
177  ZeroStartingSolution_ (true),
178  DoBackwardGS_ (false),
179  DoL1Method_ (false),
180  L1Eta_ (Teuchos::as<magnitude_type> (1.5)),
181  MinDiagonalValue_ (STS::zero ()),
182  fixTinyDiagEntries_ (false),
183  checkDiagEntries_ (false),
184  isInitialized_ (false),
185  IsComputed_ (false),
186  NumInitialize_ (0),
187  NumCompute_ (0),
188  NumApply_ (0),
189  InitializeTime_ (0.0), // Times are double anyway, so no need for ScalarTraits.
190  ComputeTime_ (0.0),
191  ApplyTime_ (0.0),
192  ComputeFlops_ (0.0),
193  ApplyFlops_ (0.0),
194  globalMinMagDiagEntryMag_ (STM::zero ()),
195  globalMaxMagDiagEntryMag_ (STM::zero ()),
196  globalNumSmallDiagEntries_ (0),
197  globalNumZeroDiagEntries_ (0),
198  globalNumNegDiagEntries_ (0),
199  globalDiagNormDiff_(Teuchos::ScalarTraits<magnitude_type>::zero()),
200  savedDiagOffsets_ (false),
201  hasBlockCrsMatrix_ (false)
202 {
203  this->setObjectLabel ("Ifpack2::Relaxation");
204 }
205 
206 //==========================================================================
207 template<class MatrixType>
209 }
210 
211 template<class MatrixType>
212 Teuchos::RCP<const Teuchos::ParameterList>
214 {
215  using Teuchos::Array;
216  using Teuchos::ParameterList;
217  using Teuchos::parameterList;
218  using Teuchos::RCP;
219  using Teuchos::rcp;
220  using Teuchos::rcp_const_cast;
221  using Teuchos::rcp_implicit_cast;
222  using Teuchos::setStringToIntegralParameter;
223  typedef Teuchos::ParameterEntryValidator PEV;
224 
225  if (validParams_.is_null ()) {
226  RCP<ParameterList> pl = parameterList ("Ifpack2::Relaxation");
227 
228  // Set a validator that automatically converts from the valid
229  // string options to their enum values.
230  Array<std::string> precTypes (3);
231  precTypes[0] = "Jacobi";
232  precTypes[1] = "Gauss-Seidel";
233  precTypes[2] = "Symmetric Gauss-Seidel";
234  Array<Details::RelaxationType> precTypeEnums (3);
235  precTypeEnums[0] = Details::JACOBI;
236  precTypeEnums[1] = Details::GS;
237  precTypeEnums[2] = Details::SGS;
238  const std::string defaultPrecType ("Jacobi");
239  setStringToIntegralParameter<Details::RelaxationType> ("relaxation: type",
240  defaultPrecType, "Relaxation method", precTypes (), precTypeEnums (),
241  pl.getRawPtr ());
242 
243  const int numSweeps = 1;
244  RCP<PEV> numSweepsValidator =
245  rcp_implicit_cast<PEV> (rcp (new NonnegativeIntValidator));
246  pl->set ("relaxation: sweeps", numSweeps, "Number of relaxation sweeps",
247  rcp_const_cast<const PEV> (numSweepsValidator));
248 
249  const scalar_type dampingFactor = STS::one ();
250  pl->set ("relaxation: damping factor", dampingFactor);
251 
252  const bool zeroStartingSolution = true;
253  pl->set ("relaxation: zero starting solution", zeroStartingSolution);
254 
255  const bool doBackwardGS = false;
256  pl->set ("relaxation: backward mode", doBackwardGS);
257 
258  const bool doL1Method = false;
259  pl->set ("relaxation: use l1", doL1Method);
260 
261  const magnitude_type l1eta = (STM::one() + STM::one() + STM::one()) /
262  (STM::one() + STM::one()); // 1.5
263  pl->set ("relaxation: l1 eta", l1eta);
264 
265  const scalar_type minDiagonalValue = STS::zero ();
266  pl->set ("relaxation: min diagonal value", minDiagonalValue);
267 
268  const bool fixTinyDiagEntries = false;
269  pl->set ("relaxation: fix tiny diagonal entries", fixTinyDiagEntries);
270 
271  const bool checkDiagEntries = false;
272  pl->set ("relaxation: check diagonal entries", checkDiagEntries);
273 
274  Teuchos::ArrayRCP<local_ordinal_type> localSmoothingIndices = Teuchos::null;
275  pl->set("relaxation: local smoothing indices", localSmoothingIndices);
276 
277  validParams_ = rcp_const_cast<const ParameterList> (pl);
278  }
279  return validParams_;
280 }
281 
282 
283 template<class MatrixType>
284 void Relaxation<MatrixType>::setParametersImpl (Teuchos::ParameterList& pl)
285 {
286  using Teuchos::getIntegralValue;
287  using Teuchos::ParameterList;
288  using Teuchos::RCP;
289  typedef scalar_type ST; // just to make code below shorter
290 
291  pl.validateParametersAndSetDefaults (* getValidParameters ());
292 
293  const Details::RelaxationType precType =
294  getIntegralValue<Details::RelaxationType> (pl, "relaxation: type");
295  const int numSweeps = pl.get<int> ("relaxation: sweeps");
296  const ST dampingFactor = pl.get<ST> ("relaxation: damping factor");
297  const bool zeroStartSol = pl.get<bool> ("relaxation: zero starting solution");
298  const bool doBackwardGS = pl.get<bool> ("relaxation: backward mode");
299  const bool doL1Method = pl.get<bool> ("relaxation: use l1");
300  const magnitude_type l1Eta = pl.get<magnitude_type> ("relaxation: l1 eta");
301  const ST minDiagonalValue = pl.get<ST> ("relaxation: min diagonal value");
302  const bool fixTinyDiagEntries = pl.get<bool> ("relaxation: fix tiny diagonal entries");
303  const bool checkDiagEntries = pl.get<bool> ("relaxation: check diagonal entries");
304  Teuchos::ArrayRCP<local_ordinal_type> localSmoothingIndices = pl.get<Teuchos::ArrayRCP<local_ordinal_type> >("relaxation: local smoothing indices");
305 
306 
307  // "Commit" the changes, now that we've validated everything.
308  PrecType_ = precType;
309  NumSweeps_ = numSweeps;
310  DampingFactor_ = dampingFactor;
311  ZeroStartingSolution_ = zeroStartSol;
312  DoBackwardGS_ = doBackwardGS;
313  DoL1Method_ = doL1Method;
314  L1Eta_ = l1Eta;
315  MinDiagonalValue_ = minDiagonalValue;
316  fixTinyDiagEntries_ = fixTinyDiagEntries;
317  checkDiagEntries_ = checkDiagEntries;
318  localSmoothingIndices_ = localSmoothingIndices;
319 }
320 
321 
322 template<class MatrixType>
323 void Relaxation<MatrixType>::setParameters (const Teuchos::ParameterList& pl)
324 {
325  // FIXME (aprokop 18 Oct 2013) Casting away const is bad here.
326  // but otherwise, we will get [unused] in pl
327  this->setParametersImpl(const_cast<Teuchos::ParameterList&>(pl));
328 }
329 
330 
331 template<class MatrixType>
332 Teuchos::RCP<const Teuchos::Comm<int> >
334  TEUCHOS_TEST_FOR_EXCEPTION(
335  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getComm: "
336  "The input matrix A is null. Please call setMatrix() with a nonnull "
337  "input matrix before calling this method.");
338  return A_->getRowMap ()->getComm ();
339 }
340 
341 
342 template<class MatrixType>
343 Teuchos::RCP<const typename Relaxation<MatrixType>::row_matrix_type>
345  return A_;
346 }
347 
348 
349 template<class MatrixType>
350 Teuchos::RCP<const Tpetra::Map<typename MatrixType::local_ordinal_type,
351  typename MatrixType::global_ordinal_type,
352  typename MatrixType::node_type> >
354  TEUCHOS_TEST_FOR_EXCEPTION(
355  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getDomainMap: "
356  "The input matrix A is null. Please call setMatrix() with a nonnull "
357  "input matrix before calling this method.");
358  return A_->getDomainMap ();
359 }
360 
361 
362 template<class MatrixType>
363 Teuchos::RCP<const Tpetra::Map<typename MatrixType::local_ordinal_type,
364  typename MatrixType::global_ordinal_type,
365  typename MatrixType::node_type> >
367  TEUCHOS_TEST_FOR_EXCEPTION(
368  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getRangeMap: "
369  "The input matrix A is null. Please call setMatrix() with a nonnull "
370  "input matrix before calling this method.");
371  return A_->getRangeMap ();
372 }
373 
374 
375 template<class MatrixType>
377  return true;
378 }
379 
380 
381 template<class MatrixType>
383  return(NumInitialize_);
384 }
385 
386 
387 template<class MatrixType>
389  return(NumCompute_);
390 }
391 
392 
393 template<class MatrixType>
395  return(NumApply_);
396 }
397 
398 
399 template<class MatrixType>
401  return(InitializeTime_);
402 }
403 
404 
405 template<class MatrixType>
407  return(ComputeTime_);
408 }
409 
410 
411 template<class MatrixType>
413  return(ApplyTime_);
414 }
415 
416 
417 template<class MatrixType>
419  return(ComputeFlops_);
420 }
421 
422 
423 template<class MatrixType>
425  return(ApplyFlops_);
426 }
427 
428 
429 template<class MatrixType>
430 void
432 apply (const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& X,
433  Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& Y,
434  Teuchos::ETransp mode,
435  scalar_type alpha,
436  scalar_type beta) const
437 {
438  using Teuchos::as;
439  using Teuchos::RCP;
440  using Teuchos::rcp;
441  using Teuchos::rcpFromRef;
442  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
444  TEUCHOS_TEST_FOR_EXCEPTION(
445  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::apply: "
446  "The input matrix A is null. Please call setMatrix() with a nonnull "
447  "input matrix, then call compute(), before calling this method.");
448  TEUCHOS_TEST_FOR_EXCEPTION(
449  ! isComputed (),
450  std::runtime_error,
451  "Ifpack2::Relaxation::apply: You must call compute() on this Ifpack2 "
452  "preconditioner instance before you may call apply(). You may call "
453  "isComputed() to find out if compute() has been called already.");
454  TEUCHOS_TEST_FOR_EXCEPTION(
455  X.getNumVectors() != Y.getNumVectors(),
456  std::runtime_error,
457  "Ifpack2::Relaxation::apply: X and Y have different numbers of columns. "
458  "X has " << X.getNumVectors() << " columns, but Y has "
459  << Y.getNumVectors() << " columns.");
460  TEUCHOS_TEST_FOR_EXCEPTION(
461  beta != STS::zero (), std::logic_error,
462  "Ifpack2::Relaxation::apply: beta = " << beta << " != 0 case not "
463  "implemented.");
464  {
465  // Reset the timer each time, since Relaxation uses the same Time
466  // object to track times for different methods.
467  Teuchos::TimeMonitor timeMon (*Time_, true);
468 
469  // Special case: alpha == 0.
470  if (alpha == STS::zero ()) {
471  // No floating-point operations, so no need to update a count.
472  Y.putScalar (STS::zero ());
473  }
474  else {
475  // If X and Y alias one another, then we need to create an
476  // auxiliary vector, Xcopy (a deep copy of X).
477  RCP<const MV> Xcopy;
478  // FIXME (mfh 12 Sep 2014) This test for aliasing is incomplete.
479  {
480  auto X_lcl_host = X.template getLocalView<Kokkos::HostSpace> ();
481  auto Y_lcl_host = Y.template getLocalView<Kokkos::HostSpace> ();
482  if (X_lcl_host.ptr_on_device () == Y_lcl_host.ptr_on_device ()) {
483  Xcopy = rcp (new MV (X, Teuchos::Copy));
484  } else {
485  Xcopy = rcpFromRef (X);
486  }
487  }
488 
489  // Each of the following methods updates the flop count itself.
490  // All implementations handle zeroing out the starting solution
491  // (if necessary) themselves.
492  switch (PrecType_) {
493  case Ifpack2::Details::JACOBI:
494  ApplyInverseJacobi(*Xcopy,Y);
495  break;
496  case Ifpack2::Details::GS:
497  ApplyInverseGS(*Xcopy,Y);
498  break;
499  case Ifpack2::Details::SGS:
500  ApplyInverseSGS(*Xcopy,Y);
501  break;
502  default:
503  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
504  "Ifpack2::Relaxation::apply: Invalid preconditioner type enum value "
505  << PrecType_ << ". Please report this bug to the Ifpack2 developers.");
506  }
507  if (alpha != STS::one ()) {
508  Y.scale (alpha);
509  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
510  const double numVectors = as<double> (Y.getNumVectors ());
511  ApplyFlops_ += numGlobalRows * numVectors;
512  }
513  }
514  }
515  ApplyTime_ += Time_->totalElapsedTime ();
516  ++NumApply_;
517 }
518 
519 
520 template<class MatrixType>
521 void
523 applyMat (const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& X,
524  Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& Y,
525  Teuchos::ETransp mode) const
526 {
527  TEUCHOS_TEST_FOR_EXCEPTION(
528  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::applyMat: "
529  "The input matrix A is null. Please call setMatrix() with a nonnull "
530  "input matrix, then call compute(), before calling this method.");
531  TEUCHOS_TEST_FOR_EXCEPTION(
532  ! isComputed (), std::runtime_error, "Ifpack2::Relaxation::applyMat: "
533  "isComputed() must be true before you may call applyMat(). "
534  "Please call compute() before calling this method.");
535  TEUCHOS_TEST_FOR_EXCEPTION(
536  X.getNumVectors () != Y.getNumVectors (), std::invalid_argument,
537  "Ifpack2::Relaxation::applyMat: X.getNumVectors() = " << X.getNumVectors ()
538  << " != Y.getNumVectors() = " << Y.getNumVectors () << ".");
539  A_->apply (X, Y, mode);
540 }
541 
542 
543 template<class MatrixType>
545 {
546  TEUCHOS_TEST_FOR_EXCEPTION(
547  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::initialize: "
548  "The input matrix A is null. Please call setMatrix() with a nonnull "
549  "input matrix before calling this method.");
550 
551  if (A_.is_null ()) {
552  hasBlockCrsMatrix_ = false;
553  }
554  else { // A_ is not null
555  Teuchos::RCP<const block_crs_matrix_type> A_bcrs =
556  Teuchos::rcp_dynamic_cast<const block_crs_matrix_type> (A_);
557  if (A_bcrs.is_null ()) {
558  hasBlockCrsMatrix_ = false;
559  }
560  else { // A_ is a block_crs_matrix_type
561  hasBlockCrsMatrix_ = true;
562  }
563  }
564 
565  // Initialization for Relaxation is trivial, so we say it takes zero time.
566  //InitializeTime_ += Time_->totalElapsedTime ();
567  ++NumInitialize_;
568  isInitialized_ = true;
569 }
570 
571 template<class MatrixType>
573 {
574  using Kokkos::ALL;
575  using Teuchos::Array;
576  using Teuchos::ArrayRCP;
577  using Teuchos::ArrayView;
578  using Teuchos::as;
579  using Teuchos::Comm;
580  using Teuchos::RCP;
581  using Teuchos::rcp;
582  using Teuchos::REDUCE_MAX;
583  using Teuchos::REDUCE_MIN;
584  using Teuchos::REDUCE_SUM;
585  using Teuchos::rcp_dynamic_cast;
586  using Teuchos::reduceAll;
587  typedef local_ordinal_type LO;
588 
589  {
590  // Reset the timer each time, since Relaxation uses the same Time
591  // object to track times for different methods.
592  Teuchos::TimeMonitor timeMon (*Time_, true);
593 
594  TEUCHOS_TEST_FOR_EXCEPTION(
595  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::"
596  "computeBlockCrs: The input matrix A is null. Please call setMatrix() "
597  "with a nonnull input matrix, then call initialize(), before calling "
598  "this method.");
599  const block_crs_matrix_type* blockCrsA =
600  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
601  TEUCHOS_TEST_FOR_EXCEPTION(
602  blockCrsA == NULL, std::logic_error, "Ifpack2::Relaxation::"
603  "computeBlockCrs: A_ is not a BlockCrsMatrix, but it should be if we "
604  "got this far. Please report this bug to the Ifpack2 developers.");
605 
606  const scalar_type one = STS::one ();
607 
608  // Reset state.
609  IsComputed_ = false;
610 
611  const LO lclNumMeshRows =
612  blockCrsA->getCrsGraph ().getNodeNumRows ();
613  const LO blockSize = blockCrsA->getBlockSize ();
614 
615  if (! savedDiagOffsets_) {
616  blockDiag_ = block_diag_type (); // clear it before reallocating
617  blockDiag_ = block_diag_type ("Ifpack2::Relaxation::blockDiag_",
618  lclNumMeshRows, blockSize, blockSize);
619  blockCrsA->getLocalDiagOffsets (diagOffsets_);
620  TEUCHOS_TEST_FOR_EXCEPTION
621  (static_cast<size_t> (diagOffsets_.size ()) !=
622  static_cast<size_t> (blockDiag_.dimension_0 ()),
623  std::logic_error, "diagOffsets_.size() = " << diagOffsets_.size () <<
624  " != blockDiag_.dimension_0() = " << blockDiag_.dimension_0 () <<
625  ". Please report this bug to the Ifpack2 developers.");
626  savedDiagOffsets_ = true;
627  }
628  blockCrsA->getLocalDiagCopy (blockDiag_, diagOffsets_ ());
629 
630  // Use an unmanaged View in this method, so that when we take
631  // subviews of it (to get each diagonal block), we don't have to
632  // touch the reference count. Reference count updates are a
633  // thread scalability bottleneck and have a performance cost even
634  // without using threads.
635  unmanaged_block_diag_type blockDiag = blockDiag_;
636 
637  if (DoL1Method_ && IsParallel_) {
638  const scalar_type two = one + one;
639  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
640  Array<LO> indices (maxLength);
641  Array<scalar_type> values (maxLength * blockSize * blockSize);
642  size_t numEntries = 0;
643 
644  for (LO i = 0; i < lclNumMeshRows; ++i) {
645  // FIXME (mfh 16 Dec 2015) Get views instead of copies.
646  blockCrsA->getLocalRowCopy (i, indices (), values (), numEntries);
647 
648  auto diagBlock = Kokkos::subview (blockDiag, i, ALL (), ALL ());
649  for (LO subRow = 0; subRow < blockSize; ++subRow) {
650  magnitude_type diagonal_boost = STM::zero ();
651  for (size_t k = 0 ; k < numEntries ; ++k) {
652  if (indices[k] > lclNumMeshRows) {
653  const size_t offset = blockSize*blockSize*k + subRow*blockSize;
654  for (LO subCol = 0; subCol < blockSize; ++subCol) {
655  diagonal_boost += STS::magnitude (values[offset+subCol] / two);
656  }
657  }
658  }
659  if (STS::magnitude (diagBlock(subRow, subRow)) < L1Eta_ * diagonal_boost) {
660  diagBlock(subRow, subRow) += diagonal_boost;
661  }
662  }
663  }
664  }
665 
666  blockDiagFactPivots_ =
667  pivots_type ("Ifpack2::Relaxation::pivots", lclNumMeshRows, blockSize);
668  // Use an unmanaged View to avoid ref count overhead in loop below.
669  unmanaged_pivots_type pivots = blockDiagFactPivots_;
670 
671  int info = 0;
672  for (LO i = 0 ; i < lclNumMeshRows; ++i) {
673  auto diagBlock = Kokkos::subview (blockDiag, i, ALL (), ALL ());
674  auto ipiv = Kokkos::subview (pivots, i, ALL ());
675  Tpetra::Experimental::GETF2 (diagBlock, ipiv, info);
676  }
677 
678  // In a debug build, do an extra test to make sure that all the
679  // factorizations were computed correctly.
680 #ifdef HAVE_IFPACK2_DEBUG
681  const int numResults = 2;
682  // Use "max = -min" trick to get min and max in a single all-reduce.
683  int lclResults[2], gblResults[2];
684  lclResults[0] = info;
685  lclResults[1] = -info;
686  gblResults[0] = 0;
687  gblResults[1] = 0;
688  reduceAll<int, int> (* (A_->getGraph ()->getComm ()), REDUCE_MIN,
689  numResults, lclResults, gblResults);
690  TEUCHOS_TEST_FOR_EXCEPTION
691  (gblResults[0] != 0 || gblResults[1] != 0, std::runtime_error,
692  "Ifpack2::Relaxation::compute: When processing the input "
693  "Tpetra::BlockCrsMatrix, one or more diagonal block LU factorizations "
694  "failed on one or more (MPI) processes.");
695 #endif // HAVE_IFPACK2_DEBUG
696 
697  Importer_ = A_->getGraph ()->getImporter ();
698  } // end TimeMonitor scope
699 
700  ComputeTime_ += Time_->totalElapsedTime ();
701  ++NumCompute_;
702  IsComputed_ = true;
703 }
704 
705 template<class MatrixType>
707 {
708  using Teuchos::Array;
709  using Teuchos::ArrayRCP;
710  using Teuchos::ArrayView;
711  using Teuchos::as;
712  using Teuchos::Comm;
713  using Teuchos::RCP;
714  using Teuchos::rcp;
715  using Teuchos::REDUCE_MAX;
716  using Teuchos::REDUCE_MIN;
717  using Teuchos::REDUCE_SUM;
718  using Teuchos::rcp_dynamic_cast;
719  using Teuchos::reduceAll;
720  typedef Tpetra::Vector<scalar_type, local_ordinal_type,
721  global_ordinal_type, node_type> vector_type;
722  typedef typename vector_type::device_type device_type;
723  const scalar_type zero = STS::zero ();
724  const scalar_type one = STS::one ();
725 
726  // We don't count initialization in compute() time.
727  if (! isInitialized ()) {
728  initialize ();
729  }
730 
731  if (hasBlockCrsMatrix_) {
732  computeBlockCrs ();
733  return;
734  }
735 
736  {
737  // Reset the timer each time, since Relaxation uses the same Time
738  // object to track times for different methods.
739  Teuchos::TimeMonitor timeMon (*Time_, true);
740 
741  TEUCHOS_TEST_FOR_EXCEPTION(
742  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::compute: "
743  "The input matrix A is null. Please call setMatrix() with a nonnull "
744  "input matrix, then call initialize(), before calling this method.");
745 
746  // Reset state.
747  IsComputed_ = false;
748 
749  TEUCHOS_TEST_FOR_EXCEPTION(
750  NumSweeps_ < 0, std::logic_error,
751  "Ifpack2::Relaxation::compute: NumSweeps_ = " << NumSweeps_ << " < 0. "
752  "Please report this bug to the Ifpack2 developers.");
753 
754  Diagonal_ = rcp (new vector_type (A_->getRowMap ()));
755 
756  // Extract the diagonal entries. The CrsMatrix static graph
757  // version is faster for subsequent calls to compute(), since it
758  // caches the diagonal offsets.
759  //
760  // isStaticGraph() == true means that the matrix was created with
761  // a const graph. The only requirement is that the structure of
762  // the matrix never changes, so isStaticGraph() == true is a bit
763  // more conservative than we need. However, Tpetra doesn't (as of
764  // 05 Apr 2013) have a way to tell if the graph hasn't changed
765  // since the last time we used it.
766  {
767  // NOTE (mfh 07 Jul 2013): We must cast here to CrsMatrix
768  // instead of MatrixType, because isStaticGraph is a CrsMatrix
769  // method (not inherited from RowMatrix's interface). It's
770  // perfectly valid to do relaxation on a RowMatrix which is not
771  // a CrsMatrix.
772  const crs_matrix_type* crsMat =
773  dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
774  if (crsMat == NULL || ! crsMat->isStaticGraph ()) {
775  A_->getLocalDiagCopy (*Diagonal_); // slow path
776  } else {
777  if (! savedDiagOffsets_) { // we haven't precomputed offsets
778  crsMat->getLocalDiagOffsets (diagOffsets_);
779  savedDiagOffsets_ = true;
780  }
781  crsMat->getLocalDiagCopy (*Diagonal_, diagOffsets_ ());
782 #ifdef HAVE_TPETRA_DEBUG
783  // Validate the fast-path diagonal against the slow-path diagonal.
784  vector_type D_copy (A_->getRowMap ());
785  A_->getLocalDiagCopy (D_copy);
786  D_copy.update (STS::one (), *Diagonal_, -STS::one ());
787  const magnitude_type err = D_copy.normInf ();
788  // The two diagonals should be exactly the same, so their
789  // difference should be exactly zero.
790  TEUCHOS_TEST_FOR_EXCEPTION(
791  err != STM::zero(), std::logic_error, "Ifpack2::Relaxation::compute: "
792  "\"fast-path\" diagonal computation failed. \\|D1 - D2\\|_inf = "
793  << err << ".");
794 #endif // HAVE_TPETRA_DEBUG
795  }
796  }
797 
798  // If we're checking the computed inverse diagonal, then keep a
799  // copy of the original diagonal entries for later comparison.
800  RCP<vector_type> origDiag;
801  if (checkDiagEntries_) {
802  origDiag = rcp (new vector_type (A_->getRowMap ()));
803  Tpetra::deep_copy (*origDiag, *Diagonal_);
804  }
805 
806  const size_t numMyRows = A_->getNodeNumRows ();
807 
808  // We're about to read and write diagonal entries on the host.
809  Diagonal_->template sync<Kokkos::HostSpace> ();
810  Diagonal_->template modify<Kokkos::HostSpace> ();
811  auto diag_2d = Diagonal_->template getLocalView<Kokkos::HostSpace> ();
812  auto diag_1d = Kokkos::subview (diag_2d, Kokkos::ALL (), 0);
813  // FIXME (mfh 12 Jan 2016) temp fix for Kokkos::complex vs. std::complex.
814  scalar_type* const diag = reinterpret_cast<scalar_type*> (diag_1d.ptr_on_device ());
815 
816  // Setup for L1 Methods.
817  // Here we add half the value of the off-processor entries in the row,
818  // but only if diagonal isn't sufficiently large.
819  //
820  // This follows from Equation (6.5) in: Baker, Falgout, Kolev and
821  // Yang. "Multigrid Smoothers for Ultraparallel Computing." SIAM
822  // J. Sci. Comput., Vol. 33, No. 5. (2011), pp. 2864-2887.
823  if (DoL1Method_ && IsParallel_) {
824  const scalar_type two = one + one;
825  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
826  Array<local_ordinal_type> indices (maxLength);
827  Array<scalar_type> values (maxLength);
828  size_t numEntries;
829 
830  for (size_t i = 0; i < numMyRows; ++i) {
831  A_->getLocalRowCopy (i, indices (), values (), numEntries);
832  magnitude_type diagonal_boost = STM::zero ();
833  for (size_t k = 0 ; k < numEntries ; ++k) {
834  if (static_cast<size_t> (indices[k]) > numMyRows) {
835  diagonal_boost += STS::magnitude (values[k] / two);
836  }
837  }
838  if (STS::magnitude (diag[i]) < L1Eta_ * diagonal_boost) {
839  diag[i] += diagonal_boost;
840  }
841  }
842  }
843 
844  //
845  // Invert the diagonal entries of the matrix (not in place).
846  //
847 
848  // Precompute some quantities for "fixing" zero or tiny diagonal
849  // entries. We'll only use them if this "fixing" is enabled.
850  //
851  // SmallTraits covers for the lack of eps() in
852  // Teuchos::ScalarTraits when its template parameter is not a
853  // floating-point type. (Ifpack2 sometimes gets instantiated for
854  // integer Scalar types.)
855  const scalar_type oneOverMinDiagVal = (MinDiagonalValue_ == zero) ?
856  one / SmallTraits<scalar_type>::eps () :
857  one / MinDiagonalValue_;
858  // It's helpful not to have to recompute this magnitude each time.
859  const magnitude_type minDiagValMag = STS::magnitude (MinDiagonalValue_);
860 
861  if (checkDiagEntries_) {
862  //
863  // Check diagonal elements, replace zero elements with the minimum
864  // diagonal value, and store their inverses. Count the number of
865  // "small" and zero diagonal entries while we're at it.
866  //
867  size_t numSmallDiagEntries = 0; // "small" includes zero
868  size_t numZeroDiagEntries = 0; // # zero diagonal entries
869  size_t numNegDiagEntries = 0; // # negative (real parts of) diagonal entries
870 
871  // As we go, keep track of the diagonal entries with the least and
872  // greatest magnitude. We could use the trick of starting the min
873  // with +Inf and the max with -Inf, but that doesn't work if
874  // scalar_type is a built-in integer type. Thus, we have to start
875  // by reading the first diagonal entry redundantly.
876  // scalar_type minMagDiagEntry = zero;
877  // scalar_type maxMagDiagEntry = zero;
878  magnitude_type minMagDiagEntryMag = STM::zero ();
879  magnitude_type maxMagDiagEntryMag = STM::zero ();
880  if (numMyRows > 0) {
881  const scalar_type d_0 = diag[0];
882  const magnitude_type d_0_mag = STS::magnitude (d_0);
883  // minMagDiagEntry = d_0;
884  // maxMagDiagEntry = d_0;
885  minMagDiagEntryMag = d_0_mag;
886  maxMagDiagEntryMag = d_0_mag;
887  }
888 
889  // Go through all the diagonal entries. Compute counts of
890  // small-magnitude, zero, and negative-real-part entries. Invert
891  // the diagonal entries that aren't too small. For those that are
892  // too small in magnitude, replace them with 1/MinDiagonalValue_
893  // (or 1/eps if MinDiagonalValue_ happens to be zero).
894  for (size_t i = 0 ; i < numMyRows; ++i) {
895  const scalar_type d_i = diag[i];
896  const magnitude_type d_i_mag = STS::magnitude (d_i);
897  const magnitude_type d_i_real = STS::real (d_i);
898 
899  // We can't compare complex numbers, but we can compare their
900  // real parts.
901  if (d_i_real < STM::zero ()) {
902  ++numNegDiagEntries;
903  }
904  if (d_i_mag < minMagDiagEntryMag) {
905  // minMagDiagEntry = d_i;
906  minMagDiagEntryMag = d_i_mag;
907  }
908  if (d_i_mag > maxMagDiagEntryMag) {
909  // maxMagDiagEntry = d_i;
910  maxMagDiagEntryMag = d_i_mag;
911  }
912 
913  if (fixTinyDiagEntries_) {
914  // <= not <, in case minDiagValMag is zero.
915  if (d_i_mag <= minDiagValMag) {
916  ++numSmallDiagEntries;
917  if (d_i_mag == STM::zero ()) {
918  ++numZeroDiagEntries;
919  }
920  diag[i] = oneOverMinDiagVal;
921  } else {
922  diag[i] = one / d_i;
923  }
924  }
925  else { // Don't fix zero or tiny diagonal entries.
926  // <= not <, in case minDiagValMag is zero.
927  if (d_i_mag <= minDiagValMag) {
928  ++numSmallDiagEntries;
929  if (d_i_mag == STM::zero ()) {
930  ++numZeroDiagEntries;
931  }
932  }
933  diag[i] = one / d_i;
934  }
935  }
936 
937  // Count floating-point operations of computing the inverse diagonal.
938  //
939  // FIXME (mfh 30 Mar 2013) Shouldn't counts be global, not local?
940  if (STS::isComplex) { // magnitude: at least 3 flops per diagonal entry
941  ComputeFlops_ += 4.0 * numMyRows;
942  } else {
943  ComputeFlops_ += numMyRows;
944  }
945 
946  // Collect global data about the diagonal entries. Only do this
947  // (which involves O(1) all-reduces) if the user asked us to do
948  // the extra work.
949  //
950  // FIXME (mfh 28 Mar 2013) This is wrong if some processes have
951  // zero rows. Fixing this requires one of two options:
952  //
953  // 1. Use a custom reduction operation that ignores processes
954  // with zero diagonal entries.
955  // 2. Split the communicator, compute all-reduces using the
956  // subcommunicator over processes that have a nonzero number
957  // of diagonal entries, and then broadcast from one of those
958  // processes (if there is one) to the processes in the other
959  // subcommunicator.
960  const Comm<int>& comm = * (A_->getRowMap ()->getComm ());
961 
962  // Compute global min and max magnitude of entries.
963  Array<magnitude_type> localVals (2);
964  localVals[0] = minMagDiagEntryMag;
965  // (- (min (- x))) is the same as (max x).
966  localVals[1] = -maxMagDiagEntryMag;
967  Array<magnitude_type> globalVals (2);
968  reduceAll<int, magnitude_type> (comm, REDUCE_MIN, 2,
969  localVals.getRawPtr (),
970  globalVals.getRawPtr ());
971  globalMinMagDiagEntryMag_ = globalVals[0];
972  globalMaxMagDiagEntryMag_ = -globalVals[1];
973 
974  Array<size_t> localCounts (3);
975  localCounts[0] = numSmallDiagEntries;
976  localCounts[1] = numZeroDiagEntries;
977  localCounts[2] = numNegDiagEntries;
978  Array<size_t> globalCounts (3);
979  reduceAll<int, size_t> (comm, REDUCE_SUM, 3,
980  localCounts.getRawPtr (),
981  globalCounts.getRawPtr ());
982  globalNumSmallDiagEntries_ = globalCounts[0];
983  globalNumZeroDiagEntries_ = globalCounts[1];
984  globalNumNegDiagEntries_ = globalCounts[2];
985 
986  // Forestall "set but not used" compiler warnings.
987  // (void) minMagDiagEntry;
988  // (void) maxMagDiagEntry;
989 
990  // Compute and save the difference between the computed inverse
991  // diagonal, and the original diagonal's inverse.
992  //
993  // NOTE (mfh 11 Jan 2016) We need to sync Diagonal_ back from
994  // host to device for the update kernel below, and we don't need
995  // to modify it or sync it back again here.
996  vector_type diff (A_->getRowMap ());
997  diff.reciprocal (*origDiag);
998  Diagonal_->template sync<device_type> ();
999  diff.update (-one, *Diagonal_, one);
1000  globalDiagNormDiff_ = diff.norm2 ();
1001  }
1002  else { // don't check diagonal elements
1003  if (fixTinyDiagEntries_) {
1004  // Go through all the diagonal entries. Invert those that
1005  // aren't too small in magnitude. For those that are too
1006  // small in magnitude, replace them with oneOverMinDiagVal.
1007  for (size_t i = 0 ; i < numMyRows; ++i) {
1008  const scalar_type d_i = diag[i];
1009  const magnitude_type d_i_mag = STS::magnitude (d_i);
1010 
1011  // <= not <, in case minDiagValMag is zero.
1012  if (d_i_mag <= minDiagValMag) {
1013  diag[i] = oneOverMinDiagVal;
1014  } else {
1015  diag[i] = one / d_i;
1016  }
1017  }
1018  }
1019  else { // don't fix tiny or zero diagonal entries
1020  for (size_t i = 0 ; i < numMyRows; ++i) {
1021  diag[i] = one / diag[i];
1022  }
1023  }
1024  if (STS::isComplex) { // magnitude: at least 3 flops per diagonal entry
1025  ComputeFlops_ += 4.0 * numMyRows;
1026  } else {
1027  ComputeFlops_ += numMyRows;
1028  }
1029  }
1030 
1031  if (IsParallel_ && (PrecType_ == Ifpack2::Details::GS ||
1032  PrecType_ == Ifpack2::Details::SGS)) {
1033  // mfh 21 Mar 2013: The Import object may be null, but in that
1034  // case, the domain and column Maps are the same and we don't
1035  // need to Import anyway.
1036  Importer_ = A_->getGraph ()->getImporter ();
1037  Diagonal_->template sync<device_type> ();
1038  }
1039  } // end TimeMonitor scope
1040 
1041  ComputeTime_ += Time_->totalElapsedTime ();
1042  ++NumCompute_;
1043  IsComputed_ = true;
1044 }
1045 
1046 
1047 template<class MatrixType>
1048 void
1050 ApplyInverseJacobi (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1051  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1052 {
1053  using Teuchos::as;
1054  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
1056 
1057  if (hasBlockCrsMatrix_) {
1058  ApplyInverseJacobi_BlockCrsMatrix (X, Y);
1059  return;
1060  }
1061 
1062  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1063  const double numVectors = as<double> (X.getNumVectors ());
1064  if (ZeroStartingSolution_) {
1065  // For the first Jacobi sweep, if we are allowed to assume that
1066  // the initial guess is zero, then Jacobi is just diagonal
1067  // scaling. (A_ij * x_j = 0 for i != j, since x_j = 0.)
1068  // Compute it as Y(i,j) = DampingFactor_ * X(i,j) * D(i).
1069  Y.elementWiseMultiply (DampingFactor_, *Diagonal_, X, STS::zero ());
1070 
1071  // Count (global) floating-point operations. Ifpack2 represents
1072  // this as a floating-point number rather than an integer, so that
1073  // overflow (for a very large number of calls, or a very large
1074  // problem) is approximate instead of catastrophic.
1075  double flopUpdate = 0.0;
1076  if (DampingFactor_ == STS::one ()) {
1077  // Y(i,j) = X(i,j) * D(i): one multiply for each entry of Y.
1078  flopUpdate = numGlobalRows * numVectors;
1079  } else {
1080  // Y(i,j) = DampingFactor_ * X(i,j) * D(i):
1081  // Two multiplies per entry of Y.
1082  flopUpdate = 2.0 * numGlobalRows * numVectors;
1083  }
1084  ApplyFlops_ += flopUpdate;
1085  if (NumSweeps_ == 1) {
1086  return;
1087  }
1088  }
1089  // If we were allowed to assume that the starting guess was zero,
1090  // then we have already done the first sweep above.
1091  const int startSweep = ZeroStartingSolution_ ? 1 : 0;
1092  // We don't need to initialize the result MV, since the sparse
1093  // mat-vec will clobber its contents anyway.
1094  MV A_times_Y (Y.getMap (), as<size_t>(numVectors), false);
1095  for (int j = startSweep; j < NumSweeps_; ++j) {
1096  // Each iteration: Y = Y + \omega D^{-1} (X - A*Y)
1097  applyMat (Y, A_times_Y);
1098  A_times_Y.update (STS::one (), X, -STS::one ());
1099  Y.elementWiseMultiply (DampingFactor_, *Diagonal_, A_times_Y, STS::one ());
1100  }
1101 
1102  // For each column of output, for each pass over the matrix:
1103  //
1104  // - One + and one * for each matrix entry
1105  // - One / and one + for each row of the matrix
1106  // - If the damping factor is not one: one * for each row of the
1107  // matrix. (It's not fair to count this if the damping factor is
1108  // one, since the implementation could skip it. Whether it does
1109  // or not is the implementation's choice.)
1110 
1111  // Floating-point operations due to the damping factor, per matrix
1112  // row, per direction, per columm of output.
1113  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1114  const double dampingFlops = (DampingFactor_ == STS::one ()) ? 0.0 : 1.0;
1115  ApplyFlops_ += as<double> (NumSweeps_ - startSweep) * numVectors *
1116  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1117 }
1118 
1119 template<class MatrixType>
1120 void
1124  Tpetra::MultiVector<scalar_type, local_ordinal_type,
1125  global_ordinal_type,node_type>& Y) const
1126 {
1127  using Kokkos::ALL;
1128  typedef Tpetra::Experimental::BlockMultiVector<scalar_type,
1130  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
1132  typedef typename block_crs_matrix_type::little_vec_type little_vec_type;
1133 
1134  // Get unmanaged Views, so that taking subviews repeatedly won't
1135  // update the reference count.
1136  unmanaged_block_diag_type blockDiag = blockDiag_;
1137  unmanaged_pivots_type pivots = blockDiagFactPivots_;
1138 
1139  if (ZeroStartingSolution_) {
1140  Y.putScalar (STS::zero ());
1141  }
1142 
1143  const size_t numVectors = X.getNumVectors ();
1144 
1145  const block_crs_matrix_type* blockMat =
1146  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
1147 
1148  const local_ordinal_type blockSize = blockMat->getBlockSize ();
1149  BMV A_times_Y_Block (* (blockMat->getRowMap ()), * (Y.getMap ()),
1150  blockSize, numVectors);
1151  MV A_times_Y = A_times_Y_Block.getMultiVectorView();
1152  BMV yBlock (Y, * (blockMat->getRowMap ()), blockSize);
1153  for (int j = 0; j < NumSweeps_; ++j) {
1154  blockMat->apply (Y, A_times_Y);
1155  A_times_Y.update (STS::one (), X, -STS::one ());
1156  const size_t numRows = blockMat->getNodeNumRows ();
1157  for (size_t i = 0; i < numVectors; ++i) {
1158  for (size_t k = 0; k < numRows; ++k) {
1159  // Each iteration: Y = Y + \omega D^{-1} (X - A*Y)
1160  auto factorizedBlockDiag = Kokkos::subview (blockDiag, k, ALL (), ALL ());
1161  auto ipiv = Kokkos::subview (pivots, k, ALL ());
1162  little_vec_type xloc = A_times_Y_Block.getLocalBlock (k, i);
1163  little_vec_type yloc = yBlock.getLocalBlock (k, i);
1164  int info = 0;
1165  Tpetra::Experimental::GETRS ("N", factorizedBlockDiag, ipiv, xloc, info);
1166  Tpetra::Experimental::AXPY (DampingFactor_, xloc, yloc);
1167  }
1168  }
1169  }
1170 }
1171 
1172 template<class MatrixType>
1173 void
1175 ApplyInverseGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1176  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1177 {
1178  typedef Relaxation<MatrixType> this_type;
1179  // The CrsMatrix version is faster, because it can access the sparse
1180  // matrix data directly, rather than by copying out each row's data
1181  // in turn. Thus, we check whether the RowMatrix is really a
1182  // CrsMatrix.
1183  //
1184  // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef
1185  // declaration in Ifpack2_Relaxation_decl.hpp header file. The code
1186  // will still be correct if the cast fails, but it will use an
1187  // unoptimized kernel.
1188  const block_crs_matrix_type* blockCrsMat =
1189  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
1190  const crs_matrix_type* crsMat =
1191  dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
1192  if (blockCrsMat != NULL) {
1193  const_cast<this_type*> (this)->ApplyInverseGS_BlockCrsMatrix (*blockCrsMat, X, Y);
1194  } else if (crsMat != NULL) {
1195  ApplyInverseGS_CrsMatrix (*crsMat, X, Y);
1196  } else {
1197  ApplyInverseGS_RowMatrix (X, Y);
1198  }
1199 }
1200 
1201 
1202 template<class MatrixType>
1203 void
1205 ApplyInverseGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1206  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1207 {
1208  using Teuchos::Array;
1209  using Teuchos::ArrayRCP;
1210  using Teuchos::ArrayView;
1211  using Teuchos::as;
1212  using Teuchos::RCP;
1213  using Teuchos::rcp;
1214  using Teuchos::rcpFromRef;
1215  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> MV;
1216 
1217  // Tpetra's GS implementation for CrsMatrix handles zeroing out the
1218  // starting multivector itself. The generic RowMatrix version here
1219  // does not, so we have to zero out Y here.
1220  if (ZeroStartingSolution_) {
1221  Y.putScalar (STS::zero ());
1222  }
1223 
1224  const size_t NumVectors = X.getNumVectors ();
1225  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
1226  Array<local_ordinal_type> Indices (maxLength);
1227  Array<scalar_type> Values (maxLength);
1228 
1229  // Local smoothing stuff
1230  const size_t numMyRows = A_->getNodeNumRows();
1231  const local_ordinal_type* rowInd = 0;
1232  size_t numActive = numMyRows;
1233  bool do_local = ! localSmoothingIndices_.is_null ();
1234  if (do_local) {
1235  rowInd = localSmoothingIndices_.getRawPtr ();
1236  numActive = localSmoothingIndices_.size ();
1237  }
1238 
1239  RCP<MV> Y2;
1240  if (IsParallel_) {
1241  if (Importer_.is_null ()) { // domain and column Maps are the same.
1242  // We will copy Y into Y2 below, so no need to fill with zeros here.
1243  Y2 = rcp (new MV (Y.getMap (), NumVectors, false));
1244  } else {
1245  // FIXME (mfh 21 Mar 2013) We probably don't need to fill with
1246  // zeros here, since we are doing an Import into Y2 below
1247  // anyway. However, it doesn't hurt correctness.
1248  Y2 = rcp (new MV (Importer_->getTargetMap (), NumVectors));
1249  }
1250  }
1251  else {
1252  Y2 = rcpFromRef (Y);
1253  }
1254 
1255  // Diagonal
1256  ArrayRCP<const scalar_type> d_rcp = Diagonal_->get1dView ();
1257  ArrayView<const scalar_type> d_ptr = d_rcp();
1258 
1259  // Constant stride check
1260  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
1261 
1262  if (constant_stride) {
1263  // extract 1D RCPs
1264  size_t x_stride = X.getStride();
1265  size_t y2_stride = Y2->getStride();
1266  ArrayRCP<scalar_type> y2_rcp = Y2->get1dViewNonConst();
1267  ArrayRCP<const scalar_type> x_rcp = X.get1dView();
1268  ArrayView<scalar_type> y2_ptr = y2_rcp();
1269  ArrayView<const scalar_type> x_ptr = x_rcp();
1270  Array<scalar_type> dtemp(NumVectors,STS::zero());
1271 
1272  for (int j = 0; j < NumSweeps_; ++j) {
1273  // data exchange is here, once per sweep
1274  if (IsParallel_) {
1275  if (Importer_.is_null ()) {
1276  *Y2 = Y; // just copy, since domain and column Maps are the same
1277  } else {
1278  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1279  }
1280  }
1281 
1282  if (! DoBackwardGS_) { // Forward sweep
1283  for (size_t ii = 0; ii < numActive; ++ii) {
1284  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1285  size_t NumEntries;
1286  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1287  dtemp.assign(NumVectors,STS::zero());
1288 
1289  for (size_t k = 0; k < NumEntries; ++k) {
1290  const local_ordinal_type col = Indices[k];
1291  for (size_t m = 0; m < NumVectors; ++m) {
1292  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1293  }
1294  }
1295 
1296  for (size_t m = 0; m < NumVectors; ++m) {
1297  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1298  }
1299  }
1300  }
1301  else { // Backward sweep
1302  // ptrdiff_t is the same size as size_t, but is signed. Being
1303  // signed is important so that i >= 0 is not trivially true.
1304  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1305  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1306  size_t NumEntries;
1307  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1308  dtemp.assign (NumVectors, STS::zero ());
1309 
1310  for (size_t k = 0; k < NumEntries; ++k) {
1311  const local_ordinal_type col = Indices[k];
1312  for (size_t m = 0; m < NumVectors; ++m) {
1313  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1314  }
1315  }
1316 
1317  for (size_t m = 0; m < NumVectors; ++m) {
1318  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1319  }
1320  }
1321  }
1322  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1323  if (IsParallel_) {
1324  Tpetra::deep_copy (Y, *Y2);
1325  }
1326  }
1327  }
1328  else {
1329  // extract 2D RCPS
1330  ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
1331  ArrayRCP<ArrayRCP<const scalar_type> > x_ptr = X.get2dView ();
1332 
1333  for (int j = 0; j < NumSweeps_; ++j) {
1334  // data exchange is here, once per sweep
1335  if (IsParallel_) {
1336  if (Importer_.is_null ()) {
1337  *Y2 = Y; // just copy, since domain and column Maps are the same
1338  } else {
1339  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1340  }
1341  }
1342 
1343  if (! DoBackwardGS_) { // Forward sweep
1344  for (size_t ii = 0; ii < numActive; ++ii) {
1345  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1346  size_t NumEntries;
1347  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1348 
1349  for (size_t m = 0; m < NumVectors; ++m) {
1350  scalar_type dtemp = STS::zero ();
1351  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1352  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1353 
1354  for (size_t k = 0; k < NumEntries; ++k) {
1355  const local_ordinal_type col = Indices[k];
1356  dtemp += Values[k] * y2_local[col];
1357  }
1358  y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
1359  }
1360  }
1361  }
1362  else { // Backward sweep
1363  // ptrdiff_t is the same size as size_t, but is signed. Being
1364  // signed is important so that i >= 0 is not trivially true.
1365  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1366  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1367 
1368  size_t NumEntries;
1369  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1370 
1371  for (size_t m = 0; m < NumVectors; ++m) {
1372  scalar_type dtemp = STS::zero ();
1373  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1374  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1375 
1376  for (size_t k = 0; k < NumEntries; ++k) {
1377  const local_ordinal_type col = Indices[k];
1378  dtemp += Values[k] * y2_local[col];
1379  }
1380  y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
1381  }
1382  }
1383  }
1384 
1385  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1386  if (IsParallel_) {
1387  Tpetra::deep_copy (Y, *Y2);
1388  }
1389  }
1390  }
1391 
1392 
1393  // See flop count discussion in implementation of ApplyInverseGS_CrsMatrix().
1394  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1395  const double numVectors = as<double> (X.getNumVectors ());
1396  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1397  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1398  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
1399  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1400 }
1401 
1402 
1403 template<class MatrixType>
1404 void
1406 ApplyInverseGS_CrsMatrix (const crs_matrix_type& A,
1407  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1408  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1409 {
1410  using Teuchos::as;
1411  const Tpetra::ESweepDirection direction =
1412  DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
1413  if (localSmoothingIndices_.is_null ()) {
1414  A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
1415  NumSweeps_, ZeroStartingSolution_);
1416  }
1417  else {
1418  A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
1419  DampingFactor_, direction,
1420  NumSweeps_, ZeroStartingSolution_);
1421  }
1422 
1423  // For each column of output, for each sweep over the matrix:
1424  //
1425  // - One + and one * for each matrix entry
1426  // - One / and one + for each row of the matrix
1427  // - If the damping factor is not one: one * for each row of the
1428  // matrix. (It's not fair to count this if the damping factor is
1429  // one, since the implementation could skip it. Whether it does
1430  // or not is the implementation's choice.)
1431 
1432  // Floating-point operations due to the damping factor, per matrix
1433  // row, per direction, per columm of output.
1434  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1435  const double numVectors = as<double> (X.getNumVectors ());
1436  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1437  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1438  ApplyFlops_ += NumSweeps_ * numVectors *
1439  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1440 }
1441 
1442 template<class MatrixType>
1443 void
1445 ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
1446  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1447  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
1448 {
1449  using Teuchos::RCP;
1450  using Teuchos::rcp;
1451  using Teuchos::rcpFromRef;
1452  typedef Tpetra::Experimental::BlockMultiVector<scalar_type,
1454  typedef Tpetra::MultiVector<scalar_type,
1456 
1457  //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
1458  //
1459  // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for
1460  // multiple right-hand sides, unless the input or output MultiVector
1461  // does not have constant stride. We should check for that case
1462  // here, in case it doesn't work in localGaussSeidel (which is
1463  // entirely possible).
1464  BMV yBlock (Y, * (A.getGraph ()->getDomainMap ()), A.getBlockSize ());
1465  const BMV xBlock (X, * (A.getColMap ()), A.getBlockSize ());
1466 
1467  bool performImport = false;
1468  RCP<BMV> yBlockCol;
1469  if (Importer_.is_null ()) {
1470  yBlockCol = rcpFromRef (yBlock);
1471  }
1472  else {
1473  if (yBlockColumnPointMap_.is_null () ||
1474  yBlockColumnPointMap_->getNumVectors () != yBlock.getNumVectors () ||
1475  yBlockColumnPointMap_->getBlockSize () != yBlock.getBlockSize ()) {
1476  yBlockColumnPointMap_ =
1477  rcp (new BMV (* (A.getColMap ()), A.getBlockSize (),
1478  static_cast<local_ordinal_type> (yBlock.getNumVectors ())));
1479  }
1480  yBlockCol = yBlockColumnPointMap_;
1481  performImport = true;
1482  }
1483 
1484  if (ZeroStartingSolution_) {
1485  yBlockCol->putScalar (STS::zero ());
1486  }
1487  else if (performImport) {
1488  yBlockCol->doImport (yBlock, *Importer_, Tpetra::INSERT);
1489  }
1490 
1491  const Tpetra::ESweepDirection direction =
1492  DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
1493 
1494  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
1495  if (performImport && sweep > 0) {
1496  yBlockCol->doImport(yBlock, *Importer_, Tpetra::INSERT);
1497  }
1498  A.localGaussSeidel (xBlock, *yBlockCol, blockDiag_,
1499  blockDiagFactPivots_,
1500  DampingFactor_, direction);
1501  if (performImport) {
1502  RCP<const MV> yBlockColPointDomain =
1503  yBlockCol->getMultiVectorView ().offsetView (A.getDomainMap (), 0);
1504  Tpetra::deep_copy (Y, *yBlockColPointDomain);
1505  }
1506  }
1507 }
1508 
1509 
1510 template<class MatrixType>
1511 void
1513 ApplyInverseSGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1514  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1515 {
1516  typedef Relaxation<MatrixType> this_type;
1517  // The CrsMatrix version is faster, because it can access the sparse
1518  // matrix data directly, rather than by copying out each row's data
1519  // in turn. Thus, we check whether the RowMatrix is really a
1520  // CrsMatrix.
1521  //
1522  // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef
1523  // declaration in Ifpack2_Relaxation_decl.hpp header file. The code
1524  // will still be correct if the cast fails, but it will use an
1525  // unoptimized kernel.
1526  const block_crs_matrix_type* blockCrsMat = dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr());
1527  const crs_matrix_type* crsMat = dynamic_cast<const crs_matrix_type*> (&(*A_));
1528  if (blockCrsMat != NULL) {
1529  const_cast<this_type*> (this)->ApplyInverseSGS_BlockCrsMatrix(*blockCrsMat, X, Y);
1530  }
1531  else if (crsMat != NULL) {
1532  ApplyInverseSGS_CrsMatrix (*crsMat, X, Y);
1533  } else {
1534  ApplyInverseSGS_RowMatrix (X, Y);
1535  }
1536 }
1537 
1538 
1539 template<class MatrixType>
1540 void
1542 ApplyInverseSGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1543  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1544 {
1545  using Teuchos::Array;
1546  using Teuchos::ArrayRCP;
1547  using Teuchos::ArrayView;
1548  using Teuchos::as;
1549  using Teuchos::RCP;
1550  using Teuchos::rcp;
1551  using Teuchos::rcpFromRef;
1552  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> MV;
1553 
1554  // Tpetra's GS implementation for CrsMatrix handles zeroing out the
1555  // starting multivector itself. The generic RowMatrix version here
1556  // does not, so we have to zero out Y here.
1557  if (ZeroStartingSolution_) {
1558  Y.putScalar (STS::zero ());
1559  }
1560 
1561  const size_t NumVectors = X.getNumVectors ();
1562  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
1563  Array<local_ordinal_type> Indices (maxLength);
1564  Array<scalar_type> Values (maxLength);
1565 
1566  // Local smoothing stuff
1567  const size_t numMyRows = A_->getNodeNumRows();
1568  const local_ordinal_type * rowInd = 0;
1569  size_t numActive = numMyRows;
1570  bool do_local = localSmoothingIndices_.is_null();
1571  if(do_local) {
1572  rowInd = localSmoothingIndices_.getRawPtr();
1573  numActive = localSmoothingIndices_.size();
1574  }
1575 
1576 
1577  RCP<MV> Y2;
1578  if (IsParallel_) {
1579  if (Importer_.is_null ()) { // domain and column Maps are the same.
1580  // We will copy Y into Y2 below, so no need to fill with zeros here.
1581  Y2 = rcp (new MV (Y.getMap (), NumVectors, false));
1582  } else {
1583  // FIXME (mfh 21 Mar 2013) We probably don't need to fill with
1584  // zeros here, since we are doing an Import into Y2 below
1585  // anyway. However, it doesn't hurt correctness.
1586  Y2 = rcp (new MV (Importer_->getTargetMap (), NumVectors));
1587  }
1588  }
1589  else {
1590  Y2 = rcpFromRef (Y);
1591  }
1592 
1593  // Diagonal
1594  ArrayRCP<const scalar_type> d_rcp = Diagonal_->get1dView();
1595  ArrayView<const scalar_type> d_ptr = d_rcp();
1596 
1597  // Constant stride check
1598  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
1599 
1600  if(constant_stride) {
1601  // extract 1D RCPs
1602  size_t x_stride = X.getStride();
1603  size_t y2_stride = Y2->getStride();
1604  ArrayRCP<scalar_type> y2_rcp = Y2->get1dViewNonConst();
1605  ArrayRCP<const scalar_type> x_rcp = X.get1dView();
1606  ArrayView<scalar_type> y2_ptr = y2_rcp();
1607  ArrayView<const scalar_type> x_ptr = x_rcp();
1608  Array<scalar_type> dtemp(NumVectors,STS::zero());
1609  for (int iter = 0; iter < NumSweeps_; ++iter) {
1610  // only one data exchange per sweep
1611  if (IsParallel_) {
1612  if (Importer_.is_null ()) {
1613  // just copy, since domain and column Maps are the same
1614  Tpetra::deep_copy (*Y2, Y);
1615  } else {
1616  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1617  }
1618  }
1619 
1620  for (int j = 0; j < NumSweeps_; j++) {
1621  // data exchange is here, once per sweep
1622  if (IsParallel_) {
1623  if (Importer_.is_null ()) {
1624  // just copy, since domain and column Maps are the same
1625  Tpetra::deep_copy (*Y2, Y);
1626  } else {
1627  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1628  }
1629  }
1630 
1631  for (size_t ii = 0; ii < numActive; ++ii) {
1632  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1633  size_t NumEntries;
1634  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1635  dtemp.assign(NumVectors,STS::zero());
1636 
1637  for (size_t k = 0; k < NumEntries; ++k) {
1638  const local_ordinal_type col = Indices[k];
1639  for (size_t m = 0; m < NumVectors; ++m) {
1640  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1641  }
1642  }
1643 
1644  for (size_t m = 0; m < NumVectors; ++m) {
1645  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1646  }
1647  }
1648 
1649  // ptrdiff_t is the same size as size_t, but is signed. Being
1650  // signed is important so that i >= 0 is not trivially true.
1651  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1652  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1653  size_t NumEntries;
1654  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1655  dtemp.assign(NumVectors,STS::zero());
1656 
1657  for (size_t k = 0; k < NumEntries; ++k) {
1658  const local_ordinal_type col = Indices[k];
1659  for (size_t m = 0; m < NumVectors; ++m) {
1660  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1661  }
1662  }
1663 
1664  for (size_t m = 0; m < NumVectors; ++m) {
1665  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1666  }
1667  }
1668  }
1669 
1670  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1671  if (IsParallel_) {
1672  Tpetra::deep_copy (Y, *Y2);
1673  }
1674  }
1675  }
1676  else {
1677  // extract 2D RCPs
1678  ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
1679  ArrayRCP<ArrayRCP<const scalar_type> > x_ptr = X.get2dView ();
1680 
1681  for (int iter = 0; iter < NumSweeps_; ++iter) {
1682  // only one data exchange per sweep
1683  if (IsParallel_) {
1684  if (Importer_.is_null ()) {
1685  // just copy, since domain and column Maps are the same
1686  Tpetra::deep_copy (*Y2, Y);
1687  } else {
1688  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1689  }
1690  }
1691 
1692  for (size_t ii = 0; ii < numActive; ++ii) {
1693  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1694  const scalar_type diag = d_ptr[i];
1695  size_t NumEntries;
1696  A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
1697 
1698  for (size_t m = 0; m < NumVectors; ++m) {
1699  scalar_type dtemp = STS::zero ();
1700  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1701  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1702 
1703  for (size_t k = 0; k < NumEntries; ++k) {
1704  const local_ordinal_type col = Indices[k];
1705  dtemp += Values[k] * y2_local[col];
1706  }
1707  y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
1708  }
1709  }
1710 
1711  // ptrdiff_t is the same size as size_t, but is signed. Being
1712  // signed is important so that i >= 0 is not trivially true.
1713  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1714  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1715  const scalar_type diag = d_ptr[i];
1716  size_t NumEntries;
1717  A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
1718 
1719  for (size_t m = 0; m < NumVectors; ++m) {
1720  scalar_type dtemp = STS::zero ();
1721  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1722  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1723 
1724  for (size_t k = 0; k < NumEntries; ++k) {
1725  const local_ordinal_type col = Indices[k];
1726  dtemp += Values[k] * y2_local[col];
1727  }
1728  y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
1729  }
1730  }
1731 
1732  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1733  if (IsParallel_) {
1734  Tpetra::deep_copy (Y, *Y2);
1735  }
1736  }
1737  }
1738 
1739  // See flop count discussion in implementation of ApplyInverseSGS_CrsMatrix().
1740  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1741  const double numVectors = as<double> (X.getNumVectors ());
1742  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1743  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1744  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
1745  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1746 }
1747 
1748 
1749 template<class MatrixType>
1750 void
1752 ApplyInverseSGS_CrsMatrix (const crs_matrix_type& A,
1753  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1754  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1755 {
1756  using Teuchos::as;
1757  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
1758  if (localSmoothingIndices_.is_null ()) {
1759  A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
1760  NumSweeps_, ZeroStartingSolution_);
1761  }
1762  else {
1763  A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
1764  DampingFactor_, direction,
1765  NumSweeps_, ZeroStartingSolution_);
1766  }
1767 
1768  // For each column of output, for each sweep over the matrix:
1769  //
1770  // - One + and one * for each matrix entry
1771  // - One / and one + for each row of the matrix
1772  // - If the damping factor is not one: one * for each row of the
1773  // matrix. (It's not fair to count this if the damping factor is
1774  // one, since the implementation could skip it. Whether it does
1775  // or not is the implementation's choice.)
1776  //
1777  // Each sweep of symmetric Gauss-Seidel / SOR counts as two sweeps,
1778  // one forward and one backward.
1779 
1780  // Floating-point operations due to the damping factor, per matrix
1781  // row, per direction, per columm of output.
1782  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1783  const double numVectors = as<double> (X.getNumVectors ());
1784  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1785  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1786  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
1787  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1788 }
1789 
1790 template<class MatrixType>
1791 void
1793 ApplyInverseSGS_BlockCrsMatrix (const block_crs_matrix_type& A,
1794  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1795  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
1796 {
1797  using Teuchos::RCP;
1798  using Teuchos::rcp;
1799  using Teuchos::rcpFromRef;
1800  typedef Tpetra::Experimental::BlockMultiVector<scalar_type,
1802  typedef Tpetra::MultiVector<scalar_type,
1804 
1805  //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
1806  //
1807  // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for
1808  // multiple right-hand sides, unless the input or output MultiVector
1809  // does not have constant stride. We should check for that case
1810  // here, in case it doesn't work in localGaussSeidel (which is
1811  // entirely possible).
1812  BMV yBlock (Y, * (A.getGraph ()->getDomainMap ()), A.getBlockSize ());
1813  const BMV xBlock (X, * (A.getColMap ()), A.getBlockSize ());
1814 
1815  bool performImport = false;
1816  RCP<BMV> yBlockCol;
1817  if (Importer_.is_null ()) {
1818  yBlockCol = Teuchos::rcpFromRef (yBlock);
1819  }
1820  else {
1821  if (yBlockColumnPointMap_.is_null () ||
1822  yBlockColumnPointMap_->getNumVectors () != yBlock.getNumVectors () ||
1823  yBlockColumnPointMap_->getBlockSize () != yBlock.getBlockSize ()) {
1824  yBlockColumnPointMap_ =
1825  rcp (new BMV (* (A.getColMap ()), A.getBlockSize (),
1826  static_cast<local_ordinal_type> (yBlock.getNumVectors ())));
1827  }
1828  yBlockCol = yBlockColumnPointMap_;
1829  performImport = true;
1830  }
1831 
1832  if (ZeroStartingSolution_) {
1833  yBlockCol->putScalar (STS::zero ());
1834  }
1835  else if (performImport) {
1836  yBlockCol->doImport (yBlock, *Importer_, Tpetra::INSERT);
1837  }
1838 
1839  // FIXME (mfh 12 Sep 2014) Shouldn't this come from the user's parameter?
1840  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
1841 
1842  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
1843  if (performImport && sweep > 0) {
1844  yBlockCol->doImport (yBlock, *Importer_, Tpetra::INSERT);
1845  }
1846  A.localGaussSeidel (xBlock, *yBlockCol, blockDiag_, blockDiagFactPivots_,
1847  DampingFactor_, direction);
1848  if (performImport) {
1849  RCP<const MV> yBlockColPointDomain =
1850  yBlockCol->getMultiVectorView ().offsetView (A.getDomainMap (), 0);
1851  MV yBlockView = yBlock.getMultiVectorView ();
1852  Tpetra::deep_copy (yBlockView, *yBlockColPointDomain);
1853  }
1854  }
1855 }
1856 
1857 
1858 template<class MatrixType>
1860 {
1861  std::ostringstream os;
1862 
1863  // Output is a valid YAML dictionary in flow style. If you don't
1864  // like everything on a single line, you should call describe()
1865  // instead.
1866  os << "\"Ifpack2::Relaxation\": {";
1867 
1868  os << "Initialized: " << (isInitialized () ? "true" : "false") << ", "
1869  << "Computed: " << (isComputed () ? "true" : "false") << ", ";
1870 
1871  // It's useful to print this instance's relaxation method (Jacobi,
1872  // Gauss-Seidel, or symmetric Gauss-Seidel). If you want more info
1873  // than that, call describe() instead.
1874  os << "Type: ";
1875  if (PrecType_ == Ifpack2::Details::JACOBI) {
1876  os << "Jacobi";
1877  } else if (PrecType_ == Ifpack2::Details::GS) {
1878  os << "Gauss-Seidel";
1879  } else if (PrecType_ == Ifpack2::Details::SGS) {
1880  os << "Symmetric Gauss-Seidel";
1881  } else {
1882  os << "INVALID";
1883  }
1884 
1885  os << ", " << "sweeps: " << NumSweeps_ << ", "
1886  << "damping factor: " << DampingFactor_ << ", ";
1887  if (DoL1Method_) {
1888  os << "use l1: " << DoL1Method_ << ", "
1889  << "l1 eta: " << L1Eta_ << ", ";
1890  }
1891 
1892  if (A_.is_null ()) {
1893  os << "Matrix: null";
1894  }
1895  else {
1896  os << "Global matrix dimensions: ["
1897  << A_->getGlobalNumRows () << ", " << A_->getGlobalNumCols () << "]"
1898  << ", Global nnz: " << A_->getGlobalNumEntries();
1899  }
1900 
1901  os << "}";
1902  return os.str ();
1903 }
1904 
1905 
1906 template<class MatrixType>
1907 void
1909 describe (Teuchos::FancyOStream &out,
1910  const Teuchos::EVerbosityLevel verbLevel) const
1911 {
1912  using Teuchos::OSTab;
1913  using Teuchos::TypeNameTraits;
1914  using Teuchos::VERB_DEFAULT;
1915  using Teuchos::VERB_NONE;
1916  using Teuchos::VERB_LOW;
1917  using Teuchos::VERB_MEDIUM;
1918  using Teuchos::VERB_HIGH;
1919  using Teuchos::VERB_EXTREME;
1920  using std::endl;
1921 
1922  const Teuchos::EVerbosityLevel vl =
1923  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
1924 
1925  const int myRank = this->getComm ()->getRank ();
1926 
1927  // none: print nothing
1928  // low: print O(1) info from Proc 0
1929  // medium:
1930  // high:
1931  // extreme:
1932 
1933  if (vl != VERB_NONE && myRank == 0) {
1934  // Describable interface asks each implementation to start with a tab.
1935  OSTab tab1 (out);
1936  // Output is valid YAML; hence the quotes, to protect the colons.
1937  out << "\"Ifpack2::Relaxation\":" << endl;
1938  OSTab tab2 (out);
1939  out << "MatrixType: \"" << TypeNameTraits<MatrixType>::name () << "\""
1940  << endl;
1941  if (this->getObjectLabel () != "") {
1942  out << "Label: " << this->getObjectLabel () << endl;
1943  }
1944  out << "Initialized: " << (isInitialized () ? "true" : "false") << endl
1945  << "Computed: " << (isComputed () ? "true" : "false") << endl
1946  << "Parameters: " << endl;
1947  {
1948  OSTab tab3 (out);
1949  out << "\"relaxation: type\": ";
1950  if (PrecType_ == Ifpack2::Details::JACOBI) {
1951  out << "Jacobi";
1952  } else if (PrecType_ == Ifpack2::Details::GS) {
1953  out << "Gauss-Seidel";
1954  } else if (PrecType_ == Ifpack2::Details::SGS) {
1955  out << "Symmetric Gauss-Seidel";
1956  } else {
1957  out << "INVALID";
1958  }
1959  // We quote these parameter names because they contain colons.
1960  // YAML uses the colon to distinguish key from value.
1961  out << endl
1962  << "\"relaxation: sweeps\": " << NumSweeps_ << endl
1963  << "\"relaxation: damping factor\": " << DampingFactor_ << endl
1964  << "\"relaxation: min diagonal value\": " << MinDiagonalValue_ << endl
1965  << "\"relaxation: zero starting solution\": " << ZeroStartingSolution_ << endl
1966  << "\"relaxation: backward mode\": " << DoBackwardGS_ << endl
1967  << "\"relaxation: use l1\": " << DoL1Method_ << endl
1968  << "\"relaxation: l1 eta\": " << L1Eta_ << endl;
1969  }
1970  out << "Computed quantities:" << endl;
1971  {
1972  OSTab tab3 (out);
1973  out << "Global number of rows: " << A_->getGlobalNumRows () << endl
1974  << "Global number of columns: " << A_->getGlobalNumCols () << endl;
1975  }
1976  if (checkDiagEntries_ && isComputed ()) {
1977  out << "Properties of input diagonal entries:" << endl;
1978  {
1979  OSTab tab3 (out);
1980  out << "Magnitude of minimum-magnitude diagonal entry: "
1981  << globalMinMagDiagEntryMag_ << endl
1982  << "Magnitude of maximum-magnitude diagonal entry: "
1983  << globalMaxMagDiagEntryMag_ << endl
1984  << "Number of diagonal entries with small magnitude: "
1985  << globalNumSmallDiagEntries_ << endl
1986  << "Number of zero diagonal entries: "
1987  << globalNumZeroDiagEntries_ << endl
1988  << "Number of diagonal entries with negative real part: "
1989  << globalNumNegDiagEntries_ << endl
1990  << "Abs 2-norm diff between computed and actual inverse "
1991  << "diagonal: " << globalDiagNormDiff_ << endl;
1992  }
1993  }
1994  if (isComputed ()) {
1995  out << "Saved diagonal offsets: "
1996  << (savedDiagOffsets_ ? "true" : "false") << endl;
1997  }
1998  out << "Call counts and total times (in seconds): " << endl;
1999  {
2000  OSTab tab3 (out);
2001  out << "initialize: " << endl;
2002  {
2003  OSTab tab4 (out);
2004  out << "Call count: " << NumInitialize_ << endl;
2005  out << "Total time: " << InitializeTime_ << endl;
2006  }
2007  out << "compute: " << endl;
2008  {
2009  OSTab tab4 (out);
2010  out << "Call count: " << NumCompute_ << endl;
2011  out << "Total time: " << ComputeTime_ << endl;
2012  }
2013  out << "apply: " << endl;
2014  {
2015  OSTab tab4 (out);
2016  out << "Call count: " << NumApply_ << endl;
2017  out << "Total time: " << ApplyTime_ << endl;
2018  }
2019  }
2020  }
2021 }
2022 
2023 } // namespace Ifpack2
2024 
2025 #define IFPACK2_RELAXATION_INSTANT(S,LO,GO,N) \
2026  template class Ifpack2::Relaxation< Tpetra::RowMatrix<S, LO, GO, N> >;
2027 
2028 #endif // IFPACK2_RELAXATION_DEF_HPP
bool hasTransposeApply() const
Whether apply() and applyMat() let you apply the transpose or conjugate transpose.
Definition: Ifpack2_Relaxation_def.hpp:376
double getComputeFlops() const
Total number of floating-point operations over all calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:418
double getApplyFlops() const
Total number of floating-point operations over all calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:424
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object&#39;s attributes to the given output stream.
Definition: Ifpack2_Relaxation_def.hpp:1909
void compute()
Compute the preconditioner.
Definition: Ifpack2_Relaxation_def.hpp:706
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix and vectors are distributed.
Definition: Ifpack2_Relaxation_def.hpp:333
int getNumApply() const
Total number of calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:394
Ifpack2 implementation details.
double getComputeTime() const
Total time in seconds spent in all calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:406
MatrixType::node_type node_type
The Node type used by the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:246
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getDomainMap() const
Returns the Tpetra::Map object associated with the domain of this operator.
Definition: Ifpack2_Relaxation_def.hpp:353
void setParameters(const Teuchos::ParameterList &params)
Set the relaxation / preconditioner parameters.
Definition: Ifpack2_Relaxation_def.hpp:323
File for utility functions.
std::string description() const
A simple one-line description of this object.
Definition: Ifpack2_Relaxation_def.hpp:1859
int getNumInitialize() const
Total number of calls to initialize().
Definition: Ifpack2_Relaxation_def.hpp:382
bool isComputed() const
Return true if compute() has been called.
Definition: Ifpack2_Relaxation_decl.hpp:412
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getRangeMap() const
Returns the Tpetra::Map object associated with the range of this operator.
Definition: Ifpack2_Relaxation_def.hpp:366
MatrixType::global_ordinal_type global_ordinal_type
The type of global indices in the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:243
double getInitializeTime() const
Total time in seconds spent in all calls to initialize().
Definition: Ifpack2_Relaxation_def.hpp:400
void applyMat(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS) const
Apply the preconditioner to X, returning the result in Y.
Definition: Ifpack2_Relaxation_def.hpp:523
Definition: Ifpack2_Details_Amesos2Wrapper_decl.hpp:63
Teuchos::RCP< const row_matrix_type > getMatrix() const
The matrix to be preconditioned.
Definition: Ifpack2_Relaxation_def.hpp:344
MatrixType::local_ordinal_type local_ordinal_type
The type of local indices in the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:240
MatrixType::scalar_type scalar_type
The type of the entries of the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:237
virtual ~Relaxation()
Destructor.
Definition: Ifpack2_Relaxation_def.hpp:208
int getNumCompute() const
Total number of calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:388
bool isInitialized() const
Returns true if the preconditioner has been successfully initialized.
Definition: Ifpack2_Relaxation_decl.hpp:397
Relaxation preconditioners for Tpetra::RowMatrix and Tpetra::CrsMatrix sparse matrices.
Definition: Ifpack2_Relaxation_decl.hpp:222
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:72
double getApplyTime() const
Total time in seconds spent in all calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:412
void initialize()
Initialize the preconditioner.
Definition: Ifpack2_Relaxation_def.hpp:544
void apply(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_type alpha=Teuchos::ScalarTraits< scalar_type >::one(), scalar_type beta=Teuchos::ScalarTraits< scalar_type >::zero()) const
Apply the preconditioner to X, returning the result in Y.
Definition: Ifpack2_Relaxation_def.hpp:432
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
Return a list of all the parameters that this class accepts.
Definition: Ifpack2_Relaxation_def.hpp:213
Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type
The type of the magnitude (absolute value) of a matrix entry.
Definition: Ifpack2_Relaxation_decl.hpp:249