Tpetra parallel linear algebra  Version of the Day
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
44 
47 
50 #include "Teuchos_TimeMonitor.hpp"
51 
52 namespace Tpetra {
53 namespace Experimental {
54 
55  template<class Scalar, class LO, class GO, class Node>
56  std::ostream&
57  BlockCrsMatrix<Scalar, LO, GO, Node>::
58  markLocalErrorAndGetStream ()
59  {
60  * (this->localError_) = true;
61  if ((*errs_).is_null ()) {
62  *errs_ = Teuchos::rcp (new std::ostringstream ());
63  }
64  return **errs_;
65  }
66 
67  template<class Scalar, class LO, class GO, class Node>
70  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
71  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
72  blockSize_ (static_cast<LO> (0)),
73  ind_ (NULL),
74  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
75  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
76  offsetPerBlock_ (0),
77  localError_ (new bool (false)),
78  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
79  {
80  }
81 
82  template<class Scalar, class LO, class GO, class Node>
85  const LO blockSize) :
86  dist_object_type (graph.getMap ()),
87  graph_ (graph),
88  rowMeshMap_ (* (graph.getRowMap ())),
89  blockSize_ (blockSize),
90  ind_ (NULL), // to be initialized below
91  val_ (NULL), // to be initialized below
92  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
93  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
94  offsetPerBlock_ (blockSize * blockSize),
95  localError_ (new bool (false)),
96  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
97  {
98  TEUCHOS_TEST_FOR_EXCEPTION(
99  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
100  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
101  "rows (isSorted() is false). This class assumes sorted rows.");
102 
103  graphRCP_ = Teuchos::rcpFromRef(graph_);
104 
105  // Trick to test whether LO is nonpositive, without a compiler
106  // warning in case LO is unsigned (which is generally a bad idea
107  // anyway). I don't promise that the trick works, but it
108  // generally does with gcc at least, in my experience.
109  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
110  TEUCHOS_TEST_FOR_EXCEPTION(
111  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
112  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
113  " <= 0. The block size must be positive.");
114 
115  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
116  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
117 
118  {
119  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
120  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
121 
122  row_map_type ptr_d = graph.getLocalGraph ().row_map;
123  // FIXME (mfh 23 Mar 2015) Once we write a Kokkos kernel for the
124  // mat-vec, we won't need a host version of this.
125  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
126  Kokkos::deep_copy (ptr_h_nc, ptr_d);
127  ptr_ = ptr_h_nc;
128  }
129  ind_ = graph.getNodePackedIndices ().getRawPtr ();
130 
131  Kokkos::resize (valView_,
132  static_cast<size_t> (graph.getNodeNumEntries () *
133  offsetPerBlock ()));
134  val_ = valView_.ptr_on_device ();
135  }
136 
137  template<class Scalar, class LO, class GO, class Node>
140  const map_type& domainPointMap,
141  const map_type& rangePointMap,
142  const LO blockSize) :
143  dist_object_type (graph.getMap ()),
144  graph_ (graph),
145  rowMeshMap_ (* (graph.getRowMap ())),
146  domainPointMap_ (domainPointMap),
147  rangePointMap_ (rangePointMap),
148  blockSize_ (blockSize),
149  ind_ (NULL), // to be initialized below
150  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
151  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
152  offsetPerBlock_ (blockSize * blockSize),
153  localError_ (new bool (false)),
154  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
155  {
156  TEUCHOS_TEST_FOR_EXCEPTION(
157  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
158  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
159  "rows (isSorted() is false). This class assumes sorted rows.");
160 
161  graphRCP_ = Teuchos::rcpFromRef(graph_);
162 
163  // Trick to test whether LO is nonpositive, without a compiler
164  // warning in case LO is unsigned (which is generally a bad idea
165  // anyway). I don't promise that the trick works, but it
166  // generally does with gcc at least, in my experience.
167  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
168  TEUCHOS_TEST_FOR_EXCEPTION(
169  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
170  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
171  " <= 0. The block size must be positive.");
172 
173  {
174  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
175  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
176 
177  row_map_type ptr_d = graph.getLocalGraph ().row_map;
178  // FIXME (mfh 23 Mar 2015) Once we write a Kokkos kernel for the
179  // mat-vec, we won't need a host version of this.
180  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
181  Kokkos::deep_copy (ptr_h_nc, ptr_d);
182  ptr_ = ptr_h_nc;
183  }
184  ind_ = graph.getNodePackedIndices ().getRawPtr ();
185 
186  Kokkos::resize (valView_,
187  static_cast<size_t> (graph.getNodeNumEntries () *
188  offsetPerBlock ()));
189  val_ = valView_.ptr_on_device ();
190  }
191 
192  template<class Scalar, class LO, class GO, class Node>
193  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
196  { // Copy constructor of map_type does a shallow copy.
197  // We're only returning by RCP for backwards compatibility.
198  return Teuchos::rcp (new map_type (domainPointMap_));
199  }
200 
201  template<class Scalar, class LO, class GO, class Node>
202  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
204  getRangeMap () const
205  { // Copy constructor of map_type does a shallow copy.
206  // We're only returning by RCP for backwards compatibility.
207  return Teuchos::rcp (new map_type (rangePointMap_));
208  }
209 
210  template<class Scalar, class LO, class GO, class Node>
211  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
213  getRowMap () const
214  {
215  return graph_.getRowMap();
216  }
217 
218  template<class Scalar, class LO, class GO, class Node>
219  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
221  getColMap () const
222  {
223  return graph_.getColMap();
224  }
225 
226  template<class Scalar, class LO, class GO, class Node>
230  {
231  return graph_.getGlobalNumRows();
232  }
233 
234  template<class Scalar, class LO, class GO, class Node>
235  size_t
238  {
239  return graph_.getNodeNumRows();
240  }
241 
242  template<class Scalar, class LO, class GO, class Node>
243  size_t
246  {
247  return graph_.getNodeMaxNumRowEntries();
248  }
249 
250  template<class Scalar, class LO, class GO, class Node>
251  void
253  apply (const mv_type& X,
254  mv_type& Y,
255  Teuchos::ETransp mode,
256  Scalar alpha,
257  Scalar beta) const
258  {
260  TEUCHOS_TEST_FOR_EXCEPTION(
261  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
262  std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
263  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
264  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
265 
266  BMV X_view;
267  BMV Y_view;
268  const LO blockSize = getBlockSize ();
269  try {
270  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
271  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
272  }
273  catch (std::invalid_argument& e) {
274  TEUCHOS_TEST_FOR_EXCEPTION(
275  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
276  "apply: Either the input MultiVector X or the output MultiVector Y "
277  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
278  "graph. BlockMultiVector's constructor threw the following exception: "
279  << e.what ());
280  }
281 
282  try {
283  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
284  // either that or mark fields of this class as 'mutable'. The
285  // problem is that applyBlock wants to do lazy initialization of
286  // temporary block multivectors.
287  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
288  } catch (std::invalid_argument& e) {
289  TEUCHOS_TEST_FOR_EXCEPTION(
290  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
291  "apply: The implementation method applyBlock complained about having "
292  "an invalid argument. It reported the following: " << e.what ());
293  } catch (std::logic_error& e) {
294  TEUCHOS_TEST_FOR_EXCEPTION(
295  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
296  "apply: The implementation method applyBlock complained about a "
297  "possible bug in its implementation. It reported the following: "
298  << e.what ());
299  } catch (std::exception& e) {
300  TEUCHOS_TEST_FOR_EXCEPTION(
301  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
302  "apply: The implementation method applyBlock threw an exception which "
303  "is neither std::invalid_argument nor std::logic_error, but is a "
304  "subclass of std::exception. It reported the following: "
305  << e.what ());
306  } catch (...) {
307  TEUCHOS_TEST_FOR_EXCEPTION(
308  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
309  "apply: The implementation method applyBlock threw an exception which "
310  "is not an instance of a subclass of std::exception. This probably "
311  "indicates a bug. Please report this to the Tpetra developers.");
312  }
313  }
314 
315  template<class Scalar, class LO, class GO, class Node>
316  void
320  Teuchos::ETransp mode,
321  const Scalar alpha,
322  const Scalar beta)
323  {
324  TEUCHOS_TEST_FOR_EXCEPTION(
325  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
326  "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
327  "X and Y have different block sizes. X.getBlockSize() = "
328  << X.getBlockSize () << " != Y.getBlockSize() = "
329  << Y.getBlockSize () << ".");
330 
331  if (mode == Teuchos::NO_TRANS) {
332  applyBlockNoTrans (X, Y, alpha, beta);
333  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
334  applyBlockTrans (X, Y, mode, alpha, beta);
335  } else {
336  TEUCHOS_TEST_FOR_EXCEPTION(
337  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
338  "applyBlock: Invalid 'mode' argument. Valid values are "
339  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
340  }
341  }
342 
343  template<class Scalar, class LO, class GO, class Node>
344  void
346  setAllToScalar (const Scalar& alpha)
347  {
348  const LO numLocalMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
349  for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
350  const size_t meshBeg = ptr_[lclRow];
351  const size_t meshEnd = ptr_[lclRow+1];
352  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
353  little_block_type A_cur = getNonConstLocalBlockFromAbsOffset (absBlkOff);
354  deep_copy (A_cur, static_cast<impl_scalar_type> (alpha));
355  }
356  }
357  }
358 
359  template<class Scalar, class LO, class GO, class Node>
360  LO
362  replaceLocalValues (const LO localRowInd,
363  const LO colInds[],
364  const Scalar vals[],
365  const LO numColInds) const
366  {
367  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
368  // We modified no values, because the input local row index is
369  // invalid on the calling process. That may not be an error, if
370  // numColInds is zero anyway; it doesn't matter. This is the
371  // advantage of returning the number of valid indices.
372  return static_cast<LO> (0);
373  }
374  const impl_scalar_type* const vIn =
375  reinterpret_cast<const impl_scalar_type*> (vals);
376  const size_t absRowBlockOffset = this->ptr_[localRowInd];
377  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
378  const LO perBlockSize = this->offsetPerBlock ();
379  LO hint = 0; // Guess for the relative offset into the current row
380  LO pointOffset = 0; // Current offset into input values
381  LO validCount = 0; // number of valid column indices in colInds
382 
383  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
384  const LO relBlockOffset =
385  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
386  if (relBlockOffset != LINV) {
387  // mfh 21 Dec 2015: Here we encode the assumption that blocks
388  // are stored contiguously, with no padding. "Contiguously"
389  // means that all memory between the first and last entries
390  // belongs to the block (no striding). "No padding" means
391  // that getBlockSize() * getBlockSize() is exactly the number
392  // of entries that the block uses. For another place where
393  // this assumption is encoded, see sumIntoLocalValues.
394 
395  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
396  // little_block_type A_old =
397  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
398  impl_scalar_type* const A_old = val_ + absBlockOffset * perBlockSize;
399  // const_little_block_type A_new =
400  // getConstLocalBlockFromInput (vIn, pointOffset);
401  const impl_scalar_type* const A_new = vIn + pointOffset;
402  // COPY (A_new, A_old);
403  for (LO i = 0; i < perBlockSize; ++i) {
404  A_old[i] = A_new[i];
405  }
406  hint = relBlockOffset + 1;
407  ++validCount;
408  }
409  }
410  return validCount;
411  }
412 
413  template <class Scalar, class LO, class GO, class Node>
414  void
416  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
417  {
418  const char tfecfFuncName[] =
419  "Tpetra::BlockCrsMatrix::getLocalDiagOffsets: ";
420  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
421  (! graph_.hasColMap (), std::runtime_error,
422  "The matrix's graph does not yet have a column Map.");
423 
424  // FIXME (mfh 13 Dec 2015) This method currently only works if the
425  // graph is locally indexed. We check "globally indexed" rather
426  // than "not locally indexed," because currently, "neither locally
427  // nor globally indexed" means that the graph is empty on the
428  // calling process.
429  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
430  (graph_.isGloballyIndexed (), std::runtime_error,
431  "The matrix's graph is globally indexed.");
432 
433  const map_type& rowMap = * (graph_.getRowMap ());
434  const map_type& colMap = * (graph_.getColMap ());
435 
436  const LO lclNumRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
437  if (static_cast<LO> (offsets.size ()) != lclNumRows) {
438  offsets.resize (lclNumRows);
439  }
440 
441 #ifdef HAVE_TPETRA_DEBUG
442  bool allRowMapDiagEntriesInColMap = true;
443  bool allDiagEntriesFound = true;
444  bool allOffsetsCorrect = true;
445  bool noOtherWeirdness = true;
446  std::vector<std::pair<LO, size_t> > wrongOffsets;
447  auto localGraph = this->graph_.getLocalGraph ();
448 #endif // HAVE_TPETRA_DEBUG
449 
450  for (LO lclRowInd = 0; lclRowInd < lclNumRows; ++lclRowInd) {
451  const GO gblRowInd = rowMap.getGlobalElement (lclRowInd);
452  const GO gblColInd = gblRowInd;
453  const LO lclColInd = colMap.getLocalElement (gblColInd);
454 
455  if (lclColInd == Teuchos::OrdinalTraits<LO>::invalid ()) {
456 #ifdef HAVE_TPETRA_DEBUG
457  allRowMapDiagEntriesInColMap = false;
458 #endif // HAVE_TPETRA_DEBUG
459  offsets[lclRowInd] = Teuchos::OrdinalTraits<size_t>::invalid ();
460  }
461  else {
462  const RowInfo rowInfo = graph_.getRowInfo (lclRowInd);
463  if (static_cast<LO> (rowInfo.localRow) == lclRowInd &&
464  rowInfo.numEntries > 0) {
465  const size_t offset = graph_.findLocalIndex (rowInfo, lclColInd);
466  offsets[lclRowInd] = offset;
467 
468 #ifdef HAVE_TPETRA_DEBUG
469  // Now that we have what we think is an offset, make sure
470  // that it really does point to the diagonal entry. Offsets
471  // are _relative_ to each row, not absolute (for the whole
472  // (local) matrix).
473  const LO* lclColInds = NULL;
474  Scalar* lclVals = NULL;
475  LO numEnt = 0;
476  LO err = this->getLocalRowView (lclRowInd, lclColInds, lclVals, numEnt);
477  if (err != 0) {
478  noOtherWeirdness = false;
479  }
480  else { // err = 0; view is correct
481  if (offset >= static_cast<size_t> (numEnt)) {
482  // Offsets are relative to each row, so this means that
483  // the offset is out of bounds.
484  allOffsetsCorrect = false;
485  wrongOffsets.push_back (std::make_pair (lclRowInd, offset));
486  } else {
487  const LO actualLclColInd = lclColInds[offset];
488  const GO actualGblColInd = colMap.getGlobalElement (actualLclColInd);
489  if (actualGblColInd != gblColInd) {
490  allOffsetsCorrect = false;
491  wrongOffsets.push_back (std::make_pair (lclRowInd, offset));
492  }
493  }
494  }
495 #endif // HAVE_TPETRA_DEBUG
496  }
497  else {
498  offsets[lclRowInd] = Teuchos::OrdinalTraits<size_t>::invalid ();
499 #ifdef HAVE_TPETRA_DEBUG
500  allDiagEntriesFound = false;
501 #endif // HAVE_TPETRA_DEBUG
502  }
503  }
504  }
505 
506 #ifdef HAVE_TPETRA_DEBUG
507  if (wrongOffsets.size () != 0) {
508  std::ostringstream os;
509  os << "Proc " << this->getComm ()->getRank () << ": Wrong offsets: [";
510  for (size_t k = 0; k < wrongOffsets.size (); ++k) {
511  os << "(" << wrongOffsets[k].first << ","
512  << wrongOffsets[k].second << ")";
513  if (k + 1 < wrongOffsets.size ()) {
514  os << ", ";
515  }
516  }
517  os << "]" << std::endl;
518  std::cerr << os.str ();
519  }
520 #endif // HAVE_TPETRA_DEBUG
521 
522 #ifdef HAVE_TPETRA_DEBUG
523  using Teuchos::reduceAll;
524  using std::endl;
525  const bool localSuccess =
526  allRowMapDiagEntriesInColMap && allDiagEntriesFound && allOffsetsCorrect;
527  const int numResults = 5;
528  int lclResults[5];
529  lclResults[0] = allRowMapDiagEntriesInColMap ? 1 : 0;
530  lclResults[1] = allDiagEntriesFound ? 1 : 0;
531  lclResults[2] = allOffsetsCorrect ? 1 : 0;
532  lclResults[3] = noOtherWeirdness ? 1 : 0;
533  // min-all-reduce will compute least rank of all the processes
534  // that didn't succeed.
535  lclResults[4] =
536  ! localSuccess ? getComm ()->getRank () : getComm ()->getSize ();
537 
538  int gblResults[5];
539  gblResults[0] = 0;
540  gblResults[1] = 0;
541  gblResults[2] = 0;
542  gblResults[3] = 0;
543  gblResults[4] = 0;
544  reduceAll<int, int> (* (this->getComm ()), Teuchos::REDUCE_MIN,
545  numResults, lclResults, gblResults);
546 
547  if (gblResults[0] != 1 || gblResults[1] != 1 || gblResults[2] != 1
548  || gblResults[3] != 1) {
549  std::ostringstream os; // build error message
550  os << "Issue(s) that we noticed (on Process " << gblResults[4] << ", "
551  "possibly among others): " << endl;
552  if (gblResults[0] == 0) {
553  os << " - The column Map does not contain at least one diagonal entry "
554  "of the matrix." << endl;
555  }
556  if (gblResults[1] == 0) {
557  os << " - On one or more processes, some row does not contain a "
558  "diagonal entry." << endl;
559  }
560  if (gblResults[2] == 0) {
561  os << " - On one or more processes, some offsets are incorrect."
562  << endl;
563  }
564  if (gblResults[3] == 0) {
565  os << " - One or more processes had some other error."
566  << endl;
567  }
568  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error, os.str());
569  }
570 #endif // HAVE_TPETRA_DEBUG
571  }
572 
573  template <class Scalar, class LO, class GO, class Node>
574  void
578  const Kokkos::View<impl_scalar_type***, device_type,
579  Kokkos::MemoryUnmanaged>& factoredDiagonal,
580  const Kokkos::View<int**, device_type,
581  Kokkos::MemoryUnmanaged>& factorizationPivots,
582  const Scalar& omega,
583  const ESweepDirection direction) const
584  {
585  using Kokkos::ALL;
586 
587  const LO numLocalMeshRows =
588  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
589  const LO numVecs = static_cast<LO> (X.getNumVectors ());
590 
591  // If using (new) Kokkos, replace localMem with thread-local
592  // memory. Note that for larger block sizes, this will affect the
593  // two-level parallelization. Look to Stokhos for best practice
594  // on making this fast for GPUs.
595  const LO blockSize = getBlockSize ();
596  Teuchos::Array<impl_scalar_type> localMem (blockSize);
597  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
598  little_vec_type X_lcl (localMem.getRawPtr (), blockSize, 1);
599 
600  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
601  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
602  if (direction == Forward) {
603  rowBegin = 1;
604  rowEnd = numLocalMeshRows+1;
605  rowStride = 1;
606  }
607  else if (direction == Backward) {
608  rowBegin = numLocalMeshRows;
609  rowEnd = 0;
610  rowStride = -1;
611  }
612  else if (direction == Symmetric) {
613  this->localGaussSeidel (B, X, factoredDiagonal, factorizationPivots, omega, Forward);
614  this->localGaussSeidel (B, X, factoredDiagonal, factorizationPivots, omega, Backward);
615  return;
616  }
617 
618  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
619  const Scalar minus_omega = -omega;
620 
621  if (numVecs == 1) {
622  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
623  const LO actlRow = lclRow - 1;
624 
625  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
626  COPY (B_cur, X_lcl);
627  SCAL (omega, X_lcl);
628 
629  const size_t meshBeg = ptr_[actlRow];
630  const size_t meshEnd = ptr_[actlRow+1];
631  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
632  const LO meshCol = ind_[absBlkOff];
634  getConstLocalBlockFromAbsOffset (absBlkOff);
635 
636  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
637 
638  // X_lcl += alpha*A_cur*X_cur
639  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
640  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
641  GEMV (alpha, A_cur, X_cur, X_lcl);
642  } // for each entry in the current local row of the matrix
643 
644  // FIXME (mfh 16 Dec 2015) Get an unmanaged subview of
645  // factoredDiagonal BEFORE getting its subview! This will
646  // avoid reference counting overhead, which introduces a
647  // scalability bottleneck.
648  auto D_lcl = Kokkos::subview (factoredDiagonal, actlRow, ALL (), ALL ());
649  auto ipiv = Kokkos::subview (factorizationPivots, actlRow, ALL ());
650  int info = 0;
651  GETRS ("N", D_lcl, ipiv, X_lcl, info);
652 
653  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
654  COPY (X_lcl, X_update);
655  } // for each local row of the matrix
656  }
657  else {
658  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
659  for (LO j = 0; j < numVecs; ++j) {
660  LO actlRow = lclRow-1;
661 
662  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
663  COPY (B_cur, X_lcl);
664  SCAL (omega, X_lcl);
665 
666  const size_t meshBeg = ptr_[actlRow];
667  const size_t meshEnd = ptr_[actlRow+1];
668  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
669  const LO meshCol = ind_[absBlkOff];
671  getConstLocalBlockFromAbsOffset (absBlkOff);
672 
673  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
674 
675  // X_lcl += alpha*A_cur*X_cur
676  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
677  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
678  GEMV (alpha, A_cur, X_cur, X_lcl);
679  } // for each entry in the current local row of the matrx
680 
681  // FIXME (mfh 16 Dec 2015) Get an unmanaged subview of
682  // factoredDiagonal BEFORE getting its subview! This will
683  // avoid reference counting overhead, which introduces a
684  // scalability bottleneck.
685  auto D_lcl = Kokkos::subview (factoredDiagonal, actlRow, ALL (), ALL ());
686  auto ipiv = Kokkos::subview (factorizationPivots, actlRow, ALL ());
687  int info = 0;
688  GETRS ("N", D_lcl, ipiv, X_lcl, info);
689 
690  little_vec_type X_update = X.getLocalBlock (actlRow, j);
691  COPY (X_lcl, X_update);
692  } // for each entry in the current local row of the matrix
693  } // for each local row of the matrix
694  }
695  }
696 
697  template <class Scalar, class LO, class GO, class Node>
698  void
703  const Scalar& dampingFactor,
704  const ESweepDirection direction,
705  const int numSweeps,
706  const bool zeroInitialGuess) const
707  {
708  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
709  // interface for block Gauss-Seidel.
710  TEUCHOS_TEST_FOR_EXCEPTION(
711  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
712  "gaussSeidelCopy: Not implemented.");
713  }
714 
715  template <class Scalar, class LO, class GO, class Node>
716  void
721  const Teuchos::ArrayView<LO>& rowIndices,
722  const Scalar& dampingFactor,
723  const ESweepDirection direction,
724  const int numSweeps,
725  const bool zeroInitialGuess) const
726  {
727  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
728  // interface for block Gauss-Seidel.
729  TEUCHOS_TEST_FOR_EXCEPTION(
730  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
731  "reorderedGaussSeidelCopy: Not implemented.");
732  }
733 
734  template <class Scalar, class LO, class GO, class Node>
735  void
738  const Teuchos::ArrayView<const size_t>& offsets) const
739  {
740  using Teuchos::ArrayRCP;
741  using Teuchos::ArrayView;
742  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
743 
744  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
745  const LO* columnIndices;
746  Scalar* vals;
747  LO numColumns;
748  Teuchos::Array<LO> cols(1);
749 
750  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
751  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
752  for (size_t i = 0; i < myNumRows; ++i) {
753  cols[0] = i;
754  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
755  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
756  }
757  else {
758  getLocalRowView (i, columnIndices, vals, numColumns);
759  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
760  }
761  }
762  }
763 
764 
765  template <class Scalar, class LO, class GO, class Node>
766  void
769  Kokkos::MemoryUnmanaged>& diag,
770  const Teuchos::ArrayView<const size_t>& offsets) const
771  {
772  using Kokkos::ALL;
773  using Kokkos::parallel_for;
774  typedef typename Kokkos::View<impl_scalar_type***, device_type,
775  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
776 
777  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
778  const LO blockSize = this->getBlockSize ();
779  TEUCHOS_TEST_FOR_EXCEPTION
780  (static_cast<LO> (diag.dimension_0 ()) < lclNumMeshRows ||
781  static_cast<LO> (diag.dimension_1 ()) < blockSize ||
782  static_cast<LO> (diag.dimension_2 ()) < blockSize,
783  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
784  "The input Kokkos::View is not big enough to hold all the data.");
785  TEUCHOS_TEST_FOR_EXCEPTION
786  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
787  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
788  "offsets.size() = " << offsets.size () << " < local number of diagonal "
789  "blocks " << lclNumMeshRows << ".");
790 
791  // mfh 12 Dec 2015: Use the host execution space, since we haven't
792  // quite made everything work with CUDA yet.
793  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
794  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
795  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
796  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
797  COPY (D_in, D_out);
798  });
799  }
800 
801 
802  template<class Scalar, class LO, class GO, class Node>
803  LO
805  absMaxLocalValues (const LO localRowInd,
806  const LO colInds[],
807  const Scalar vals[],
808  const LO numColInds) const
809  {
810  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
811  // We modified no values, because the input local row index is
812  // invalid on the calling process. That may not be an error, if
813  // numColInds is zero anyway; it doesn't matter. This is the
814  // advantage of returning the number of valid indices.
815  return static_cast<LO> (0);
816  }
817  const impl_scalar_type* const vIn =
818  reinterpret_cast<const impl_scalar_type*> (vals);
819  const size_t absRowBlockOffset = ptr_[localRowInd];
820  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
821  const LO perBlockSize = this->offsetPerBlock ();
822  LO hint = 0; // Guess for the relative offset into the current row
823  LO pointOffset = 0; // Current offset into input values
824  LO validCount = 0; // number of valid column indices in colInds
825 
826  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
827  const LO relBlockOffset =
828  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
829  if (relBlockOffset != LINV) {
830  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
831  little_block_type A_old =
832  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
834  getConstLocalBlockFromInput (vIn, pointOffset);
835 
836  Impl::absMax (A_old, A_new);
837  hint = relBlockOffset + 1;
838  ++validCount;
839  }
840  }
841  return validCount;
842  }
843 
844 
845  template<class Scalar, class LO, class GO, class Node>
846  LO
848  sumIntoLocalValues (const LO localRowInd,
849  const LO colInds[],
850  const Scalar vals[],
851  const LO numColInds) const
852  {
853  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
854  // We modified no values, because the input local row index is
855  // invalid on the calling process. That may not be an error, if
856  // numColInds is zero anyway; it doesn't matter. This is the
857  // advantage of returning the number of valid indices.
858  return static_cast<LO> (0);
859  }
860  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
861  const impl_scalar_type* const vIn =
862  reinterpret_cast<const impl_scalar_type*> (vals);
863  const size_t absRowBlockOffset = this->ptr_[localRowInd];
864  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
865  const LO perBlockSize = this->offsetPerBlock ();
866  LO hint = 0; // Guess for the relative offset into the current row
867  LO pointOffset = 0; // Current offset into input values
868  LO validCount = 0; // number of valid column indices in colInds
869 
870  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
871  const LO relBlockOffset =
872  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
873  if (relBlockOffset != LINV) {
874  // mfh 21 Dec 2015: Here we encode the assumption that blocks
875  // are stored contiguously, with no padding. "Contiguously"
876  // means that all memory between the first and last entries
877  // belongs to the block (no striding). "No padding" means
878  // that getBlockSize() * getBlockSize() is exactly the number
879  // of entries that the block uses. For another place where
880  // this assumption is encoded, see replaceLocalValues.
881 
882  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
883  // little_block_type A_old =
884  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
885  impl_scalar_type* const A_old = val_ + absBlockOffset * perBlockSize;
886  // const_little_block_type A_new =
887  // getConstLocalBlockFromInput (vIn, pointOffset);
888  const impl_scalar_type* const A_new = vIn + pointOffset;
889  // AXPY (ONE, A_new, A_old);
890  for (LO i = 0; i < perBlockSize; ++i) {
891  A_old[i] += A_new[i];
892  }
893  hint = relBlockOffset + 1;
894  ++validCount;
895  }
896  }
897  return validCount;
898  }
899 
900  template<class Scalar, class LO, class GO, class Node>
901  LO
903  getLocalRowView (const LO localRowInd,
904  const LO*& colInds,
905  Scalar*& vals,
906  LO& numInds) const
907  {
908  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
909  colInds = NULL;
910  vals = NULL;
911  numInds = 0;
912  return Teuchos::OrdinalTraits<LO>::invalid ();
913  }
914  else {
915  const size_t absBlockOffsetStart = ptr_[localRowInd];
916  colInds = ind_ + absBlockOffsetStart;
917 
918  impl_scalar_type* const vOut = val_ + absBlockOffsetStart * offsetPerBlock ();
919  vals = reinterpret_cast<Scalar*> (vOut);
920 
921  numInds = ptr_[localRowInd + 1] - absBlockOffsetStart;
922  return 0; // indicates no error
923  }
924  }
925 
926  template<class Scalar, class LO, class GO, class Node>
927  void
929  getLocalRowCopy (LO LocalRow,
930  const Teuchos::ArrayView<LO>& Indices,
931  const Teuchos::ArrayView<Scalar>& Values,
932  size_t &NumEntries) const
933  {
934  const LO *colInds;
935  Scalar *vals;
936  LO numInds;
937  getLocalRowView(LocalRow,colInds,vals,numInds);
938  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
939  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
940  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
941  << numInds << " row entries");
942  }
943  for (LO i=0; i<numInds; ++i) {
944  Indices[i] = colInds[i];
945  }
946  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
947  Values[i] = vals[i];
948  }
949  NumEntries = numInds;
950  }
951 
952  template<class Scalar, class LO, class GO, class Node>
953  LO
955  getLocalRowOffsets (const LO localRowInd,
956  ptrdiff_t offsets[],
957  const LO colInds[],
958  const LO numColInds) const
959  {
960  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
961  // We got no offsets, because the input local row index is
962  // invalid on the calling process. That may not be an error, if
963  // numColInds is zero anyway; it doesn't matter. This is the
964  // advantage of returning the number of valid indices.
965  return static_cast<LO> (0);
966  }
967 
968  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
969  LO hint = 0; // Guess for the relative offset into the current row
970  LO validCount = 0; // number of valid column indices in colInds
971 
972  for (LO k = 0; k < numColInds; ++k) {
973  const LO relBlockOffset =
974  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
975  if (relBlockOffset != LINV) {
976  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
977  hint = relBlockOffset + 1;
978  ++validCount;
979  }
980  }
981  return validCount;
982  }
983 
984 
985  template<class Scalar, class LO, class GO, class Node>
986  LO
988  replaceLocalValuesByOffsets (const LO localRowInd,
989  const ptrdiff_t offsets[],
990  const Scalar vals[],
991  const LO numOffsets) const
992  {
993  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
994  // We modified no values, because the input local row index is
995  // invalid on the calling process. That may not be an error, if
996  // numColInds is zero anyway; it doesn't matter. This is the
997  // advantage of returning the number of valid indices.
998  return static_cast<LO> (0);
999  }
1000  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1001 
1002  const size_t absRowBlockOffset = ptr_[localRowInd];
1003  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1004  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1005  size_t pointOffset = 0; // Current offset into input values
1006  LO validCount = 0; // number of valid offsets
1007 
1008  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1009  const size_t relBlockOffset = offsets[k];
1010  if (relBlockOffset != STINV) {
1011  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1012  little_block_type A_old =
1013  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1014  const_little_block_type A_new =
1015  getConstLocalBlockFromInput (vIn, pointOffset);
1016  COPY (A_new, A_old);
1017  ++validCount;
1018  }
1019  }
1020  return validCount;
1021  }
1022 
1023 
1024  template<class Scalar, class LO, class GO, class Node>
1025  LO
1027  absMaxLocalValuesByOffsets (const LO localRowInd,
1028  const ptrdiff_t offsets[],
1029  const Scalar vals[],
1030  const LO numOffsets) const
1031  {
1032  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1033  // We modified no values, because the input local row index is
1034  // invalid on the calling process. That may not be an error, if
1035  // numColInds is zero anyway; it doesn't matter. This is the
1036  // advantage of returning the number of valid indices.
1037  return static_cast<LO> (0);
1038  }
1039  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1040 
1041  const size_t absRowBlockOffset = ptr_[localRowInd];
1042  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1043  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1044  size_t pointOffset = 0; // Current offset into input values
1045  LO validCount = 0; // number of valid offsets
1046 
1047  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1048  const size_t relBlockOffset = offsets[k];
1049  if (relBlockOffset != STINV) {
1050  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1051  little_block_type A_old =
1052  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1053  const_little_block_type A_new =
1054  getConstLocalBlockFromInput (vIn, pointOffset);
1055  Impl::absMax (A_old, A_new);
1056  ++validCount;
1057  }
1058  }
1059  return validCount;
1060  }
1061 
1062 
1063  template<class Scalar, class LO, class GO, class Node>
1064  LO
1066  sumIntoLocalValuesByOffsets (const LO localRowInd,
1067  const ptrdiff_t offsets[],
1068  const Scalar vals[],
1069  const LO numOffsets) const
1070  {
1071  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1072  // We modified no values, because the input local row index is
1073  // invalid on the calling process. That may not be an error, if
1074  // numColInds is zero anyway; it doesn't matter. This is the
1075  // advantage of returning the number of valid indices.
1076  return static_cast<LO> (0);
1077  }
1078  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1079  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1080 
1081  const size_t absRowBlockOffset = ptr_[localRowInd];
1082  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1083  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1084  size_t pointOffset = 0; // Current offset into input values
1085  LO validCount = 0; // number of valid offsets
1086 
1087  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1088  const size_t relBlockOffset = offsets[k];
1089  if (relBlockOffset != STINV) {
1090  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1091  little_block_type A_old =
1092  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1093  const_little_block_type A_new =
1094  getConstLocalBlockFromInput (vIn, pointOffset);
1095  //A_old.update (ONE, A_new);
1096  AXPY (ONE, A_new, A_old);
1097  ++validCount;
1098  }
1099  }
1100  return validCount;
1101  }
1102 
1103 
1104  template<class Scalar, class LO, class GO, class Node>
1105  size_t
1107  getNumEntriesInLocalRow (const LO localRowInd) const
1108  {
1109  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1110  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1111  return static_cast<LO> (0); // the calling process doesn't have that row
1112  } else {
1113  return static_cast<LO> (numEntInGraph);
1114  }
1115  }
1116 
1117  template<class Scalar, class LO, class GO, class Node>
1118  void
1122  const Teuchos::ETransp mode,
1123  const Scalar alpha,
1124  const Scalar beta)
1125  {
1126  (void) X;
1127  (void) Y;
1128  (void) mode;
1129  (void) alpha;
1130  (void) beta;
1131 
1132  TEUCHOS_TEST_FOR_EXCEPTION(
1133  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
1134  "transpose and conjugate transpose modes are not yet implemented.");
1135  }
1136 
1137  template<class Scalar, class LO, class GO, class Node>
1138  void
1142  const Scalar alpha,
1143  const Scalar beta)
1144  {
1145  using Teuchos::RCP;
1146  using Teuchos::rcp;
1147  typedef Tpetra::Import<LO, GO, Node> import_type;
1148  typedef Tpetra::Export<LO, GO, Node> export_type;
1149  const Scalar zero = STS::zero ();
1150  const Scalar one = STS::one ();
1151  RCP<const import_type> import = graph_.getImporter ();
1152  // "export" is a reserved C++ keyword, so we can't use it.
1153  RCP<const export_type> theExport = graph_.getExporter ();
1154 
1155  // FIXME (mfh 20 May 2014) X.mv_ and Y.mv_ requires a friend
1156  // declaration, which is useful only for debugging.
1157  TEUCHOS_TEST_FOR_EXCEPTION(
1158  X.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1159  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1160  "The input BlockMultiVector X has deep copy semantics, "
1161  "not view semantics (as it should).");
1162  TEUCHOS_TEST_FOR_EXCEPTION(
1163  Y.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1164  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1165  "The output BlockMultiVector Y has deep copy semantics, "
1166  "not view semantics (as it should).");
1167 
1168  if (alpha == zero) {
1169  if (beta == zero) {
1170  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1171  } else if (beta != one) {
1172  Y.scale (beta);
1173  }
1174  } else { // alpha != 0
1175  const BMV* X_colMap = NULL;
1176  if (import.is_null ()) {
1177  try {
1178  X_colMap = &X;
1179  } catch (std::exception& e) {
1180  TEUCHOS_TEST_FOR_EXCEPTION(
1181  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1182  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1183  "operator= threw an exception: " << std::endl << e.what ());
1184  }
1185  } else {
1186  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1187  // Y_rowMap_ below. This lets us do lazy initialization
1188  // correctly with view semantics of BlockCrsMatrix. All views
1189  // of this BlockCrsMatrix have the same outer pointer. That
1190  // way, we can set the inner pointer in one view, and all
1191  // other views will see it.
1192  if ((*X_colMap_).is_null () ||
1193  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1194  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1195  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1196  static_cast<LO> (X.getNumVectors ())));
1197  }
1198  (**X_colMap_).doImport (X, *import, Tpetra::REPLACE);
1199  try {
1200  X_colMap = &(**X_colMap_);
1201  } catch (std::exception& e) {
1202  TEUCHOS_TEST_FOR_EXCEPTION(
1203  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1204  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1205  "operator= threw an exception: " << std::endl << e.what ());
1206  }
1207  }
1208 
1209  BMV* Y_rowMap = NULL;
1210  if (theExport.is_null ()) {
1211  Y_rowMap = &Y;
1212  } else if ((*Y_rowMap_).is_null () ||
1213  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1214  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1215  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1216  static_cast<LO> (X.getNumVectors ())));
1217  try {
1218  Y_rowMap = &(**Y_rowMap_);
1219  } catch (std::exception& e) {
1220  TEUCHOS_TEST_FOR_EXCEPTION(
1221  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1222  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1223  "operator= threw an exception: " << std::endl << e.what ());
1224  }
1225  }
1226 
1227  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1228 
1229  if (! theExport.is_null ()) {
1230  Y.doExport (*Y_rowMap, *theExport, Tpetra::REPLACE);
1231  }
1232  }
1233  }
1234 
1235  template<class Scalar, class LO, class GO, class Node>
1236  void
1240  const Scalar alpha,
1241  const Scalar beta)
1242  {
1243  // If using (new) Kokkos, prefer Kokkos::ArithTraits to
1244  // Teuchos::ScalarTraits.
1245  const Scalar zero = STS::zero ();
1246  const Scalar one = STS::one ();
1247  const LO numLocalMeshRows =
1248  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1249  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1250 
1251  // If using (new) Kokkos, replace localMem with thread-local
1252  // memory. Note that for larger block sizes, this will affect the
1253  // two-level parallelization. Look to Stokhos for best practice
1254  // on making this fast for GPUs.
1255  const LO blockSize = getBlockSize ();
1256  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1257  little_vec_type Y_lcl (localMem.getRawPtr (), blockSize, 1);
1258 
1259  if (numVecs == 1) {
1260  for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
1261  little_vec_type Y_cur = Y.getLocalBlock (lclRow, 0);
1262 
1263  if (beta == zero) {
1264  deep_copy (Y_lcl, zero);
1265  } else if (beta == one) {
1266  COPY (Y_cur, Y_lcl);
1267  } else {
1268  COPY (Y_cur, Y_lcl);
1269  SCAL (beta, Y_lcl);
1270  }
1271 
1272  const size_t meshBeg = ptr_[lclRow];
1273  const size_t meshEnd = ptr_[lclRow+1];
1274  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1275  const LO meshCol = ind_[absBlkOff];
1276  const_little_block_type A_cur =
1277  getConstLocalBlockFromAbsOffset (absBlkOff);
1278  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1279  // Y_lcl += alpha*A_cur*X_cur
1280  //Y_lcl.matvecUpdate (alpha, A_cur, X_cur);
1281  GEMV (alpha, A_cur, X_cur, Y_lcl);
1282  } // for each entry in the current local row of the matrx
1283 
1284  COPY (Y_lcl, Y_cur);
1285  } // for each local row of the matrix
1286  }
1287  else {
1288  for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
1289  for (LO j = 0; j < numVecs; ++j) {
1290  little_vec_type Y_cur = Y.getLocalBlock (lclRow, j);
1291 
1292  if (beta == zero) {
1293  deep_copy (Y_lcl, zero);
1294  } else if (beta == one) {
1295  COPY (Y_cur, Y_lcl);
1296  } else {
1297  COPY (Y_cur, Y_lcl);
1298  SCAL (beta, Y_lcl);
1299  }
1300 
1301  const size_t meshBeg = ptr_[lclRow];
1302  const size_t meshEnd = ptr_[lclRow+1];
1303  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1304  const LO meshCol = ind_[absBlkOff];
1305  const_little_block_type A_cur =
1306  getConstLocalBlockFromAbsOffset (absBlkOff);
1307  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1308  // Y_lcl += alpha*A_cur*X_cur
1309  //Y_lcl.matvecUpdate (alpha, A_cur, X_cur);
1310  GEMV (alpha, A_cur, X_cur, Y_lcl);
1311  } // for each entry in the current local row of the matrix
1312 
1313  COPY (Y_lcl, Y_cur);
1314  } // for each entry in the current row of Y
1315  } // for each local row of the matrix
1316  }
1317  }
1318 
1319  template<class Scalar, class LO, class GO, class Node>
1320  LO
1322  findRelOffsetOfColumnIndex (const LO localRowIndex,
1323  const LO colIndexToFind,
1324  const LO hint) const
1325  {
1326  const size_t absStartOffset = ptr_[localRowIndex];
1327  const size_t absEndOffset = ptr_[localRowIndex+1];
1328  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1329  // Amortize pointer arithmetic over the search loop.
1330  const LO* const curInd = ind_ + absStartOffset;
1331 
1332  // If the hint was correct, then the hint is the offset to return.
1333  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1334  // Always return the offset relative to the current row.
1335  return hint;
1336  }
1337 
1338  // The hint was wrong, so we must search for the given column
1339  // index in the column indices for the given row.
1340  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1341 
1342  // We require that the graph have sorted rows. However, binary
1343  // search only pays if the current row is longer than a certain
1344  // amount. We set this to 32, but you might want to tune this.
1345  const LO maxNumEntriesForLinearSearch = 32;
1346  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1347  // Use binary search. It would probably be better for us to
1348  // roll this loop by hand. If we wrote it right, a smart
1349  // compiler could perhaps use conditional loads and avoid
1350  // branches (according to Jed Brown on May 2014).
1351  const LO* beg = curInd;
1352  const LO* end = curInd + numEntriesInRow;
1353  std::pair<const LO*, const LO*> p =
1354  std::equal_range (beg, end, colIndexToFind);
1355  if (p.first != p.second) {
1356  // offset is relative to the current row
1357  relOffset = static_cast<LO> (p.first - beg);
1358  }
1359  }
1360  else { // use linear search
1361  for (LO k = 0; k < numEntriesInRow; ++k) {
1362  if (colIndexToFind == curInd[k]) {
1363  relOffset = k; // offset is relative to the current row
1364  break;
1365  }
1366  }
1367  }
1368 
1369  return relOffset;
1370  }
1371 
1372  template<class Scalar, class LO, class GO, class Node>
1373  LO
1375  offsetPerBlock () const
1376  {
1377  return offsetPerBlock_;
1378  }
1379 
1380  template<class Scalar, class LO, class GO, class Node>
1384  const size_t pointOffset) const
1385  {
1386  // Row major blocks
1387  const LO rowStride = blockSize_;
1388  return const_little_block_type (val + pointOffset, blockSize_, rowStride, 1);
1389  }
1390 
1391  template<class Scalar, class LO, class GO, class Node>
1395  const size_t pointOffset) const
1396  {
1397  // Row major blocks
1398  const LO rowStride = blockSize_;
1399  return little_block_type (val + pointOffset, blockSize_, rowStride, 1);
1400  }
1401 
1402  template<class Scalar, class LO, class GO, class Node>
1405  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1406  {
1407  if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) {
1408  // An empty block signifies an error. We don't expect to see
1409  // this error in correct code, but it's helpful for avoiding
1410  // memory corruption in case there is a bug.
1411  return const_little_block_type (NULL, 0, 0, 0);
1412  } else {
1413  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1414  return getConstLocalBlockFromInput (val_, absPointOffset);
1415  }
1416  }
1417 
1418  template<class Scalar, class LO, class GO, class Node>
1421  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
1422  const size_t relMeshOffset) const
1423  {
1424  typedef impl_scalar_type IST;
1425 
1426  const LO* lclColInds = NULL;
1427  Scalar* lclVals = NULL;
1428  LO numEnt = 0;
1429 
1430  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
1431  if (err != 0) {
1432  // An empty block signifies an error. We don't expect to see
1433  // this error in correct code, but it's helpful for avoiding
1434  // memory corruption in case there is a bug.
1435  return const_little_block_type (NULL, 0, 0, 0);
1436  }
1437  else {
1438  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
1439  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
1440  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
1441  relPointOffset);
1442  }
1443  }
1444 
1445  template<class Scalar, class LO, class GO, class Node>
1448  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1449  {
1450  if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) {
1451  // An empty block signifies an error. We don't expect to see
1452  // this error in correct code, but it's helpful for avoiding
1453  // memory corruption in case there is a bug.
1454  return little_block_type (NULL, 0, 0, 0);
1455  } else {
1456  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1457  return getNonConstLocalBlockFromInput (const_cast<impl_scalar_type*> (val_),
1458  absPointOffset);
1459  }
1460  }
1461 
1462  template<class Scalar, class LO, class GO, class Node>
1465  getLocalBlock (const LO localRowInd, const LO localColInd) const
1466  {
1467  const size_t absRowBlockOffset = ptr_[localRowInd];
1468  const LO relBlockOffset =
1469  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1470 
1471  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1472  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1473  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1474  }
1475  else {
1476  return little_block_type (NULL, 0, 0, 0);
1477  }
1478  }
1479 
1480  // template<class Scalar, class LO, class GO, class Node>
1481  // void
1482  // BlockCrsMatrix<Scalar, LO, GO, Node>::
1483  // clearLocalErrorStateAndStream ()
1484  // {
1485  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
1486  // * (const_cast<this_type*> (this)->localError_) = false;
1487  // *errs_ = Teuchos::null;
1488  // }
1489 
1490  template<class Scalar, class LO, class GO, class Node>
1491  bool
1494  {
1495  using std::endl;
1497  const this_type* src = dynamic_cast<const this_type* > (&source);
1498 
1499  if (src == NULL) {
1500  std::ostream& err = markLocalErrorAndGetStream ();
1501  err << "checkSizes: The source object of the Import or Export "
1502  "must be a BlockCrsMatrix with the same template parameters as the "
1503  "target object." << endl;
1504  }
1505  else {
1506  // Use a string of ifs, not if-elseifs, because we want to know
1507  // all the errors.
1508  if (src->getBlockSize () != this->getBlockSize ()) {
1509  std::ostream& err = markLocalErrorAndGetStream ();
1510  err << "checkSizes: The source and target objects of the Import or "
1511  << "Export must have the same block sizes. The source's block "
1512  << "size = " << src->getBlockSize () << " != the target's block "
1513  << "size = " << this->getBlockSize () << "." << endl;
1514  }
1515  if (! src->graph_.isFillComplete ()) {
1516  std::ostream& err = markLocalErrorAndGetStream ();
1517  err << "checkSizes: The source object of the Import or Export is "
1518  "not fill complete. Both source and target objects must be fill "
1519  "complete." << endl;
1520  }
1521  if (! this->graph_.isFillComplete ()) {
1522  std::ostream& err = markLocalErrorAndGetStream ();
1523  err << "checkSizes: The target object of the Import or Export is "
1524  "not fill complete. Both source and target objects must be fill "
1525  "complete." << endl;
1526  }
1527  if (src->graph_.getColMap ().is_null ()) {
1528  std::ostream& err = markLocalErrorAndGetStream ();
1529  err << "checkSizes: The source object of the Import or Export does "
1530  "not have a column Map. Both source and target objects must have "
1531  "column Maps." << endl;
1532  }
1533  if (this->graph_.getColMap ().is_null ()) {
1534  std::ostream& err = markLocalErrorAndGetStream ();
1535  err << "checkSizes: The target object of the Import or Export does "
1536  "not have a column Map. Both source and target objects must have "
1537  "column Maps." << endl;
1538  }
1539  }
1540 
1541  return ! (* (this->localError_));
1542  }
1543 
1544  template<class Scalar, class LO, class GO, class Node>
1545  void
1548  size_t numSameIDs,
1549  const Teuchos::ArrayView<const LO>& permuteToLIDs,
1550  const Teuchos::ArrayView<const LO>& permuteFromLIDs)
1551  {
1552  using std::endl;
1554  const bool debug = false;
1555 
1556  if (debug) {
1557  std::ostringstream os;
1558  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1559  os << "Proc " << myRank << ": copyAndPermute: "
1560  << "numSameIDs: " << numSameIDs
1561  << ", permuteToLIDs.size(): " << permuteToLIDs.size ()
1562  << ", permuteFromLIDs.size(): " << permuteFromLIDs.size ()
1563  << endl;
1564  std::cerr << os.str ();
1565  }
1566 
1567  // There's no communication in this method, so it's safe just to
1568  // return on error.
1569 
1570  if (* (this->localError_)) {
1571  std::ostream& err = this->markLocalErrorAndGetStream ();
1572  err << "copyAndPermute: The target object of the Import or Export is "
1573  "already in an error state." << endl;
1574  return;
1575  }
1576  if (permuteToLIDs.size () != permuteFromLIDs.size ()) {
1577  std::ostream& err = this->markLocalErrorAndGetStream ();
1578  err << "copyAndPermute: permuteToLIDs.size() = " << permuteToLIDs.size ()
1579  << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."
1580  << endl;
1581  return;
1582  }
1583 
1584  const this_type* src = dynamic_cast<const this_type* > (&source);
1585  if (src == NULL) {
1586  std::ostream& err = this->markLocalErrorAndGetStream ();
1587  err << "copyAndPermute: The source object of the Import or Export is "
1588  "either not a BlockCrsMatrix, or does not have the right template "
1589  "parameters. checkSizes() should have caught this. "
1590  "Please report this bug to the Tpetra developers." << endl;
1591  return;
1592  }
1593  if (* (src->localError_)) {
1594  std::ostream& err = this->markLocalErrorAndGetStream ();
1595  err << "copyAndPermute: The source object of the Import or Export is "
1596  "already in an error state." << endl;
1597  return;
1598  }
1599 
1600  bool lclErr = false;
1601 #ifdef HAVE_TPETRA_DEBUG
1602  std::set<LO> invalidSrcCopyRows;
1603  std::set<LO> invalidDstCopyRows;
1604  std::set<LO> invalidDstCopyCols;
1605  std::set<LO> invalidDstPermuteCols;
1606  std::set<LO> invalidPermuteFromRows;
1607 #endif // HAVE_TPETRA_DEBUG
1608 
1609  // Copy the initial sequence of rows that are the same.
1610  //
1611  // The two graphs might have different column Maps, so we need to
1612  // do this using global column indices. This is purely local, so
1613  // we only need to check for local sameness of the two column
1614  // Maps.
1615 
1616 #ifdef HAVE_TPETRA_DEBUG
1617  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1618 #endif // HAVE_TPETRA_DEBUG
1619  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1620  const map_type& srcColMap = * (src->graph_.getColMap ());
1621  const map_type& dstColMap = * (this->graph_.getColMap ());
1622  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1623  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.size ());
1624 
1625  if (debug) {
1626  std::ostringstream os;
1627  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1628  os << "Proc " << myRank << ": copyAndPermute: "
1629  << "canUseLocalColumnIndices: "
1630  << (canUseLocalColumnIndices ? "true" : "false")
1631  << ", numPermute: " << numPermute
1632  << endl;
1633  std::cerr << os.str ();
1634  }
1635 
1636  if (canUseLocalColumnIndices) {
1637  // Copy local rows that are the "same" in both source and target.
1638  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1639 #ifdef HAVE_TPETRA_DEBUG
1640  if (! srcRowMap.isNodeLocalElement (localRow)) {
1641  lclErr = true;
1642  invalidSrcCopyRows.insert (localRow);
1643  continue; // skip invalid rows
1644  }
1645 #endif // HAVE_TPETRA_DEBUG
1646 
1647  const LO* lclSrcCols;
1648  Scalar* vals;
1649  LO numEntries;
1650  // If this call fails, that means the mesh row local index is
1651  // invalid. That means the Import or Export is invalid somehow.
1652  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
1653  if (err != 0) {
1654  lclErr = true;
1655 #ifdef HAVE_TPETRA_DEBUG
1656  (void) invalidSrcCopyRows.insert (localRow);
1657 #endif // HAVE_TPETRA_DEBUG
1658  }
1659  else {
1660  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
1661  if (err != numEntries) {
1662  lclErr = true;
1663  if (! dstRowMap.isNodeLocalElement (localRow)) {
1664 #ifdef HAVE_TPETRA_DEBUG
1665  invalidDstCopyRows.insert (localRow);
1666 #endif // HAVE_TPETRA_DEBUG
1667  }
1668  else {
1669  // Once there's an error, there's no sense in saving
1670  // time, so we check whether the column indices were
1671  // invalid. However, only remember which ones were
1672  // invalid in a debug build, because that might take a
1673  // lot of space.
1674  for (LO k = 0; k < numEntries; ++k) {
1675  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1676  lclErr = true;
1677 #ifdef HAVE_TPETRA_DEBUG
1678  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1679 #endif // HAVE_TPETRA_DEBUG
1680  }
1681  }
1682  }
1683  }
1684  }
1685  } // for each "same" local row
1686 
1687  // Copy the "permute" local rows.
1688  for (size_t k = 0; k < numPermute; ++k) {
1689  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
1690  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
1691 
1692  const LO* lclSrcCols;
1693  Scalar* vals;
1694  LO numEntries;
1695  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
1696  if (err != 0) {
1697  lclErr = true;
1698 #ifdef HAVE_TPETRA_DEBUG
1699  invalidPermuteFromRows.insert (srcLclRow);
1700 #endif // HAVE_TPETRA_DEBUG
1701  }
1702  else {
1703  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
1704  if (err != numEntries) {
1705  lclErr = true;
1706 #ifdef HAVE_TPETRA_DEBUG
1707  for (LO c = 0; c < numEntries; ++c) {
1708  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1709  invalidDstPermuteCols.insert (lclSrcCols[c]);
1710  }
1711  }
1712 #endif // HAVE_TPETRA_DEBUG
1713  }
1714  }
1715  }
1716  }
1717  else { // must convert column indices to global
1718  // Reserve space to store the destination matrix's local column indices.
1719  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
1720  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1721 
1722  // Copy local rows that are the "same" in both source and target.
1723  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1724  const LO* lclSrcCols;
1725  Scalar* vals;
1726  LO numEntries;
1727  // If this call fails, that means the mesh row local index is
1728  // invalid. That means the Import or Export is invalid somehow.
1729  LO err = 0;
1730  try {
1731  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
1732  } catch (std::exception& e) {
1733  if (debug) {
1734  std::ostringstream os;
1735  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1736  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1737  << localRow << ", src->getLocalRowView() threw an exception: "
1738  << e.what ();
1739  std::cerr << os.str ();
1740  }
1741  throw e;
1742  }
1743 
1744  if (err != 0) {
1745  lclErr = true;
1746 #ifdef HAVE_TPETRA_DEBUG
1747  invalidSrcCopyRows.insert (localRow);
1748 #endif // HAVE_TPETRA_DEBUG
1749  }
1750  else {
1751  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1752  lclErr = true;
1753  if (debug) {
1754  std::ostringstream os;
1755  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1756  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1757  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1758  << maxNumEnt << endl;
1759  std::cerr << os.str ();
1760  }
1761  }
1762  else {
1763  // Convert the source matrix's local column indices to the
1764  // destination matrix's local column indices.
1765  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1766  for (LO j = 0; j < numEntries; ++j) {
1767  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1768  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1769  lclErr = true;
1770 #ifdef HAVE_TPETRA_DEBUG
1771  invalidDstCopyCols.insert (lclDstColsView[j]);
1772 #endif // HAVE_TPETRA_DEBUG
1773  }
1774  }
1775  try {
1776  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
1777  } catch (std::exception& e) {
1778  if (debug) {
1779  std::ostringstream os;
1780  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1781  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1782  << localRow << ", this->replaceLocalValues() threw an exception: "
1783  << e.what ();
1784  std::cerr << os.str ();
1785  }
1786  throw e;
1787  }
1788  if (err != numEntries) {
1789  lclErr = true;
1790  if (debug) {
1791  std::ostringstream os;
1792  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1793  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1794  "localRow " << localRow << ", this->replaceLocalValues "
1795  "returned " << err << " instead of numEntries = "
1796  << numEntries << endl;
1797  std::cerr << os.str ();
1798  }
1799  }
1800  }
1801  }
1802  }
1803 
1804  // Copy the "permute" local rows.
1805  for (size_t k = 0; k < numPermute; ++k) {
1806  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
1807  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
1808 
1809  const LO* lclSrcCols;
1810  Scalar* vals;
1811  LO numEntries;
1812  LO err = 0;
1813  try {
1814  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
1815  } catch (std::exception& e) {
1816  if (debug) {
1817  std::ostringstream os;
1818  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1819  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1820  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1821  << ", src->getLocalRowView() threw an exception: " << e.what ();
1822  std::cerr << os.str ();
1823  }
1824  throw e;
1825  }
1826 
1827  if (err != 0) {
1828  lclErr = true;
1829 #ifdef HAVE_TPETRA_DEBUG
1830  invalidPermuteFromRows.insert (srcLclRow);
1831 #endif // HAVE_TPETRA_DEBUG
1832  }
1833  else {
1834  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1835  lclErr = true;
1836  }
1837  else {
1838  // Convert the source matrix's local column indices to the
1839  // destination matrix's local column indices.
1840  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1841  for (LO j = 0; j < numEntries; ++j) {
1842  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1843  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1844  lclErr = true;
1845 #ifdef HAVE_TPETRA_DEBUG
1846  invalidDstPermuteCols.insert (lclDstColsView[j]);
1847 #endif // HAVE_TPETRA_DEBUG
1848  }
1849  }
1850  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
1851  if (err != numEntries) {
1852  lclErr = true;
1853  }
1854  }
1855  }
1856  }
1857  }
1858 
1859  if (lclErr) {
1860  std::ostream& err = this->markLocalErrorAndGetStream ();
1861 #ifdef HAVE_TPETRA_DEBUG
1862  err << "copyAndPermute: The graph structure of the source of the "
1863  "Import or Export must be a subset of the graph structure of the "
1864  "target. ";
1865  err << "invalidSrcCopyRows = [";
1866  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1867  it != invalidSrcCopyRows.end (); ++it) {
1868  err << *it;
1869  typename std::set<LO>::const_iterator itp1 = it;
1870  itp1++;
1871  if (itp1 != invalidSrcCopyRows.end ()) {
1872  err << ",";
1873  }
1874  }
1875  err << "], invalidDstCopyRows = [";
1876  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1877  it != invalidDstCopyRows.end (); ++it) {
1878  err << *it;
1879  typename std::set<LO>::const_iterator itp1 = it;
1880  itp1++;
1881  if (itp1 != invalidDstCopyRows.end ()) {
1882  err << ",";
1883  }
1884  }
1885  err << "], invalidDstCopyCols = [";
1886  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1887  it != invalidDstCopyCols.end (); ++it) {
1888  err << *it;
1889  typename std::set<LO>::const_iterator itp1 = it;
1890  itp1++;
1891  if (itp1 != invalidDstCopyCols.end ()) {
1892  err << ",";
1893  }
1894  }
1895  err << "], invalidDstPermuteCols = [";
1896  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1897  it != invalidDstPermuteCols.end (); ++it) {
1898  err << *it;
1899  typename std::set<LO>::const_iterator itp1 = it;
1900  itp1++;
1901  if (itp1 != invalidDstPermuteCols.end ()) {
1902  err << ",";
1903  }
1904  }
1905  err << "], invalidPermuteFromRows = [";
1906  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1907  it != invalidPermuteFromRows.end (); ++it) {
1908  err << *it;
1909  typename std::set<LO>::const_iterator itp1 = it;
1910  itp1++;
1911  if (itp1 != invalidPermuteFromRows.end ()) {
1912  err << ",";
1913  }
1914  }
1915  err << "]" << std::endl;
1916 #else
1917  err << "copyAndPermute: The graph structure of the source of the "
1918  "Import or Export must be a subset of the graph structure of the "
1919  "target." << std::endl;
1920 #endif // HAVE_TPETRA_DEBUG
1921  }
1922 
1923  if (debug) {
1924  std::ostringstream os;
1925  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1926  const bool lclSuccess = ! (* (this->localError_));
1927  os << "*** Proc " << myRank << ": copyAndPermute "
1928  << (lclSuccess ? "succeeded" : "FAILED");
1929  if (lclSuccess) {
1930  os << endl;
1931  } else {
1932  os << ": error messages: " << this->errorMessages (); // comes w/ endl
1933  }
1934  std::cerr << os.str ();
1935  }
1936  }
1937 
1938  namespace { // (anonymous)
1939 
1958  template<class LO, class GO, class D>
1959  size_t
1960  packRowCount (const size_t numEnt,
1961  const size_t numBytesPerValue,
1962  const size_t blkSize)
1963  {
1965 
1966  if (numEnt == 0) {
1967  // Empty rows always take zero bytes, to ensure sparsity.
1968  return 0;
1969  }
1970  else {
1971  // We store the number of entries as a local index (LO).
1972  LO numEntLO = 0; // packValueCount wants this.
1973  GO gid;
1974  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
1975  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
1976  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1977  return numEntLen + gidsLen + valsLen;
1978  }
1979  }
1980 
1991  template<class ST, class LO, class GO, class D>
1992  size_t
1993  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
1994  const size_t offset,
1995  const size_t numBytes,
1996  const size_t numBytesPerValue)
1997  {
1998  using Kokkos::subview;
2000  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
2001  typedef typename input_buffer_type::size_type size_type;
2002 
2003  if (numBytes == 0) {
2004  // Empty rows always take zero bytes, to ensure sparsity.
2005  return static_cast<size_t> (0);
2006  }
2007  else {
2008  LO numEntLO = 0;
2009  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2010 #ifdef HAVE_TPETRA_DEBUG
2011  TEUCHOS_TEST_FOR_EXCEPTION(
2012  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2013  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2014  << ".");
2015 #endif // HAVE_TPETRA_DEBUG
2016  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
2017  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
2018  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2019 #ifdef HAVE_TPETRA_DEBUG
2020  TEUCHOS_TEST_FOR_EXCEPTION(
2021  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2022  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2023  << ".");
2024 #else
2025  (void) actualNumBytes;
2026 #endif // HAVE_TPETRA_DEBUG
2027  return static_cast<size_t> (numEntLO);
2028  }
2029  }
2030 
2038  template<class ST, class LO, class GO, class D>
2039  size_t
2040  packRowForBlockCrs (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
2041  const size_t offset,
2042  const size_t numEnt,
2045  const size_t numBytesPerValue,
2046  const size_t blockSize)
2047  {
2048  using Kokkos::subview;
2050  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
2051  // the same, no matter what type we're packing. It's a reasonable
2052  // assumption, given that we go through the trouble of PackTraits
2053  // just so that we can pack data of different types in the same
2054  // buffer.
2055  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
2056  typedef typename output_buffer_type::size_type size_type;
2057  typedef typename std::pair<size_type, size_type> pair_type;
2058 
2059  if (numEnt == 0) {
2060  // Empty rows always take zero bytes, to ensure sparsity.
2061  return 0;
2062  }
2063  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2064 
2065  const GO gid = 0; // packValueCount wants this
2066  const LO numEntLO = static_cast<size_t> (numEnt);
2067 
2068  const size_t numEntBeg = offset;
2069  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2070  const size_t gidsBeg = numEntBeg + numEntLen;
2071  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2072  const size_t valsBeg = gidsBeg + gidsLen;
2073  const size_t valsLen = numScalarEnt * numBytesPerValue;
2074 
2075  output_buffer_type numEntOut =
2076  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
2077  output_buffer_type gidsOut =
2078  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
2079  output_buffer_type valsOut =
2080  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
2081 
2082  size_t numBytesOut = 0;
2083  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2084  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
2085  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numScalarEnt);
2086 
2087  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2088  TEUCHOS_TEST_FOR_EXCEPTION(
2089  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2090  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2091  << expectedNumBytes << ".");
2092  return numBytesOut;
2093  }
2094 
2095  // Return the number of bytes actually read / used.
2096  template<class ST, class LO, class GO, class D>
2097  size_t
2098  unpackRowForBlockCrs (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2101  const size_t offset,
2102  const size_t numBytes,
2103  const size_t numEnt,
2104  const size_t numBytesPerValue,
2105  const size_t blockSize)
2106  {
2107  using Kokkos::subview;
2109  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
2110  // the same, no matter what type we're unpacking. It's a
2111  // reasonable assumption, given that we go through the trouble of
2112  // PackTraits just so that we can pack data of different types in
2113  // the same buffer.
2114  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
2115  typedef typename input_buffer_type::size_type size_type;
2116  typedef typename std::pair<size_type, size_type> pair_type;
2117 
2118  if (numBytes == 0) {
2119  // Rows with zero bytes always have zero entries.
2120  return 0;
2121  }
2122  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2123  TEUCHOS_TEST_FOR_EXCEPTION(
2124  static_cast<size_t> (imports.dimension_0 ()) <= offset,
2125  std::logic_error, "unpackRow: imports.dimension_0() = "
2126  << imports.dimension_0 () << " <= offset = " << offset << ".");
2127  TEUCHOS_TEST_FOR_EXCEPTION(
2128  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
2129  std::logic_error, "unpackRow: imports.dimension_0() = "
2130  << imports.dimension_0 () << " < offset + numBytes = "
2131  << (offset + numBytes) << ".");
2132 
2133  const GO gid = 0; // packValueCount wants this
2134  const LO lid = 0; // packValueCount wants this
2135 
2136  const size_t numEntBeg = offset;
2137  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2138  const size_t gidsBeg = numEntBeg + numEntLen;
2139  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2140  const size_t valsBeg = gidsBeg + gidsLen;
2141  const size_t valsLen = numScalarEnt * numBytesPerValue;
2142 
2143  input_buffer_type numEntIn =
2144  subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
2145  input_buffer_type gidsIn =
2146  subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
2147  input_buffer_type valsIn =
2148  subview (imports, pair_type (valsBeg, valsBeg + valsLen));
2149 
2150  size_t numBytesOut = 0;
2151  LO numEntOut;
2152  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2153  TEUCHOS_TEST_FOR_EXCEPTION(
2154  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2155  "unpackRow: Expected number of entries " << numEnt
2156  << " != actual number of entries " << numEntOut << ".");
2157 
2158  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
2159  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numScalarEnt);
2160 
2161  TEUCHOS_TEST_FOR_EXCEPTION(
2162  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
2163  << numBytesOut << " != numBytes = " << numBytes << ".");
2164  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2165  TEUCHOS_TEST_FOR_EXCEPTION(
2166  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2167  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2168  << expectedNumBytes << ".");
2169  return numBytesOut;
2170  }
2171  } // namespace (anonymous)
2172 
2173  template<class Scalar, class LO, class GO, class Node>
2174  void
2176  packAndPrepare (const Tpetra::SrcDistObject& source,
2177  const Teuchos::ArrayView<const LO>& exportLIDs,
2178  Teuchos::Array<packet_type>& exports,
2179  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2180  size_t& constantNumPackets,
2181  Tpetra::Distributor& /* distor */)
2182  {
2183  using std::endl;
2185  using Kokkos::MemoryUnmanaged;
2186  using Kokkos::subview;
2187  using Kokkos::View;
2189  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
2191  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2192  const bool debug = false;
2193 
2194  if (debug) {
2195  std::ostringstream os;
2196  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2197  os << "Proc " << myRank << ": packAndPrepare: exportLIDs.size() = "
2198  << exportLIDs.size () << ", numPacketsPerLID.size() = "
2199  << numPacketsPerLID.size () << endl;
2200  std::cerr << os.str ();
2201  }
2202 
2203  if (* (this->localError_)) {
2204  std::ostream& err = this->markLocalErrorAndGetStream ();
2205  err << "packAndPrepare: The target object of the Import or Export is "
2206  "already in an error state." << endl;
2207  return;
2208  }
2209 
2210  const this_type* src = dynamic_cast<const this_type* > (&source);
2211  // Should have checked for these cases in checkSizes().
2212  if (src == NULL) {
2213  std::ostream& err = this->markLocalErrorAndGetStream ();
2214  err << "packAndPrepare: The source (input) object of the Import or "
2215  "Export is either not a BlockCrsMatrix, or does not have the right "
2216  "template parameters. checkSizes() should have caught this. "
2217  "Please report this bug to the Tpetra developers." << endl;
2218  return;
2219  }
2220 
2221  const crs_graph_type& srcGraph = src->graph_;
2222  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2223  const size_type numExportLIDs = exportLIDs.size ();
2224 
2225  if (numExportLIDs != numPacketsPerLID.size ()) {
2226  std::ostream& err = this->markLocalErrorAndGetStream ();
2227  err << "packAndPrepare: exportLIDs.size() = " << numExportLIDs
2228  << " != numPacketsPerLID.size() = " << numPacketsPerLID.size ()
2229  << "." << endl;
2230  return;
2231  }
2232 
2233  // Graphs and matrices are allowed to have a variable number of
2234  // entries per row. We could test whether all rows have the same
2235  // number of entries, but DistObject can only use this
2236  // optimization if all rows on _all_ processes have the same
2237  // number of entries. Rather than do the all-reduce necessary to
2238  // test for this unlikely case, we tell DistObject (by setting
2239  // constantNumPackets to zero) to assume that different rows may
2240  // have different numbers of entries.
2241  constantNumPackets = 0;
2242 
2243  // Compute the number of bytes ("packets") per row to pack. While
2244  // we're at it, compute the total # of block entries to send, and
2245  // the max # of block entries in any of the rows we're sending.
2246  size_t totalNumBytes = 0;
2247  size_t totalNumEntries = 0;
2248  size_t maxRowLength = 0;
2249  for (size_type i = 0; i < numExportLIDs; ++i) {
2250  const LO lclRow = exportLIDs[i];
2251  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2252  // If any given LIDs are invalid, the above might return either
2253  // zero or the invalid size_t value. If the former, we have no
2254  // way to tell, but that's OK; it just means the calling process
2255  // won't pack anything (it has nothing to pack anyway). If the
2256  // latter, we replace it with zero (that row is not owned by the
2257  // calling process, so it has no entries to pack).
2258  if (numEnt == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2259  numEnt = 0;
2260  }
2261  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2262 
2263  // The 'if' branch implicitly assumes that packRowCount() returns
2264  // zero if numEnt == 0.
2265  size_t numBytesPerValue = 0;
2266  if (numEnt > 0) {
2267  // Get a locally indexed view of the current row's data. If
2268  // the current row has > 0 entries, we need an entry in order
2269  // to figure out the byte count of the packed row. (We really
2270  // only need it if ST's size is determined at run time.)
2271  Scalar* valsRaw = NULL;
2272  const LO* lidsRaw = NULL;
2273  LO actualNumEnt = 0;
2274  const LO errCode =
2275  src->getLocalRowView (lclRow, lidsRaw, valsRaw, actualNumEnt);
2276 
2277  if (numEnt != static_cast<size_t> (actualNumEnt)) {
2278  std::ostream& err = this->markLocalErrorAndGetStream ();
2279  err << "packAndPrepare: Local row " << i << " claims to have " <<
2280  numEnt << "entry/ies, but the View returned by getLocalRowView() "
2281  "has " << actualNumEnt << " entry/ies. This should never happen. "
2282  "Please report this bug to the Tpetra developers." << endl;
2283  return;
2284  }
2285  if (errCode == Teuchos::OrdinalTraits<LO>::invalid ()) {
2286  std::ostream& err = this->markLocalErrorAndGetStream ();
2287  err << "packAndPrepare: Local row " << i << " is not in the row Map "
2288  "of the source object on the calling process." << endl;
2289  return;
2290  }
2291 
2292  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2293  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2294 
2295  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2296  // space here for now, this doesn't assume UVM. That may change
2297  // in the future, if we ever start packing on the device.
2298  numBytesPerValue = PackTraits<ST, HES>::packValueCount (vals(0));
2299  }
2300 
2301  const size_t numBytes =
2302  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue, blockSize);
2303  numPacketsPerLID[i] = numBytes;
2304  totalNumBytes += numBytes;
2305  totalNumEntries += numEnt;
2306  maxRowLength = std::max (maxRowLength, numEnt);
2307  }
2308 
2309  if (debug) {
2310  const int myRank = graph_.getComm ()->getRank ();
2311  std::ostringstream os;
2312  os << "Proc " << myRank << ": packAndPrepare: totalNumBytes = "
2313  << totalNumBytes << endl;
2314  std::cerr << os.str ();
2315  }
2316 
2317  // We use a "struct of arrays" approach to packing each row's
2318  // entries. All the column indices (as global indices) go first,
2319  // then all their owning process ranks, and then the values.
2320  exports.resize (totalNumBytes);
2321  if (totalNumEntries > 0) {
2322  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
2323  totalNumBytes);
2324 
2325  // Current position (in bytes) in the 'exports' output array.
2326  size_t offset = 0;
2327 
2328  // For each block row of the matrix owned by the calling
2329  // process, pack that block row's column indices and values into
2330  // the exports array.
2331 
2332  // Source matrix's column Map. We verified in checkSizes() that
2333  // the column Map exists (is not null).
2334  const map_type& srcColMap = * (srcGraph.getColMap ());
2335 
2336  // Temporary buffer for global column indices.
2337  View<GO*, HES> gblColInds;
2338  {
2339  GO gid = 0;
2340  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
2341  }
2342 
2343  // Pack the data for each row to send, into the 'exports' buffer.
2344  for (size_type i = 0; i < numExportLIDs; ++i) {
2345  const LO lclRowInd = exportLIDs[i];
2346  const LO* lclColIndsRaw;
2347  Scalar* valsRaw;
2348  LO numEntLO;
2349  // It's OK to ignore the return value, since if the calling
2350  // process doesn't own that local row, then the number of
2351  // entries in that row on the calling process is zero.
2352  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
2353  const size_t numEnt = static_cast<size_t> (numEntLO);
2354  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2355  View<const LO*, HES, MemoryUnmanaged> lclColInds (lclColIndsRaw, numEnt);
2356  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2357  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2358 
2359  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2360  // space here for now, this doesn't assume UVM. That may
2361  // change in the future, if we ever start packing on device.
2362  const size_t numBytesPerValue = numEnt == 0 ?
2363  static_cast<size_t> (0) :
2364  PackTraits<ST, HES>::packValueCount (vals(0));
2365 
2366  // Convert column indices from local to global.
2367  for (size_t j = 0; j < numEnt; ++j) {
2368  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2369  }
2370 
2371  // Copy the row's data into the current spot in the exports array.
2372  const size_t numBytes =
2373  packRowForBlockCrs<ST, LO, GO, HES> (exportsK, offset, numEnt, gblColInds,
2374  vals, numBytesPerValue, blockSize);
2375  // Keep track of how many bytes we packed.
2376  offset += numBytes;
2377  } // for each LID (of a row) to send
2378 
2379  if (offset != totalNumBytes) {
2380  std::ostream& err = this->markLocalErrorAndGetStream ();
2381  err << "packAndPreapre: At end of method, the final offset (in bytes) "
2382  << offset << " does not equal the total number of bytes packed "
2383  << totalNumBytes << ". "
2384  << "Please report this bug to the Tpetra developers." << endl;
2385  return;
2386  }
2387  } // if totalNumEntries > 0
2388 
2389  if (debug) {
2390  std::ostringstream os;
2391  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2392  const bool lclSuccess = ! (* (this->localError_));
2393  os << "*** Proc " << myRank << ": packAndPrepare "
2394  << (lclSuccess ? "succeeded" : "FAILED")
2395  << " (totalNumEntries = " << totalNumEntries << ") ***" << endl;
2396  std::cerr << os.str ();
2397  }
2398  }
2399 
2400 
2401  template<class Scalar, class LO, class GO, class Node>
2402  void
2404  unpackAndCombine (const Teuchos::ArrayView<const LO>& importLIDs,
2405  const Teuchos::ArrayView<const packet_type>& imports,
2406  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2407  size_t /* constantNumPackets */, // not worthwhile to use this
2408  Tpetra::Distributor& /* distor */,
2410  {
2411  using std::endl;
2413  using Kokkos::MemoryUnmanaged;
2414  using Kokkos::subview;
2415  using Kokkos::View;
2417  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2418  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
2419  typedef std::pair<typename View<int*, HES>::size_type,
2420  typename View<int*, HES>::size_type> pair_type;
2421  typedef View<GO*, HES, MemoryUnmanaged> gids_out_type;
2422  typedef View<LO*, HES, MemoryUnmanaged> lids_out_type;
2423  typedef View<ST*, HES, MemoryUnmanaged> vals_out_type;
2424  typedef typename PackTraits<GO, HES>::input_buffer_type input_buffer_type;
2425  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::unpackAndCombine: ";
2426  const bool debug = false;
2427 
2428  if (debug) {
2429  std::ostringstream os;
2430  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2431  os << "Proc " << myRank << ": unpackAndCombine" << endl;
2432  std::cerr << os.str ();
2433  }
2434 
2435  // It should not cause deadlock to return on error in this method,
2436  // since this method does not communicate.
2437 
2438  if (* (this->localError_)) {
2439  std::ostream& err = this->markLocalErrorAndGetStream ();
2440  err << prefix << "The target object of the Import or Export is "
2441  "already in an error state." << endl;
2442  return;
2443  }
2444  if (importLIDs.size () != numPacketsPerLID.size ()) {
2445  std::ostream& err = this->markLocalErrorAndGetStream ();
2446  err << prefix << "importLIDs.size() = " << importLIDs.size () << " != "
2447  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "." << endl;
2448  return;
2449  }
2450  if (CM != ADD && CM != INSERT && CM != REPLACE && CM != ABSMAX && CM != ZERO) {
2451  std::ostream& err = this->markLocalErrorAndGetStream ();
2452  err << prefix << "Invalid CombineMode value " << CM << ". Valid "
2453  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2454  << endl;
2455  return;
2456  }
2457 
2458  // Target matrix's column Map. Use to convert the global column
2459  // indices in the receive buffer to local indices. We verified in
2460  // checkSizes() that the column Map exists (is not null).
2461  const map_type& tgtColMap = * (this->graph_.getColMap ());
2462 
2463  const size_type numImportLIDs = importLIDs.size ();
2464  if (CM == ZERO || numImportLIDs == 0) {
2465  if (debug) {
2466  std::ostringstream os;
2467  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2468  os << "Proc " << myRank << ": unpackAndCombine: Nothing to do" << endl;
2469  std::cerr << os.str ();
2470  }
2471  return; // nothing to do; no need to combine entries
2472  }
2473 
2474  if (debug) {
2475  std::ostringstream os;
2476  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2477  os << "Proc " << myRank << ": unpackAndCombine: Getting ready" << endl;
2478  std::cerr << os.str ();
2479  }
2480 
2481  input_buffer_type importsK (imports.getRawPtr (), imports.size ());
2482  const size_t blockSize = this->getBlockSize ();
2483  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
2484  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2485 
2486  size_t numBytesPerValue;
2487  {
2488  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
2489  // run-time size? We already assume that all entries in both
2490  // the source and target matrices have the same size. If the
2491  // calling process owns at least one entry in either matrix, we
2492  // can use that entry to set the size. However, it is possible
2493  // that the calling process owns no entries. In that case,
2494  // we're in trouble. One way to fix this would be for each
2495  // row's data to contain the run-time size. This is only
2496  // necessary if the size is not a compile-time constant.
2497  Scalar val;
2498  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
2499  }
2500 
2501  // Temporary space to cache incoming global column indices and
2502  // values. Column indices come in as global indices, in case the
2503  // source object's column Map differs from the target object's
2504  // (this's) column Map.
2505  View<GO*, HES> gblColInds;
2506  View<LO*, HES> lclColInds;
2507  View<ST*, HES> vals;
2508  {
2509  GO gid = 0;
2510  LO lid = 0;
2511  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
2512  // run-time size? We already assume that all entries in both
2513  // the source and target matrices have the same size. If the
2514  // calling process owns at least one entry in either matrix, we
2515  // can use that entry to set the size. However, it is possible
2516  // that the calling process owns no entries. In that case,
2517  // we're in trouble. One way to fix this would be for each
2518  // row's data to contain the run-time size. This is only
2519  // necessary if the size is not a compile-time constant.
2520  Scalar val;
2521  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowNumEnt, "gids");
2522  lclColInds = PackTraits<LO, HES>::allocateArray (lid, maxRowNumEnt, "lids");
2523  vals = PackTraits<ST, HES>::allocateArray (val, maxRowNumScalarEnt, "vals");
2524  }
2525 
2526  size_t offset = 0;
2527  for (size_type i = 0; i < numImportLIDs; ++i) {
2528  const size_t numBytes = numPacketsPerLID[i];
2529  if (numBytes == 0) {
2530  // Empty buffer for that row means that the row is empty.
2531  continue;
2532  }
2533  const size_t numEnt =
2534  unpackRowCount<ST, LO, GO, HES> (importsK, offset, numBytes, numBytesPerValue);
2535  if (numEnt > maxRowNumEnt) {
2536  std::ostream& err = this->markLocalErrorAndGetStream ();
2537  err << prefix << "At i = " << i << ", numEnt = " << numEnt
2538  << " > maxRowNumEnt = " << maxRowNumEnt << endl;
2539  continue;
2540  }
2541 
2542  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2543  const LO lclRow = importLIDs[i];
2544 
2545  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
2546  vals_out_type valsOut = subview (vals, pair_type (0, numScalarEnt));
2547 
2548  const size_t numBytesOut =
2549  unpackRowForBlockCrs<ST, LO, GO, HES> (gidsOut, valsOut, importsK, offset, numBytes,
2550  numEnt, numBytesPerValue, blockSize);
2551  if (numBytes != numBytesOut) {
2552  std::ostream& err = this->markLocalErrorAndGetStream ();
2553  err << prefix << "At i = " << i << ", numBytes = " << numBytes
2554  << " != numBytesOut = " << numBytesOut << ".";
2555  continue;
2556  }
2557 
2558  // Convert incoming global indices to local indices.
2559  lids_out_type lidsOut = subview (lclColInds, pair_type (0, numEnt));
2560  for (size_t k = 0; k < numEnt; ++k) {
2561  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2562 #ifdef HAVE_TPETRA_DEBUG
2563  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2564  std::ostream& err = this->markLocalErrorAndGetStream ();
2565  err << prefix << "At i = " << i << ", GID " << gidsOut(k)
2566  << " is not owned by the calling process.";
2567  continue;
2568  }
2569 #endif // HAVE_TPETRA_DEBUG
2570  }
2571 
2572  // Combine the incoming data with the matrix's current data.
2573  LO numCombd = 0;
2574  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.ptr_on_device ());
2575  const Scalar* const valsRaw =
2576  reinterpret_cast<const Scalar*> (const_cast<const ST*> (valsOut.ptr_on_device ()));
2577  if (CM == ADD) {
2578  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2579  } else if (CM == INSERT || CM == REPLACE) {
2580  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2581  } else if (CM == ABSMAX) {
2582  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2583  }
2584 #ifdef HAVE_TPETRA_DEBUG
2585  if (static_cast<LO> (numEnt) != numCombd) {
2586  std::ostream& err = this->markLocalErrorAndGetStream ();
2587  err << prefix << "At i = " << i << ", numEnt = " << numEnt
2588  << " != numCombd = " << numCombd << ".";
2589  continue;
2590  }
2591 #else
2592  (void) numCombd; // ignore, just for now
2593 #endif // HAVE_TPETRA_DEBUG
2594 
2595  // Don't update offset until current LID has succeeded.
2596  offset += numBytes;
2597  } // for each import LID i
2598 
2599  if (debug) {
2600  std::ostringstream os;
2601  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2602  const bool lclSuccess = ! (* (this->localError_));
2603  os << "*** Proc " << myRank << ": unpackAndCombine "
2604  << (lclSuccess ? "succeeded" : "FAILED")
2605  << " ***" << endl;
2606  std::cerr << os.str ();
2607  }
2608  }
2609 
2610 
2611  template<class Scalar, class LO, class GO, class Node>
2612  std::string
2614  {
2615  using Teuchos::TypeNameTraits;
2616  std::ostringstream os;
2617  os << "\"Tpetra::BlockCrsMatrix\": { "
2618  << "Template parameters: { "
2619  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2620  << "LO: " << TypeNameTraits<LO>::name ()
2621  << "GO: " << TypeNameTraits<GO>::name ()
2622  << "Node: " << TypeNameTraits<Node>::name ()
2623  << " }"
2624  << ", Label: \"" << this->getObjectLabel () << "\""
2625  << ", Global dimensions: ["
2626  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2627  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2628  << ", Block size: " << getBlockSize ()
2629  << " }";
2630  return os.str ();
2631  }
2632 
2633 
2634  template<class Scalar, class LO, class GO, class Node>
2635  void
2637  describe (Teuchos::FancyOStream& out,
2638  const Teuchos::EVerbosityLevel verbLevel) const
2639  {
2640  using Teuchos::ArrayRCP;
2641  using Teuchos::CommRequest;
2642  using Teuchos::FancyOStream;
2643  using Teuchos::getFancyOStream;
2644  using Teuchos::ireceive;
2645  using Teuchos::isend;
2646  using Teuchos::outArg;
2647  using Teuchos::TypeNameTraits;
2648  using Teuchos::VERB_DEFAULT;
2649  using Teuchos::VERB_NONE;
2650  using Teuchos::VERB_LOW;
2651  using Teuchos::VERB_MEDIUM;
2652  // using Teuchos::VERB_HIGH;
2653  using Teuchos::VERB_EXTREME;
2654  using Teuchos::RCP;
2655  using Teuchos::wait;
2656  using std::endl;
2657 
2658  // Set default verbosity if applicable.
2659  const Teuchos::EVerbosityLevel vl =
2660  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2661 
2662  if (vl == VERB_NONE) {
2663  return; // print nothing
2664  }
2665 
2666  // describe() always starts with a tab before it prints anything.
2667  Teuchos::OSTab tab0 (out);
2668 
2669  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2670  Teuchos::OSTab tab1 (out);
2671 
2672  out << "Template parameters:" << endl;
2673  {
2674  Teuchos::OSTab tab2 (out);
2675  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2676  << "LO: " << TypeNameTraits<LO>::name () << endl
2677  << "GO: " << TypeNameTraits<GO>::name () << endl
2678  << "Node: " << TypeNameTraits<Node>::name () << endl;
2679  }
2680  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2681  << "Global dimensions: ["
2682  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2683  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2684 
2685  const LO blockSize = getBlockSize ();
2686  out << "Block size: " << blockSize << endl;
2687 
2688  // constituent objects
2689  if (vl >= VERB_MEDIUM) {
2690  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2691  const int myRank = comm.getRank ();
2692  if (myRank == 0) {
2693  out << "Row Map:" << endl;
2694  }
2695  getRowMap()->describe (out, vl);
2696 
2697  if (! getColMap ().is_null ()) {
2698  if (getColMap() == getRowMap()) {
2699  if (myRank == 0) {
2700  out << "Column Map: same as row Map" << endl;
2701  }
2702  }
2703  else {
2704  if (myRank == 0) {
2705  out << "Column Map:" << endl;
2706  }
2707  getColMap ()->describe (out, vl);
2708  }
2709  }
2710  if (! getDomainMap ().is_null ()) {
2711  if (getDomainMap () == getRowMap ()) {
2712  if (myRank == 0) {
2713  out << "Domain Map: same as row Map" << endl;
2714  }
2715  }
2716  else if (getDomainMap () == getColMap ()) {
2717  if (myRank == 0) {
2718  out << "Domain Map: same as column Map" << endl;
2719  }
2720  }
2721  else {
2722  if (myRank == 0) {
2723  out << "Domain Map:" << endl;
2724  }
2725  getDomainMap ()->describe (out, vl);
2726  }
2727  }
2728  if (! getRangeMap ().is_null ()) {
2729  if (getRangeMap () == getDomainMap ()) {
2730  if (myRank == 0) {
2731  out << "Range Map: same as domain Map" << endl;
2732  }
2733  }
2734  else if (getRangeMap () == getRowMap ()) {
2735  if (myRank == 0) {
2736  out << "Range Map: same as row Map" << endl;
2737  }
2738  }
2739  else {
2740  if (myRank == 0) {
2741  out << "Range Map: " << endl;
2742  }
2743  getRangeMap ()->describe (out, vl);
2744  }
2745  }
2746  }
2747 
2748  if (vl >= VERB_EXTREME) {
2749  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2750  const int myRank = comm.getRank ();
2751  const int numProcs = comm.getSize ();
2752 
2753  // Print the calling process' data to the given output stream.
2754  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2755  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2756  FancyOStream& os = *osPtr;
2757  os << "Process " << myRank << ":" << endl;
2758  Teuchos::OSTab tab2 (os);
2759 
2760  const map_type& meshRowMap = * (graph_.getRowMap ());
2761  const map_type& meshColMap = * (graph_.getColMap ());
2762  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2763  meshLclRow <= meshRowMap.getMaxLocalIndex ();
2764  ++meshLclRow) {
2765  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2766  os << "Row " << meshGblRow << ": {";
2767 
2768  const LO* lclColInds = NULL;
2769  Scalar* vals = NULL;
2770  LO numInds = 0;
2771  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
2772 
2773  for (LO k = 0; k < numInds; ++k) {
2774  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2775 
2776  os << "Col " << gblCol << ": [";
2777  for (LO i = 0; i < blockSize; ++i) {
2778  for (LO j = 0; j < blockSize; ++j) {
2779  os << vals[blockSize*blockSize*k + i*blockSize + j];
2780  if (j + 1 < blockSize) {
2781  os << ", ";
2782  }
2783  }
2784  if (i + 1 < blockSize) {
2785  os << "; ";
2786  }
2787  }
2788  os << "]";
2789  if (k + 1 < numInds) {
2790  os << ", ";
2791  }
2792  }
2793  os << "}" << endl;
2794  }
2795 
2796  // Print data on Process 0. This will automatically respect the
2797  // current indentation level.
2798  if (myRank == 0) {
2799  out << lclOutStrPtr->str ();
2800  lclOutStrPtr = Teuchos::null; // clear it to save space
2801  }
2802 
2803  const int sizeTag = 1337;
2804  const int dataTag = 1338;
2805 
2806  ArrayRCP<char> recvDataBuf; // only used on Process 0
2807 
2808  // Send string sizes and data from each process in turn to
2809  // Process 0, and print on that process.
2810  for (int p = 1; p < numProcs; ++p) {
2811  if (myRank == 0) {
2812  // Receive the incoming string's length.
2813  ArrayRCP<size_t> recvSize (1);
2814  recvSize[0] = 0;
2815  RCP<CommRequest<int> > recvSizeReq =
2816  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2817  wait<int> (comm, outArg (recvSizeReq));
2818  const size_t numCharsToRecv = recvSize[0];
2819 
2820  // Allocate space for the string to receive. Reuse receive
2821  // buffer space if possible. We can do this because in the
2822  // current implementation, we only have one receive in
2823  // flight at a time. Leave space for the '\0' at the end,
2824  // in case the sender doesn't send it.
2825  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2826  recvDataBuf.resize (numCharsToRecv + 1);
2827  }
2828  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2829  // Post the receive of the actual string data.
2830  RCP<CommRequest<int> > recvDataReq =
2831  ireceive<int, char> (recvData, p, dataTag, comm);
2832  wait<int> (comm, outArg (recvDataReq));
2833 
2834  // Print the received data. This will respect the current
2835  // indentation level. Make sure that the string is
2836  // null-terminated.
2837  recvDataBuf[numCharsToRecv] = '\0';
2838  out << recvDataBuf.getRawPtr ();
2839  }
2840  else if (myRank == p) { // if I am not Process 0, and my rank is p
2841  // This deep-copies the string at most twice, depending on
2842  // whether std::string reference counts internally (it
2843  // generally does, so this won't deep-copy at all).
2844  const std::string stringToSend = lclOutStrPtr->str ();
2845  lclOutStrPtr = Teuchos::null; // clear original to save space
2846 
2847  // Send the string's length to Process 0.
2848  const size_t numCharsToSend = stringToSend.size ();
2849  ArrayRCP<size_t> sendSize (1);
2850  sendSize[0] = numCharsToSend;
2851  RCP<CommRequest<int> > sendSizeReq =
2852  isend<int, size_t> (sendSize, 0, sizeTag, comm);
2853  wait<int> (comm, outArg (sendSizeReq));
2854 
2855  // Send the actual string to Process 0. We know that the
2856  // string has length > 0, so it's save to take the address
2857  // of the first entry. Make a nonowning ArrayRCP to hold
2858  // the string. Process 0 will add a null termination
2859  // character at the end of the string, after it receives the
2860  // message.
2861  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2862  RCP<CommRequest<int> > sendDataReq =
2863  isend<int, char> (sendData, 0, dataTag, comm);
2864  wait<int> (comm, outArg (sendDataReq));
2865  }
2866  } // for each process rank p other than 0
2867  } // extreme verbosity level (print the whole matrix)
2868  }
2869 
2870  template<class Scalar, class LO, class GO, class Node>
2871  Teuchos::RCP<const Teuchos::Comm<int> >
2873  getComm() const
2874  {
2875  return graph_.getComm();
2876  }
2877 
2878  template<class Scalar, class LO, class GO, class Node>
2879  Teuchos::RCP<Node>
2881  getNode() const
2882  {
2883  return graph_.getNode();
2884 
2885  }
2886 
2887  template<class Scalar, class LO, class GO, class Node>
2891  {
2892  return graph_.getGlobalNumCols();
2893  }
2894 
2895  template<class Scalar, class LO, class GO, class Node>
2896  size_t
2899  {
2900  return graph_.getNodeNumCols();
2901  }
2902 
2903  template<class Scalar, class LO, class GO, class Node>
2904  GO
2907  {
2908  return graph_.getIndexBase();
2909  }
2910 
2911  template<class Scalar, class LO, class GO, class Node>
2915  {
2916  return graph_.getGlobalNumEntries();
2917  }
2918 
2919  template<class Scalar, class LO, class GO, class Node>
2920  size_t
2923  {
2924  return graph_.getNodeNumEntries();
2925  }
2926 
2927  template<class Scalar, class LO, class GO, class Node>
2928  size_t
2930  getNumEntriesInGlobalRow (GO globalRow) const
2931  {
2932  return graph_.getNumEntriesInGlobalRow(globalRow);
2933  }
2934 
2935  template<class Scalar, class LO, class GO, class Node>
2939  {
2940  return graph_.getGlobalNumDiags();
2941  }
2942 
2943  template<class Scalar, class LO, class GO, class Node>
2944  size_t
2947  {
2948  return graph_.getNodeNumDiags();
2949  }
2950 
2951  template<class Scalar, class LO, class GO, class Node>
2952  size_t
2955  {
2956  return graph_.getGlobalMaxNumRowEntries();
2957  }
2958 
2959  template<class Scalar, class LO, class GO, class Node>
2960  bool
2962  hasColMap() const
2963  {
2964  return graph_.hasColMap();
2965  }
2966 
2967  template<class Scalar, class LO, class GO, class Node>
2968  bool
2971  {
2972  return graph_.isLowerTriangular();
2973  }
2974 
2975  template<class Scalar, class LO, class GO, class Node>
2976  bool
2979  {
2980  return graph_.isUpperTriangular();
2981  }
2982 
2983  template<class Scalar, class LO, class GO, class Node>
2984  bool
2987  {
2988  return graph_.isLocallyIndexed();
2989  }
2990 
2991  template<class Scalar, class LO, class GO, class Node>
2992  bool
2995  {
2996  return graph_.isGloballyIndexed();
2997  }
2998 
2999  template<class Scalar, class LO, class GO, class Node>
3000  bool
3003  {
3004  return graph_.isFillComplete ();
3005  }
3006 
3007  template<class Scalar, class LO, class GO, class Node>
3008  bool
3011  {
3012  return true;
3013  }
3014 
3015 
3016  template<class Scalar, class LO, class GO, class Node>
3017  void
3019  getGlobalRowCopy (GO GlobalRow,
3020  const Teuchos::ArrayView<GO> &Indices,
3021  const Teuchos::ArrayView<Scalar> &Values,
3022  size_t &NumEntries) const
3023  {
3024  TEUCHOS_TEST_FOR_EXCEPTION(
3025  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
3026  "This class doesn't support global matrix indexing.");
3027 
3028  }
3029 
3030  template<class Scalar, class LO, class GO, class Node>
3031  void
3033  getGlobalRowView (GO GlobalRow,
3034  Teuchos::ArrayView<const GO> &indices,
3035  Teuchos::ArrayView<const Scalar> &values) const
3036  {
3037  TEUCHOS_TEST_FOR_EXCEPTION(
3038  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
3039  "This class doesn't support global matrix indexing.");
3040 
3041  }
3042 
3043  template<class Scalar, class LO, class GO, class Node>
3044  void
3046  getLocalRowView (LO LocalRow,
3047  Teuchos::ArrayView<const LO> &indices,
3048  Teuchos::ArrayView<const Scalar> &values) const
3049  {
3050  TEUCHOS_TEST_FOR_EXCEPTION(
3051  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
3052  "This class doesn't support global matrix indexing.");
3053 
3054  }
3055 
3056 
3057  template<class Scalar, class LO, class GO, class Node>
3058  void
3061  {
3062  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3063  int rowOffset = 0;
3064  int offset = 0;
3065  LO bs = getBlockSize();
3066  Teuchos::ArrayRCP<size_t> colOffsets;
3067  getLocalDiagOffsets (colOffsets);
3068  for(size_t r=0; r<getNodeNumRows(); r++)
3069  {
3070  // move pointer to start of diagonal block
3071  offset = rowOffset + colOffsets[r]*bs*bs;
3072  for(int b=0; b<bs; b++)
3073  {
3074  std::cout << "offset: " << offset+b*(bs+1) << std::endl;
3075  diag.replaceLocalValue(r*bs+b, val_[offset+b*(bs+1)]);
3076  }
3077  // move pointer to start of next block row
3078  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3079  }
3080 
3081 #if 0
3082  Teuchos::RCP<Teuchos::FancyOStream> wrappedStream = Teuchos::getFancyOStream (Teuchos::rcpFromRef (std::cout));
3083  diag.describe (*wrappedStream, Teuchos::VERB_EXTREME);
3084 
3085  std::cout << "Raw data:\n";
3086  int nnz = getNodeNumEntries()*bs*bs;
3087  for(int i=0; i<nnz; i++)
3088  std::cout << "val[" << i << "] = " << val_[i] << std::endl;
3089 #endif // 0
3090  }
3091 
3092  template<class Scalar, class LO, class GO, class Node>
3093  void
3096  {
3097  TEUCHOS_TEST_FOR_EXCEPTION(
3098  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
3099  "not implemented.");
3100 
3101  }
3102 
3103  template<class Scalar, class LO, class GO, class Node>
3104  void
3107  {
3108  TEUCHOS_TEST_FOR_EXCEPTION(
3109  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
3110  "not implemented.");
3111 
3112  }
3113 
3114  template<class Scalar, class LO, class GO, class Node>
3115  Teuchos::RCP<const Tpetra::RowGraph<LO, GO, Node> >
3117  getGraph() const
3118  {
3119  return graphRCP_;
3120  }
3121 
3122  template<class Scalar, class LO, class GO, class Node>
3126  {
3127  TEUCHOS_TEST_FOR_EXCEPTION(
3128  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
3129  "not implemented.");
3130  }
3131 
3132 } // namespace Experimental
3133 } // namespace Tpetra
3134 
3135 //
3136 // Explicit instantiation macro
3137 //
3138 // Must be expanded from within the Tpetra::Experimental namespace!
3139 //
3140 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3141  template class Experimental::BlockCrsMatrix< S, LO, GO, NODE >;
3142 
3143 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
Teuchos::ArrayRCP< const LocalOrdinal > getNodePackedIndices() const
Get an Teuchos::ArrayRCP of the packed column-indices.
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
bool isGloballyIndexed() const
If graph indices are in the global range, this function returns true. Otherwise, this function return...
Teuchos::RCP< const Comm< int > > getComm() const
Returns the communicator.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
size_t findLocalIndex(const RowInfo &rowinfo, const LocalOrdinal ind, const size_t hint=0) const
Find the column offset corresponding to the given (local) column index.
virtual void copyAndPermute(const Tpetra::SrcDistObject &source, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Perform copies and permutations that are local to this process.
virtual bool isLowerTriangular() const
Whether this matrix is lower triangular.
Teuchos::RCP< const map_type > getRowMap() const
Returns the Map that describes the row distribution in this graph.
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
LO getBlockSize() const
The number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
Teuchos::RCP< const map_type > getRangeMap() const
Returns the Map associated with the domain of this graph.
mv_type mv_
The Tpetra::MultiVector used to represent the data.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
size_t getNodeNumCols() const
Returns the number of columns connected to the locally owned rows of this graph.
local_graph_type getLocalGraph() const
Get the local graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
void COPY(const ViewType1 &x, const ViewType2 &y)
y := x, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dimension(s)...
bool hasColMap() const
Whether the graph has a column Map.
void putScalar(const Scalar &val)
Fill all entries with the given value val.
size_t getGlobalMaxNumRowEntries() const
Maximum number of entries in all rows over all processes.
global_size_t getGlobalNumRows() const
Returns the number of global rows in the graph.
size_t getNodeNumRows() const
get the local number of block rows
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with some verbosity level to a FancyOStream.
One or more distributed dense vectors.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
MultiVector for multiple degrees of freedom per mesh point.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
global_size_t getGlobalNumRows() const
get the global number of block rows
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Teuchos::RCP< const import_type > getImporter() const
Returns the importer associated with this graph.
Nonowning view of a square dense block in a block matrix.
LocalOrdinal getMinLocalIndex() const
The minimum local index.
void deep_copy(const LittleBlock< ST2, LO > &dst, const LittleBlock< ST1, LO > &src, typename std::enable_if< std::is_convertible< ST1, ST2 >::value &&!std::is_const< ST2 >::value, int >::type *=NULL)
Copy the LittleBlock src into the LittleBlock dst.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
virtual bool isUpperTriangular() const
Whether this matrix is upper triangular.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
bool locallySameAs(const Map< LocalOrdinal, GlobalOrdinal, node_type > &map) const
Is the given Map locally the same as the input Map?
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in all rows owned by the calling process.
void scale(const Scalar &val)
Multiply all entries in place by the given value val.
virtual bool checkSizes(const Tpetra::SrcDistObject &source)
Compare the source and target (this) objects for compatibility.
little_vec_type getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a view of the degrees of freedom at the given mesh point.
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
bool isNodeLocalElement(LocalOrdinal localIndex) const
Whether the given local index is valid for this Map on the calling process.
virtual bool hasColMap() const
Whether this matrix has a well-defined column map.
bool isFillComplete() const
Returns true if fillComplete() has been called and the graph is in compute mode.
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
virtual size_t getNodeNumDiags() const
The number of local diagonal entries, based on global row/column index comparisons.
global_size_t getGlobalNumEntries() const
Returns the global number of entries in the graph.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
virtual GO getIndexBase() const
The index base for global indices in this matrix.
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
size_t global_size_t
Global size_t object.
Insert new values that don&#39;t currently exist.
bool isLocallyIndexed() const
If graph indices are in the local range, this function returns true. Otherwise, this function returns...
std::string description() const
One-line description of this object.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
Teuchos::RCP< const map_type > getDomainMap() const
Returns the Map associated with the domain of this graph.
Teuchos::RCP< node_type > getNode() const
Returns the underlying node.
void GEMV(const CoefficientType &alpha, const LittleBlockType &A, const LittleVectorType1 &x, const LittleVectorType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
GlobalOrdinal getIndexBase() const
Returns the index base for global indices for this graph.
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
virtual bool isFillComplete() const
Whether fillComplete() has been called.
void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Nonowning view of a set of degrees of freedom corresponding to a mesh point in a block vector or mult...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
bool isUpperTriangular() const
Whether the graph is locally upper triangular.
Sets up and executes a communication plan for a Tpetra DistObject.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void reorderedGaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node >::mag_type mag_type
Type of a norm result.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
virtual void leftScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
void getLocalDiagCopy(BlockCrsMatrix< Scalar, LO, GO, Node > &diag, const Teuchos::ArrayView< const size_t > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets.
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
Replace old value with maximum of magnitudes of old and new values.
global_size_t getGlobalNumCols() const
Returns the number of global columns in the graph.
Abstract base class for objects that can be the source of an Import or Export operation.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
Replace existing values with new values.
Teuchos::RCP< const map_type > getColMap() const
Returns the Map that describes the column distribution in this graph.
Replace old values with zero.
std::string errorMessages() const
The current stream of error messages.
Teuchos::DataAccess getCopyOrView() const
Get whether this has copy (copyOrView = Teuchos::Copy) or view (copyOrView = Teuchos::View) semantics...
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries across all rows/columns on all nodes.
size_t getNodeNumEntries() const
Returns the local number of entries in the graph.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
global_size_t getGlobalNumDiags() const
Returns the number of global diagonal entries, based on global row/column index comparisons.
LocalOrdinal getLocalElement(GlobalOrdinal globalIndex) const
The local index corresponding to the given global index.
virtual void rightScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
virtual Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const
Returns the current number of entries on this node in the specified local row.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
A distributed dense vector.
size_t getNodeNumDiags() const
Returns the number of local diagonal entries, based on global row/column index comparisons.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
size_t getNodeNumRows() const
Returns the number of graph rows owned on the calling node.
void doExport(const SrcDistObject &source, const Export< LO, GO, Node > &exporter, CombineMode CM)
Export data into this object using an Export object ("forward mode").
virtual global_size_t getGlobalNumDiags() const
The number of global diagonal entries, based on global row/column index comparisons.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
BlockMultiVector< Scalar, LO, GO, Node >::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< Node > getNode() const
The Kokkos Node instance.
void GETRS(const char mode[], const LittleBlockType &A, const LittleIntVectorType &ipiv, const LittleScalarVectorType &B, int &info)
Solve the linear system(s) AX=B, using the result of GETRF or GETF2.
Declaration of Tpetra::Experimental::BlockCrsMatrix.
size_t getNodeMaxNumRowEntries() const
The maximum number of entries across all rows/columns on this node.
bool isLowerTriangular() const
Whether the graph is locally lower triangular.
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
virtual Teuchos::RCP< const Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
Teuchos::RCP< const export_type > getExporter() const
Returns the exporter associated with this graph.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &factoredDiagonal, const Kokkos::View< int **, device_type, Kokkos::MemoryUnmanaged > &factorizationPivots, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
RowInfo getRowInfo(const LocalOrdinal myRow) const
Get information about the locally owned row with local index myRow.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const
Returns the current number of entries on this node in the specified global row.