Tpetra parallel linear algebra  Version of the Day
Tpetra_Import_Util2.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_IMPORT_UTIL2_HPP
43 #define TPETRA_IMPORT_UTIL2_HPP
44 
49 
50 #include "Tpetra_ConfigDefs.hpp"
52 #include "Tpetra_Import.hpp"
53 #include "Tpetra_HashTable.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Util.hpp"
56 #include "Tpetra_Distributor.hpp"
57 #include <Teuchos_Array.hpp>
58 #include <utility>
59 
60 // Tpetra::CrsMatrix uses the functions below in its implementation.
61 // To avoid a circular include issue, only include the declarations
62 // for CrsMatrix. We will include the definition after the functions
63 // here have been defined.
65 
66 
67 namespace { // (anonymous)
68 
69  template<class T, class D>
70  Kokkos::View<T*, D, Kokkos::MemoryUnmanaged>
71  getNonconstView (const Teuchos::ArrayView<T>& x)
72  {
73  typedef Kokkos::View<T*, D, Kokkos::MemoryUnmanaged> view_type;
74  typedef typename view_type::size_type size_type;
75  const size_type numEnt = static_cast<size_type> (x.size ());
76  return view_type (x.getRawPtr (), numEnt);
77  }
78 
79  template<class T, class D>
80  Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged>
81  getConstView (const Teuchos::ArrayView<const T>& x)
82  {
83  typedef Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged> view_type;
84  typedef typename view_type::size_type size_type;
85  const size_type numEnt = static_cast<size_type> (x.size ());
86  return view_type (x.getRawPtr (), numEnt);
87  }
88 
89  // For a given Kokkos (execution or memory) space, return both its
90  // execution space, and the corresponding host execution space.
91  template<class Space>
92  struct GetHostExecSpace {
93  typedef typename Space::execution_space execution_space;
94  typedef typename Kokkos::View<int*, execution_space>::HostMirror::execution_space host_execution_space;
95  };
96 
97 } // namespace (anonymous)
98 
99 namespace Tpetra {
100 namespace Import_Util {
101 
116 template<typename Scalar,
117  typename LocalOrdinal,
118  typename GlobalOrdinal,
119  typename Node>
120 void
121 packAndPrepareWithOwningPIDs (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
122  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
123  Teuchos::Array<char>& exports,
124  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
125  size_t& constantNumPackets,
126  Distributor &distor,
127  const Teuchos::ArrayView<const int>& SourcePids);
128 
144 template<typename Scalar,
145  typename LocalOrdinal,
146  typename GlobalOrdinal,
147  typename Node>
148 size_t
149 unpackAndCombineWithOwningPIDsCount (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
150  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
151  const Teuchos::ArrayView<const char> &imports,
152  const Teuchos::ArrayView<size_t> &numPacketsPerLID,
153  size_t constantNumPackets,
154  Distributor &distor,
155  CombineMode combineMode,
156  size_t numSameIDs,
157  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
158  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs);
159 
174 template<typename Scalar,
175  typename LocalOrdinal,
176  typename GlobalOrdinal,
177  typename Node>
178 void
179 unpackAndCombineIntoCrsArrays (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
180  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
181  const Teuchos::ArrayView<const char>& imports,
182  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
183  size_t constantNumPackets,
184  Distributor &distor,
185  CombineMode combineMode,
186  size_t numSameIDs,
187  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
188  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
189  size_t TargetNumRows,
190  size_t TargetNumNonzeros,
191  int MyTargetPID,
192  const Teuchos::ArrayView<size_t>& rowPointers,
193  const Teuchos::ArrayView<GlobalOrdinal>& columnIndices,
194  const Teuchos::ArrayView<Scalar>& values,
195  const Teuchos::ArrayView<const int>& SourcePids,
196  Teuchos::Array<int>& TargetPids);
197 
200 template<typename Scalar, typename Ordinal>
201 void
202 sortCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
203  const Teuchos::ArrayView<Ordinal>& CRS_colind,
204  const Teuchos::ArrayView<Scalar>&CRS_vals);
205 
210 template<typename Scalar, typename Ordinal>
211 void
212 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
213  const Teuchos::ArrayView<Ordinal>& CRS_colind,
214  const Teuchos::ArrayView<Scalar>& CRS_vals);
215 
231 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
232 void
233 lowCommunicationMakeColMapAndReindex (const Teuchos::ArrayView<const size_t> &rowPointers,
234  const Teuchos::ArrayView<LocalOrdinal> &columnIndices_LID,
235  const Teuchos::ArrayView<GlobalOrdinal> &columnIndices_GID,
236  const Tpetra::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & domainMap,
237  const Teuchos::ArrayView<const int> &owningPids,
238  Teuchos::Array<int> &remotePids,
239  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap);
240 
241 
242 } // namespace Import_Util
243 } // namespace Tpetra
244 
245 
246 //
247 // Implementations
248 //
249 
250 namespace { // (anonymous)
251 
268  template<class LO, class GO, class D>
269  size_t
270  packRowCount (const size_t numEnt,
271  const size_t numBytesPerValue)
272  {
274 
275  if (numEnt == 0) {
276  // Empty rows always take zero bytes, to ensure sparsity.
277  return 0;
278  }
279  else {
280  // We store the number of entries as a local index (LO).
281  LO numEntLO = 0; // packValueCount wants this.
282  GO gid;
283  int lid;
284  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
285  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
286  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (lid);
287  const size_t valsLen = numEnt * numBytesPerValue;
288  return numEntLen + gidsLen + pidsLen + valsLen;
289  }
290  }
291 
292  template<class LO, class D>
293  size_t
294  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
295  const size_t offset,
296  const size_t numBytes,
297  const size_t numBytesPerValue)
298  {
299  using Kokkos::subview;
301  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
302  typedef typename input_buffer_type::size_type size_type;
303 
304  if (numBytes == 0) {
305  // Empty rows always take zero bytes, to ensure sparsity.
306  return static_cast<size_t> (0);
307  }
308  else {
309  LO numEntLO = 0;
310  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
311 #ifdef HAVE_TPETRA_DEBUG
312  TEUCHOS_TEST_FOR_EXCEPTION(
313  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
314  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
315  << ".");
316 #endif // HAVE_TPETRA_DEBUG
317  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
318  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
319  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
320  (void)actualNumBytes;
321 #ifdef HAVE_TPETRA_DEBUG
322  TEUCHOS_TEST_FOR_EXCEPTION(
323  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
324  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
325  << ".");
326 #endif // HAVE_TPETRA_DEBUG
327  return static_cast<size_t> (numEntLO);
328  }
329  }
330 
331  // Return the number of bytes packed.
332  template<class ST, class LO, class GO, class D>
333  size_t
334  packRow (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
335  const size_t offset,
336  const size_t numEnt,
340  const size_t numBytesPerValue)
341  {
342  using Kokkos::subview;
344  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
345  // the same, no matter what type we're packing. It's a reasonable
346  // assumption, given that we go through the trouble of PackTraits
347  // just so that we can pack data of different types in the same
348  // buffer.
349  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
350  typedef typename output_buffer_type::size_type size_type;
351  typedef typename std::pair<size_type, size_type> pair_type;
352 
353  if (numEnt == 0) {
354  // Empty rows always take zero bytes, to ensure sparsity.
355  return 0;
356  }
357 
358  const GO gid = 0; // packValueCount wants this
359  const LO numEntLO = static_cast<size_t> (numEnt);
360  const int pid = 0; // packValueCount wants this
361 
362  const size_t numEntBeg = offset;
363  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
364  const size_t gidsBeg = numEntBeg + numEntLen;
365  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
366  const size_t pidsBeg = gidsBeg + gidsLen;
367  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
368  const size_t valsBeg = pidsBeg + pidsLen;
369  const size_t valsLen = numEnt * numBytesPerValue;
370 
371  output_buffer_type numEntOut =
372  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
373  output_buffer_type gidsOut =
374  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
375  output_buffer_type pidsOut =
376  subview (exports, pair_type (pidsBeg, pidsBeg + pidsLen));
377  output_buffer_type valsOut =
378  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
379 
380  size_t numBytesOut = 0;
381  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
382  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
383  numBytesOut += PackTraits<int, D>::packArray (pidsOut, pidsIn, numEnt);
384  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numEnt);
385 
386  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
387  TEUCHOS_TEST_FOR_EXCEPTION(
388  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
389  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
390  << expectedNumBytes << ".");
391 
392  return numBytesOut;
393  }
394 
395  // Return the number of bytes actually read / used.
396  template<class ST, class LO, class GO, class D>
397  size_t
398  unpackRow (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
402  const size_t offset,
403  const size_t numBytes,
404  const size_t numEnt,
405  const size_t numBytesPerValue)
406  {
407  using Kokkos::subview;
409  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
410  // the same, no matter what type we're unpacking. It's a
411  // reasonable assumption, given that we go through the trouble of
412  // PackTraits just so that we can pack data of different types in
413  // the same buffer.
414  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
415  typedef typename input_buffer_type::size_type size_type;
416  typedef typename std::pair<size_type, size_type> pair_type;
417 
418  if (numBytes == 0) {
419  // Rows with zero bytes always have zero entries.
420  return 0;
421  }
422  TEUCHOS_TEST_FOR_EXCEPTION(
423  static_cast<size_t> (imports.dimension_0 ()) <= offset, std::logic_error,
424  "unpackRow: imports.dimension_0() = " << imports.dimension_0 () <<
425  " <= offset = " << offset << ".");
426  TEUCHOS_TEST_FOR_EXCEPTION(
427  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
428  std::logic_error, "unpackRow: imports.dimension_0() = "
429  << imports.dimension_0 () << " < offset + numBytes = "
430  << (offset + numBytes) << ".");
431 
432  const GO gid = 0; // packValueCount wants this
433  const LO lid = 0; // packValueCount wants this
434  const int pid = 0; // packValueCount wants this
435 
436  const size_t numEntBeg = offset;
437  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
438  const size_t gidsBeg = numEntBeg + numEntLen;
439  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
440  const size_t pidsBeg = gidsBeg + gidsLen;
441  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
442  const size_t valsBeg = pidsBeg + pidsLen;
443  const size_t valsLen = numEnt * numBytesPerValue;
444 
445  input_buffer_type numEntIn = subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
446  input_buffer_type gidsIn = subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
447  input_buffer_type pidsIn = subview (imports, pair_type (pidsBeg, pidsBeg + pidsLen));
448  input_buffer_type valsIn = subview (imports, pair_type (valsBeg, valsBeg + valsLen));
449 
450  size_t numBytesOut = 0;
451  LO numEntOut;
452  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
453  TEUCHOS_TEST_FOR_EXCEPTION(
454  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
455  "unpackRow: Expected number of entries " << numEnt
456  << " != actual number of entries " << numEntOut << ".");
457 
458  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
459  numBytesOut += PackTraits<int, D>::unpackArray (pidsOut, pidsIn, numEnt);
460  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numEnt);
461 
462  TEUCHOS_TEST_FOR_EXCEPTION(
463  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
464  << numBytesOut << " != numBytes = " << numBytes << ".");
465  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
466  TEUCHOS_TEST_FOR_EXCEPTION(
467  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
468  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
469  << expectedNumBytes << ".");
470  return numBytesOut;
471  }
472 
473 } // namespace (anonymous)
474 
475 
476 namespace Tpetra {
477 namespace Import_Util {
478 
479 template<typename Scalar,
480  typename LocalOrdinal,
481  typename GlobalOrdinal,
482  typename Node>
483 void
485  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
486  Teuchos::Array<char>& exports,
487  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
488  size_t& constantNumPackets,
489  Distributor &distor,
490  const Teuchos::ArrayView<const int>& SourcePids)
491 {
493  using Kokkos::MemoryUnmanaged;
494  using Kokkos::subview;
495  using Kokkos::View;
496  using Teuchos::Array;
497  using Teuchos::ArrayView;
498  using Teuchos::as;
499  using Teuchos::RCP;
500  typedef LocalOrdinal LO;
501  typedef GlobalOrdinal GO;
503  typedef typename matrix_type::impl_scalar_type ST;
504  typedef typename Node::execution_space execution_space;
505  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
506  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
507  typedef typename ArrayView<const LO>::size_type size_type;
508  typedef std::pair<typename View<int*, HES>::size_type,
509  typename View<int*, HES>::size_type> pair_type;
510  const char prefix[] = "Tpetra::Import_Util::packAndPrepareWithOwningPIDs: ";
511 
512  // FIXME (mfh 03 Jan 2015) Currently, it might be the case that if a
513  // graph or matrix owns no entries on a process, then it reports
514  // that it is neither locally nor globally indexed, even if the
515  // graph or matrix has a column Map. This should change once we get
516  // rid of lazy initialization in CrsGraph and CrsMatrix.
517  TEUCHOS_TEST_FOR_EXCEPTION(
518  ! SourceMatrix.isLocallyIndexed (), std::invalid_argument,
519  prefix << "SourceMatrix must be locally indexed.");
520  TEUCHOS_TEST_FOR_EXCEPTION(
521  SourceMatrix.getColMap ().is_null (), std::logic_error,
522  prefix << "The source matrix claims to be locally indexed, but its column "
523  "Map is null. This should never happen. Please report this bug to the "
524  "Tpetra developers.");
525  const size_type numExportLIDs = exportLIDs.size ();
526  TEUCHOS_TEST_FOR_EXCEPTION(
527  numExportLIDs != numPacketsPerLID.size (), std::invalid_argument, prefix
528  << "exportLIDs.size() = " << numExportLIDs << "!= numPacketsPerLID.size() "
529  << " = " << numPacketsPerLID.size () << ".");
530  TEUCHOS_TEST_FOR_EXCEPTION(
531  static_cast<size_t> (SourcePids.size ()) != SourceMatrix.getColMap ()->getNodeNumElements (),
532  std::invalid_argument, prefix << "SourcePids.size() = "
533  << SourcePids.size ()
534  << "!= SourceMatrix.getColMap()->getNodeNumElements() = "
535  << SourceMatrix.getColMap ()->getNodeNumElements () << ".");
536 
537  // This tells the caller that different rows may have different
538  // numbers of entries. That is, the number of packets per LID might
539  // not be a constant.
540  constantNumPackets = 0;
541 
542  // Compute the number of bytes ("packets") per row to pack. While
543  // we're at it, compute the total number of matrix entries to send,
544  // and the max number of entries in any of the rows we're sending.
545  size_t totalNumBytes = 0;
546  size_t totalNumEntries = 0;
547  size_t maxRowLength = 0;
548  for (size_type i = 0; i < numExportLIDs; ++i) {
549  const LO lclRow = exportLIDs[i];
550  const size_t numEnt = SourceMatrix.getNumEntriesInLocalRow (lclRow);
551 
552  // The 'if' branch implicitly assumes that packRowCount() returns
553  // zero if numEnt == 0.
554  size_t numBytesPerValue = 0;
555  if (numEnt > 0) {
556  // Get a locally indexed view of the current row's data. If the
557  // current row has > 0 entries, we need an entry in order to
558  // figure out the byte count of the packed row. (We really only
559  // need it if ST's size is determined at run time.)
560  ArrayView<const Scalar> valsView;
561  ArrayView<const LO> lidsView;
562  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
563  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
564  View<const ST*, HES, MemoryUnmanaged> valsViewK (valsViewRaw, valsView.size ());
565  TEUCHOS_TEST_FOR_EXCEPTION(
566  static_cast<size_t> (valsViewK.dimension_0 ()) != numEnt,
567  std::logic_error, prefix << "Local row " << i << " claims to have "
568  << numEnt << "entry/ies, but the View returned by getLocalRowView() "
569  "has " << valsViewK.dimension_0 () << " entry/ies. This should never "
570  "happen. Please report this bug to the Tpetra developers.");
571 
572  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
573  // space here for now, this doesn't assume UVM. That may change
574  // in the future, if we ever start packing on the device.
575  numBytesPerValue = PackTraits<ST, HES>::packValueCount (valsViewK(0));
576  }
577 
578  const size_t numBytes =
579  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue);
580  numPacketsPerLID[i] = numBytes;
581  totalNumBytes += numBytes;
582  totalNumEntries += numEnt;
583  maxRowLength = std::max (maxRowLength, numEnt);
584  }
585 
586  // We use a "struct of arrays" approach to packing each row's
587  // entries. All the column indices (as global indices) go first,
588  // then all their owning process ranks, and then the values.
589  if (totalNumEntries > 0) {
590  exports.resize (totalNumBytes);
591  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
592  totalNumBytes);
593  // Current position (in bytes) in the 'exports' output array.
594  size_t offset = 0;
595 
596  // For each row of the matrix owned by the calling process, pack
597  // that row's column indices and values into the exports array.
598 
599  // Locally indexed matrices always have a column Map.
600  const map_type& colMap = * (SourceMatrix.getColMap ());
601 
602  // Temporary buffers for a copy of the column gids/pids
603  View<GO*, HES> gids;
604  View<int*, HES> pids;
605  {
606  GO gid;
607  int pid;
608  gids = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
609  pids = PackTraits<int, HES>::allocateArray (pid, maxRowLength, "pids");
610  }
611 
612  for (size_type i = 0; i < numExportLIDs; i++) {
613  const LO lclRow = exportLIDs[i];
614 
615  // Get a locally indexed view of the current row's data.
616  ArrayView<const Scalar> valsView;
617  ArrayView<const LO> lidsView;
618  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
619  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
620  View<const ST*, HES, MemoryUnmanaged> valsViewK (valsViewRaw, valsView.size ());
621  const size_t numEnt = static_cast<size_t> (valsViewK.dimension_0 ());
622 
623  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
624  // space here for now, this doesn't assume UVM. That may change
625  // in the future, if we ever start packing on the device.
626  const size_t numBytesPerValue = numEnt == 0 ?
627  static_cast<size_t> (0) :
628  PackTraits<ST, HES>::packValueCount (valsViewK(0));
629 
630  // Convert column indices as LIDs to column indices as GIDs.
631  View<GO*, HES> gidsView = subview (gids, pair_type (0, numEnt));
632  View<int*, HES> pidsView = subview (pids, pair_type (0, numEnt));
633  for (size_t k = 0; k < numEnt; ++k) {
634  gidsView(k) = colMap.getGlobalElement (lidsView[k]);
635  pidsView(k) = SourcePids[lidsView[k]];
636  }
637 
638  // Copy the row's data into the current spot in the exports array.
639  const size_t numBytes =
640  packRow<ST, LO, GO, HES> (exportsK, offset, numEnt,
641  gidsView, pidsView, valsViewK,
642  numBytesPerValue);
643  // Keep track of how many bytes we packed.
644  offset += numBytes;
645  }
646 
647 #ifdef HAVE_TPETRA_DEBUG
648  TEUCHOS_TEST_FOR_EXCEPTION(
649  offset != totalNumBytes, std::logic_error, prefix << "At end of method, "
650  "the final offset (in bytes) " << offset << " does not equal the total "
651  "number of bytes packed " << totalNumBytes << ". Please report this bug "
652  "to the Tpetra developers.");
653 #endif // HAVE_TPETRA_DEBUG
654  }
655 }
656 
657 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
658 size_t
660  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
661  const Teuchos::ArrayView<const char> &imports,
662  const Teuchos::ArrayView<size_t> &numPacketsPerLID,
663  size_t constantNumPackets,
664  Distributor &distor,
665  CombineMode combineMode,
666  size_t numSameIDs,
667  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
668  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
669 {
670  using Kokkos::MemoryUnmanaged;
671  using Kokkos::View;
672  typedef LocalOrdinal LO;
673  typedef GlobalOrdinal GO;
674  typedef CrsMatrix<Scalar, LO, GO, Node> matrix_type;
675  typedef typename matrix_type::impl_scalar_type ST;
676  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
677  typedef typename Node::execution_space execution_space;
678  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
679  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
680 
681  TEUCHOS_TEST_FOR_EXCEPTION(
682  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
683  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
684  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
685  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
686  // process, then the matrix is neither locally nor globally indexed.
687  const bool locallyIndexed = SourceMatrix.isLocallyIndexed ();
688  TEUCHOS_TEST_FOR_EXCEPTION(
689  ! locallyIndexed, std::invalid_argument, prefix << "The input CrsMatrix "
690  "'SourceMatrix' must be locally indexed.");
691  TEUCHOS_TEST_FOR_EXCEPTION(
692  importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
693  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
694  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
695 
696  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
697  imports.size ());
698 
699  // Number of matrix entries to unpack (returned by this function).
700  size_t nnz = 0;
701 
702  // Count entries copied directly from the source matrix without permuting.
703  for (size_t sourceLID = 0; sourceLID < numSameIDs; ++sourceLID) {
704  const LO srcLID = static_cast<LO> (sourceLID);
705  nnz += SourceMatrix.getNumEntriesInLocalRow (srcLID);
706  }
707 
708  // Count entries copied directly from the source matrix with permuting.
709  const size_type numPermuteToLIDs = permuteToLIDs.size ();
710  for (size_type p = 0; p < numPermuteToLIDs; ++p) {
711  nnz += SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[p]);
712  }
713 
714  // Count entries received from other MPI processes.
715  size_t offset = 0;
716  const size_type numImportLIDs = importLIDs.size ();
717  for (size_type i = 0; i < numImportLIDs; ++i) {
718  const size_t numBytes = numPacketsPerLID[i];
719  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
720  // values, if it has one) for the (possibly run-time-depenendent)
721  // number of bytes of one of its entries.
722  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
723  numBytes, sizeof (ST));
724  nnz += numEnt;
725  offset += numBytes;
726  }
727  return nnz;
728 }
729 
730 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
731 void
733  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
734  const Teuchos::ArrayView<const char>& imports,
735  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
736  const size_t constantNumPackets,
737  Distributor& distor,
738  const CombineMode combineMode,
739  const size_t numSameIDs,
740  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
741  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
742  size_t TargetNumRows,
743  size_t TargetNumNonzeros,
744  const int MyTargetPID,
745  const Teuchos::ArrayView<size_t>& CSR_rowptr,
746  const Teuchos::ArrayView<GlobalOrdinal>& CSR_colind,
747  const Teuchos::ArrayView<Scalar>& CSR_vals,
748  const Teuchos::ArrayView<const int>& SourcePids,
749  Teuchos::Array<int>& TargetPids)
750 {
752  using Kokkos::MemoryUnmanaged;
753  using Kokkos::subview;
754  using Kokkos::View;
755  using Teuchos::ArrayView;
756  using Teuchos::as;
757  using Teuchos::av_reinterpret_cast;
758  using Teuchos::outArg;
759  using Teuchos::REDUCE_MAX;
760  using Teuchos::reduceAll;
761  typedef LocalOrdinal LO;
762  typedef GlobalOrdinal GO;
763  typedef typename Node::execution_space execution_space;
764  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
766  typedef typename matrix_type::impl_scalar_type ST;
767  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
768  typedef typename ArrayView<const LO>::size_type size_type;
769  //typedef std::pair<typename Kokkos::View<int*, HES>::size_type,
770  // typename Kokkos::View<int*, HES>::size_type> pair_type;
771  const char prefix[] = "Tpetra::Import_Util::unpackAndCombineIntoCrsArrays: ";
772 
773  const size_t N = TargetNumRows;
774  const size_t mynnz = TargetNumNonzeros;
775  // In the case of reduced communicators, the SourceMatrix won't have
776  // the right "MyPID", so thus we have to supply it.
777  const int MyPID = MyTargetPID;
778 
779  TEUCHOS_TEST_FOR_EXCEPTION(
780  TargetNumRows + 1 != static_cast<size_t> (CSR_rowptr.size ()),
781  std::invalid_argument, prefix << "CSR_rowptr.size() = " <<
782  CSR_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
783  TEUCHOS_TEST_FOR_EXCEPTION(
784  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
785  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
786  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
787  const size_type numImportLIDs = importLIDs.size ();
788  TEUCHOS_TEST_FOR_EXCEPTION(
789  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
790  prefix << "importLIDs.size() = " << numImportLIDs << " != "
791  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
792 
793  // Kokkos::View of the input buffer of bytes to unpack.
794  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
795  imports.size ());
796  // Zero the rowptr
797  for (size_t i = 0; i< N+1; ++i) {
798  CSR_rowptr[i] = 0;
799  }
800 
801  // SameIDs: Always first, always in the same place
802  for (size_t i = 0; i < numSameIDs; ++i) {
803  CSR_rowptr[i] = SourceMatrix.getNumEntriesInLocalRow (static_cast<LO> (i));
804  }
805 
806  // PermuteIDs: Still local, but reordered
807  size_t numPermuteIDs = permuteToLIDs.size ();
808  for (size_t i = 0; i < numPermuteIDs; ++i) {
809  CSR_rowptr[permuteToLIDs[i]] =
810  SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[i]);
811  }
812 
813  // Setup CSR_rowptr for remotes
814  {
815  size_t offset = 0;
816  for (size_type k = 0; k < numImportLIDs; ++k) {
817  const size_t numBytes = numPacketsPerLID[k];
818  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
819  // values, if it has one) for the (possibly run-time -
820  // depenendent) number of bytes of one of its entries.
821  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
822  numBytes, sizeof (ST));
823  CSR_rowptr[importLIDs[k]] += numEnt;
824  offset += numBytes;
825  }
826  }
827 
828  // If multiple processes contribute to the same row, we may need to
829  // update row offsets. This tracks that.
830  Teuchos::Array<size_t> NewStartRow (N + 1);
831 
832  // Turn row length into a real CSR_rowptr
833  size_t last_len = CSR_rowptr[0];
834  CSR_rowptr[0] = 0;
835  for (size_t i = 1; i < N+1; ++i) {
836  size_t new_len = CSR_rowptr[i];
837  CSR_rowptr[i] = last_len + CSR_rowptr[i-1];
838  NewStartRow[i] = CSR_rowptr[i];
839  last_len = new_len;
840  }
841 
842  TEUCHOS_TEST_FOR_EXCEPTION(
843  CSR_rowptr[N] != mynnz, std::invalid_argument, prefix << "CSR_rowptr[last]"
844  " = " << CSR_rowptr[N] << "!= mynnz = " << mynnz << ".");
845 
846  // Preseed TargetPids with -1 for local
847  if (static_cast<size_t> (TargetPids.size ()) != mynnz) {
848  TargetPids.resize (mynnz);
849  }
850  TargetPids.assign (mynnz, -1);
851 
852  // Grab pointers for SourceMatrix
853  ArrayRCP<const size_t> Source_rowptr_RCP;
854  ArrayRCP<const LO> Source_colind_RCP;
855  ArrayRCP<const Scalar> Source_vals_RCP;
856  SourceMatrix.getAllValues (Source_rowptr_RCP, Source_colind_RCP, Source_vals_RCP);
857  ArrayView<const size_t> Source_rowptr = Source_rowptr_RCP ();
858  ArrayView<const LO> Source_colind = Source_colind_RCP ();
859  ArrayView<const Scalar> Source_vals = Source_vals_RCP ();
860 
861  const map_type& sourceColMap = * (SourceMatrix.getColMap());
862  ArrayView<const GO> globalColElts = sourceColMap.getNodeElementList();
863 
864  // SameIDs: Copy the data over
865  for (size_t i = 0; i < numSameIDs; ++i) {
866  size_t FromRow = Source_rowptr[i];
867  size_t ToRow = CSR_rowptr[i];
868  NewStartRow[i] += Source_rowptr[i+1] - Source_rowptr[i];
869 
870  for (size_t j = Source_rowptr[i]; j < Source_rowptr[i+1]; ++j) {
871  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
872  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
873  TargetPids[ToRow + j - FromRow] =
874  (SourcePids[Source_colind[j]] != MyPID) ?
875  SourcePids[Source_colind[j]] : -1;
876  }
877  }
878 
879  size_t numBytesPerValue = 0;
880  if (PackTraits<ST, HES>::compileTimeSize) {
881  ST val; // assume that ST is default constructible
882  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
883  }
884  else {
885  // Since the packed data come from the source matrix, we can use
886  // the source matrix to get the number of bytes per Scalar value
887  // stored in the matrix. This assumes that all Scalar values in
888  // the source matrix require the same number of bytes. If the
889  // source matrix has no entries on the calling process, then we
890  // have to ask the target matrix (via the output CSR arrays). If
891  // the target matrix has no entries on input on the calling
892  // process, then hope that some process does have some idea how
893  // big a Scalar value is. Of course, if no processes have any
894  // entries, then no values should be packed (though this does
895  // assume that in our packing scheme, rows with zero entries take
896  // zero bytes).
897  if (Source_rowptr.size () == 0 || Source_rowptr[Source_rowptr.size () - 1] == 0) {
898  numBytesPerValue = PackTraits<ST, HES>::packValueCount (CSR_vals[0]);
899  }
900  else {
901  numBytesPerValue = PackTraits<ST, HES>::packValueCount (Source_vals[0]);
902  }
903  size_t lclNumBytesPerValue = numBytesPerValue;
904  Teuchos::RCP<const Teuchos::Comm<int> > comm = SourceMatrix.getComm ();
905  reduceAll<int, size_t> (*comm, REDUCE_MAX, lclNumBytesPerValue,
906  outArg (numBytesPerValue));
907  }
908 
909  // PermuteIDs: Copy the data over
910  for (size_t i = 0; i < numPermuteIDs; ++i) {
911  LO FromLID = permuteFromLIDs[i];
912  size_t FromRow = Source_rowptr[FromLID];
913  size_t ToRow = CSR_rowptr[permuteToLIDs[i]];
914 
915  NewStartRow[permuteToLIDs[i]] += Source_rowptr[FromLID+1]-Source_rowptr[FromLID];
916 
917  for (size_t j = Source_rowptr[FromLID]; j < Source_rowptr[FromLID+1]; ++j) {
918  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
919  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
920  TargetPids[ToRow + j - FromRow] =
921  (SourcePids[Source_colind[j]] != MyPID) ?
922  SourcePids[Source_colind[j]] : -1;
923  }
924  }
925 
926  // RemoteIDs: Loop structure following UnpackAndCombine
927  if (imports.size () > 0) {
928  size_t offset = 0;
929 #ifdef HAVE_TPETRA_DEBUG
930  int lclErr = 0;
931 #endif
932 
933  for (size_t i = 0; i < static_cast<size_t> (numImportLIDs); ++i) {
934  const size_t numBytes = numPacketsPerLID[i];
935  if (numBytes == 0) {
936  // Empty buffer for that row means that the row is empty.
937  continue;
938  }
939  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
940  // values, if it has one) for the (possibly run-time -
941  // depenendent) number of bytes of one of its entries.
942  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset, numBytes,
943  numBytesPerValue);
944  const LO lclRow = importLIDs[i];
945  const size_t StartRow = NewStartRow[lclRow];
946  NewStartRow[lclRow] += numEnt;
947 
948  View<GO*, HES, MemoryUnmanaged> gidsOut =
949  getNonconstView<GO, HES> (CSR_colind (StartRow, numEnt));
950  View<int*, HES, MemoryUnmanaged> pidsOut =
951  getNonconstView<int, HES> (TargetPids (StartRow, numEnt));
952  ArrayView<Scalar> valsOutS = CSR_vals (StartRow, numEnt);
953  View<ST*, HES, MemoryUnmanaged> valsOut =
954  getNonconstView<ST, HES> (av_reinterpret_cast<ST> (valsOutS));
955 
956  const size_t numBytesOut =
957  unpackRow<ST, LO, GO, HES> (gidsOut, pidsOut, valsOut, importsK,
958  offset, numBytes, numEnt, numBytesPerValue);
959  if (numBytesOut != numBytes) {
960 #ifdef HAVE_TPETRA_DEBUG
961  lclErr = 1;
962 #endif
963  break;
964  }
965 
966  // Correct target PIDs.
967  for (size_t j = 0; j < numEnt; ++j) {
968  const int pid = pidsOut[j];
969  pidsOut[j] = (pid != MyPID) ? pid : -1;
970  }
971  offset += numBytes;
972  }
973 #ifdef HAVE_TPETRA_DEBUG
974  TEUCHOS_TEST_FOR_EXCEPTION(
975  offset != static_cast<size_t> (imports.size ()), std::logic_error, prefix
976  << "After unpacking and counting all the import packets, the final offset"
977  " in bytes " << offset << " != imports.size() = " << imports.size () <<
978  ". Please report this bug to the Tpetra developers.");
979  TEUCHOS_TEST_FOR_EXCEPTION(
980  lclErr != 0, std::logic_error, prefix << "numBytes != numBytesOut "
981  "somewhere in unpack loop. This should never happen. "
982  "Please report this bug to the Tpetra developers.");
983 #endif // HAVE_TPETRA_DEBUG
984  }
985 }
986 
987 // Note: This should get merged with the other Tpetra sort routines eventually.
988 template<typename Scalar, typename Ordinal>
989 void
990 sortCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
991  const Teuchos::ArrayView<Ordinal> & CRS_colind,
992  const Teuchos::ArrayView<Scalar> &CRS_vals)
993 {
994  // For each row, sort column entries from smallest to largest.
995  // Use shell sort. Stable sort so it is fast if indices are already sorted.
996  // Code copied from Epetra_CrsMatrix::SortEntries()
997  size_t NumRows = CRS_rowptr.size()-1;
998  size_t nnz = CRS_colind.size();
999 
1000  for(size_t i = 0; i < NumRows; i++){
1001  size_t start=CRS_rowptr[i];
1002  if(start >= nnz) continue;
1003 
1004  Scalar* locValues = &CRS_vals[start];
1005  size_t NumEntries = CRS_rowptr[i+1] - start;
1006  Ordinal* locIndices = &CRS_colind[start];
1007 
1008  Ordinal n = NumEntries;
1009  Ordinal m = n/2;
1010 
1011  while(m > 0) {
1012  Ordinal max = n - m;
1013  for(Ordinal j = 0; j < max; j++) {
1014  for(Ordinal k = j; k >= 0; k-=m) {
1015  if(locIndices[k+m] >= locIndices[k])
1016  break;
1017  Scalar dtemp = locValues[k+m];
1018  locValues[k+m] = locValues[k];
1019  locValues[k] = dtemp;
1020  Ordinal itemp = locIndices[k+m];
1021  locIndices[k+m] = locIndices[k];
1022  locIndices[k] = itemp;
1023  }
1024  }
1025  m = m/2;
1026  }
1027  }
1028 }
1029 
1030 // Note: This should get merged with the other Tpetra sort routines eventually.
1031 template<typename Scalar, typename Ordinal>
1032 void
1033 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
1034  const Teuchos::ArrayView<Ordinal> & CRS_colind,
1035  const Teuchos::ArrayView<Scalar> &CRS_vals)
1036 {
1037  // For each row, sort column entries from smallest to largest,
1038  // merging column ids that are identify by adding values. Use shell
1039  // sort. Stable sort so it is fast if indices are already sorted.
1040  // Code copied from Epetra_CrsMatrix::SortEntries()
1041 
1042  size_t NumRows = CRS_rowptr.size()-1;
1043  size_t nnz = CRS_colind.size();
1044  size_t new_curr=CRS_rowptr[0], old_curr=CRS_rowptr[0];
1045 
1046  for(size_t i = 0; i < NumRows; i++){
1047  size_t start=CRS_rowptr[i];
1048  if(start >= nnz) continue;
1049 
1050  Scalar* locValues = &CRS_vals[start];
1051  size_t NumEntries = CRS_rowptr[i+1] - start;
1052  Ordinal* locIndices = &CRS_colind[start];
1053 
1054  // Sort phase
1055  Ordinal n = NumEntries;
1056  Ordinal m = n/2;
1057 
1058  while(m > 0) {
1059  Ordinal max = n - m;
1060  for(Ordinal j = 0; j < max; j++) {
1061  for(Ordinal k = j; k >= 0; k-=m) {
1062  if(locIndices[k+m] >= locIndices[k])
1063  break;
1064  Scalar dtemp = locValues[k+m];
1065  locValues[k+m] = locValues[k];
1066  locValues[k] = dtemp;
1067  Ordinal itemp = locIndices[k+m];
1068  locIndices[k+m] = locIndices[k];
1069  locIndices[k] = itemp;
1070  }
1071  }
1072  m = m/2;
1073  }
1074 
1075  // Merge & shrink
1076  for(size_t j=CRS_rowptr[i]; j < CRS_rowptr[i+1]; j++) {
1077  if(j > CRS_rowptr[i] && CRS_colind[j]==CRS_colind[new_curr-1]) {
1078  CRS_vals[new_curr-1] += CRS_vals[j];
1079  }
1080  else if(new_curr==j) {
1081  new_curr++;
1082  }
1083  else {
1084  CRS_colind[new_curr] = CRS_colind[j];
1085  CRS_vals[new_curr] = CRS_vals[j];
1086  new_curr++;
1087  }
1088  }
1089 
1090  CRS_rowptr[i] = old_curr;
1091  old_curr=new_curr;
1092  }
1093 
1094  CRS_rowptr[NumRows] = new_curr;
1095 }
1096 
1097 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1098 void
1099 lowCommunicationMakeColMapAndReindex (const ArrayView<const size_t> &rowptr,
1100  const ArrayView<LocalOrdinal> &colind_LID,
1101  const ArrayView<GlobalOrdinal> &colind_GID,
1102  const Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMapRCP,
1103  const Teuchos::ArrayView<const int> &owningPIDs,
1104  Teuchos::Array<int> &remotePIDs,
1105  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap)
1106 {
1107  using Teuchos::rcp;
1108  typedef LocalOrdinal LO;
1109  typedef GlobalOrdinal GO;
1110  typedef Tpetra::global_size_t GST;
1111  typedef Tpetra::Map<LO, GO, Node> map_type;
1112  const char prefix[] = "lowCommunicationMakeColMapAndReindex: ";
1113 
1114  // The domainMap is an RCP because there is a shortcut for a
1115  // (common) special case to return the columnMap = domainMap.
1116  const map_type& domainMap = *domainMapRCP;
1117 
1118  // Scan all column indices and sort into two groups:
1119  // Local: those whose GID matches a GID of the domain map on this processor and
1120  // Remote: All others.
1121  const size_t numDomainElements = domainMap.getNodeNumElements ();
1122  Teuchos::Array<bool> LocalGIDs;
1123  if (numDomainElements > 0) {
1124  LocalGIDs.resize (numDomainElements, false); // Assume domain GIDs are not local
1125  }
1126 
1127  // In principle it is good to have RemoteGIDs and RemotGIDList be as
1128  // long as the number of remote GIDs on this processor, but this
1129  // would require two passes through the column IDs, so we make it
1130  // the max of 100 and the number of block rows.
1131  //
1132  // FIXME (mfh 11 Feb 2015) Tpetra::Details::HashTable can hold at
1133  // most INT_MAX entries, but it's possible to have more rows than
1134  // that (if size_t is 64 bits and int is 32 bits).
1135  const size_t numMyRows = rowptr.size () - 1;
1136  const int hashsize = std::max (static_cast<int> (numMyRows), 100);
1137 
1138  Tpetra::Details::HashTable<GO, LO> RemoteGIDs (hashsize);
1139  Teuchos::Array<GO> RemoteGIDList;
1140  RemoteGIDList.reserve (hashsize);
1141  Teuchos::Array<int> PIDList;
1142  PIDList.reserve (hashsize);
1143 
1144  // Here we start using the *LocalOrdinal* colind_LID array. This is
1145  // safe even if both columnIndices arrays are actually the same
1146  // (because LocalOrdinal==GO). For *local* GID's set
1147  // colind_LID with with their LID in the domainMap. For *remote*
1148  // GIDs, we set colind_LID with (numDomainElements+NumRemoteColGIDs)
1149  // before the increment of the remote count. These numberings will
1150  // be separate because no local LID is greater than
1151  // numDomainElements.
1152 
1153  size_t NumLocalColGIDs = 0;
1154  LO NumRemoteColGIDs = 0;
1155  for (size_t i = 0; i < numMyRows; ++i) {
1156  for(size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1157  const GO GID = colind_GID[j];
1158  // Check if GID matches a row GID
1159  const LO LID = domainMap.getLocalElement (GID);
1160  if(LID != -1) {
1161  const bool alreadyFound = LocalGIDs[LID];
1162  if (! alreadyFound) {
1163  LocalGIDs[LID] = true; // There is a column in the graph associated with this domain map GID
1164  NumLocalColGIDs++;
1165  }
1166  colind_LID[j] = LID;
1167  }
1168  else {
1169  const LO hash_value = RemoteGIDs.get (GID);
1170  if (hash_value == -1) { // This means its a new remote GID
1171  const int PID = owningPIDs[j];
1172  TEUCHOS_TEST_FOR_EXCEPTION(
1173  PID == -1, std::invalid_argument, prefix << "Cannot figure out if "
1174  "PID is owned.");
1175  colind_LID[j] = static_cast<LO> (numDomainElements + NumRemoteColGIDs);
1176  RemoteGIDs.add (GID, NumRemoteColGIDs);
1177  RemoteGIDList.push_back (GID);
1178  PIDList.push_back (PID);
1179  NumRemoteColGIDs++;
1180  }
1181  else {
1182  colind_LID[j] = static_cast<LO> (numDomainElements + hash_value);
1183  }
1184  }
1185  }
1186  }
1187 
1188  // Possible short-circuit: If all domain map GIDs are present as
1189  // column indices, then set ColMap=domainMap and quit.
1190  if (domainMap.getComm ()->getSize () == 1) {
1191  // Sanity check: When there is only one process, there can be no
1192  // remoteGIDs.
1193  TEUCHOS_TEST_FOR_EXCEPTION(
1194  NumRemoteColGIDs != 0, std::runtime_error, prefix << "There is only one "
1195  "process in the domain Map's communicator, which means that there are no "
1196  "\"remote\" indices. Nevertheless, some column indices are not in the "
1197  "domain Map.");
1198  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1199  // In this case, we just use the domainMap's indices, which is,
1200  // not coincidently, what we clobbered colind with up above
1201  // anyway. No further reindexing is needed.
1202  colMap = domainMapRCP;
1203  return;
1204  }
1205  }
1206 
1207  // Now build the array containing column GIDs
1208  // Build back end, containing remote GIDs, first
1209  const LO numMyCols = NumLocalColGIDs + NumRemoteColGIDs;
1210  Teuchos::Array<GO> ColIndices;
1211  GO* RemoteColIndices = NULL;
1212  if (numMyCols > 0) {
1213  ColIndices.resize (numMyCols);
1214  if (NumLocalColGIDs != static_cast<size_t> (numMyCols)) {
1215  RemoteColIndices = &ColIndices[NumLocalColGIDs]; // Points to back half of ColIndices
1216  }
1217  }
1218 
1219  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1220  RemoteColIndices[i] = RemoteGIDList[i];
1221  }
1222 
1223  // Build permute array for *remote* reindexing.
1224  Teuchos::Array<LO> RemotePermuteIDs (NumRemoteColGIDs);
1225  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1226  RemotePermuteIDs[i]=i;
1227  }
1228 
1229  // Sort External column indices so that all columns coming from a
1230  // given remote processor are contiguous. This is a sort with two
1231  // auxillary arrays: RemoteColIndices and RemotePermuteIDs.
1232  Tpetra::sort3 (PIDList.begin (), PIDList.end (),
1233  ColIndices.begin () + NumLocalColGIDs,
1234  RemotePermuteIDs.begin ());
1235 
1236  // Stash the RemotePIDs.
1237  //
1238  // Note: If Teuchos::Array had a shrink_to_fit like std::vector,
1239  // we'd call it here.
1240  remotePIDs = PIDList;
1241 
1242  // Sort external column indices so that columns from a given remote
1243  // processor are not only contiguous but also in ascending
1244  // order. NOTE: I don't know if the number of externals associated
1245  // with a given remote processor is known at this point ... so I
1246  // count them here.
1247 
1248  // NTS: Only sort the RemoteColIndices this time...
1249  LO StartCurrent = 0, StartNext = 1;
1250  while (StartNext < NumRemoteColGIDs) {
1251  if (PIDList[StartNext]==PIDList[StartNext-1]) {
1252  StartNext++;
1253  }
1254  else {
1255  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1256  ColIndices.begin () + NumLocalColGIDs + StartNext,
1257  RemotePermuteIDs.begin () + StartCurrent);
1258  StartCurrent = StartNext;
1259  StartNext++;
1260  }
1261  }
1262  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1263  ColIndices.begin () + NumLocalColGIDs + StartNext,
1264  RemotePermuteIDs.begin () + StartCurrent);
1265 
1266  // Reverse the permutation to get the information we actually care about
1267  Teuchos::Array<LO> ReverseRemotePermuteIDs (NumRemoteColGIDs);
1268  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1269  ReverseRemotePermuteIDs[RemotePermuteIDs[i]] = i;
1270  }
1271 
1272  // Build permute array for *local* reindexing.
1273  bool use_local_permute = false;
1274  Teuchos::Array<LO> LocalPermuteIDs (numDomainElements);
1275 
1276  // Now fill front end. Two cases:
1277  //
1278  // (1) If the number of Local column GIDs is the same as the number
1279  // of Local domain GIDs, we can simply read the domain GIDs into
1280  // the front part of ColIndices, otherwise
1281  //
1282  // (2) We step through the GIDs of the domainMap, checking to see if
1283  // each domain GID is a column GID. we want to do this to
1284  // maintain a consistent ordering of GIDs between the columns
1285  // and the domain.
1286  Teuchos::ArrayView<const GO> domainGlobalElements = domainMap.getNodeElementList();
1287  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1288  if (NumLocalColGIDs > 0) {
1289  // Load Global Indices into first numMyCols elements column GID list
1290  std::copy (domainGlobalElements.begin (), domainGlobalElements.end (),
1291  ColIndices.begin ());
1292  }
1293  }
1294  else {
1295  LO NumLocalAgain = 0;
1296  use_local_permute = true;
1297  for (size_t i = 0; i < numDomainElements; ++i) {
1298  if (LocalGIDs[i]) {
1299  LocalPermuteIDs[i] = NumLocalAgain;
1300  ColIndices[NumLocalAgain++] = domainGlobalElements[i];
1301  }
1302  }
1303  TEUCHOS_TEST_FOR_EXCEPTION(
1304  static_cast<size_t> (NumLocalAgain) != NumLocalColGIDs,
1305  std::runtime_error, prefix << "Local ID count test failed.");
1306  }
1307 
1308  // Make column Map
1309  const GST minus_one = Teuchos::OrdinalTraits<GST>::invalid ();
1310  colMap = rcp (new map_type (minus_one, ColIndices, domainMap.getIndexBase (),
1311  domainMap.getComm (), domainMap.getNode ()));
1312 
1313  // Low-cost reindex of the matrix
1314  for (size_t i = 0; i < numMyRows; ++i) {
1315  for (size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1316  const LO ID = colind_LID[j];
1317  if (static_cast<size_t> (ID) < numDomainElements) {
1318  if (use_local_permute) {
1319  colind_LID[j] = LocalPermuteIDs[colind_LID[j]];
1320  }
1321  // In the case where use_local_permute==false, we just copy
1322  // the DomainMap's ordering, which it so happens is what we
1323  // put in colind_LID to begin with.
1324  }
1325  else {
1326  colind_LID[j] = NumLocalColGIDs + ReverseRemotePermuteIDs[colind_LID[j]-numDomainElements];
1327  }
1328  }
1329  }
1330 }
1331 
1332 } // namespace Import_Util
1333 } // namespace Tpetra
1334 
1335 // We can include the definitions for Tpetra::CrsMatrix now that the above
1336 // functions have been defined. For ETI, this isn't necessary, so we just
1337 // including the generated hpp
1338 #include "Tpetra_CrsMatrix.hpp"
1339 
1340 #endif // TPETRA_IMPORT_UTIL_HPP
void sortAndMergeCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort and merge the entries of the (raw CSR) matrix by column index within each row.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void sort3(const IT1 &first1, const IT1 &last1, const IT2 &first2, const IT3 &first3)
Sort the first array, and apply the same permutation to the second and third arrays.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
Teuchos::RCP< const map_type > getColMap() const
The Map that describes the column distribution in this matrix.
Declaration of the Tpetra::CrsMatrix class.
bool isLocallyIndexed() const
Whether the matrix is locally indexed on the calling process.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView< const LocalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
void sortCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort the entries of the (raw CSR) matrix by column index within each row.
size_t global_size_t
Global size_t object.
size_t unpackAndCombineWithOwningPIDsCount(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::CrsMatrix::unpackAndCombine that also unpacks owning process ranks...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix is distributed.
void lowCommunicationMakeColMapAndReindex(const Teuchos::ArrayView< const size_t > &rowPointers, const Teuchos::ArrayView< LocalOrdinal > &columnIndices_LID, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices_GID, const Tpetra::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::ArrayView< const int > &owningPids, Teuchos::Array< int > &remotePids, Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap)
lowCommunicationMakeColMapAndReindex
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
void unpackAndCombineIntoCrsArrays(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, int MyTargetPID, const Teuchos::ArrayView< size_t > &rowPointers, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices, const Teuchos::ArrayView< Scalar > &values, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2)
Sort the first array, and apply the resulting permutation to the second array.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
ValueType get(const KeyType key)
Get the value corresponding to the given key.
Describes a parallel distribution of objects over processes.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
Stand-alone utility functions and macros.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const
Returns the current number of entries on this node in the specified local row.
void packAndPrepareWithOwningPIDs(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &exportLIDs, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t &constantNumPackets, Distributor &distor, const Teuchos::ArrayView< const int > &SourcePids)
Special version of Tpetra::CrsMatrix::packAndPrepare that also packs owning process ranks...