Tpetra parallel linear algebra  Version of the Day
Tpetra_Distributor.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTRIBUTOR_HPP
43 #define TPETRA_DISTRIBUTOR_HPP
44 
45 #include "Tpetra_Util.hpp"
46 #include <Teuchos_as.hpp>
47 #include <Teuchos_Describable.hpp>
48 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
49 #include <Teuchos_VerboseObject.hpp>
50 
51 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time
52 // doPosts (both versions) and doWaits, and register those timers with
53 // Teuchos::TimeMonitor so that summarize() or report() will show
54 // results.
55 
56 // #ifndef TPETRA_DISTRIBUTOR_TIMERS
57 // # define TPETRA_DISTRIBUTOR_TIMERS 1
58 // #endif // TPETRA_DISTRIBUTOR_TIMERS
59 
60 #ifdef TPETRA_DISTRIBUTOR_TIMERS
61 # undef TPETRA_DISTRIBUTOR_TIMERS
62 #endif // TPETRA_DISTRIBUTOR_TIMERS
63 
64 #include "KokkosCompat_View.hpp"
65 #include "Kokkos_Core.hpp"
66 #include "Kokkos_TeuchosCommAdapters.hpp"
67 
68 
69 namespace Tpetra {
70 
71  namespace Details {
77  DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
78  DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
79  DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send)
80  DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend)
81  };
82 
87  std::string
89 
95  DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
96  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
97  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
98  DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
99  DISTRIBUTOR_INITIALIZED_BY_COPY // By copy constructor
100  };
101 
106  std::string
108 
109  } // namespace Details
110 
117  Array<std::string> distributorSendTypes ();
118 
186  class Distributor :
187  public Teuchos::Describable,
188  public Teuchos::ParameterListAcceptorDefaultBase {
189  public:
191 
192 
201  explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
202 
214  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
215  const Teuchos::RCP<Teuchos::FancyOStream>& out);
216 
230  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
231  const Teuchos::RCP<Teuchos::ParameterList>& plist);
232 
249  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
250  const Teuchos::RCP<Teuchos::FancyOStream>& out,
251  const Teuchos::RCP<Teuchos::ParameterList>& plist);
252 
254  Distributor (const Distributor& distributor);
255 
257  virtual ~Distributor ();
258 
264  void swap (Distributor& rhs);
265 
267 
269 
274  void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
275 
280  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
281 
283 
285 
305  size_t createFromSends (const ArrayView<const int>& exportNodeIDs);
306 
340  template <class Ordinal>
341  void
342  createFromRecvs (const ArrayView<const Ordinal>& remoteIDs,
343  const ArrayView<const int>& remoteNodeIDs,
344  Array<Ordinal>& exportIDs,
345  Array<int>& exportNodeIDs);
346 
348 
350 
354  size_t getNumReceives() const;
355 
359  size_t getNumSends() const;
360 
362  bool hasSelfMessage() const;
363 
365  size_t getMaxSendLength() const;
366 
368  size_t getTotalReceiveLength() const;
369 
374  ArrayView<const int> getImagesFrom() const;
375 
380  ArrayView<const int> getImagesTo() const;
381 
389  ArrayView<const size_t> getLengthsFrom() const;
390 
398  ArrayView<const size_t> getLengthsTo() const;
399 
405  return howInitialized_;
406  }
407 
409 
411 
422  RCP<Distributor> getReverse() const;
423 
425 
427 
448  template <class Packet>
449  void
450  doPostsAndWaits (const ArrayView<const Packet> &exports,
451  size_t numPackets,
452  const ArrayView<Packet> &imports);
453 
475  template <class Packet>
476  void
477  doPostsAndWaits (const ArrayView<const Packet> &exports,
478  const ArrayView<size_t> &numExportPacketsPerLID,
479  const ArrayView<Packet> &imports,
480  const ArrayView<size_t> &numImportPacketsPerLID);
481 
506  template <class Packet>
507  void
508  doPosts (const ArrayRCP<const Packet> &exports,
509  size_t numPackets,
510  const ArrayRCP<Packet> &imports);
511 
530  template <class Packet>
531  void
532  doPosts (const ArrayRCP<const Packet> &exports,
533  const ArrayView<size_t> &numExportPacketsPerLID,
534  const ArrayRCP<Packet> &imports,
535  const ArrayView<size_t> &numImportPacketsPerLID);
536 
543  void doWaits ();
544 
549  template <class Packet>
550  void
551  doReversePostsAndWaits (const ArrayView<const Packet> &exports,
552  size_t numPackets,
553  const ArrayView<Packet> &imports);
554 
559  template <class Packet>
560  void
561  doReversePostsAndWaits (const ArrayView<const Packet> &exports,
562  const ArrayView<size_t> &numExportPacketsPerLID,
563  const ArrayView<Packet> &imports,
564  const ArrayView<size_t> &numImportPacketsPerLID);
565 
570  template <class Packet>
571  void
572  doReversePosts (const ArrayRCP<const Packet> &exports,
573  size_t numPackets,
574  const ArrayRCP<Packet> &imports);
575 
580  template <class Packet>
581  void
582  doReversePosts (const ArrayRCP<const Packet> &exports,
583  const ArrayView<size_t> &numExportPacketsPerLID,
584  const ArrayRCP<Packet> &imports,
585  const ArrayView<size_t> &numImportPacketsPerLID);
586 
593  void doReverseWaits ();
594 
615  template <class ExpView, class ImpView>
616  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
617  doPostsAndWaits (
618  const ExpView &exports,
619  size_t numPackets,
620  const ImpView &imports);
621 
643  template <class ExpView, class ImpView>
644  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
645  doPostsAndWaits (const ExpView &exports,
646  const ArrayView<size_t> &numExportPacketsPerLID,
647  const ImpView &imports,
648  const ArrayView<size_t> &numImportPacketsPerLID);
649 
674  template <class ExpView, class ImpView>
675  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
676  doPosts (const ExpView &exports,
677  size_t numPackets,
678  const ImpView &imports);
679 
698  template <class ExpView, class ImpView>
699  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
700  doPosts (const ExpView &exports,
701  const ArrayView<size_t> &numExportPacketsPerLID,
702  const ImpView &imports,
703  const ArrayView<size_t> &numImportPacketsPerLID);
704 
709  template <class ExpView, class ImpView>
710  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
711  doReversePostsAndWaits (const ExpView &exports,
712  size_t numPackets,
713  const ImpView &imports);
714 
719  template <class ExpView, class ImpView>
720  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
721  doReversePostsAndWaits (const ExpView &exports,
722  const ArrayView<size_t> &numExportPacketsPerLID,
723  const ImpView &imports,
724  const ArrayView<size_t> &numImportPacketsPerLID);
725 
730  template <class ExpView, class ImpView>
731  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
732  doReversePosts (const ExpView &exports,
733  size_t numPackets,
734  const ImpView &imports);
735 
740  template <class ExpView, class ImpView>
741  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
742  doReversePosts (const ExpView &exports,
743  const ArrayView<size_t> &numExportPacketsPerLID,
744  const ImpView &imports,
745  const ArrayView<size_t> &numImportPacketsPerLID);
746 
750  void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{
751  bytes_sent = lastRoundBytesSend_;
752  bytes_recvd = lastRoundBytesRecv_;
753  }
754 
755 
757 
759 
761  std::string description() const;
762 
764  void describe (Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const;
765 
767 
768  private:
770  RCP<const Comm<int> > comm_;
771 
773  Teuchos::RCP<Teuchos::FancyOStream> out_;
774 
776  Details::EDistributorHowInitialized howInitialized_;
777 
779 
780 
783 
785  bool barrierBetween_;
786 
788  bool debug_;
789 
791 
800  bool enable_cuda_rdma_;
801 
803 
812  size_t numExports_;
813 
817  bool selfMessage_;
818 
828  size_t numSends_;
829 
834  Array<int> imagesTo_;
835 
844  Array<size_t> startsTo_;
845 
851  Array<size_t> lengthsTo_;
852 
856  size_t maxSendLength_;
857 
873  Array<size_t> indicesTo_;
874 
884  size_t numReceives_;
885 
892  size_t totalReceiveLength_;
893 
899  Array<size_t> lengthsFrom_;
900 
906  Array<int> imagesFrom_;
907 
913  Array<size_t> startsFrom_;
914 
921  Array<size_t> indicesFrom_;
922 
929  Array<RCP<Teuchos::CommRequest<int> > > requests_;
930 
935  mutable RCP<Distributor> reverseDistributor_;
936 
937 
939  size_t lastRoundBytesSend_;
940 
942  size_t lastRoundBytesRecv_;
943 
944 #ifdef TPETRA_DISTRIBUTOR_TIMERS
945  Teuchos::RCP<Teuchos::Time> timer_doPosts3_;
946  Teuchos::RCP<Teuchos::Time> timer_doPosts4_;
947  Teuchos::RCP<Teuchos::Time> timer_doWaits_;
948  Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_;
949  Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_;
950  Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_;
951  Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_;
952  Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_;
953  Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_;
954 
956  void makeTimers ();
957 #endif // TPETRA_DISTRIBUTOR_TIMERS
958 
970  bool useDistinctTags_;
971 
976  int getTag (const int pathTag) const;
977 
995  void
996  init (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
997  const Teuchos::RCP<Teuchos::FancyOStream>& out,
998  const Teuchos::RCP<Teuchos::ParameterList>& plist);
999 
1010  void computeReceives ();
1011 
1024  template <class Ordinal>
1025  void computeSends (const ArrayView<const Ordinal> &importIDs,
1026  const ArrayView<const int> &importNodeIDs,
1027  Array<Ordinal> &exportIDs,
1028  Array<int> &exportNodeIDs);
1029 
1031  void createReverseDistributor() const;
1032 
1033  }; // class Distributor
1034 
1035 
1036  template <class Packet>
1037  void Distributor::
1038  doPostsAndWaits (const ArrayView<const Packet>& exports,
1039  size_t numPackets,
1040  const ArrayView<Packet>& imports)
1041  {
1042  using Teuchos::arcp;
1043  using Teuchos::ArrayRCP;
1044  typedef typename ArrayRCP<const Packet>::size_type size_type;
1045 
1046  TEUCHOS_TEST_FOR_EXCEPTION(
1047  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1048  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1049  " outstanding nonblocking messages pending. It is incorrect to call "
1050  "this method with posts outstanding.");
1051 
1052  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1053  // requiring that the memory location is persisting (as is
1054  // necessary for nonblocking receives). However, it need only
1055  // persist until doWaits() completes, so it is safe for us to use
1056  // a nonpersisting reference in this case. The use of a
1057  // nonpersisting reference is purely a performance optimization.
1058 
1059  //const Packet* exportsPtr = exports.getRawPtr();
1060  //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
1061  // exports.size(), false);
1062  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1063  static_cast<size_type> (0),
1064  exports.size(), false);
1065 
1066  // For some reason, neither of the options below (that use arcp)
1067  // compile for Packet=std::complex<double> with GCC 4.5.1. The
1068  // issue only arises with the exports array. This is why we
1069  // construct a separate nonowning ArrayRCP.
1070 
1071  // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1072  // numPackets,
1073  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1074  // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
1075  // numPackets,
1076  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1077  doPosts (exportsArcp,
1078  numPackets,
1079  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1080  doWaits ();
1081 
1082  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1083  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1084  }
1085 
1086  template <class Packet>
1087  void Distributor::
1088  doPostsAndWaits (const ArrayView<const Packet>& exports,
1089  const ArrayView<size_t> &numExportPacketsPerLID,
1090  const ArrayView<Packet> &imports,
1091  const ArrayView<size_t> &numImportPacketsPerLID)
1092  {
1093  using Teuchos::arcp;
1094  using Teuchos::ArrayRCP;
1095 
1096  TEUCHOS_TEST_FOR_EXCEPTION(
1097  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1098  "doPostsAndWaits: There are " << requests_.size () << " outstanding "
1099  "nonblocking messages pending. It is incorrect to call doPostsAndWaits "
1100  "with posts outstanding.");
1101 
1102  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1103  // requiring that the memory location is persisting (as is
1104  // necessary for nonblocking receives). However, it need only
1105  // persist until doWaits() completes, so it is safe for us to use
1106  // a nonpersisting reference in this case.
1107 
1108  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1109  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1110  // with some versions of GCC. The issue only arises with the
1111  // exports array. This is why we construct a separate nonowning
1112  // ArrayRCP.
1113  typedef typename ArrayRCP<const Packet>::size_type size_type;
1114  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1115  static_cast<size_type> (0),
1116  exports.size (), false);
1117  // mfh 04 Apr 2012: This is the offending code. This statement
1118  // would normally be in place of "exportsArcp" in the
1119  // doPosts() call below.
1120  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1121  doPosts (exportsArcp,
1122  numExportPacketsPerLID,
1123  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1124  numImportPacketsPerLID);
1125  doWaits ();
1126 
1127  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1128  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1129  }
1130 
1131 
1132  template <class Packet>
1133  void Distributor::
1134  doPosts (const ArrayRCP<const Packet>& exports,
1135  size_t numPackets,
1136  const ArrayRCP<Packet>& imports)
1137  {
1138  using Teuchos::Array;
1139  using Teuchos::as;
1140  using Teuchos::FancyOStream;
1141  using Teuchos::includesVerbLevel;
1142  using Teuchos::ireceive;
1143  using Teuchos::isend;
1144  using Teuchos::OSTab;
1145  using Teuchos::readySend;
1146  using Teuchos::send;
1147  using Teuchos::ssend;
1148  using Teuchos::TypeNameTraits;
1149  using Teuchos::typeName;
1150  using std::endl;
1151  typedef Array<size_t>::size_type size_type;
1152 
1153  Teuchos::OSTab tab (out_);
1154 
1155 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1156  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
1157 #endif // TPETRA_DISTRIBUTOR_TIMERS
1158 
1159  // Run-time configurable parameters that come from the input
1160  // ParameterList set by setParameterList().
1161  const Details::EDistributorSendType sendType = sendType_;
1162  const bool doBarrier = barrierBetween_;
1163 
1164 // #ifdef HAVE_TEUCHOS_DEBUG
1165 // // Prepare for verbose output, if applicable.
1166 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1167 // (void) verbLevel; // Silence "unused variable" compiler warning.
1168 // RCP<FancyOStream> out = this->getOStream ();
1169 // // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1170 // // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1171 // const bool doPrint = out.get () && (comm_->getRank () == 0);
1172 
1173 // if (doPrint) {
1174 // // Only need one process to print out parameters.
1175 // *out << "Distributor::doPosts (3 args)" << endl;
1176 // }
1177 // // Add one tab level. We declare this outside the doPrint scopes
1178 // // so that the tab persists until the end of this method.
1179 // OSTab tab = this->getOSTab ();
1180 // if (doPrint) {
1181 // *out << "Parameters:" << endl;
1182 // {
1183 // OSTab tab2 (out);
1184 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1185 // << endl << "barrierBetween: " << doBarrier << endl;
1186 // }
1187 // }
1188 // #endif // HAVE_TEUCHOS_DEBUG
1189 
1190  TEUCHOS_TEST_FOR_EXCEPTION(
1191  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1192  "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
1193  "barrier between posting receives and posting ready sends. This should "
1194  "have been checked before. "
1195  "Please report this bug to the Tpetra developers.");
1196 
1197  const int myImageID = comm_->getRank ();
1198  size_t selfReceiveOffset = 0;
1199 
1200  // Each message has the same number of packets.
1201  //
1202  // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
1203  // inequality to a less-than seems to have fixed Bug 6170. It's
1204  // OK for the 'imports' array to be longer than it needs to be;
1205  // I'm just curious why it would be.
1206  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
1207  TEUCHOS_TEST_FOR_EXCEPTION(
1208  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1209  std::invalid_argument, "Tpetra::Distributor::doPosts(3 args): The "
1210  "'imports' array must have enough entries to hold the expected number "
1211  "of import packets. imports.size() = " << imports.size () << " < "
1212  "totalNumImportPackets = " << totalNumImportPackets << ".");
1213 
1214  // MPI tag for nonblocking receives and blocking sends in this
1215  // method. Some processes might take the "fast" path
1216  // (indicesTo_.empty()) and others might take the "slow" path for
1217  // the same doPosts() call, so the path tag must be the same for
1218  // both.
1219  const int pathTag = 0;
1220  const int tag = this->getTag (pathTag);
1221 
1222  if (debug_) {
1223  TEUCHOS_TEST_FOR_EXCEPTION(
1224  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
1225  "doPosts(3 args): Process " << myImageID << ": requests_.size() = "
1226  << requests_.size () << " != 0.");
1227  std::ostringstream os;
1228  os << myImageID << ": doPosts(3,"
1229  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
1230  *out_ << os.str ();
1231  }
1232 
1233  // Distributor uses requests_.size() as the number of outstanding
1234  // nonblocking message requests, so we resize to zero to maintain
1235  // this invariant.
1236  //
1237  // numReceives_ does _not_ include the self message, if there is
1238  // one. Here, we do actually send a message to ourselves, so we
1239  // include any self message in the "actual" number of receives to
1240  // post.
1241  //
1242  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1243  // doesn't (re)allocate its array of requests. That happens in
1244  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1245  // demand), or Resize_().
1246  const size_type actualNumReceives = as<size_type> (numReceives_) +
1247  as<size_type> (selfMessage_ ? 1 : 0);
1248  requests_.resize (0);
1249 
1250  // Post the nonblocking receives. It's common MPI wisdom to post
1251  // receives before sends. In MPI terms, this means favoring
1252  // adding to the "posted queue" (of receive requests) over adding
1253  // to the "unexpected queue" (of arrived messages not yet matched
1254  // with a receive).
1255  {
1256 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1257  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
1258 #endif // TPETRA_DISTRIBUTOR_TIMERS
1259 
1260  size_t curBufOffset = 0;
1261  for (size_type i = 0; i < actualNumReceives; ++i) {
1262  const size_t curBufLen = lengthsFrom_[i] * numPackets;
1263  if (imagesFrom_[i] != myImageID) {
1264  // If my process is receiving these packet(s) from another
1265  // process (not a self-receive):
1266  //
1267  // 1. Set up the persisting view (recvBuf) of the imports
1268  // array, given the offset and size (total number of
1269  // packets from process imagesFrom_[i]).
1270  // 2. Start the Irecv and save the resulting request.
1271  TEUCHOS_TEST_FOR_EXCEPTION(
1272  curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
1273  std::logic_error, "Tpetra::Distributor::doPosts(3 args): Exceeded "
1274  "size of 'imports' array in packing loop on Process " << myImageID
1275  << ". imports.size() = " << imports.size () << " < offset + length"
1276  " = " << (curBufOffset + curBufLen) << ".");
1277 
1278  ArrayRCP<Packet> recvBuf =
1279  imports.persistingView (curBufOffset, curBufLen);
1280  requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
1281  tag, *comm_));
1282  if (debug_) {
1283  std::ostringstream os;
1284  os << myImageID << ": doPosts(3,"
1285  << (indicesTo_.empty () ? "fast" : "slow") << "): "
1286  << "Posted irecv from Proc " << imagesFrom_[i] << " with "
1287  "specified tag " << tag << endl;
1288  *out_ << os.str ();
1289  }
1290  }
1291  else { // Receiving from myself
1292  selfReceiveOffset = curBufOffset; // Remember the self-recv offset
1293  }
1294  curBufOffset += curBufLen;
1295  }
1296  }
1297 
1298  if (doBarrier) {
1299 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1300  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
1301 #endif // TPETRA_DISTRIBUTOR_TIMERS
1302  // If we are using ready sends (MPI_Rsend) below, we need to do
1303  // a barrier before we post the ready sends. This is because a
1304  // ready send requires that its matching receive has already
1305  // been posted before the send has been posted. The only way to
1306  // guarantee that in this case is to use a barrier.
1307  comm_->barrier ();
1308  }
1309 
1310 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1311  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
1312 #endif // TPETRA_DISTRIBUTOR_TIMERS
1313 
1314  // setup scan through imagesTo_ list starting with higher numbered images
1315  // (should help balance message traffic)
1316  //
1317  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
1318  // It doesn't depend on the input at all.
1319  size_t numBlocks = numSends_ + selfMessage_;
1320  size_t imageIndex = 0;
1321  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
1322  ++imageIndex;
1323  }
1324  if (imageIndex == numBlocks) {
1325  imageIndex = 0;
1326  }
1327 
1328  size_t selfNum = 0;
1329  size_t selfIndex = 0;
1330 
1331  if (indicesTo_.empty()) {
1332  if (debug_) {
1333  std::ostringstream os;
1334  os << myImageID << ": doPosts(3,fast): posting sends" << endl;
1335  *out_ << os.str ();
1336  }
1337 
1338  // Data are already blocked (laid out) by process, so we don't
1339  // need a separate send buffer (besides the exports array).
1340  for (size_t i = 0; i < numBlocks; ++i) {
1341  size_t p = i + imageIndex;
1342  if (p > (numBlocks - 1)) {
1343  p -= numBlocks;
1344  }
1345 
1346  if (imagesTo_[p] != myImageID) {
1347  ArrayView<const Packet> tmpSend =
1348  exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
1349 
1350  if (sendType == Details::DISTRIBUTOR_SEND) {
1351  send<int, Packet> (tmpSend.getRawPtr (),
1352  as<int> (tmpSend.size ()),
1353  imagesTo_[p], tag, *comm_);
1354  }
1355  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1356  ArrayRCP<const Packet> tmpSendBuf =
1357  exports.persistingView (startsTo_[p] * numPackets,
1358  lengthsTo_[p] * numPackets);
1359  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1360  tag, *comm_));
1361  }
1362  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1363  readySend<int, Packet> (tmpSend.getRawPtr (),
1364  as<int> (tmpSend.size ()),
1365  imagesTo_[p], tag, *comm_);
1366  }
1367  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1368  ssend<int, Packet> (tmpSend.getRawPtr (),
1369  as<int> (tmpSend.size ()),
1370  imagesTo_[p], tag, *comm_);
1371  } else {
1372  TEUCHOS_TEST_FOR_EXCEPTION(
1373  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
1374  "Invalid send type. We should never get here. "
1375  "Please report this bug to the Tpetra developers.");
1376  }
1377 
1378  if (debug_) {
1379  std::ostringstream os;
1380  os << myImageID << ": doPosts(3,fast): "
1381  << "Posted send to Proc " << imagesTo_[i]
1382  << " w/ specified tag " << tag << endl;
1383  *out_ << os.str ();
1384  }
1385  }
1386  else { // "Sending" the message to myself
1387  selfNum = p;
1388  }
1389  }
1390 
1391  if (selfMessage_) {
1392  // This is how we "send a message to ourself": we copy from
1393  // the export buffer to the import buffer. That saves
1394  // Teuchos::Comm implementations other than MpiComm (in
1395  // particular, SerialComm) the trouble of implementing self
1396  // messages correctly. (To do this right, SerialComm would
1397  // need internal buffer space for messages, keyed on the
1398  // message's tag.)
1399  std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
1400  exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
1401  imports.begin()+selfReceiveOffset);
1402  }
1403  if (debug_) {
1404  std::ostringstream os;
1405  os << myImageID << ": doPosts(3,fast) done" << endl;
1406  *out_ << os.str ();
1407  }
1408  }
1409  else { // data are not blocked by image, use send buffer
1410  if (debug_) {
1411  std::ostringstream os;
1412  os << myImageID << ": doPosts(3,slow): posting sends" << endl;
1413  *out_ << os.str ();
1414  }
1415 
1416  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
1417  // sends), because the buffer is only long enough for one send.
1418  ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
1419 
1420  TEUCHOS_TEST_FOR_EXCEPTION(
1421  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1422  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
1423  << "doesn't currently work with nonblocking sends.");
1424 
1425  for (size_t i = 0; i < numBlocks; ++i) {
1426  size_t p = i + imageIndex;
1427  if (p > (numBlocks - 1)) {
1428  p -= numBlocks;
1429  }
1430 
1431  if (imagesTo_[p] != myImageID) {
1432  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1433  size_t sendArrayOffset = 0;
1434  size_t j = startsTo_[p];
1435  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1436  srcBegin = exports.begin() + indicesTo_[j]*numPackets;
1437  srcEnd = srcBegin + numPackets;
1438  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1439  sendArrayOffset += numPackets;
1440  }
1441  ArrayView<const Packet> tmpSend =
1442  sendArray.view (0, lengthsTo_[p]*numPackets);
1443 
1444  if (sendType == Details::DISTRIBUTOR_SEND) {
1445  send<int, Packet> (tmpSend.getRawPtr (),
1446  as<int> (tmpSend.size ()),
1447  imagesTo_[p], tag, *comm_);
1448  }
1449  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1450  ArrayRCP<const Packet> tmpSendBuf =
1451  sendArray.persistingView (0, lengthsTo_[p] * numPackets);
1452  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1453  tag, *comm_));
1454  }
1455  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1456  readySend<int, Packet> (tmpSend.getRawPtr (),
1457  as<int> (tmpSend.size ()),
1458  imagesTo_[p], tag, *comm_);
1459  }
1460  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1461  ssend<int, Packet> (tmpSend.getRawPtr (),
1462  as<int> (tmpSend.size ()),
1463  imagesTo_[p], tag, *comm_);
1464  }
1465  else {
1466  TEUCHOS_TEST_FOR_EXCEPTION(
1467  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
1468  "Invalid send type. We should never get here. "
1469  "Please report this bug to the Tpetra developers.");
1470  }
1471 
1472  if (debug_) {
1473  std::ostringstream os;
1474  os << myImageID << ": doPosts(3,slow): "
1475  << "Posted send to Proc " << imagesTo_[i]
1476  << " w/ specified tag " << tag << endl;
1477  *out_ << os.str ();
1478  }
1479  }
1480  else { // "Sending" the message to myself
1481  selfNum = p;
1482  selfIndex = startsTo_[p];
1483  }
1484  }
1485 
1486  if (selfMessage_) {
1487  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1488  std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
1489  exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
1490  imports.begin() + selfReceiveOffset);
1491  ++selfIndex;
1492  selfReceiveOffset += numPackets;
1493  }
1494  }
1495  if (debug_) {
1496  std::ostringstream os;
1497  os << myImageID << ": doPosts(3,slow) done" << endl;
1498  *out_ << os.str ();
1499  }
1500  }
1501  }
1502 
1503  template <class Packet>
1504  void Distributor::
1505  doPosts (const ArrayRCP<const Packet>& exports,
1506  const ArrayView<size_t>& numExportPacketsPerLID,
1507  const ArrayRCP<Packet>& imports,
1508  const ArrayView<size_t>& numImportPacketsPerLID)
1509  {
1510  using Teuchos::Array;
1511  using Teuchos::as;
1512  using Teuchos::ireceive;
1513  using Teuchos::isend;
1514  using Teuchos::readySend;
1515  using Teuchos::send;
1516  using Teuchos::ssend;
1517  using Teuchos::TypeNameTraits;
1518 #ifdef HAVE_TEUCHOS_DEBUG
1519  using Teuchos::OSTab;
1520 #endif // HAVE_TEUCHOS_DEBUG
1521  using std::endl;
1522  typedef Array<size_t>::size_type size_type;
1523 
1524  Teuchos::OSTab tab (out_);
1525 
1526 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1527  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
1528 #endif // TPETRA_DISTRIBUTOR_TIMERS
1529 
1530  // Run-time configurable parameters that come from the input
1531  // ParameterList set by setParameterList().
1532  const Details::EDistributorSendType sendType = sendType_;
1533  const bool doBarrier = barrierBetween_;
1534 
1535 // #ifdef HAVE_TEUCHOS_DEBUG
1536 // // Prepare for verbose output, if applicable.
1537 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1538 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
1539 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1540 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1541 
1542 // if (doPrint) {
1543 // // Only need one process to print out parameters.
1544 // *out << "Distributor::doPosts (4 args)" << endl;
1545 // }
1546 // // Add one tab level. We declare this outside the doPrint scopes
1547 // // so that the tab persists until the end of this method.
1548 // Teuchos::OSTab tab = this->getOSTab ();
1549 // if (doPrint) {
1550 // *out << "Parameters:" << endl;
1551 // {
1552 // OSTab tab2 (out);
1553 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1554 // << endl << "barrierBetween: " << doBarrier << endl;
1555 // }
1556 // }
1557 // #endif // HAVE_TEUCHOS_DEBUG
1558 
1559  TEUCHOS_TEST_FOR_EXCEPTION(
1560  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1561  "Tpetra::Distributor::doPosts(4 args): Ready-send version requires a "
1562  "barrier between posting receives and posting ready sends. This should "
1563  "have been checked before. "
1564  "Please report this bug to the Tpetra developers.");
1565 
1566  const int myImageID = comm_->getRank ();
1567  size_t selfReceiveOffset = 0;
1568 
1569 #ifdef HAVE_TEUCHOS_DEBUG
1570  // Different messages may have different numbers of packets.
1571  size_t totalNumImportPackets = 0;
1572  for (int ii = 0; ii < numImportPacketsPerLID.size(); ++ii) {
1573  totalNumImportPackets += numImportPacketsPerLID[ii];
1574  }
1575  TEUCHOS_TEST_FOR_EXCEPTION(
1576  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1577  std::runtime_error, "Tpetra::Distributor::doPosts(4 args): The 'imports' "
1578  "array must have enough entries to hold the expected number of import "
1579  "packets. imports.size() = " << imports.size() << " < "
1580  "totalNumImportPackets = " << totalNumImportPackets << ".");
1581 #endif // HAVE_TEUCHOS_DEBUG
1582 
1583  // MPI tag for nonblocking receives and blocking sends in this
1584  // method. Some processes might take the "fast" path
1585  // (indicesTo_.empty()) and others might take the "slow" path for
1586  // the same doPosts() call, so the path tag must be the same for
1587  // both.
1588  const int pathTag = 1;
1589  const int tag = this->getTag (pathTag);
1590 
1591  if (debug_) {
1592  TEUCHOS_TEST_FOR_EXCEPTION(
1593  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
1594  "doPosts(4 args): Process " << myImageID << ": requests_.size() = "
1595  << requests_.size () << " != 0.");
1596  std::ostringstream os;
1597  os << myImageID << ": doPosts(4,"
1598  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
1599  *out_ << os.str ();
1600  }
1601 
1602  // Distributor uses requests_.size() as the number of outstanding
1603  // nonblocking message requests, so we resize to zero to maintain
1604  // this invariant.
1605  //
1606  // numReceives_ does _not_ include the self message, if there is
1607  // one. Here, we do actually send a message to ourselves, so we
1608  // include any self message in the "actual" number of receives to
1609  // post.
1610  //
1611  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1612  // doesn't (re)allocate its array of requests. That happens in
1613  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1614  // demand), or Resize_().
1615  const size_type actualNumReceives = as<size_type> (numReceives_) +
1616  as<size_type> (selfMessage_ ? 1 : 0);
1617  requests_.resize (0);
1618 
1619  // Post the nonblocking receives. It's common MPI wisdom to post
1620  // receives before sends. In MPI terms, this means favoring
1621  // adding to the "posted queue" (of receive requests) over adding
1622  // to the "unexpected queue" (of arrived messages not yet matched
1623  // with a receive).
1624  {
1625 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1626  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
1627 #endif // TPETRA_DISTRIBUTOR_TIMERS
1628 
1629  size_t curBufferOffset = 0;
1630  size_t curLIDoffset = 0;
1631  for (size_type i = 0; i < actualNumReceives; ++i) {
1632  size_t totalPacketsFrom_i = 0;
1633  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
1634  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1635  }
1636  curLIDoffset += lengthsFrom_[i];
1637  if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
1638  // If my process is receiving these packet(s) from another
1639  // process (not a self-receive), and if there is at least
1640  // one packet to receive:
1641  //
1642  // 1. Set up the persisting view (recvBuf) into the imports
1643  // array, given the offset and size (total number of
1644  // packets from process imagesFrom_[i]).
1645  // 2. Start the Irecv and save the resulting request.
1646  ArrayRCP<Packet> recvBuf =
1647  imports.persistingView (curBufferOffset, totalPacketsFrom_i);
1648  requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
1649  tag, *comm_));
1650  }
1651  else { // Receiving these packet(s) from myself
1652  selfReceiveOffset = curBufferOffset; // Remember the offset
1653  }
1654  curBufferOffset += totalPacketsFrom_i;
1655  }
1656  }
1657 
1658  if (doBarrier) {
1659 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1660  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
1661 #endif // TPETRA_DISTRIBUTOR_TIMERS
1662  // If we are using ready sends (MPI_Rsend) below, we need to do
1663  // a barrier before we post the ready sends. This is because a
1664  // ready send requires that its matching receive has already
1665  // been posted before the send has been posted. The only way to
1666  // guarantee that in this case is to use a barrier.
1667  comm_->barrier ();
1668  }
1669 
1670 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1671  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
1672 #endif // TPETRA_DISTRIBUTOR_TIMERS
1673 
1674  // setup arrays containing starting-offsets into exports for each send,
1675  // and num-packets-to-send for each send.
1676  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
1677  size_t maxNumPackets = 0;
1678  size_t curPKToffset = 0;
1679  for (size_t pp=0; pp<numSends_; ++pp) {
1680  sendPacketOffsets[pp] = curPKToffset;
1681  size_t numPackets = 0;
1682  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
1683  numPackets += numExportPacketsPerLID[j];
1684  }
1685  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1686  packetsPerSend[pp] = numPackets;
1687  curPKToffset += numPackets;
1688  }
1689 
1690  // setup scan through imagesTo_ list starting with higher numbered images
1691  // (should help balance message traffic)
1692  size_t numBlocks = numSends_+ selfMessage_;
1693  size_t imageIndex = 0;
1694  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
1695  ++imageIndex;
1696  }
1697  if (imageIndex == numBlocks) {
1698  imageIndex = 0;
1699  }
1700 
1701  size_t selfNum = 0;
1702  size_t selfIndex = 0;
1703 
1704  if (indicesTo_.empty()) {
1705  if (debug_) {
1706  std::ostringstream os;
1707  os << myImageID << ": doPosts(4,fast): posting sends" << endl;
1708  *out_ << os.str ();
1709  }
1710 
1711  // Data are already blocked (laid out) by process, so we don't
1712  // need a separate send buffer (besides the exports array).
1713  for (size_t i = 0; i < numBlocks; ++i) {
1714  size_t p = i + imageIndex;
1715  if (p > (numBlocks - 1)) {
1716  p -= numBlocks;
1717  }
1718 
1719  if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
1720  ArrayView<const Packet> tmpSend =
1721  exports.view (sendPacketOffsets[p], packetsPerSend[p]);
1722 
1723  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
1724  send<int, Packet> (tmpSend.getRawPtr (),
1725  as<int> (tmpSend.size ()),
1726  imagesTo_[p], tag, *comm_);
1727  }
1728  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1729  readySend<int, Packet> (tmpSend.getRawPtr (),
1730  as<int> (tmpSend.size ()),
1731  imagesTo_[p], tag, *comm_);
1732  }
1733  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1734  ArrayRCP<const Packet> tmpSendBuf =
1735  exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
1736  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1737  tag, *comm_));
1738  }
1739  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1740  ssend<int, Packet> (tmpSend.getRawPtr (),
1741  as<int> (tmpSend.size ()),
1742  imagesTo_[p], tag, *comm_);
1743  }
1744  else {
1745  TEUCHOS_TEST_FOR_EXCEPTION(
1746  true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
1747  "Invalid send type. We should never get here. Please report "
1748  "this bug to the Tpetra developers.");
1749  }
1750  }
1751  else { // "Sending" the message to myself
1752  selfNum = p;
1753  }
1754  }
1755 
1756  if (selfMessage_) {
1757  std::copy (exports.begin()+sendPacketOffsets[selfNum],
1758  exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
1759  imports.begin()+selfReceiveOffset);
1760  }
1761  if (debug_) {
1762  std::ostringstream os;
1763  os << myImageID << ": doPosts(4,fast) done" << endl;
1764  *out_ << os.str ();
1765  }
1766  }
1767  else { // data are not blocked by image, use send buffer
1768  if (debug_) {
1769  std::ostringstream os;
1770  os << myImageID << ": doPosts(4,slow): posting sends" << endl;
1771  *out_ << os.str ();
1772  }
1773 
1774  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
1775  ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
1776 
1777  TEUCHOS_TEST_FOR_EXCEPTION(
1778  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1779  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" "
1780  "code path may not necessarily work with nonblocking sends.");
1781 
1782  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1783  size_t ioffset = 0;
1784  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
1785  indicesOffsets[j] = ioffset;
1786  ioffset += numExportPacketsPerLID[j];
1787  }
1788 
1789  for (size_t i = 0; i < numBlocks; ++i) {
1790  size_t p = i + imageIndex;
1791  if (p > (numBlocks - 1)) {
1792  p -= numBlocks;
1793  }
1794 
1795  if (imagesTo_[p] != myImageID) {
1796  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1797  size_t sendArrayOffset = 0;
1798  size_t j = startsTo_[p];
1799  size_t numPacketsTo_p = 0;
1800  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1801  srcBegin = exports.begin() + indicesOffsets[j];
1802  srcEnd = srcBegin + numExportPacketsPerLID[j];
1803  numPacketsTo_p += numExportPacketsPerLID[j];
1804  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1805  sendArrayOffset += numExportPacketsPerLID[j];
1806  }
1807  if (numPacketsTo_p > 0) {
1808  ArrayView<const Packet> tmpSend =
1809  sendArray.view (0, numPacketsTo_p);
1810 
1811  if (sendType == Details::DISTRIBUTOR_RSEND) {
1812  readySend<int, Packet> (tmpSend.getRawPtr (),
1813  as<int> (tmpSend.size ()),
1814  imagesTo_[p], tag, *comm_);
1815  }
1816  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1817  ArrayRCP<const Packet> tmpSendBuf =
1818  sendArray.persistingView (0, numPacketsTo_p);
1819  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1820  tag, *comm_));
1821  }
1822  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1823  ssend<int, Packet> (tmpSend.getRawPtr (),
1824  as<int> (tmpSend.size ()),
1825  imagesTo_[p], tag, *comm_);
1826  }
1827  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
1828  send<int, Packet> (tmpSend.getRawPtr (),
1829  as<int> (tmpSend.size ()),
1830  imagesTo_[p], tag, *comm_);
1831  }
1832  }
1833  }
1834  else { // "Sending" the message to myself
1835  selfNum = p;
1836  selfIndex = startsTo_[p];
1837  }
1838  }
1839 
1840  if (selfMessage_) {
1841  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1842  std::copy (exports.begin()+indicesOffsets[selfIndex],
1843  exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
1844  imports.begin() + selfReceiveOffset);
1845  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1846  ++selfIndex;
1847  }
1848  }
1849  if (debug_) {
1850  std::ostringstream os;
1851  os << myImageID << ": doPosts(4,slow) done" << endl;
1852  *out_ << os.str ();
1853  }
1854  }
1855  }
1856 
1857  template <class Packet>
1858  void Distributor::
1859  doReversePostsAndWaits (const ArrayView<const Packet>& exports,
1860  size_t numPackets,
1861  const ArrayView<Packet>& imports)
1862  {
1863  using Teuchos::as;
1864 
1865  // doReversePosts() takes exports and imports as ArrayRCPs,
1866  // requiring that the memory locations are persisting. However,
1867  // they need only persist within the scope of that routine, so it
1868  // is safe for us to use nonpersisting references in this case.
1869 
1870  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1871  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1872  // with some versions of GCC. The issue only arises with the
1873  // exports array. This is why we construct a separate nonowning
1874  // ArrayRCP.
1875  typedef typename ArrayRCP<const Packet>::size_type size_type;
1876  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
1877  exports.size(), false);
1878  // mfh 04 Apr 2012: This is the offending code. This statement
1879  // would normally be in place of "exportsArcp" in the
1880  // doReversePosts() call below.
1881  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
1882  doReversePosts (exportsArcp,
1883  numPackets,
1884  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1885  doReverseWaits ();
1886 
1887  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1888  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1889  }
1890 
1891  template <class Packet>
1892  void Distributor::
1893  doReversePostsAndWaits (const ArrayView<const Packet>& exports,
1894  const ArrayView<size_t> &numExportPacketsPerLID,
1895  const ArrayView<Packet> &imports,
1896  const ArrayView<size_t> &numImportPacketsPerLID)
1897  {
1898  using Teuchos::as;
1899  using Teuchos::arcp;
1900  using Teuchos::ArrayRCP;
1901 
1902  TEUCHOS_TEST_FOR_EXCEPTION(
1903  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1904  "doReversePostsAndWaits(4 args): There are " << requests_.size ()
1905  << " outstanding nonblocking messages pending. It is incorrect to call "
1906  "this method with posts outstanding.");
1907 
1908  // doReversePosts() accepts the exports and imports arrays as
1909  // ArrayRCPs, requiring that the memory location is persisting (as
1910  // is necessary for nonblocking receives). However, it need only
1911  // persist until doReverseWaits() completes, so it is safe for us
1912  // to use a nonpersisting reference in this case. The use of a
1913  // nonpersisting reference is purely a performance optimization.
1914 
1915  // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
1916  // for Packet=std::complex<double> fails to compile with some
1917  // versions of GCC. The issue only arises with the exports array.
1918  // This is why we construct a separate nonowning ArrayRCP.
1919  typedef typename ArrayRCP<const Packet>::size_type size_type;
1920  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
1921  exports.size (), false);
1922  doReversePosts (exportsArcp,
1923  numExportPacketsPerLID,
1924  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1925  numImportPacketsPerLID);
1926  doReverseWaits ();
1927 
1928  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1929  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1930  }
1931 
1932  template <class Packet>
1933  void Distributor::
1934  doReversePosts (const ArrayRCP<const Packet>& exports,
1935  size_t numPackets,
1936  const ArrayRCP<Packet>& imports)
1937  {
1938  // FIXME (mfh 29 Mar 2012) WHY?
1939  TEUCHOS_TEST_FOR_EXCEPTION(
1940  ! indicesTo_.empty (), std::runtime_error,
1941  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1942  "communication when original data are blocked by process.");
1943  if (reverseDistributor_.is_null ()) {
1944  createReverseDistributor ();
1945  }
1946  reverseDistributor_->doPosts (exports, numPackets, imports);
1947  }
1948 
1949  template <class Packet>
1950  void Distributor::
1951  doReversePosts (const ArrayRCP<const Packet>& exports,
1952  const ArrayView<size_t>& numExportPacketsPerLID,
1953  const ArrayRCP<Packet>& imports,
1954  const ArrayView<size_t>& numImportPacketsPerLID)
1955  {
1956  // FIXME (mfh 29 Mar 2012) WHY?
1957  TEUCHOS_TEST_FOR_EXCEPTION(
1958  ! indicesTo_.empty (), std::runtime_error,
1959  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1960  "communication when original data are blocked by process.");
1961  if (reverseDistributor_.is_null ()) {
1962  createReverseDistributor ();
1963  }
1964  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
1965  imports, numImportPacketsPerLID);
1966  }
1967 
1968  template <class ExpView, class ImpView>
1969  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
1970  Distributor::
1971  doPostsAndWaits (const ExpView &exports,
1972  size_t numPackets,
1973  const ImpView &imports)
1974  {
1975  // If the MPI library doesn't support RDMA for communication
1976  // directly to or from the GPU's memory, we must transfer exports
1977  // to the host, do the communication, then transfer imports back
1978  // to the GPU.
1979  //
1980  // We need to do this here instead of doPosts() because the copy
1981  // back to the GPU must occur after the waits.
1982  typedef ExpView exports_view;
1983  typedef ImpView imports_view;
1984  const bool can_access_from_host =
1985  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace,
1986  typename exports_view::memory_space >::value;
1987  if (! enable_cuda_rdma_ && ! can_access_from_host) {
1988  typename exports_view::HostMirror host_exports =
1989  Kokkos::create_mirror_view (exports);
1990  typename imports_view::HostMirror host_imports =
1991  Kokkos::create_mirror_view (imports);
1992  Kokkos::deep_copy (host_exports, exports);
1993  doPostsAndWaits (Kokkos::Compat::create_const_view (host_exports),
1994  numPackets,
1995  host_imports);
1996  Kokkos::deep_copy (imports, host_imports);
1997  return;
1998  }
1999 
2000  TEUCHOS_TEST_FOR_EXCEPTION(
2001  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
2002  "doPostsAndWaits(3 args): There are " << requests_.size () <<
2003  " outstanding nonblocking messages pending. It is incorrect to call "
2004  "this method with posts outstanding.");
2005 
2006  doPosts (exports, numPackets, imports);
2007  doWaits ();
2008  }
2009 
2010  template <class ExpView, class ImpView>
2011  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2012  Distributor::
2013  doPostsAndWaits (const ExpView &exports,
2014  const ArrayView<size_t> &numExportPacketsPerLID,
2015  const ImpView &imports,
2016  const ArrayView<size_t> &numImportPacketsPerLID)
2017  {
2018  // If MPI library doesn't support RDMA for communication directly
2019  // to/from GPU, transfer exports to the host, do the communication, then
2020  // transfer imports back to the GPU
2021  //
2022  // We need to do this here instead of doPosts() because the copy back
2023  // to the GPU must occur after the waits.
2024  typedef ExpView exports_view;
2025  typedef ImpView imports_view;
2026  if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
2027  typename exports_view::HostMirror host_exports =
2028  Kokkos::create_mirror_view(exports);
2029  typename imports_view::HostMirror host_imports =
2030  Kokkos::create_mirror_view(imports);
2031  Kokkos::deep_copy (host_exports, exports);
2032  doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
2033  numExportPacketsPerLID,
2034  host_imports,
2035  numImportPacketsPerLID);
2036  Kokkos::deep_copy (imports, host_imports);
2037  return;
2038  }
2039 
2040  TEUCHOS_TEST_FOR_EXCEPTION(
2041  requests_.size () != 0, std::runtime_error,
2042  "Tpetra::Distributor::doPostsAndWaits(4 args): There are "
2043  << requests_.size () << " outstanding nonblocking messages pending. "
2044  "It is incorrect to call this method with posts outstanding.");
2045 
2046  doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
2047  doWaits ();
2048  }
2049 
2050 
2051  template <class ExpView, class ImpView>
2052  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2053  Distributor::
2054  doPosts (const ExpView &exports,
2055  size_t numPackets,
2056  const ImpView &imports)
2057  {
2058  using Teuchos::Array;
2059  using Teuchos::as;
2060  using Teuchos::FancyOStream;
2061  using Teuchos::includesVerbLevel;
2062  using Teuchos::ireceive;
2063  using Teuchos::isend;
2064  using Teuchos::OSTab;
2065  using Teuchos::readySend;
2066  using Teuchos::send;
2067  using Teuchos::ssend;
2068  using Teuchos::TypeNameTraits;
2069  using Teuchos::typeName;
2070  using std::endl;
2071  using Kokkos::Compat::create_const_view;
2072  using Kokkos::Compat::create_view;
2073  using Kokkos::Compat::subview_offset;
2074  using Kokkos::Compat::deep_copy_offset;
2075  typedef Array<size_t>::size_type size_type;
2076  typedef ExpView exports_view_type;
2077  typedef ImpView imports_view_type;
2078 
2079  Teuchos::OSTab tab (out_);
2080 
2081 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2082  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
2083 #endif // TPETRA_DISTRIBUTOR_TIMERS
2084 
2085  // Run-time configurable parameters that come from the input
2086  // ParameterList set by setParameterList().
2087  const Details::EDistributorSendType sendType = sendType_;
2088  const bool doBarrier = barrierBetween_;
2089 
2090 // #ifdef HAVE_TEUCHOS_DEBUG
2091 // // Prepare for verbose output, if applicable.
2092 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2093 // (void) verbLevel; // Silence "unused variable" compiler warning.
2094 // RCP<FancyOStream> out = this->getOStream ();
2095 // // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2096 // // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2097 // const bool doPrint = out.get () && (comm_->getRank () == 0);
2098 
2099 // if (doPrint) {
2100 // // Only need one process to print out parameters.
2101 // *out << "Distributor::doPosts (3 args)" << endl;
2102 // }
2103 // // Add one tab level. We declare this outside the doPrint scopes
2104 // // so that the tab persists until the end of this method.
2105 // OSTab tab = this->getOSTab ();
2106 // if (doPrint) {
2107 // *out << "Parameters:" << endl;
2108 // {
2109 // OSTab tab2 (out);
2110 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2111 // << endl << "barrierBetween: " << doBarrier << endl;
2112 // }
2113 // }
2114 // #endif // HAVE_TEUCHOS_DEBUG
2115 
2116  TEUCHOS_TEST_FOR_EXCEPTION(
2117  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
2118  "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
2119  "barrier between posting receives and posting ready sends. This should "
2120  "have been checked before. "
2121  "Please report this bug to the Tpetra developers.");
2122 
2123  const int myImageID = comm_->getRank ();
2124  size_t selfReceiveOffset = 0;
2125 
2126  // Each message has the same number of packets.
2127  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
2128  TEUCHOS_TEST_FOR_EXCEPTION(
2129  static_cast<size_t> (imports.dimension_0 ()) < totalNumImportPackets,
2130  std::runtime_error, "Tpetra::Distributor::doPosts(3 args): The 'imports' "
2131  "array must have enough entries to hold the expected number of import "
2132  "packets. imports.dimension_0() = " << imports.dimension_0 () << " < "
2133  "totalNumImportPackets = " << totalNumImportPackets << ".");
2134 
2135  // MPI tag for nonblocking receives and blocking sends in this
2136  // method. Some processes might take the "fast" path
2137  // (indicesTo_.empty()) and others might take the "slow" path for
2138  // the same doPosts() call, so the path tag must be the same for
2139  // both.
2140  const int pathTag = 0;
2141  const int tag = this->getTag (pathTag);
2142 
2143  if (debug_) {
2144  TEUCHOS_TEST_FOR_EXCEPTION(
2145  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2146  "doPosts(3 args): Process " << myImageID << ": requests_.size() = "
2147  << requests_.size () << " != 0.");
2148  std::ostringstream os;
2149  os << myImageID << ": doPosts(3,"
2150  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
2151  *out_ << os.str ();
2152  }
2153 
2154  // Distributor uses requests_.size() as the number of outstanding
2155  // nonblocking message requests, so we resize to zero to maintain
2156  // this invariant.
2157  //
2158  // numReceives_ does _not_ include the self message, if there is
2159  // one. Here, we do actually send a message to ourselves, so we
2160  // include any self message in the "actual" number of receives to
2161  // post.
2162  //
2163  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2164  // doesn't (re)allocate its array of requests. That happens in
2165  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2166  // demand), or Resize_().
2167  const size_type actualNumReceives = as<size_type> (numReceives_) +
2168  as<size_type> (selfMessage_ ? 1 : 0);
2169  requests_.resize (0);
2170 
2171  // Post the nonblocking receives. It's common MPI wisdom to post
2172  // receives before sends. In MPI terms, this means favoring
2173  // adding to the "posted queue" (of receive requests) over adding
2174  // to the "unexpected queue" (of arrived messages not yet matched
2175  // with a receive).
2176  {
2177 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2178  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
2179 #endif // TPETRA_DISTRIBUTOR_TIMERS
2180 
2181  size_t curBufferOffset = 0;
2182  for (size_type i = 0; i < actualNumReceives; ++i) {
2183  if (imagesFrom_[i] != myImageID) {
2184  // If my process is receiving these packet(s) from another
2185  // process (not a self-receive):
2186  //
2187  // 1. Set up the persisting view (recvBuf) of the imports
2188  // array, given the offset and size (total number of
2189  // packets from process imagesFrom_[i]).
2190  // 2. Start the Irecv and save the resulting request.
2191  imports_view_type recvBuf =
2192  subview_offset (imports, curBufferOffset, lengthsFrom_[i]*numPackets);
2193  requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
2194  tag, *comm_));
2195  if (debug_) {
2196  std::ostringstream os;
2197  os << myImageID << ": doPosts(3,"
2198  << (indicesTo_.empty () ? "fast" : "slow") << "): "
2199  << "Posted irecv from Proc " << imagesFrom_[i] << " with "
2200  "specified tag " << tag << endl;
2201  *out_ << os.str ();
2202  }
2203  }
2204  else { // Receiving from myself
2205  selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
2206  }
2207  curBufferOffset += lengthsFrom_[i]*numPackets;
2208  }
2209  }
2210 
2211  if (doBarrier) {
2212 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2213  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
2214 #endif // TPETRA_DISTRIBUTOR_TIMERS
2215  // If we are using ready sends (MPI_Rsend) below, we need to do
2216  // a barrier before we post the ready sends. This is because a
2217  // ready send requires that its matching receive has already
2218  // been posted before the send has been posted. The only way to
2219  // guarantee that in this case is to use a barrier.
2220  comm_->barrier ();
2221  }
2222 
2223 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2224  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
2225 #endif // TPETRA_DISTRIBUTOR_TIMERS
2226 
2227  // setup scan through imagesTo_ list starting with higher numbered images
2228  // (should help balance message traffic)
2229  //
2230  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
2231  // It doesn't depend on the input at all.
2232  size_t numBlocks = numSends_ + selfMessage_;
2233  size_t imageIndex = 0;
2234  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
2235  ++imageIndex;
2236  }
2237  if (imageIndex == numBlocks) {
2238  imageIndex = 0;
2239  }
2240 
2241  size_t selfNum = 0;
2242  size_t selfIndex = 0;
2243 
2244  if (indicesTo_.empty()) {
2245  if (debug_) {
2246  std::ostringstream os;
2247  os << myImageID << ": doPosts(3,fast): posting sends" << endl;
2248  *out_ << os.str ();
2249  }
2250 
2251  // Data are already blocked (laid out) by process, so we don't
2252  // need a separate send buffer (besides the exports array).
2253  for (size_t i = 0; i < numBlocks; ++i) {
2254  size_t p = i + imageIndex;
2255  if (p > (numBlocks - 1)) {
2256  p -= numBlocks;
2257  }
2258 
2259  if (imagesTo_[p] != myImageID) {
2260  exports_view_type tmpSend = subview_offset(
2261  exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
2262 
2263  if (sendType == Details::DISTRIBUTOR_SEND) {
2264  send<int> (tmpSend,
2265  as<int> (tmpSend.size ()),
2266  imagesTo_[p], tag, *comm_);
2267  }
2268  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2269  exports_view_type tmpSendBuf =
2270  subview_offset (exports, startsTo_[p] * numPackets,
2271  lengthsTo_[p] * numPackets);
2272  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2273  tag, *comm_));
2274  }
2275  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2276  readySend<int> (tmpSend,
2277  as<int> (tmpSend.size ()),
2278  imagesTo_[p], tag, *comm_);
2279  }
2280  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2281  ssend<int> (tmpSend,
2282  as<int> (tmpSend.size ()),
2283  imagesTo_[p], tag, *comm_);
2284  } else {
2285  TEUCHOS_TEST_FOR_EXCEPTION(
2286  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
2287  "Invalid send type. We should never get here. "
2288  "Please report this bug to the Tpetra developers.");
2289  }
2290 
2291  if (debug_) {
2292  std::ostringstream os;
2293  os << myImageID << ": doPosts(3,fast): "
2294  << "Posted send to Proc " << imagesTo_[i]
2295  << " w/ specified tag " << tag << endl;
2296  *out_ << os.str ();
2297  }
2298  }
2299  else { // "Sending" the message to myself
2300  selfNum = p;
2301  }
2302  }
2303 
2304  if (selfMessage_) {
2305  // This is how we "send a message to ourself": we copy from
2306  // the export buffer to the import buffer. That saves
2307  // Teuchos::Comm implementations other than MpiComm (in
2308  // particular, SerialComm) the trouble of implementing self
2309  // messages correctly. (To do this right, SerialComm would
2310  // need internal buffer space for messages, keyed on the
2311  // message's tag.)
2312  deep_copy_offset(imports, exports, selfReceiveOffset,
2313  startsTo_[selfNum]*numPackets,
2314  lengthsTo_[selfNum]*numPackets);
2315  }
2316  if (debug_) {
2317  std::ostringstream os;
2318  os << myImageID << ": doPosts(3,fast) done" << endl;
2319  *out_ << os.str ();
2320  }
2321  }
2322  else { // data are not blocked by image, use send buffer
2323  if (debug_) {
2324  std::ostringstream os;
2325  os << myImageID << ": doPosts(3,slow): posting sends" << endl;
2326  *out_ << os.str ();
2327  }
2328 
2329  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
2330  // sends), because the buffer is only long enough for one send.
2331  typedef typename ExpView::non_const_value_type Packet;
2332  typedef typename ExpView::array_layout Layout;
2333  typedef typename ExpView::device_type Device;
2334  typedef typename ExpView::memory_traits Mem;
2335  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray",
2336  maxSendLength_ * numPackets);
2337 
2338  TEUCHOS_TEST_FOR_EXCEPTION(
2339  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
2340  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
2341  "doesn't currently work with nonblocking sends.");
2342 
2343  for (size_t i = 0; i < numBlocks; ++i) {
2344  size_t p = i + imageIndex;
2345  if (p > (numBlocks - 1)) {
2346  p -= numBlocks;
2347  }
2348 
2349  if (imagesTo_[p] != myImageID) {
2350  size_t sendArrayOffset = 0;
2351  size_t j = startsTo_[p];
2352  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2353  deep_copy_offset(sendArray, exports, sendArrayOffset,
2354  indicesTo_[j]*numPackets, numPackets);
2355  sendArrayOffset += numPackets;
2356  }
2357  ImpView tmpSend =
2358  subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
2359 
2360  if (sendType == Details::DISTRIBUTOR_SEND) {
2361  send<int> (tmpSend,
2362  as<int> (tmpSend.size ()),
2363  imagesTo_[p], tag, *comm_);
2364  }
2365  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2366  exports_view_type tmpSendBuf =
2367  subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
2368  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2369  tag, *comm_));
2370  }
2371  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2372  readySend<int> (tmpSend,
2373  as<int> (tmpSend.size ()),
2374  imagesTo_[p], tag, *comm_);
2375  }
2376  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2377  ssend<int> (tmpSend,
2378  as<int> (tmpSend.size ()),
2379  imagesTo_[p], tag, *comm_);
2380  }
2381  else {
2382  TEUCHOS_TEST_FOR_EXCEPTION(
2383  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
2384  "Invalid send type. We should never get here. "
2385  "Please report this bug to the Tpetra developers.");
2386  }
2387 
2388  if (debug_) {
2389  std::ostringstream os;
2390  os << myImageID << ": doPosts(3,slow): "
2391  << "Posted send to Proc " << imagesTo_[i]
2392  << " w/ specified tag " << tag << endl;
2393  *out_ << os.str ();
2394  }
2395  }
2396  else { // "Sending" the message to myself
2397  selfNum = p;
2398  selfIndex = startsTo_[p];
2399  }
2400  }
2401 
2402  if (selfMessage_) {
2403  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2404  deep_copy_offset(imports, exports, selfReceiveOffset,
2405  indicesTo_[selfIndex]*numPackets, numPackets);
2406  ++selfIndex;
2407  selfReceiveOffset += numPackets;
2408  }
2409  }
2410  if (debug_) {
2411  std::ostringstream os;
2412  os << myImageID << ": doPosts(3,slow) done" << endl;
2413  *out_ << os.str ();
2414  }
2415  }
2416  }
2417 
2418  template <class ExpView, class ImpView>
2419  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2420  Distributor::
2421  doPosts (const ExpView &exports,
2422  const ArrayView<size_t> &numExportPacketsPerLID,
2423  const ImpView &imports,
2424  const ArrayView<size_t> &numImportPacketsPerLID)
2425  {
2426  using Teuchos::Array;
2427  using Teuchos::as;
2428  using Teuchos::ireceive;
2429  using Teuchos::isend;
2430  using Teuchos::readySend;
2431  using Teuchos::send;
2432  using Teuchos::ssend;
2433  using Teuchos::TypeNameTraits;
2434 #ifdef HAVE_TEUCHOS_DEBUG
2435  using Teuchos::OSTab;
2436 #endif // HAVE_TEUCHOS_DEBUG
2437  using std::endl;
2438  using Kokkos::Compat::create_const_view;
2439  using Kokkos::Compat::create_view;
2440  using Kokkos::Compat::subview_offset;
2441  using Kokkos::Compat::deep_copy_offset;
2442  typedef Array<size_t>::size_type size_type;
2443  typedef ExpView exports_view_type;
2444  typedef ImpView imports_view_type;
2445 
2446  Teuchos::OSTab tab (out_);
2447 
2448 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2449  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
2450 #endif // TPETRA_DISTRIBUTOR_TIMERS
2451 
2452  // Run-time configurable parameters that come from the input
2453  // ParameterList set by setParameterList().
2454  const Details::EDistributorSendType sendType = sendType_;
2455  const bool doBarrier = barrierBetween_;
2456 
2457 // #ifdef HAVE_TEUCHOS_DEBUG
2458 // // Prepare for verbose output, if applicable.
2459 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2460 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
2461 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2462 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2463 
2464 // if (doPrint) {
2465 // // Only need one process to print out parameters.
2466 // *out << "Distributor::doPosts (4 args)" << endl;
2467 // }
2468 // // Add one tab level. We declare this outside the doPrint scopes
2469 // // so that the tab persists until the end of this method.
2470 // Teuchos::OSTab tab = this->getOSTab ();
2471 // if (doPrint) {
2472 // *out << "Parameters:" << endl;
2473 // {
2474 // OSTab tab2 (out);
2475 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2476 // << endl << "barrierBetween: " << doBarrier << endl;
2477 // }
2478 // }
2479 // #endif // HAVE_TEUCHOS_DEBUG
2480 
2481  TEUCHOS_TEST_FOR_EXCEPTION(
2482  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2483  std::logic_error, "Tpetra::Distributor::doPosts(4 args): Ready-send "
2484  "version requires a barrier between posting receives and posting ready "
2485  "sends. This should have been checked before. "
2486  "Please report this bug to the Tpetra developers.");
2487 
2488  const int myImageID = comm_->getRank ();
2489  size_t selfReceiveOffset = 0;
2490 
2491 #ifdef HAVE_TEUCHOS_DEBUG
2492  // Different messages may have different numbers of packets.
2493  size_t totalNumImportPackets = 0;
2494  for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
2495  totalNumImportPackets += numImportPacketsPerLID[ii];
2496  }
2497  TEUCHOS_TEST_FOR_EXCEPTION(
2498  imports.dimension_0 () < totalNumImportPackets, std::runtime_error,
2499  "Tpetra::Distributor::doPosts(4 args): The 'imports' array must have "
2500  "enough entries to hold the expected number of import packets. "
2501  "imports.dimension_0() = " << imports.dimension_0 () << " < "
2502  "totalNumImportPackets = " << totalNumImportPackets << ".");
2503 #endif // HAVE_TEUCHOS_DEBUG
2504 
2505  // MPI tag for nonblocking receives and blocking sends in this
2506  // method. Some processes might take the "fast" path
2507  // (indicesTo_.empty()) and others might take the "slow" path for
2508  // the same doPosts() call, so the path tag must be the same for
2509  // both.
2510  const int pathTag = 1;
2511  const int tag = this->getTag (pathTag);
2512 
2513  if (debug_) {
2514  TEUCHOS_TEST_FOR_EXCEPTION(
2515  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2516  "doPosts(4 args): Process " << myImageID << ": requests_.size () = "
2517  << requests_.size () << " != 0.");
2518  std::ostringstream os;
2519  os << myImageID << ": doPosts(4,"
2520  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
2521  *out_ << os.str ();
2522  }
2523 
2524  // Distributor uses requests_.size() as the number of outstanding
2525  // nonblocking message requests, so we resize to zero to maintain
2526  // this invariant.
2527  //
2528  // numReceives_ does _not_ include the self message, if there is
2529  // one. Here, we do actually send a message to ourselves, so we
2530  // include any self message in the "actual" number of receives to
2531  // post.
2532  //
2533  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2534  // doesn't (re)allocate its array of requests. That happens in
2535  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2536  // demand), or Resize_().
2537  const size_type actualNumReceives = as<size_type> (numReceives_) +
2538  as<size_type> (selfMessage_ ? 1 : 0);
2539  requests_.resize (0);
2540 
2541  // Post the nonblocking receives. It's common MPI wisdom to post
2542  // receives before sends. In MPI terms, this means favoring
2543  // adding to the "posted queue" (of receive requests) over adding
2544  // to the "unexpected queue" (of arrived messages not yet matched
2545  // with a receive).
2546  {
2547 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2548  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
2549 #endif // TPETRA_DISTRIBUTOR_TIMERS
2550 
2551  size_t curBufferOffset = 0;
2552  size_t curLIDoffset = 0;
2553  for (size_type i = 0; i < actualNumReceives; ++i) {
2554  size_t totalPacketsFrom_i = 0;
2555  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
2556  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
2557  }
2558  curLIDoffset += lengthsFrom_[i];
2559  if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
2560  // If my process is receiving these packet(s) from another
2561  // process (not a self-receive), and if there is at least
2562  // one packet to receive:
2563  //
2564  // 1. Set up the persisting view (recvBuf) into the imports
2565  // array, given the offset and size (total number of
2566  // packets from process imagesFrom_[i]).
2567  // 2. Start the Irecv and save the resulting request.
2568  imports_view_type recvBuf =
2569  subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
2570  requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
2571  tag, *comm_));
2572  }
2573  else { // Receiving these packet(s) from myself
2574  selfReceiveOffset = curBufferOffset; // Remember the offset
2575  }
2576  curBufferOffset += totalPacketsFrom_i;
2577  }
2578  }
2579 
2580  if (doBarrier) {
2581 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2582  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
2583 #endif // TPETRA_DISTRIBUTOR_TIMERS
2584  // If we are using ready sends (MPI_Rsend) below, we need to do
2585  // a barrier before we post the ready sends. This is because a
2586  // ready send requires that its matching receive has already
2587  // been posted before the send has been posted. The only way to
2588  // guarantee that in this case is to use a barrier.
2589  comm_->barrier ();
2590  }
2591 
2592 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2593  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
2594 #endif // TPETRA_DISTRIBUTOR_TIMERS
2595 
2596  // setup arrays containing starting-offsets into exports for each send,
2597  // and num-packets-to-send for each send.
2598  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
2599  size_t maxNumPackets = 0;
2600  size_t curPKToffset = 0;
2601  for (size_t pp=0; pp<numSends_; ++pp) {
2602  sendPacketOffsets[pp] = curPKToffset;
2603  size_t numPackets = 0;
2604  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
2605  numPackets += numExportPacketsPerLID[j];
2606  }
2607  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
2608  packetsPerSend[pp] = numPackets;
2609  curPKToffset += numPackets;
2610  }
2611 
2612  // setup scan through imagesTo_ list starting with higher numbered images
2613  // (should help balance message traffic)
2614  size_t numBlocks = numSends_+ selfMessage_;
2615  size_t imageIndex = 0;
2616  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
2617  ++imageIndex;
2618  }
2619  if (imageIndex == numBlocks) {
2620  imageIndex = 0;
2621  }
2622 
2623  size_t selfNum = 0;
2624  size_t selfIndex = 0;
2625 
2626  if (indicesTo_.empty()) {
2627  if (debug_) {
2628  std::ostringstream os;
2629  os << myImageID << ": doPosts(4,fast): posting sends" << endl;
2630  *out_ << os.str ();
2631  }
2632 
2633  // Data are already blocked (laid out) by process, so we don't
2634  // need a separate send buffer (besides the exports array).
2635  for (size_t i = 0; i < numBlocks; ++i) {
2636  size_t p = i + imageIndex;
2637  if (p > (numBlocks - 1)) {
2638  p -= numBlocks;
2639  }
2640 
2641  if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
2642  exports_view_type tmpSend =
2643  subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
2644 
2645  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
2646  send<int> (tmpSend,
2647  as<int> (tmpSend.size ()),
2648  imagesTo_[p], tag, *comm_);
2649  }
2650  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2651  readySend<int> (tmpSend,
2652  as<int> (tmpSend.size ()),
2653  imagesTo_[p], tag, *comm_);
2654  }
2655  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2656  exports_view_type tmpSendBuf =
2657  subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
2658  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2659  tag, *comm_));
2660  }
2661  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2662  ssend<int> (tmpSend,
2663  as<int> (tmpSend.size ()),
2664  imagesTo_[p], tag, *comm_);
2665  }
2666  else {
2667  TEUCHOS_TEST_FOR_EXCEPTION(
2668  true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
2669  "Invalid send type. We should never get here. "
2670  "Please report this bug to the Tpetra developers.");
2671  }
2672  }
2673  else { // "Sending" the message to myself
2674  selfNum = p;
2675  }
2676  }
2677 
2678  if (selfMessage_) {
2679  deep_copy_offset(imports, exports, selfReceiveOffset,
2680  sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
2681  }
2682  if (debug_) {
2683  std::ostringstream os;
2684  os << myImageID << ": doPosts(4,fast) done" << endl;
2685  *out_ << os.str ();
2686  }
2687  }
2688  else { // data are not blocked by image, use send buffer
2689  if (debug_) {
2690  std::ostringstream os;
2691  os << myImageID << ": doPosts(4,slow): posting sends" << endl;
2692  *out_ << os.str ();
2693  }
2694 
2695  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
2696  typedef typename ExpView::non_const_value_type Packet;
2697  typedef typename ExpView::array_layout Layout;
2698  typedef typename ExpView::device_type Device;
2699  typedef typename ExpView::memory_traits Mem;
2700  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxNumPackets); // send buffer
2701 
2702  TEUCHOS_TEST_FOR_EXCEPTION(
2703  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
2704  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
2705  "may not necessarily work with nonblocking sends.");
2706 
2707  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
2708  size_t ioffset = 0;
2709  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
2710  indicesOffsets[j] = ioffset;
2711  ioffset += numExportPacketsPerLID[j];
2712  }
2713 
2714  for (size_t i = 0; i < numBlocks; ++i) {
2715  size_t p = i + imageIndex;
2716  if (p > (numBlocks - 1)) {
2717  p -= numBlocks;
2718  }
2719 
2720  if (imagesTo_[p] != myImageID) {
2721  size_t sendArrayOffset = 0;
2722  size_t j = startsTo_[p];
2723  size_t numPacketsTo_p = 0;
2724  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2725  deep_copy_offset(sendArray, exports, sendArrayOffset,
2726  indicesOffsets[j], numExportPacketsPerLID[j]);
2727  sendArrayOffset += numExportPacketsPerLID[j];
2728  }
2729  if (numPacketsTo_p > 0) {
2730  ImpView tmpSend =
2731  subview_offset(sendArray, size_t(0), numPacketsTo_p);
2732 
2733  if (sendType == Details::DISTRIBUTOR_RSEND) {
2734  readySend<int> (tmpSend,
2735  as<int> (tmpSend.size ()),
2736  imagesTo_[p], tag, *comm_);
2737  }
2738  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2739  exports_view_type tmpSendBuf =
2740  subview_offset (sendArray, size_t(0), numPacketsTo_p);
2741  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2742  tag, *comm_));
2743  }
2744  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2745  ssend<int> (tmpSend,
2746  as<int> (tmpSend.size ()),
2747  imagesTo_[p], tag, *comm_);
2748  }
2749  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
2750  send<int> (tmpSend,
2751  as<int> (tmpSend.size ()),
2752  imagesTo_[p], tag, *comm_);
2753  }
2754  }
2755  }
2756  else { // "Sending" the message to myself
2757  selfNum = p;
2758  selfIndex = startsTo_[p];
2759  }
2760  }
2761 
2762  if (selfMessage_) {
2763  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2764  deep_copy_offset(imports, exports, selfReceiveOffset,
2765  indicesOffsets[selfIndex],
2766  numExportPacketsPerLID[selfIndex]);
2767  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
2768  ++selfIndex;
2769  }
2770  }
2771  if (debug_) {
2772  std::ostringstream os;
2773  os << myImageID << ": doPosts(4,slow) done" << endl;
2774  *out_ << os.str ();
2775  }
2776  }
2777  }
2778 
2779  template <class ExpView, class ImpView>
2780  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2781  Distributor::
2782  doReversePostsAndWaits (const ExpView &exports,
2783  size_t numPackets,
2784  const ImpView &imports)
2785  {
2786  // If MPI library doesn't support RDMA for communication directly
2787  // to/from GPU, transfer exports to the host, do the communication, then
2788  // transfer imports back to the GPU
2789  //
2790  // We need to do this here instead of doPosts() because the copy back
2791  // to the GPU must occur after the waits.
2792  typedef ExpView exports_view;
2793  typedef ImpView imports_view;
2794  if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
2795  typename exports_view::HostMirror host_exports =
2796  Kokkos::create_mirror_view(exports);
2797  typename imports_view::HostMirror host_imports =
2798  Kokkos::create_mirror_view(imports);
2799  Kokkos::deep_copy (host_exports, exports);
2800  doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
2801  numPackets,
2802  host_imports);
2803  Kokkos::deep_copy (imports, host_imports);
2804  return;
2805  }
2806 
2807  doReversePosts (exports, numPackets, imports);
2808  doReverseWaits ();
2809  }
2810 
2811  template <class ExpView, class ImpView>
2812  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2813  Distributor::
2814  doReversePostsAndWaits (const ExpView &exports,
2815  const ArrayView<size_t> &numExportPacketsPerLID,
2816  const ImpView &imports,
2817  const ArrayView<size_t> &numImportPacketsPerLID)
2818  {
2819  // If MPI library doesn't support RDMA for communication directly
2820  // to/from GPU, transfer exports to the host, do the communication, then
2821  // transfer imports back to the GPU
2822  //
2823  // We need to do this here instead of doPosts() because the copy back
2824  // to the GPU must occur after the waits.
2825  typedef ExpView exports_view;
2826  typedef ImpView imports_view;
2827  if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
2828  typename exports_view::HostMirror host_exports =
2829  Kokkos::create_mirror_view(exports);
2830  typename imports_view::HostMirror host_imports =
2831  Kokkos::create_mirror_view(imports);
2832  Kokkos::deep_copy (host_exports, exports);
2833  doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
2834  numExportPacketsPerLID,
2835  host_imports,
2836  numImportPacketsPerLID);
2837  Kokkos::deep_copy (imports, host_imports);
2838  return;
2839  }
2840 
2841  TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
2842  "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
2843  << requests_.size() << " outstanding nonblocking messages pending. It "
2844  "is incorrect to call this method with posts outstanding.");
2845 
2846  doReversePosts (exports, numExportPacketsPerLID, imports,
2847  numImportPacketsPerLID);
2848  doReverseWaits ();
2849  }
2850 
2851  template <class ExpView, class ImpView>
2852  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2853  Distributor::
2854  doReversePosts (const ExpView &exports,
2855  size_t numPackets,
2856  const ImpView &imports)
2857  {
2858  // FIXME (mfh 29 Mar 2012) WHY?
2859  TEUCHOS_TEST_FOR_EXCEPTION(
2860  ! indicesTo_.empty (), std::runtime_error,
2861  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2862  "reverse communication when original data are blocked by process.");
2863  if (reverseDistributor_.is_null ()) {
2864  createReverseDistributor ();
2865  }
2866  reverseDistributor_->doPosts (exports, numPackets, imports);
2867  }
2868 
2869  template <class ExpView, class ImpView>
2870  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2871  Distributor::
2872  doReversePosts (const ExpView &exports,
2873  const ArrayView<size_t> &numExportPacketsPerLID,
2874  const ImpView &imports,
2875  const ArrayView<size_t> &numImportPacketsPerLID)
2876  {
2877  // FIXME (mfh 29 Mar 2012) WHY?
2878  TEUCHOS_TEST_FOR_EXCEPTION(
2879  ! indicesTo_.empty (), std::runtime_error,
2880  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2881  "reverse communication when original data are blocked by process.");
2882  if (reverseDistributor_.is_null ()) {
2883  createReverseDistributor ();
2884  }
2885  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2886  imports, numImportPacketsPerLID);
2887  }
2888 
2889  template <class OrdinalType>
2890  void Distributor::
2891  computeSends (const ArrayView<const OrdinalType> & importIDs,
2892  const ArrayView<const int> & importNodeIDs,
2893  Array<OrdinalType> & exportIDs,
2894  Array<int> & exportNodeIDs)
2895  {
2896  // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
2897  // "assumes that size_t >= Ordinal". The code certainly does
2898  // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
2899  // sizeof(size_t) >= sizeof(int). This is because it casts the
2900  // OrdinalType elements of importIDs (along with their
2901  // corresponding process IDs, as int) to size_t, and does a
2902  // doPostsAndWaits<size_t>() to send the packed data.
2903  using std::endl;
2904  typedef typename ArrayView<const OrdinalType>::size_type size_type;
2905 
2906  Teuchos::OSTab tab (out_);
2907  const int myRank = comm_->getRank ();
2908  if (debug_) {
2909  std::ostringstream os;
2910  os << myRank << ": computeSends" << endl;
2911  *out_ << os.str ();
2912  }
2913 
2914  TEUCHOS_TEST_FOR_EXCEPTION(
2915  importIDs.size () != importNodeIDs.size (), std::invalid_argument,
2916  "Tpetra::Distributor::computeSends: On Process " << myRank << ": "
2917  "importNodeIDs.size() = " << importNodeIDs.size ()
2918  << " != importIDs.size() = " << importIDs.size () << ".");
2919 
2920  const size_type numImports = importNodeIDs.size ();
2921  Array<size_t> importObjs (2*numImports);
2922  // Pack pairs (importIDs[i], my process ID) to send into importObjs.
2923  for (size_type i = 0; i < numImports; ++i) {
2924  importObjs[2*i] = static_cast<size_t> (importIDs[i]);
2925  importObjs[2*i+1] = static_cast<size_t> (myRank);
2926  }
2927  //
2928  // Use a temporary Distributor to send the (importIDs[i], myRank)
2929  // pairs to importNodeIDs[i].
2930  //
2931  Distributor tempPlan (comm_, out_);
2932  if (debug_) {
2933  std::ostringstream os;
2934  os << myRank << ": computeSends: tempPlan.createFromSends" << endl;
2935  *out_ << os.str ();
2936  }
2937 
2938  // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
2939  // signed, in order to forestall any possible causes for Bug 6069.
2940  const size_t numExportsAsSizeT = tempPlan.createFromSends (importNodeIDs);
2941  const size_type numExports = static_cast<size_type> (numExportsAsSizeT);
2942  TEUCHOS_TEST_FOR_EXCEPTION(
2943  numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: "
2944  "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT
2945  << " as a size_t, which overflows to " << numExports << " when cast to "
2946  << Teuchos::TypeNameTraits<size_type>::name () << ". "
2947  "Please report this bug to the Tpetra developers.");
2948  TEUCHOS_TEST_FOR_EXCEPTION(
2949  static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports,
2950  std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal"
2951  "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num"
2952  "Exports = " << numExports << ". Please report this bug to the "
2953  "Tpetra developers.");
2954 
2955  if (numExports > 0) {
2956  exportIDs.resize (numExports);
2957  exportNodeIDs.resize (numExports);
2958  }
2959 
2960  // exportObjs: Packed receive buffer. (exportObjs[2*i],
2961  // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
2962  // after tempPlan.doPostsAndWaits(...) finishes below.
2963  //
2964  // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
2965  // size_t. This issue might come up, for example, on a 32-bit
2966  // machine using 64-bit global indices. I will add a check here
2967  // for that case.
2968  TEUCHOS_TEST_FOR_EXCEPTION(
2969  sizeof (size_t) < sizeof (OrdinalType), std::logic_error,
2970  "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t)
2971  << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = "
2972  << sizeof (OrdinalType) << ". This violates an assumption of the "
2973  "method. It's not hard to work around (just use Array<OrdinalType> as "
2974  "the export buffer, not Array<size_t>), but we haven't done that yet. "
2975  "Please report this bug to the Tpetra developers.");
2976 
2977  TEUCHOS_TEST_FOR_EXCEPTION(
2978  tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports),
2979  std::logic_error,
2980  "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = "
2981  << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports
2982  << ". Please report this bug to the Tpetra developers.");
2983 
2984  Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
2985  if (debug_) {
2986  std::ostringstream os;
2987  os << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl;
2988  *out_ << os.str ();
2989  }
2990  tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
2991 
2992  // Unpack received (GID, PID) pairs into exportIDs resp. exportNodeIDs.
2993  for (size_type i = 0; i < numExports; ++i) {
2994  exportIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
2995  exportNodeIDs[i] = static_cast<int> (exportObjs[2*i+1]);
2996  }
2997 
2998  if (debug_) {
2999  std::ostringstream os;
3000  os << myRank << ": computeSends done" << endl;
3001  *out_ << os.str ();
3002  }
3003  }
3004 
3005  template <class OrdinalType>
3006  void Distributor::
3007  createFromRecvs (const ArrayView<const OrdinalType> &remoteIDs,
3008  const ArrayView<const int> &remoteImageIDs,
3009  Array<OrdinalType> &exportGIDs,
3010  Array<int> &exportNodeIDs)
3011  {
3012  using std::endl;
3013 
3014  Teuchos::OSTab tab (out_);
3015  const int myRank = comm_->getRank();
3016 
3017  if (debug_) {
3018  *out_ << myRank << ": createFromRecvs" << endl;
3019  }
3020 
3021 #ifdef HAVE_TPETRA_DEBUG
3022  using Teuchos::outArg;
3023  using Teuchos::reduceAll;
3024 
3025  // In debug mode, first test locally, then do an all-reduce to
3026  // make sure that all processes passed.
3027  const int errProc =
3028  (remoteIDs.size () != remoteImageIDs.size ()) ? myRank : -1;
3029  int maxErrProc = -1;
3030  reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc));
3031  TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error,
3032  Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs "
3033  "and remote process IDs must have the same size on all participating "
3034  "processes. Maximum process ID with error: " << maxErrProc << ".");
3035 #else // NOT HAVE_TPETRA_DEBUG
3036 
3037  // In non-debug mode, just test locally.
3038  TEUCHOS_TEST_FOR_EXCEPTION(
3039  remoteIDs.size () != remoteImageIDs.size (), std::invalid_argument,
3040  Teuchos::typeName (*this) << "::createFromRecvs<" <<
3041  Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " <<
3042  myRank << ": remoteIDs.size() = " << remoteIDs.size () << " != "
3043  "remoteImageIDs.size() = " << remoteImageIDs.size () << ".");
3044 #endif // HAVE_TPETRA_DEBUG
3045 
3046  computeSends (remoteIDs, remoteImageIDs, exportGIDs, exportNodeIDs);
3047 
3048  const size_t numProcsSendingToMe = createFromSends (exportNodeIDs ());
3049 
3050  if (debug_) {
3051  // NOTE (mfh 20 Mar 2014) If remoteImageIDs could contain
3052  // duplicates, then its length might not be the right check here,
3053  // even if we account for selfMessage_. selfMessage_ is set in
3054  // createFromSends.
3055  std::ostringstream os;
3056  os << "Proc " << myRank << ": {numProcsSendingToMe: "
3057  << numProcsSendingToMe << ", remoteImageIDs.size(): "
3058  << remoteImageIDs.size () << ", selfMessage_: "
3059  << (selfMessage_ ? "true" : "false") << "}" << std::endl;
3060  std::cerr << os.str ();
3061  }
3062 
3063  if (debug_) {
3064  *out_ << myRank << ": createFromRecvs done" << endl;
3065  }
3066 
3067  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
3068  }
3069 
3070 } // namespace Tpetra
3071 
3072 #endif // TPETRA_DISTRIBUTOR_HPP
Namespace Tpetra contains the class and methods constituting the Tpetra library.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
void doPosts(const ArrayRCP< const Packet > &exports, size_t numPackets, const ArrayRCP< Packet > &imports)
Post the data for a forward plan, but do not execute the waits yet.
void deep_copy(const LittleBlock< ST2, LO > &dst, const LittleBlock< ST1, LO > &src, typename std::enable_if< std::is_convertible< ST1, ST2 >::value &&!std::is_const< ST2 >::value, int >::type *=NULL)
Copy the LittleBlock src into the LittleBlock dst.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
Implementation details of Tpetra.
Details::EDistributorHowInitialized howInitialized() const
Return an enum indicating whether and how a Distributor was initialized.
void doPostsAndWaits(const ArrayView< const Packet > &exports, size_t numPackets, const ArrayView< Packet > &imports)
Execute the (forward) communication plan.
Sets up and executes a communication plan for a Tpetra DistObject.
size_t getTotalReceiveLength() const
Total number of values this process will receive from other processes.
size_t createFromSends(const ArrayView< const int > &exportNodeIDs)
Set up Distributor using list of process ranks to which this process will send.
void doReversePosts(const ArrayRCP< const Packet > &exports, size_t numPackets, const ArrayRCP< Packet > &imports)
Post the data for a reverse plan, but do not execute the waits yet.
void doReversePostsAndWaits(const ArrayView< const Packet > &exports, size_t numPackets, const ArrayView< Packet > &imports)
Execute the reverse communication plan.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
Array< std::string > distributorSendTypes()
Valid values for Distributor&#39;s "Send type" parameter.
void createFromRecvs(const ArrayView< const Ordinal > &remoteIDs, const ArrayView< const int > &remoteNodeIDs, Array< Ordinal > &exportIDs, Array< int > &exportNodeIDs)
Set up Distributor using list of process ranks from which to receive.
EDistributorSendType
The type of MPI send that Distributor should use.