Tpetra parallel linear algebra  Version of the Day
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
43 #define TPETRA_MULTIVECTOR_DEF_HPP
44 
52 
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
55 #include "Tpetra_Details_MultiVectorDistObjectKernels.hpp"
56 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
57 
58 #include "KokkosCompat_View.hpp"
59 #include "Kokkos_MV_GEMM.hpp"
60 #include "Kokkos_Blas1_MV.hpp"
61 #include "Kokkos_Random.hpp"
62 
63 #ifdef HAVE_TPETRA_INST_FLOAT128
64 namespace Kokkos {
65  // FIXME (mfh 04 Sep 2015) Just a stub for now!
66  template<class Generator>
67  struct rand<Generator, __float128> {
68  static KOKKOS_INLINE_FUNCTION __float128 max ()
69  {
70  return static_cast<__float128> (1.0);
71  }
72  static KOKKOS_INLINE_FUNCTION __float128
73  draw (Generator& gen)
74  {
75  // Half the smallest normalized double, is the scaling factor of
76  // the lower-order term in the double-double representation.
77  const __float128 scalingFactor =
78  static_cast<__float128> (std::numeric_limits<double>::min ()) /
79  static_cast<__float128> (2.0);
80  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
81  const __float128 lowerOrderTerm =
82  static_cast<__float128> (gen.drand ()) * scalingFactor;
83  return higherOrderTerm + lowerOrderTerm;
84  }
85  static KOKKOS_INLINE_FUNCTION __float128
86  draw (Generator& gen, const __float128& range)
87  {
88  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
89  const __float128 scalingFactor =
90  static_cast<__float128> (std::numeric_limits<double>::min ()) /
91  static_cast<__float128> (2.0);
92  const __float128 higherOrderTerm =
93  static_cast<__float128> (gen.drand (range));
94  const __float128 lowerOrderTerm =
95  static_cast<__float128> (gen.drand (range)) * scalingFactor;
96  return higherOrderTerm + lowerOrderTerm;
97  }
98  static KOKKOS_INLINE_FUNCTION __float128
99  draw (Generator& gen, const __float128& start, const __float128& end)
100  {
101  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
102  const __float128 scalingFactor =
103  static_cast<__float128> (std::numeric_limits<double>::min ()) /
104  static_cast<__float128> (2.0);
105  const __float128 higherOrderTerm =
106  static_cast<__float128> (gen.drand (start, end));
107  const __float128 lowerOrderTerm =
108  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
109  return higherOrderTerm + lowerOrderTerm;
110  }
111  };
112 } // namespace Kokkos
113 #endif // HAVE_TPETRA_INST_FLOAT128
114 
115 namespace { // (anonymous)
116 
129  template<class ST, class LO, class GO, class NT>
131  allocDualView (const size_t lclNumRows, const size_t numCols, const bool zeroOut = true)
132  {
133  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
134  const char* label = "MV::DualView";
135 
136 #if 0
137  (void) zeroOut;
138  return dual_view_type (label, lclNumRows, numCols);
139 #else
140  if (zeroOut) {
141  return dual_view_type (label, lclNumRows, numCols);
142  }
143  else {
144  // FIXME (mfh 18 Feb 2015, 12 Apr 2015) This is just a hack,
145  // until Kokkos::DualView accepts an AllocationProperties
146  // initial argument, just like Kokkos::View. However, the hack
147  // is harmless, since it does what the (currently nonexistent)
148  // equivalent DualView constructor would have done anyway.
149  typename dual_view_type::t_dev d_view (Kokkos::ViewAllocateWithoutInitializing (label), lclNumRows, numCols);
150 #ifdef HAVE_TPETRA_DEBUG
151  // Filling with NaN is a cheap and effective way to tell if
152  // downstream code is trying to use a MultiVector's data without
153  // them having been initialized. ArithTraits lets us call nan()
154  // even if the scalar type doesn't define it; it just returns some
155  // undefined value in the latter case. This won't hurt anything
156  // because by setting zeroOut=false, users already agreed that
157  // they don't care about the contents of the MultiVector.
158  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
159  KokkosBlas::fill (d_view, nan);
160 #endif // HAVE_TPETRA_DEBUG
161  typename dual_view_type::t_host h_view = Kokkos::create_mirror_view (d_view);
162  // Even though the user doesn't care about the contents of the
163  // MultiVector, the device and host views are still out of sync.
164  // We prefer to work in device memory. The way to ensure this
165  // happens is to mark the device view as modified.
166  dual_view_type dv (d_view, h_view);
167  dv.template modify<typename dual_view_type::t_dev::memory_space> ();
168 
169  return dual_view_type (d_view, h_view);
170  }
171 #endif // 0
172  }
173 
174  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
175  //
176  // T: The type of the entries of the View.
177  // ExecSpace: The Kokkos execution space.
178  template<class T, class ExecSpace>
179  struct MakeUnmanagedView {
180  // The 'false' part of the branch carefully ensures that this
181  // won't attempt to use a host execution space that hasn't been
182  // initialized. For example, if Kokkos::OpenMP is disabled and
183  // Kokkos::Threads is enabled, the latter is always the default
184  // execution space of Kokkos::HostSpace, even when ExecSpace is
185  // Kokkos::Serial. That's why we go through the trouble of asking
186  // Kokkos::DualView what _its_ space is. That seems to work
187  // around this default execution space issue.
188  typedef typename Kokkos::Impl::if_c<
189  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<ExecSpace, Kokkos::HostSpace>::value,
190  typename ExecSpace::device_type,
191  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
192  typedef Kokkos::LayoutLeft array_layout;
193  typedef Kokkos::View<T*, array_layout, host_exec_space,
194  Kokkos::MemoryUnmanaged> view_type;
195 
196  static view_type getView (const Teuchos::ArrayView<T>& x_in)
197  {
198  const size_t numEnt = static_cast<size_t> (x_in.size ());
199  if (numEnt == 0) {
200  return view_type ();
201  } else {
202  return view_type (x_in.getRawPtr (), numEnt);
203  }
204  }
205  };
206 
207  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
208  // taking a subview of a 0 x N DualView incorrectly always results
209  // in a 0 x 0 DualView.
210  template<class DualViewType>
211  DualViewType
212  takeSubview (const DualViewType& X,
213 //We will move the ALL_t to the Kokkos namespace eventually, this is a workaround for testing the new View implementation
214 #ifdef KOKKOS_USING_EXPERIMENTAL_VIEW
215  const Kokkos::Experimental::Impl::ALL_t&,
216 #else
217  const Kokkos::ALL&,
218 #endif
219  const std::pair<size_t, size_t>& colRng)
220  {
221  if (X.dimension_0 () == 0 && X.dimension_1 () != 0) {
222  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
223  }
224  else {
225  return subview (X, Kokkos::ALL (), colRng);
226  }
227  }
228 
229  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
230  // taking a subview of a 0 x N DualView incorrectly always results
231  // in a 0 x 0 DualView.
232  template<class DualViewType>
233  DualViewType
234  takeSubview (const DualViewType& X,
235  const std::pair<size_t, size_t>& rowRng,
236  const std::pair<size_t, size_t>& colRng)
237  {
238  if (X.dimension_0 () == 0 && X.dimension_1 () != 0) {
239  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
240  }
241  else {
242  return subview (X, rowRng, colRng);
243  }
244  }
245 } // namespace (anonymous)
246 
247 
248 namespace Tpetra {
249 
250  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
251  bool
252  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
253  vectorIndexOutOfRange (const size_t VectorIndex) const {
254  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
255  }
256 
257  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
258  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
259  MultiVector () :
260  base_type (Teuchos::rcp (new map_type ()))
261  {}
262 
263  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
265  MultiVector (const Teuchos::RCP<const map_type>& map,
266  const size_t numVecs,
267  const bool zeroOut) : /* default is true */
268  base_type (map)
269  {
270  TEUCHOS_TEST_FOR_EXCEPTION(
271  numVecs < 1, std::invalid_argument, "Tpetra::MultiVector::MultiVector"
272  "(map,numVecs,zeroOut): numVecs = " << numVecs << " < 1.");
273  const size_t lclNumRows = this->getLocalLength ();
274  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
275  origView_ = view_;
276  }
277 
278  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
281  base_type (source),
282  view_ (source.view_),
283  origView_ (source.origView_),
285  {}
286 
287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
290  const Teuchos::DataAccess copyOrView) :
291  base_type (source),
292  view_ (source.view_),
293  origView_ (source.origView_),
295  {
297  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
298  "const Teuchos::DataAccess): ";
299 
300  if (copyOrView == Teuchos::Copy) {
301  // Reuse the conveniently already existing function that creates
302  // a deep copy.
303  MV cpy = createCopy (source);
304  this->view_ = cpy.view_;
305  this->origView_ = cpy.origView_;
306  this->whichVectors_ = cpy.whichVectors_;
307  }
308  else if (copyOrView == Teuchos::View) {
309  }
310  else {
311  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
312  true, std::invalid_argument, "Second argument 'copyOrView' has an "
313  "invalid value " << copyOrView << ". Valid values include "
314  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
315  << Teuchos::View << ".");
316  }
317  }
318 
319  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
321  MultiVector (const Teuchos::RCP<const map_type>& map,
322  const dual_view_type& view) :
323  base_type (map),
324  view_ (view),
325  origView_ (view)
326  {
327  const char tfecfFuncName[] = "MultiVector(map,view): ";
328 
329  // Get stride of view: if second dimension is 0, the
330  // stride might be 0, so take view_dimension instead.
331  size_t stride[8];
332  origView_.stride (stride);
333  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
334  origView_.dimension_0 ();
335  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
336  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
337  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
338  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
339  << ". This may also mean that the input view's first dimension (number "
340  "of rows = " << view.dimension_0 () << ") does not not match the number "
341  "of entries in the Map on the calling process.");
342  }
343 
344 
345  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
347  MultiVector (const Teuchos::RCP<const map_type>& map,
348  const typename dual_view_type::t_dev& d_view) :
349  base_type (map)
350  {
351  using Teuchos::ArrayRCP;
352  using Teuchos::RCP;
353  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
354 
355  // Get stride of view: if second dimension is 0, the stride might
356  // be 0, so take view_dimension instead.
357  size_t stride[8];
358  d_view.stride (stride);
359  const size_t LDA = (d_view.dimension_1 () > 1) ? stride[1] :
360  d_view.dimension_0 ();
361  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
362  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
363  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::View's "
364  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
365  << ". This may also mean that the input view's first dimension (number "
366  "of rows = " << d_view.dimension_0 () << ") does not not match the "
367  "number of entries in the Map on the calling process.");
368 
369  // The difference between create_mirror and create_mirror_view, is
370  // that the latter copies to host. We don't necessarily want to
371  // do that; we just want to allocate the memory.
372  view_ = dual_view_type (d_view, Kokkos::create_mirror (d_view));
373  // The user gave us a device view. We take it as canonical, which
374  // means we mark it as "modified," so that the next sync will
375  // synchronize it with the host view.
376  view_.template modify<execution_space> ();
377  origView_ = view_;
378  }
379 
380  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
382  MultiVector (const Teuchos::RCP<const map_type>& map,
383  const dual_view_type& view,
384  const dual_view_type& origView) :
385  base_type (map),
386  view_ (view),
387  origView_ (origView)
388  {
389  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
390 
391  // Get stride of view: if second dimension is 0, the
392  // stride might be 0, so take view_dimension instead.
393  size_t stride[8];
394  origView_.stride (stride);
395  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
396  origView_.dimension_0 ();
397  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
398  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
399  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
400  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
401  << ". This may also mean that the input origView's first dimension (number "
402  "of rows = " << origView.dimension_0 () << ") does not not match the number "
403  "of entries in the Map on the calling process.");
404  }
405 
406 
407  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
409  MultiVector (const Teuchos::RCP<const map_type>& map,
410  const dual_view_type& view,
411  const Teuchos::ArrayView<const size_t>& whichVectors) :
412  base_type (map),
413  view_ (view),
414  origView_ (view),
415  whichVectors_ (whichVectors.begin (), whichVectors.end ())
416  {
417  using Kokkos::ALL;
418  using Kokkos::subview;
419  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
420 
421  const size_t lclNumRows = map.is_null () ? size_t (0) :
422  map->getNodeNumElements ();
423  // Check dimensions of the input DualView. We accept that Kokkos
424  // might not allow construction of a 0 x m (Dual)View with m > 0,
425  // so we only require the number of rows to match if the
426  // (Dual)View has more than zero columns. Likewise, we only
427  // require the number of columns to match if the (Dual)View has
428  // more than zero rows.
429  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
430  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_0 ()) < lclNumRows,
431  std::invalid_argument, "view.dimension_0() = " << view.dimension_0 ()
432  << " < map->getNodeNumElements() = " << lclNumRows << ".");
433  if (whichVectors.size () != 0) {
434  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
435  view.dimension_1 () != 0 && view.dimension_1 () == 0,
436  std::invalid_argument, "view.dimension_1() = 0, but whichVectors.size()"
437  " = " << whichVectors.size () << " > 0.");
438  size_t maxColInd = 0;
439  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
440  for (size_type k = 0; k < whichVectors.size (); ++k) {
441  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
442  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
443  std::invalid_argument, "whichVectors[" << k << "] = "
444  "Teuchos::OrdinalTraits<size_t>::invalid().");
445  maxColInd = std::max (maxColInd, whichVectors[k]);
446  }
447  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
448  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_1 ()) <= maxColInd,
449  std::invalid_argument, "view.dimension_1() = " << view.dimension_1 ()
450  << " <= max(whichVectors) = " << maxColInd << ".");
451  }
452 
453  // Get stride of view: if second dimension is 0, the
454  // stride might be 0, so take view_dimension instead.
455  size_t stride[8];
456  origView_.stride (stride);
457  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
458  origView_.dimension_0 ();
459  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
460  LDA < lclNumRows, std::invalid_argument,
461  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
462 
463  if (whichVectors.size () == 1) {
464  // If whichVectors has only one entry, we don't need to bother
465  // with nonconstant stride. Just shift the view over so it
466  // points to the desired column.
467  //
468  // NOTE (mfh 10 May 2014) This is a special case where we set
469  // origView_ just to view that one column, not all of the
470  // original columns. This ensures that the use of origView_ in
471  // offsetView works correctly.
472  const std::pair<size_t, size_t> colRng (whichVectors[0],
473  whichVectors[0] + 1);
474  view_ = takeSubview (view_, ALL (), colRng);
475  origView_ = takeSubview (origView_, ALL (), colRng);
476  // whichVectors_.size() == 0 means "constant stride."
477  whichVectors_.clear ();
478  }
479  }
480 
481  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
483  MultiVector (const Teuchos::RCP<const map_type>& map,
484  const dual_view_type& view,
485  const dual_view_type& origView,
486  const Teuchos::ArrayView<const size_t>& whichVectors) :
487  base_type (map),
488  view_ (view),
489  origView_ (origView),
490  whichVectors_ (whichVectors.begin (), whichVectors.end ())
491  {
492  using Kokkos::ALL;
493  using Kokkos::subview;
494  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
495 
496  const size_t lclNumRows = this->getLocalLength ();
497  // Check dimensions of the input DualView. We accept that Kokkos
498  // might not allow construction of a 0 x m (Dual)View with m > 0,
499  // so we only require the number of rows to match if the
500  // (Dual)View has more than zero columns. Likewise, we only
501  // require the number of columns to match if the (Dual)View has
502  // more than zero rows.
503  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
504  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_0 ()) < lclNumRows,
505  std::invalid_argument, "view.dimension_0() = " << view.dimension_0 ()
506  << " < map->getNodeNumElements() = " << lclNumRows << ".");
507  if (whichVectors.size () != 0) {
508  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
509  view.dimension_1 () != 0 && view.dimension_1 () == 0,
510  std::invalid_argument, "view.dimension_1() = 0, but whichVectors.size()"
511  " = " << whichVectors.size () << " > 0.");
512  size_t maxColInd = 0;
513  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
514  for (size_type k = 0; k < whichVectors.size (); ++k) {
515  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
516  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
517  std::invalid_argument, "whichVectors[" << k << "] = "
518  "Teuchos::OrdinalTraits<size_t>::invalid().");
519  maxColInd = std::max (maxColInd, whichVectors[k]);
520  }
521  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
522  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_1 ()) <= maxColInd,
523  std::invalid_argument, "view.dimension_1() = " << view.dimension_1 ()
524  << " <= max(whichVectors) = " << maxColInd << ".");
525  }
526  // Get stride of view: if second dimension is 0, the
527  // stride might be 0, so take view_dimension instead.
528  size_t stride[8];
529  origView_.stride (stride);
530  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
531  origView_.dimension_0 ();
532  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
533  LDA < lclNumRows, std::invalid_argument, "Input DualView's column stride"
534  " = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
535 
536  if (whichVectors.size () == 1) {
537  // If whichVectors has only one entry, we don't need to bother
538  // with nonconstant stride. Just shift the view over so it
539  // points to the desired column.
540  //
541  // NOTE (mfh 10 May 2014) This is a special case where we set
542  // origView_ just to view that one column, not all of the
543  // original columns. This ensures that the use of origView_ in
544  // offsetView works correctly.
545  const std::pair<size_t, size_t> colRng (whichVectors[0],
546  whichVectors[0] + 1);
547  view_ = takeSubview (view_, ALL (), colRng);
548  origView_ = takeSubview (origView_, ALL (), colRng);
549  // whichVectors_.size() == 0 means "constant stride."
550  whichVectors_.clear ();
551  }
552  }
553 
554  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
556  MultiVector (const Teuchos::RCP<const map_type>& map,
557  const Teuchos::ArrayView<const Scalar>& data,
558  const size_t LDA,
559  const size_t numVecs) :
560  base_type (map)
561  {
562  using Kokkos::subview;
563  using Teuchos::ArrayView;
564  using Teuchos::av_reinterpret_cast;
565  typedef impl_scalar_type IST;
566  typedef LocalOrdinal LO;
567  typedef GlobalOrdinal GO;
568  typedef typename dual_view_type::host_mirror_space HMS;
569  typedef MakeUnmanagedView<const IST, device_type> view_getter_type;
570  typedef typename view_getter_type::view_type in_view_type;
571  typedef Kokkos::View<IST*, Kokkos::LayoutLeft, HMS> out_view_type;
572  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
573 
574  // Deep copy constructor, constant stride (NO whichVectors_).
575  // There is no need for a deep copy constructor with nonconstant stride.
576 
577  const size_t lclNumRows =
578  map.is_null () ? size_t (0) : map->getNodeNumElements ();
579  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
580  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
581  "map->getNodeNumElements() = " << lclNumRows << ".");
582  if (numVecs != 0) {
583  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
584  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
585  (static_cast<size_t> (data.size ()) < minNumEntries,
586  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
587  "entries, given the input Map and number of vectors in the MultiVector."
588  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
589  "map->getNodeNumElements () = " << minNumEntries << ".");
590  }
591  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
592  view_.template modify<HMS> ();
593 
594  ArrayView<const IST> X_in_av = av_reinterpret_cast<const IST> (data);
595  in_view_type X_in = view_getter_type::getView (X_in_av);
596  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
597  for (size_t j = 0; j < numVecs; ++j) {
598  const std::pair<size_t, size_t> rng (j*LDA, j*LDA + lclNumRows);
599  in_view_type X_j_in = subview (X_in, rng);
600  out_view_type X_j_out = subview (view_.h_view, rowRng, j);
601  Kokkos::deep_copy (X_j_out, X_j_in);
602  }
603  origView_ = view_;
604  }
605 
606  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
608  MultiVector (const Teuchos::RCP<const map_type>& map,
609  const Teuchos::ArrayView<const ArrayView<const Scalar> >& ArrayOfPtrs,
610  const size_t numVecs) :
611  base_type (map)
612  {
613  using Kokkos::subview;
614  using Teuchos::ArrayView;
615  using Teuchos::av_reinterpret_cast;
616  typedef impl_scalar_type IST;
617  typedef LocalOrdinal LO;
618  typedef GlobalOrdinal GO;
619  typedef typename dual_view_type::host_mirror_space HMS;
620  typedef MakeUnmanagedView<const IST, device_type> view_getter_type;
621  typedef typename view_getter_type::view_type in_view_type;
622  typedef Kokkos::View<IST*, Kokkos::LayoutLeft, HMS> out_view_type;
623  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
624 
625  const size_t lclNumRows =
626  map.is_null () ? size_t (0) : map->getNodeNumElements ();
627  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
628  numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
629  std::runtime_error,
630  "ArrayOfPtrs.size() must be strictly positive and as large as ArrayOfPtrs.");
631  for (size_t j = 0; j < numVecs; ++j) {
632  ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
634  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
635  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
636  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
637  << ".");
638  }
639 
640  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
641  view_.template modify<HMS> ();
642 
643  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
644  for (size_t j = 0; j < numVecs; ++j) {
645  ArrayView<const IST> X_j_av =
646  av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
647  in_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
648  out_view_type X_j_out = subview (view_.h_view, rowRng, j);
649  Kokkos::deep_copy (X_j_out, X_j_in);
650  }
651  origView_ = view_;
652  }
653 
654  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
657 
658  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
661  return whichVectors_.empty ();
662  }
663 
664  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
665  size_t
668  {
669  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
670  return static_cast<size_t> (0);
671  } else {
672  return this->getMap ()->getNodeNumElements ();
673  }
674  }
675 
676  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
680  {
681  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
682  return static_cast<size_t> (0);
683  } else {
684  return this->getMap ()->getGlobalNumElements ();
685  }
686  }
687 
688  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
689  size_t
691  getStride () const
692  {
693  if (isConstantStride ()) {
694  // Get stride of view: if second dimension is 0, the
695  // stride might be 0, so take view_dimension instead.
696  size_t stride[8];
697  origView_.stride (stride);
698  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] : origView_.dimension_0 ();
699  return LDA;
700  }
701  else {
702  return static_cast<size_t> (0);
703  }
704  }
705 
706  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
707  bool
709  checkSizes (const SrcDistObject& sourceObj)
710  {
711  // Check whether the source object is a MultiVector. If not, then
712  // we can't even compare sizes, so it's definitely not OK to
713  // Import or Export from it.
715  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
716  if (src == NULL) {
717  return false;
718  } else {
719  // The target of the Import or Export calls checkSizes() in
720  // DistObject::doTransfer(). By that point, we've already
721  // constructed an Import or Export object using the two
722  // multivectors' Maps, which means that (hopefully) we've
723  // already checked other attributes of the multivectors. Thus,
724  // all we need to do here is check the number of columns.
725  return src->getNumVectors () == this->getNumVectors ();
726  }
727  }
728 
729  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
730  size_t
733  return this->getNumVectors ();
734  }
735 
736  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
737  void
739  copyAndPermuteNew (const SrcDistObject& sourceObj,
740  size_t numSameIDs,
741  const Kokkos::View<const LocalOrdinal*, execution_space>& permuteToLIDs,
742  const Kokkos::View<const LocalOrdinal*, execution_space>& permuteFromLIDs)
743  {
744  using Teuchos::ArrayRCP;
745  using Teuchos::ArrayView;
746  using Teuchos::RCP;
747  using Kokkos::Compat::getKokkosViewDeepCopy;
748  using Kokkos::subview;
749  typedef Kokkos::DualView<impl_scalar_type*,
750  typename dual_view_type::array_layout,
751  execution_space> col_dual_view_type;
753  //typedef typename ArrayView<const LocalOrdinal>::size_type size_type; // unused
754  const char tfecfFuncName[] = "copyAndPermute";
755 
756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
757  permuteToLIDs.size() != permuteFromLIDs.size(), std::runtime_error,
758  ": permuteToLIDs and permuteFromLIDs must have the same size."
759  << std::endl << "permuteToLIDs.size() = " << permuteToLIDs.size ()
760  << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
761 
762  // We've already called checkSizes(), so this cast must succeed.
763  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
764  const size_t numCols = this->getNumVectors ();
765 
766  // TODO (mfh 15 Sep 2013) When we replace
767  // KokkosClassic::MultiVector with a Kokkos::View, there are two
768  // ways to copy the data:
769  //
770  // 1. Get a (sub)view of each column and call deep_copy on that.
771  // 2. Write a custom kernel to copy the data.
772  //
773  // The first is easier, but the second might be more performant in
774  // case we decide to use layouts other than LayoutLeft. It might
775  // even make sense to hide whichVectors_ in an entirely new layout
776  // for Kokkos Views.
777 
778  // Copy rows [0, numSameIDs-1] of the local multivectors.
779  //
780  // For GPU Nodes: All of this happens using device pointers; this
781  // does not require host views of either source or destination.
782  //
783  // Note (ETP 2 Jul 2014) We need to always copy one column at a
784  // time, even when both multivectors are constant-stride, since
785  // deep_copy between strided subviews with more than one column
786  // doesn't currently work.
787  if (numSameIDs > 0) {
788  const std::pair<size_t, size_t> rows (0, numSameIDs);
789  for (size_t j = 0; j < numCols; ++j) {
790  const size_t dstCol = isConstantStride () ? j : whichVectors_[j];
791  const size_t srcCol =
792  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
793  col_dual_view_type dst_j =
794  subview (view_, rows, dstCol);
795  col_dual_view_type src_j =
796  subview (sourceMV.view_, rows, srcCol);
797  Kokkos::deep_copy (dst_j, src_j); // Copy src_j into dst_j
798  }
799  }
800 
801  // For the remaining GIDs, execute the permutations. This may
802  // involve noncontiguous access of both source and destination
803  // vectors, depending on the LID lists.
804  //
805  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
806  // the same process, this merges their values by replacement of
807  // the last encountered GID, not by the specified merge rule
808  // (such as ADD).
809 
810  // If there are no permutations, we are done
811  if (permuteFromLIDs.size() == 0 || permuteToLIDs.size() == 0)
812  return;
813 
814  if (this->isConstantStride ()) {
815  KokkosRefactor::Details::permute_array_multi_column(
816  getKokkosView(),
817  sourceMV.getKokkosView(),
818  permuteToLIDs,
819  permuteFromLIDs,
820  numCols);
821  }
822  else {
823  KokkosRefactor::Details::permute_array_multi_column_variable_stride(
824  getKokkosView(),
825  sourceMV.getKokkosView(),
826  permuteToLIDs,
827  permuteFromLIDs,
828  getKokkosViewDeepCopy<execution_space> (whichVectors_ ()),
829  getKokkosViewDeepCopy<execution_space> (sourceMV.whichVectors_ ()),
830  numCols);
831  }
832  }
833 
834  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
835  void
837  packAndPrepareNew (const SrcDistObject& sourceObj,
838  const Kokkos::View<const local_ordinal_type*, execution_space> &exportLIDs,
839  Kokkos::View<impl_scalar_type*, execution_space> &exports,
840  const Kokkos::View<size_t*, execution_space> &numExportPacketsPerLID,
841  size_t& constantNumPackets,
842  Distributor & /* distor */ )
843  {
844  using Teuchos::Array;
845  using Teuchos::ArrayView;
846  using Kokkos::Compat::getKokkosViewDeepCopy;
848  //typedef Array<size_t>::size_type size_type; // unused
849 
850  // If we have no exports, there is nothing to do
851  if (exportLIDs.size () == 0) {
852  return;
853  }
854 
855  // We've already called checkSizes(), so this cast must succeed.
856  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
857 
858  // We don't need numExportPacketsPerLID; forestall "unused
859  // variable" compile warnings.
860  (void) numExportPacketsPerLID;
861 
862  /* The layout in the export for MultiVectors is as follows:
863  exports = { all of the data from row exportLIDs.front() ;
864  ....
865  all of the data from row exportLIDs.back() }
866  This doesn't have the best locality, but is necessary because
867  the data for a Packet (all data associated with an LID) is
868  required to be contiguous. */
869 
870  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
871  // packing scheme in the above comment? The data going to a
872  // particular process must be contiguous, of course, but those
873  // data could include entries from multiple LIDs. DistObject just
874  // needs to know how to index into that data. Kokkos is good at
875  // decoupling storage intent from data layout choice.
876 
877  const size_t numCols = sourceMV.getNumVectors ();
878 
879  // This spares us from needing to fill numExportPacketsPerLID.
880  // Setting constantNumPackets to a nonzero value signals that
881  // all packets have the same number of entries.
882  constantNumPackets = numCols;
883 
884  const size_t numExportLIDs = exportLIDs.size ();
885  const size_t newExportsSize = numCols * numExportLIDs;
886  if (exports.size () != newExportsSize) {
887  Kokkos::Compat::realloc (exports, newExportsSize);
888  }
889 
890  if (numCols == 1) { // special case for one column only
891  // MultiVector always represents a single column with constant
892  // stride, but it doesn't hurt to implement both cases anyway.
893  //
894  // ETP: I'm not sure I agree with the above statement. Can't a single-
895  // column multivector be a subview of another multi-vector, in which case
896  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
897  // separately.
898  if (sourceMV.isConstantStride ()) {
899  KokkosRefactor::Details::pack_array_single_column(
900  exports,
901  sourceMV.getKokkosView (),
902  exportLIDs,
903  0);
904  }
905  else {
906  KokkosRefactor::Details::pack_array_single_column(
907  exports,
908  sourceMV.getKokkosView (),
909  exportLIDs,
910  sourceMV.whichVectors_[0]);
911  }
912  }
913  else { // the source MultiVector has multiple columns
914  if (sourceMV.isConstantStride ()) {
915  KokkosRefactor::Details::pack_array_multi_column(
916  exports,
917  sourceMV.getKokkosView (),
918  exportLIDs,
919  numCols);
920  }
921  else {
922  KokkosRefactor::Details::pack_array_multi_column_variable_stride(
923  exports,
924  sourceMV.getKokkosView (),
925  exportLIDs,
926  getKokkosViewDeepCopy<execution_space> (sourceMV.whichVectors_ ()),
927  numCols);
928  }
929  }
930  }
931 
932 
933  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
934  void
936  unpackAndCombineNew (const Kokkos::View<const local_ordinal_type*, execution_space> &importLIDs,
937  const Kokkos::View<const impl_scalar_type*, execution_space> &imports,
938  const Kokkos::View<size_t*, execution_space> &numPacketsPerLID,
939  size_t constantNumPackets,
940  Distributor & /* distor */,
941  CombineMode CM)
942  {
943  using Teuchos::ArrayView;
944  using Kokkos::Compat::getKokkosViewDeepCopy;
945  const char tfecfFuncName[] = "unpackAndCombine";
946 
947  // If we have no imports, there is nothing to do
948  if (importLIDs.size () == 0) {
949  return;
950  }
951 
952  // We don't need numPacketsPerLID; forestall "unused variable"
953  // compile warnings.
954  (void) numPacketsPerLID;
955 
956  /* The layout in the export for MultiVectors is as follows:
957  imports = { all of the data from row exportLIDs.front() ;
958  ....
959  all of the data from row exportLIDs.back() }
960  This doesn't have the best locality, but is necessary because
961  the data for a Packet (all data associated with an LID) is
962  required to be contiguous. */
963 
964  const size_t numVecs = getNumVectors ();
965 
966 #ifdef HAVE_TPETRA_DEBUG
967  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
968  static_cast<size_t> (imports.size()) != getNumVectors()*importLIDs.size(),
969  std::runtime_error,
970  ": 'imports' buffer size must be consistent with the amount of data to "
971  "be sent. " << std::endl << "imports.size() = " << imports.size()
972  << " != getNumVectors()*importLIDs.size() = " << getNumVectors() << "*"
973  << importLIDs.size() << " = " << getNumVectors() * importLIDs.size()
974  << ".");
975 
976  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
977  constantNumPackets == static_cast<size_t> (0), std::runtime_error,
978  ": constantNumPackets input argument must be nonzero.");
979 
980  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
981  static_cast<size_t> (numVecs) != static_cast<size_t> (constantNumPackets),
982  std::runtime_error, ": constantNumPackets must equal numVecs.");
983 #endif // HAVE_TPETRA_DEBUG
984 
985  if (numVecs > 0 && importLIDs.size () > 0) {
986 
987  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
988  // custom combine modes, start editing here. Also, if you trust
989  // inlining, it would be nice to condense this code by using a
990  // binary function object f in the pack functors.
991  if (CM == INSERT || CM == REPLACE) {
992  if (isConstantStride()) {
993  KokkosRefactor::Details::unpack_array_multi_column(
994  getKokkosView(),
995  imports,
996  importLIDs,
997  KokkosRefactor::Details::InsertOp(),
998  numVecs);
999  }
1000  else {
1001  KokkosRefactor::Details::unpack_array_multi_column_variable_stride(
1002  getKokkosView(),
1003  imports,
1004  importLIDs,
1005  getKokkosViewDeepCopy<execution_space>(whichVectors_ ()),
1006  KokkosRefactor::Details::InsertOp(),
1007  numVecs);
1008  }
1009  }
1010  else if (CM == ADD) {
1011  if (isConstantStride()) {
1012  KokkosRefactor::Details::unpack_array_multi_column(
1013  getKokkosView(),
1014  imports,
1015  importLIDs,
1016  KokkosRefactor::Details::AddOp(),
1017  numVecs);
1018  }
1019  else {
1020  KokkosRefactor::Details::unpack_array_multi_column_variable_stride(
1021  getKokkosView(),
1022  imports,
1023  importLIDs,
1024  getKokkosViewDeepCopy<execution_space>(whichVectors_ ()),
1025  KokkosRefactor::Details::AddOp(),
1026  numVecs);
1027  }
1028  }
1029  else if (CM == ABSMAX) {
1030  if (isConstantStride()) {
1031  KokkosRefactor::Details::unpack_array_multi_column(
1032  getKokkosView(),
1033  imports,
1034  importLIDs,
1035  KokkosRefactor::Details::AbsMaxOp(),
1036  numVecs);
1037  }
1038  else {
1039  KokkosRefactor::Details::unpack_array_multi_column_variable_stride(
1040  getKokkosView(),
1041  imports,
1042  importLIDs,
1043  getKokkosViewDeepCopy<execution_space>(whichVectors_ ()),
1044  KokkosRefactor::Details::AbsMaxOp(),
1045  numVecs);
1046  }
1047  }
1048  else {
1049  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1050  CM != ADD && CM != REPLACE && CM != INSERT && CM != ABSMAX,
1051  std::invalid_argument, ": Invalid CombineMode: " << CM << ". Valid "
1052  "CombineMode values are ADD, REPLACE, INSERT, and ABSMAX.");
1053  }
1054  }
1055  }
1056 
1057  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1058  size_t
1061  {
1062  if (isConstantStride ()) {
1063  return static_cast<size_t> (view_.dimension_1 ());
1064  } else {
1065  return static_cast<size_t> (whichVectors_.size ());
1066  }
1067  }
1068 
1069  namespace { // (anonymous)
1070 
1071  template<class RV, class XMV>
1072  void
1073  lclDotImpl (const RV& dotsOut,
1074  const XMV& X_lcl,
1075  const XMV& Y_lcl,
1076  const size_t lclNumRows,
1077  const size_t numVecs,
1078  const Teuchos::ArrayView<const size_t>& whichVecsX,
1079  const Teuchos::ArrayView<const size_t>& whichVecsY,
1080  const bool constantStrideX,
1081  const bool constantStrideY)
1082  {
1083  using Kokkos::ALL;
1084  using Kokkos::subview;
1085  typedef typename RV::non_const_value_type dot_type;
1086 #ifdef HAVE_TPETRA_DEBUG
1087  const char prefix[] = "Tpetra::MultiVector::lclDotImpl: ";
1088 #endif // HAVE_TPETRA_DEBUG
1089 
1090  static_assert (Kokkos::Impl::is_view<RV>::value,
1091  "Tpetra::MultiVector::lclDotImpl: "
1092  "The first argument dotsOut is not a Kokkos::View.");
1093  static_assert (RV::rank == 1, "Tpetra::MultiVector::lclDotImpl: "
1094  "The first argument dotsOut must have rank 1.");
1095  static_assert (Kokkos::Impl::is_view<XMV>::value,
1096  "Tpetra::MultiVector::lclDotImpl: The type of the 2nd and "
1097  "3rd arguments (X_lcl and Y_lcl) is not a Kokkos::View.");
1098  static_assert (XMV::rank == 2, "Tpetra::MultiVector::lclDotImpl: "
1099  "X_lcl and Y_lcl must have rank 2.");
1100 
1101  // In case the input dimensions don't match, make sure that we
1102  // don't overwrite memory that doesn't belong to us, by using
1103  // subset views with the minimum dimensions over all input.
1104  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1105  const std::pair<size_t, size_t> colRng (0, numVecs);
1106  RV theDots = subview (dotsOut, colRng);
1107  XMV X = subview (X_lcl, rowRng, Kokkos::ALL());
1108  XMV Y = subview (Y_lcl, rowRng, Kokkos::ALL());
1109 
1110 #ifdef HAVE_TPETRA_DEBUG
1111  if (lclNumRows != 0) {
1112  TEUCHOS_TEST_FOR_EXCEPTION
1113  (X.dimension_0 () != lclNumRows, std::logic_error, prefix <<
1114  "X.dimension_0() = " << X.dimension_0 () << " != lclNumRows "
1115  "= " << lclNumRows << ". "
1116  "Please report this bug to the Tpetra developers.");
1117  TEUCHOS_TEST_FOR_EXCEPTION
1118  (Y.dimension_0 () != lclNumRows, std::logic_error, prefix <<
1119  "Y.dimension_0() = " << Y.dimension_0 () << " != lclNumRows "
1120  "= " << lclNumRows << ". "
1121  "Please report this bug to the Tpetra developers.");
1122  // If a MultiVector is constant stride, then numVecs should
1123  // equal its View's number of columns. Otherwise, numVecs
1124  // should be less than its View's number of columns.
1125  TEUCHOS_TEST_FOR_EXCEPTION
1126  (constantStrideX &&
1127  (X.dimension_0 () != lclNumRows || X.dimension_1 () != numVecs),
1128  std::logic_error, prefix << "X is " << X.dimension_0 () << " x " <<
1129  X.dimension_1 () << " (constant stride), which differs from the "
1130  "local dimensions " << lclNumRows << " x " << numVecs << ". "
1131  "Please report this bug to the Tpetra developers.");
1132  TEUCHOS_TEST_FOR_EXCEPTION
1133  (! constantStrideX &&
1134  (X.dimension_0 () != lclNumRows || X.dimension_1 () < numVecs),
1135  std::logic_error, prefix << "X is " << X.dimension_0 () << " x " <<
1136  X.dimension_1 () << " (NOT constant stride), but the local "
1137  "dimensions are " << lclNumRows << " x " << numVecs << ". "
1138  "Please report this bug to the Tpetra developers.");
1139  TEUCHOS_TEST_FOR_EXCEPTION
1140  (constantStrideY &&
1141  (Y.dimension_0 () != lclNumRows || Y.dimension_1 () != numVecs),
1142  std::logic_error, prefix << "Y is " << Y.dimension_0 () << " x " <<
1143  Y.dimension_1 () << " (constant stride), which differs from the "
1144  "local dimensions " << lclNumRows << " x " << numVecs << ". "
1145  "Please report this bug to the Tpetra developers.");
1146  TEUCHOS_TEST_FOR_EXCEPTION
1147  (! constantStrideY &&
1148  (Y.dimension_0 () != lclNumRows || Y.dimension_1 () < numVecs),
1149  std::logic_error, prefix << "Y is " << Y.dimension_0 () << " x " <<
1150  Y.dimension_1 () << " (NOT constant stride), but the local "
1151  "dimensions are " << lclNumRows << " x " << numVecs << ". "
1152  "Please report this bug to the Tpetra developers.");
1153  }
1154 #endif // HAVE_TPETRA_DEBUG
1155 
1156  if (lclNumRows == 0) {
1157  const dot_type zero = Kokkos::Details::ArithTraits<dot_type>::zero ();
1158  Kokkos::deep_copy(theDots, zero);
1159  }
1160  else { // lclNumRows != 0
1161  if (constantStrideX && constantStrideY) {
1162  if(X.dimension_1() == 1) {
1163  typename RV::non_const_value_type result =
1164  KokkosBlas::dot (Kokkos::subview(X,Kokkos::ALL(),0),
1165  Kokkos::subview(Y,Kokkos::ALL(),0));
1166  Kokkos::deep_copy(theDots,result);
1167  } else
1168  KokkosBlas::dot (theDots, X, Y);
1169  }
1170  else { // not constant stride
1171  // NOTE (mfh 15 Jul 2014) This does a kernel launch for
1172  // every column. It might be better to have a kernel that
1173  // does the work all at once. On the other hand, we don't
1174  // prioritize performance of MultiVector views of
1175  // noncontiguous columns.
1176  for (size_t k = 0; k < numVecs; ++k) {
1177  const size_t X_col = constantStrideX ? k : whichVecsX[k];
1178  const size_t Y_col = constantStrideY ? k : whichVecsY[k];
1179  KokkosBlas::dot (subview (theDots, k), subview (X, ALL (), X_col),
1180  subview (Y, ALL (), Y_col));
1181  } // for each column
1182  } // constantStride
1183  } // lclNumRows != 0
1184  }
1185 
1186  template<class RV>
1187  void
1188  gblDotImpl (const RV& dotsOut,
1189  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1190  const bool distributed)
1191  {
1192  using Teuchos::REDUCE_MAX;
1193  using Teuchos::REDUCE_SUM;
1194  using Teuchos::reduceAll;
1195  typedef typename RV::non_const_value_type dot_type;
1196 
1197  const size_t numVecs = dotsOut.dimension_0 ();
1198 
1199  // If the MultiVector is distributed over multiple processes, do
1200  // the distributed (interprocess) part of the dot product. We
1201  // assume that the MPI implementation can read from and write to
1202  // device memory.
1203  //
1204  // replaceMap() may have removed some processes. Those
1205  // processes have a null Map. They must not participate in any
1206  // collective operations. We ask first whether the Map is null,
1207  // because isDistributed() defers that question to the Map. We
1208  // still compute and return local dots for processes not
1209  // participating in collective operations; those probably don't
1210  // make any sense, but it doesn't hurt to do them, since it's
1211  // illegal to call dot() on those processes anyway.
1212  if (distributed && ! comm.is_null ()) {
1213  // The calling process only participates in the collective if
1214  // both the Map and its Comm on that process are nonnull.
1215  //
1216  // MPI doesn't allow aliasing of arguments, so we have to make
1217  // a copy of the local sum.
1218  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1219  Kokkos::deep_copy (lclDots, dotsOut);
1220  const dot_type* const lclSum = lclDots.ptr_on_device ();
1221  dot_type* const gblSum = dotsOut.ptr_on_device ();
1222  const int nv = static_cast<int> (numVecs);
1223  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1224  }
1225  }
1226  } // namespace (anonymous)
1227 
1228  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1229  void
1232  const Kokkos::View<dot_type*, device_type>& dots) const
1233  {
1234  using Kokkos::create_mirror_view;
1235  using Kokkos::subview;
1236  using Teuchos::Comm;
1237  using Teuchos::null;
1238  using Teuchos::RCP;
1239  // View of all the dot product results.
1240  typedef Kokkos::View<dot_type*, device_type> RV;
1241  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1242 
1243  const size_t numVecs = this->getNumVectors ();
1244  if (numVecs == 0) {
1245  return; // nothing to do
1246  }
1247  const size_t lclNumRows = this->getLocalLength ();
1248  const size_t numDots = static_cast<size_t> (dots.dimension_0 ());
1249 
1250 #ifdef HAVE_TPETRA_DEBUG
1251  {
1252  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1253  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1254  ! compat, std::invalid_argument, "Tpetra::MultiVector::dot: *this is "
1255  "not compatible with the input MultiVector A. We only test for this "
1256  "in a debug build.");
1257  }
1258 #endif // HAVE_TPETRA_DEBUG
1259 
1260  // FIXME (mfh 11 Jul 2014) These exception tests may not
1261  // necessarily be thrown on all processes consistently. We should
1262  // instead pass along error state with the inner product. We
1263  // could do this by setting an extra slot to
1264  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1265  // final sum should be
1266  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1267  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1268  lclNumRows != A.getLocalLength (), std::runtime_error,
1269  "MultiVectors do not have the same local length. "
1270  "this->getLocalLength() = " << lclNumRows << " != "
1271  "A.getLocalLength() = " << A.getLocalLength () << ".");
1272  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1273  numVecs != A.getNumVectors (), std::runtime_error,
1274  "MultiVectors must have the same number of columns (vectors). "
1275  "this->getNumVectors() = " << numVecs << " != "
1276  "A.getNumVectors() = " << A.getNumVectors () << ".");
1277  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1278  numDots != numVecs, std::runtime_error,
1279  "The output array 'dots' must have the same number of entries as the "
1280  "number of columns (vectors) in *this and A. dots.dimension_0() = " <<
1281  numDots << " != this->getNumVectors() = " << numVecs << ".");
1282 
1283  const std::pair<size_t, size_t> colRng (0, numVecs);
1284  RV dotsOut = subview (dots, colRng);
1285  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1286  this->getMap ()->getComm ();
1287 
1288  // FIXME (mfh 05 Mar 2015) DualView flags are not indicative when
1289  // the two memory spaces are the same, so we check the latter.
1290  const bool oneMemorySpace =
1291  Kokkos::Impl::is_same<typename dual_view_type::t_dev::memory_space,
1292  typename dual_view_type::t_host::memory_space>::value;
1293  if (! oneMemorySpace && A.view_.modified_host() > A.view_.modified_device()) {
1294  // A was last modified on host, so run the local kernel there.
1295  // This means we need a host mirror of the array of norms too.
1296  typedef typename dual_view_type::t_host XMV;
1297  // I consider it more polite to sync *this, then to sync A.
1298  // A is a "guest" of this method, and is passed in const.
1299  this->view_.template sync<typename XMV::memory_space> ();
1300  lclDotImpl<RV, XMV> (dotsOut, view_.h_view, A.view_.h_view,
1301  lclNumRows, numVecs,
1302  this->whichVectors_, A.whichVectors_,
1303  this->isConstantStride (), A.isConstantStride ());
1304  typename RV::HostMirror dotsOutHost = create_mirror_view (dotsOut);
1305  Kokkos::deep_copy (dotsOutHost, dotsOut);
1306  gblDotImpl<typename RV::HostMirror> (dotsOutHost, comm,
1307  this->isDistributed ());
1308  Kokkos::deep_copy (dotsOut, dotsOutHost);
1309  }
1310  else {
1311  // A was last modified on device, so run the local kernel there.
1312  typedef typename dual_view_type::t_dev XMV;
1313  // I consider it more polite to sync *this, then to sync A.
1314  // A is a "guest" of this method, and is passed in const.
1315  this->view_.template sync<typename XMV::memory_space> ();
1316  lclDotImpl<RV, XMV> (dotsOut, view_.d_view, A.view_.d_view,
1317  lclNumRows, numVecs,
1318  this->whichVectors_, A.whichVectors_,
1319  this->isConstantStride (), A.isConstantStride ());
1320  gblDotImpl<RV> (dotsOut, comm, this->isDistributed ());
1321  }
1322  }
1323 
1324 
1325  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1326  void
1329  const Teuchos::ArrayView<dot_type>& dots) const
1330  {
1331  typedef Kokkos::View<dot_type*, device_type> dev_dots_view_type;
1332  typedef MakeUnmanagedView<dot_type, device_type> view_getter_type;
1333  typedef typename view_getter_type::view_type host_dots_view_type;
1334 
1335  const size_t numDots = static_cast<size_t> (dots.size ());
1336  host_dots_view_type dotsHostView (dots.getRawPtr (), numDots);
1337  dev_dots_view_type dotsDevView ("MV::dot tmp", numDots);
1338  this->dot (A, dotsDevView); // Do the computation on the device.
1339  Kokkos::deep_copy (dotsHostView, dotsDevView); // Bring back result to host
1340  }
1341 
1342 
1343  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1344  void
1346  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
1347  {
1348  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
1349  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
1350  typedef typename view_getter_type::view_type host_norms_view_type;
1351 
1352  const size_t numNorms = static_cast<size_t> (norms.size ());
1353  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1354  dev_norms_view_type normsDevView ("MV::norm2 tmp", numNorms);
1355  this->norm2 (normsDevView); // Do the computation on the device.
1356  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
1357  }
1358 
1359 
1360  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1361  void
1363  norm2 (const Kokkos::View<mag_type*, device_type>& norms) const
1364  {
1365  this->normImpl (norms, NORM_TWO);
1366  }
1367 
1368 
1369  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1370  void TPETRA_DEPRECATED
1373  const Teuchos::ArrayView<mag_type> &norms) const
1374  {
1375  using Kokkos::ALL;
1376  using Kokkos::subview;
1377  using Teuchos::Comm;
1378  using Teuchos::null;
1379  using Teuchos::RCP;
1380  using Teuchos::reduceAll;
1381  using Teuchos::REDUCE_SUM;
1382  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
1383  typedef Kokkos::Details::ArithTraits<mag_type> ATM;
1384  typedef Kokkos::View<mag_type*, device_type> norms_view_type;
1385  const char tfecfFuncName[] = "normWeighted: ";
1386 
1387  const size_t numVecs = this->getNumVectors ();
1388  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1389  static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
1390  "norms.size() = " << norms.size () << " != this->getNumVectors() = "
1391  << numVecs << ".");
1392 
1393  const bool OneW = (weights.getNumVectors () == 1);
1394  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1395  ! OneW && weights.getNumVectors () != numVecs, std::runtime_error,
1396  "The input MultiVector of weights must contain either one column, "
1397  "or must have the same number of columns as *this. "
1398  "weights.getNumVectors() = " << weights.getNumVectors ()
1399  << " and this->getNumVectors() = " << numVecs << ".");
1400 
1401 #ifdef HAVE_TPETRA_DEBUG
1402  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1403  ! this->getMap ()->isCompatible (*weights.getMap ()), std::runtime_error,
1404  "MultiVectors do not have compatible Maps:" << std::endl
1405  << "this->getMap(): " << std::endl << *this->getMap()
1406  << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
1407 #else
1408  const size_t lclNumRows = this->getLocalLength ();
1409  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1410  lclNumRows != weights.getLocalLength (), std::runtime_error,
1411  "MultiVectors do not have the same local length.");
1412 #endif // HAVE_TPETRA_DEBUG
1413 
1414  norms_view_type lclNrms ("lclNrms", numVecs);
1415 
1416  view_.template sync<device_type> ();
1417  weights.view_.template sync<device_type> ();
1418 
1419  typename dual_view_type::t_dev X_lcl = this->view_.d_view;
1420  typename dual_view_type::t_dev W_lcl = weights.view_.d_view;
1421 
1422  if (isConstantStride () && ! OneW) {
1423  KokkosBlas::nrm2w_squared (lclNrms, X_lcl, W_lcl);
1424  }
1425  else {
1426  for (size_t j = 0; j < numVecs; ++j) {
1427  const size_t X_col = this->isConstantStride () ? j :
1428  this->whichVectors_[j];
1429  const size_t W_col = OneW ? static_cast<size_t> (0) :
1430  (weights.isConstantStride () ? j : weights.whichVectors_[j]);
1431  KokkosBlas::nrm2w_squared (subview (lclNrms, j),
1432  subview (X_lcl, ALL (), X_col),
1433  subview (W_lcl, ALL (), W_col));
1434  }
1435  }
1436 
1437  const mag_type OneOverN =
1438  ATM::one () / static_cast<mag_type> (this->getGlobalLength ());
1439  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
1440  Teuchos::null : this->getMap ()->getComm ();
1441 
1442  if (! comm.is_null () && this->isDistributed ()) {
1443  // Assume that MPI can access device memory.
1444  reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
1445  lclNrms.ptr_on_device (), norms.getRawPtr ());
1446  for (size_t k = 0; k < numVecs; ++k) {
1447  norms[k] = ATM::sqrt (norms[k] * OneOverN);
1448  }
1449  }
1450  else {
1451  typename norms_view_type::HostMirror lclNrms_h =
1452  Kokkos::create_mirror_view (lclNrms);
1453  Kokkos::deep_copy (lclNrms_h, lclNrms);
1454  for (size_t k = 0; k < numVecs; ++k) {
1455  norms[k] = ATM::sqrt (ATS::magnitude (lclNrms_h(k)) * OneOverN);
1456  }
1457  }
1458  }
1459 
1460 
1461  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1462  void
1464  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
1465  {
1466  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
1467  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
1468  typedef typename view_getter_type::view_type host_norms_view_type;
1469 
1470  const size_t numNorms = static_cast<size_t> (norms.size ());
1471  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1472  dev_norms_view_type normsDevView ("MV::norm1 tmp", numNorms);
1473  this->norm1 (normsDevView); // Do the computation on the device.
1474  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
1475  }
1476 
1477 
1478  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1479  void
1481  norm1 (const Kokkos::View<mag_type*, device_type>& norms) const
1482  {
1483  this->normImpl (norms, NORM_ONE);
1484  }
1485 
1486 
1487  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1488  void
1490  normInf (const Teuchos::ArrayView<mag_type>& norms) const
1491  {
1492  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
1493  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
1494  typedef typename view_getter_type::view_type host_norms_view_type;
1495 
1496  const size_t numNorms = static_cast<size_t> (norms.size ());
1497  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1498  dev_norms_view_type normsDevView ("MV::normInf tmp", numNorms);
1499  this->normInf (normsDevView); // Do the computation on the device.
1500  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
1501  }
1502 
1503 
1504  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1505  void
1507  normInf (const Kokkos::View<mag_type*, device_type>& norms) const
1508  {
1509  this->normImpl (norms, NORM_INF);
1510  }
1511 
1512  namespace { // (anonymous)
1513 
1516  IMPL_NORM_ONE, //<! Use the one-norm
1517  IMPL_NORM_TWO, //<! Use the two-norm
1518  IMPL_NORM_INF //<! Use the infinity-norm
1519  };
1520 
1521  template<class RV, class XMV>
1522  void
1523  lclNormImpl (const RV& normsOut,
1524  const XMV& X_lcl,
1525  const size_t lclNumRows,
1526  const size_t numVecs,
1527  const Teuchos::ArrayView<const size_t>& whichVecs,
1528  const bool constantStride,
1529  const EWhichNormImpl whichNorm)
1530  {
1531  using Kokkos::ALL;
1532  using Kokkos::subview;
1533  typedef typename RV::non_const_value_type mag_type;
1534 
1535  static_assert (Kokkos::Impl::is_view<RV>::value,
1536  "Tpetra::MultiVector::lclNormImpl: "
1537  "The first argument RV is not a Kokkos::View.");
1538  static_assert (RV::rank == 1, "Tpetra::MultiVector::lclNormImpl: "
1539  "The first argument normsOut must have rank 1.");
1540  static_assert (Kokkos::Impl::is_view<XMV>::value,
1541  "Tpetra::MultiVector::lclNormImpl: "
1542  "The second argument X_lcl is not a Kokkos::View.");
1543  static_assert (XMV::rank == 2, "Tpetra::MultiVector::lclNormImpl: "
1544  "The second argument X_lcl must have rank 2.");
1545 
1546  // In case the input dimensions don't match, make sure that we
1547  // don't overwrite memory that doesn't belong to us, by using
1548  // subset views with the minimum dimensions over all input.
1549  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1550  const std::pair<size_t, size_t> colRng (0, numVecs);
1551  RV theNorms = subview (normsOut, colRng);
1552  XMV X = subview (X_lcl, rowRng, Kokkos::ALL());
1553 
1554  // mfh 10 Mar 2015: Kokkos::(Dual)View subviews don't quite
1555  // behave how you think when they have zero rows. In that case,
1556  // it returns a 0 x 0 (Dual)View.
1557  TEUCHOS_TEST_FOR_EXCEPTION(
1558  lclNumRows != 0 && constantStride && ( \
1559  ( X.dimension_0 () != lclNumRows ) ||
1560  ( X.dimension_1 () != numVecs ) ),
1561  std::logic_error, "Constant Stride X's dimensions are " << X.dimension_0 () << " x "
1562  << X.dimension_1 () << ", which differ from the local dimensions "
1563  << lclNumRows << " x " << numVecs << ". Please report this bug to "
1564  "the Tpetra developers.");
1565 
1566  TEUCHOS_TEST_FOR_EXCEPTION(
1567  lclNumRows != 0 && !constantStride && ( \
1568  ( X.dimension_0 () != lclNumRows ) ||
1569  ( X.dimension_1 () < numVecs ) ),
1570  std::logic_error, "Strided X's dimensions are " << X.dimension_0 () << " x "
1571  << X.dimension_1 () << ", which are incompatible with the local dimensions "
1572  << lclNumRows << " x " << numVecs << ". Please report this bug to "
1573  "the Tpetra developers.");
1574 
1575  if (lclNumRows == 0) {
1576  const mag_type zeroMag = Kokkos::Details::ArithTraits<mag_type>::zero ();
1577  Kokkos::deep_copy(theNorms, zeroMag);
1578  }
1579  else { // lclNumRows != 0
1580  if (constantStride) {
1581  if (whichNorm == IMPL_NORM_INF) {
1582  KokkosBlas::nrmInf (theNorms, X);
1583  }
1584  else if (whichNorm == IMPL_NORM_ONE) {
1585  KokkosBlas::nrm1 (theNorms, X);
1586  }
1587  else if (whichNorm == IMPL_NORM_TWO) {
1588  KokkosBlas::nrm2_squared (theNorms, X);
1589  }
1590  else {
1591  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
1592  }
1593  }
1594  else { // not constant stride
1595  // NOTE (mfh 15 Jul 2014) This does a kernel launch for
1596  // every column. It might be better to have a kernel that
1597  // does the work all at once. On the other hand, we don't
1598  // prioritize performance of MultiVector views of
1599  // noncontiguous columns.
1600  for (size_t k = 0; k < numVecs; ++k) {
1601  const size_t X_col = constantStride ? k : whichVecs[k];
1602  if (whichNorm == IMPL_NORM_INF) {
1603  KokkosBlas::nrmInf (subview (theNorms, k),
1604  subview (X, ALL (), X_col));
1605  }
1606  else if (whichNorm == IMPL_NORM_ONE) {
1607  KokkosBlas::nrm1 (subview (theNorms, k),
1608  subview (X, ALL (), X_col));
1609  }
1610  else if (whichNorm == IMPL_NORM_TWO) {
1611  KokkosBlas::nrm2_squared (subview (theNorms, k),
1612  subview (X, ALL (), X_col));
1613  }
1614  else {
1615  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
1616  }
1617  } // for each column
1618  } // constantStride
1619  } // lclNumRows != 0
1620  }
1621 
1622  template<class RV>
1623  void
1624  gblNormImpl (const RV& normsOut,
1625  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1626  const bool distributed,
1627  const EWhichNormImpl whichNorm)
1628  {
1629  using Teuchos::REDUCE_MAX;
1630  using Teuchos::REDUCE_SUM;
1631  using Teuchos::reduceAll;
1632  typedef typename RV::non_const_value_type mag_type;
1633 
1634  const size_t numVecs = normsOut.dimension_0 ();
1635 
1636  // If the MultiVector is distributed over multiple processes, do
1637  // the distributed (interprocess) part of the norm. We assume
1638  // that the MPI implementation can read from and write to device
1639  // memory.
1640  //
1641  // replaceMap() may have removed some processes. Those processes
1642  // have a null Map. They must not participate in any collective
1643  // operations. We ask first whether the Map is null, because
1644  // isDistributed() defers that question to the Map. We still
1645  // compute and return local norms for processes not participating
1646  // in collective operations; those probably don't make any sense,
1647  // but it doesn't hurt to do them, since it's illegal to call
1648  // norm*() on those processes anyway.
1649  if (distributed && ! comm.is_null ()) {
1650  // The calling process only participates in the collective if
1651  // both the Map and its Comm on that process are nonnull.
1652  //
1653  // MPI doesn't allow aliasing of arguments, so we have to make
1654  // a copy of the local sum.
1655  RV lclNorms ("MV::normImpl lcl", numVecs);
1656  Kokkos::deep_copy (lclNorms, normsOut);
1657  const mag_type* const lclSum = lclNorms.ptr_on_device ();
1658  mag_type* const gblSum = normsOut.ptr_on_device ();
1659  const int nv = static_cast<int> (numVecs);
1660  if (whichNorm == IMPL_NORM_INF) {
1661  reduceAll<int, mag_type> (*comm, REDUCE_MAX, nv, lclSum, gblSum);
1662  } else {
1663  reduceAll<int, mag_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1664  }
1665  }
1666 
1667  if (whichNorm == IMPL_NORM_TWO) {
1668  // Replace the norm-squared results with their square roots in
1669  // place, to get the final output. If the device memory and
1670  // the host memory are the same, it probably doesn't pay to
1671  // launch a parallel kernel for that, since there isn't enough
1672  // parallelism for the typical MultiVector case.
1673  const bool inHostMemory =
1674  Kokkos::Impl::is_same<typename RV::memory_space,
1675  typename RV::host_mirror_space::memory_space>::value;
1676  if (inHostMemory) {
1677  for (size_t j = 0; j < numVecs; ++j) {
1678  normsOut(j) = Kokkos::Details::ArithTraits<mag_type>::sqrt (normsOut(j));
1679  }
1680  }
1681  else {
1682  // There's not as much parallelism now, but that's OK. The
1683  // point of doing parallel dispatch here is to keep the norm
1684  // results on the device, thus avoiding a copy to the host and
1685  // back again.
1686  KokkosBlas::Impl::SquareRootFunctor<RV> f (normsOut);
1687  Kokkos::parallel_for (numVecs, f);
1688  }
1689  }
1690  }
1691 
1692  } // namespace (anonymous)
1693 
1694  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1695  void
1697  normImpl (const Kokkos::View<mag_type*, device_type>& norms,
1698  const EWhichNorm whichNorm) const
1699  {
1700  using Kokkos::create_mirror_view;
1701  using Kokkos::subview;
1702  using Teuchos::Comm;
1703  using Teuchos::null;
1704  using Teuchos::RCP;
1705  // View of all the norm results.
1706  typedef Kokkos::View<mag_type*, device_type> RV;
1707 
1708  const size_t numVecs = this->getNumVectors ();
1709  if (numVecs == 0) {
1710  return; // nothing to do
1711  }
1712  const size_t lclNumRows = this->getLocalLength ();
1713  const size_t numNorms = static_cast<size_t> (norms.dimension_0 ());
1714  TEUCHOS_TEST_FOR_EXCEPTION(
1715  numNorms < numVecs, std::runtime_error, "Tpetra::MultiVector::normImpl: "
1716  "'norms' must have at least as many entries as the number of vectors in "
1717  "*this. norms.dimension_0() = " << numNorms << " < this->getNumVectors()"
1718  " = " << numVecs << ".");
1719 
1720  const std::pair<size_t, size_t> colRng (0, numVecs);
1721  RV normsOut = subview (norms, colRng);
1722 
1723  EWhichNormImpl lclNormType;
1724  if (whichNorm == NORM_ONE) {
1725  lclNormType = IMPL_NORM_ONE;
1726  } else if (whichNorm == NORM_TWO) {
1727  lclNormType = IMPL_NORM_TWO;
1728  } else {
1729  lclNormType = IMPL_NORM_INF;
1730  }
1731 
1732  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1733  this->getMap ()->getComm ();
1734 
1735  // FIXME (mfh 05 Mar 2015) DualView flags are not indicative when
1736  // the two memory spaces are the same, so we check the latter.
1737  const bool oneMemorySpace =
1738  Kokkos::Impl::is_same<typename dual_view_type::t_dev::memory_space,
1739  typename dual_view_type::t_host::memory_space>::value;
1740  if (! oneMemorySpace && view_.modified_host() > view_.modified_device()) {
1741  // DualView was last modified on host, so run the local kernel there.
1742  // This means we need a host mirror of the array of norms too.
1743  typedef typename dual_view_type::t_host XMV;
1744  lclNormImpl<RV, XMV> (normsOut, view_.h_view, lclNumRows, numVecs,
1745  this->whichVectors_, this->isConstantStride (),
1746  lclNormType);
1747  typename RV::HostMirror normsOutHost = create_mirror_view (normsOut);
1748  Kokkos::deep_copy (normsOutHost, normsOut);
1749  gblNormImpl<typename RV::HostMirror> (normsOutHost, comm,
1750  this->isDistributed (),
1751  lclNormType);
1752  Kokkos::deep_copy (normsOut, normsOutHost);
1753  }
1754  else {
1755  // DualView was last modified on device, so run the local kernel there.
1756  typedef typename dual_view_type::t_dev XMV;
1757  lclNormImpl<RV, XMV> (normsOut, view_.d_view, lclNumRows, numVecs,
1758  this->whichVectors_, this->isConstantStride (),
1759  lclNormType);
1760  gblNormImpl<RV> (normsOut, comm, this->isDistributed (), lclNormType);
1761  }
1762  }
1763 
1764  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1765  void
1767  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
1768  {
1769  // KR FIXME Overload this method to take a View.
1770  using Kokkos::ALL;
1771  using Kokkos::subview;
1772  using Teuchos::Comm;
1773  using Teuchos::RCP;
1774  using Teuchos::reduceAll;
1775  using Teuchos::REDUCE_SUM;
1776  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
1777 
1778  const size_t lclNumRows = this->getLocalLength ();
1779  const size_t numVecs = this->getNumVectors ();
1780  const size_t numMeans = static_cast<size_t> (means.size ());
1781 
1782  TEUCHOS_TEST_FOR_EXCEPTION(
1783  numMeans != numVecs, std::runtime_error,
1784  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
1785  << " != this->getNumVectors() = " << numVecs << ".");
1786 
1787  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1788  const std::pair<size_t, size_t> colRng (0, numVecs);
1789 
1790  // Make sure that the final output view has the same layout as the
1791  // temporary view's HostMirror. Left or Right doesn't matter for
1792  // a 1-D array anyway; this is just to placate the compiler.
1793  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
1794  typedef Kokkos::View<impl_scalar_type*,
1795  typename local_view_type::HostMirror::array_layout,
1796  Kokkos::HostSpace,
1797  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
1798  host_local_view_type meansOut (means.getRawPtr (), numMeans);
1799 
1800  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
1801  this->getMap ()->getComm ();
1802 
1803  // FIXME (mfh 05 Mar 2015) DualView flags are not indicative when
1804  // the two memory spaces are the same, so we check the latter.
1805  const bool oneMemorySpace =
1806  Kokkos::Impl::is_same<typename dual_view_type::t_dev::memory_space,
1807  typename dual_view_type::t_host::memory_space>::value;
1808  if (! oneMemorySpace && view_.modified_host() > view_.modified_device()) {
1809  // DualView was last modified on host, so run the local kernel there.
1810  typename dual_view_type::t_host X_lcl =
1811  subview (this->view_.h_view, rowRng, Kokkos::ALL());
1812 
1813  // Compute the local sum of each column.
1814  typename local_view_type::HostMirror lclSums ("MV::meanValue tmp", numVecs);
1815  if (isConstantStride ()) {
1816  KokkosBlas::sum (lclSums, X_lcl);
1817  }
1818  else {
1819  for (size_t j = 0; j < numVecs; ++j) {
1820  const size_t col = whichVectors_[j];
1821  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
1822  }
1823  }
1824 
1825  // If there are multiple MPI processes, the all-reduce reads
1826  // from lclSums, and writes to meansOut. Otherwise, we just
1827  // copy lclSums into meansOut.
1828  if (! comm.is_null () && this->isDistributed ()) {
1829  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
1830  lclSums.ptr_on_device (), meansOut.ptr_on_device ());
1831  }
1832  else {
1833  Kokkos::deep_copy (meansOut, lclSums);
1834  }
1835  }
1836  else {
1837  // DualView was last modified on device, so run the local kernel there.
1838  typename dual_view_type::t_dev X_lcl =
1839  subview (this->view_.d_view, rowRng, Kokkos::ALL());
1840 
1841  // Compute the local sum of each column.
1842  local_view_type lclSums ("MV::meanValue tmp", numVecs);
1843  if (isConstantStride ()) {
1844  KokkosBlas::sum (lclSums, X_lcl);
1845  }
1846  else {
1847  for (size_t j = 0; j < numVecs; ++j) {
1848  const size_t col = whichVectors_[j];
1849  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
1850  }
1851  }
1852 
1853  // If there are multiple MPI processes, the all-reduce reads
1854  // from lclSums, and writes to meansOut. (We assume that MPI
1855  // can read device memory.) Otherwise, we just copy lclSums
1856  // into meansOut.
1857  if (! comm.is_null () && this->isDistributed ()) {
1858  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
1859  lclSums.ptr_on_device (), meansOut.ptr_on_device ());
1860  }
1861  else {
1862  Kokkos::deep_copy (meansOut, lclSums);
1863  }
1864  }
1865 
1866  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
1867  // to the magnitude type, since operator/ (std::complex<T>, int)
1868  // isn't necessarily defined.
1869  const impl_scalar_type OneOverN =
1870  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
1871  for (size_t k = 0; k < numMeans; ++k) {
1872  meansOut(k) = meansOut(k) * OneOverN;
1873  }
1874  }
1875 
1876 
1877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1878  void
1881  {
1882  using Kokkos::ALL;
1883  using Kokkos::subview;
1884  typedef impl_scalar_type IST;
1885  typedef Kokkos::Details::ArithTraits<IST> ATS;
1886  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
1887  typedef typename pool_type::generator_type generator_type;
1888 
1889  // Seed the pseudorandom number generator using the calling
1890  // process' rank. This helps decorrelate different process'
1891  // pseudorandom streams. It's not perfect but it's effective and
1892  // doesn't require MPI communication. The seed also includes bits
1893  // from the standard library's rand().
1894  //
1895  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
1896  // The code below just makes a new seed each time.
1897 
1898  const uint64_t myRank =
1899  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
1900  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
1901  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
1902 
1903  pool_type rand_pool (seed);
1904  const IST max = Kokkos::rand<generator_type, IST>::max ();
1905  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
1906 
1907  if (isConstantStride ()) {
1908  Kokkos::fill_random (view_.d_view, rand_pool, min, max);
1909  view_.template modify<device_type> ();
1910  }
1911  else {
1912  const size_t numVecs = getNumVectors ();
1913  view_.template sync<device_type> ();
1914  typedef Kokkos::View<IST*, device_type> view_type;
1915  for (size_t k = 0; k < numVecs; ++k) {
1916  const size_t col = whichVectors_[k];
1917  view_type X_k = subview (view_.d_view, ALL (), col);
1918  Kokkos::fill_random (X_k, rand_pool, min, max);
1919  }
1920  view_.template modify<device_type> ();
1921  }
1922  }
1923 
1924 
1925  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1926  void
1928  putScalar (const Scalar& alpha)
1929  {
1930  using Kokkos::ALL;
1931  using Kokkos::deep_copy;
1932  using Kokkos::subview;
1933  typedef typename dual_view_type::t_dev::device_type DMS;
1934  typedef typename dual_view_type::t_host::device_type HMS;
1935 
1936  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
1937  const size_t lclNumRows = getLocalLength ();
1938  const size_t numVecs = getNumVectors ();
1939  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1940  const std::pair<size_t, size_t> colRng (0, numVecs);
1941 
1942  // Modify the most recently updated version of the data. This
1943  // avoids sync'ing, which could violate users' expectations.
1944  if (view_.modified_device() >= view_.modified_host()) {
1945  //
1946  // Last modified in device memory, so modify data there.
1947  //
1948  // Type of the device memory View of the MultiVector's data.
1949  typedef typename dual_view_type::t_dev mv_view_type;
1950  // Type of a View of a single column of the MultiVector's data.
1951  typedef Kokkos::View<impl_scalar_type*,
1952  typename mv_view_type::array_layout, DMS> vec_view_type;
1953 
1954  this->template modify<DMS> (); // we are about to modify on the device
1955  mv_view_type X =
1956  subview (this->getDualView ().template view<DMS> (),
1957  rowRng, Kokkos::ALL());
1958  if (numVecs == 1) {
1959  vec_view_type X_0 =
1960  subview (X, ALL (), static_cast<size_t> (0));
1961  deep_copy (X_0, theAlpha);
1962  }
1963  else if (isConstantStride ()) {
1964  deep_copy (X, theAlpha);
1965  }
1966  else {
1967  for (size_t k = 0; k < numVecs; ++k) {
1968  const size_t col = whichVectors_[k];
1969  vec_view_type X_k = subview (X, ALL (), col);
1970  deep_copy (X_k, theAlpha);
1971  }
1972  }
1973  }
1974  else { // last modified in host memory, so modify data there.
1975  typedef typename dual_view_type::t_host mv_view_type;
1976  typedef Kokkos::View<impl_scalar_type*,
1977  typename mv_view_type::array_layout, HMS> vec_view_type;
1978 
1979  this->template modify<HMS> (); // we are about to modify on the host
1980  mv_view_type X =
1981  subview (this->getDualView ().template view<HMS> (),
1982  rowRng, Kokkos::ALL());
1983  if (numVecs == 1) {
1984  vec_view_type X_0 =
1985  subview (X, ALL (), static_cast<size_t> (0));
1986  deep_copy (X_0, theAlpha);
1987  }
1988  else if (isConstantStride ()) {
1989  deep_copy (X, theAlpha);
1990  }
1991  else {
1992  for (size_t k = 0; k < numVecs; ++k) {
1993  const size_t col = whichVectors_[k];
1994  vec_view_type X_k = subview (X, ALL (), col);
1995  deep_copy (X_k, theAlpha);
1996  }
1997  }
1998  }
1999  }
2000 
2001 
2002  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2003  void
2005  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2006  {
2007  using Teuchos::ArrayRCP;
2008  using Teuchos::Comm;
2009  using Teuchos::RCP;
2010 
2011  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2012  // it might work if the MV is a column view of another MV.
2013  // However, things might go wrong when restoring the original
2014  // Map, so we don't allow this case for now.
2015  TEUCHOS_TEST_FOR_EXCEPTION(
2016  ! this->isConstantStride (), std::logic_error,
2017  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2018  "if the MultiVector is a column view of another MultiVector (that is, if "
2019  "isConstantStride() == false).");
2020 
2021  // Case 1: current Map and new Map are both nonnull on this process.
2022  // Case 2: current Map is nonnull, new Map is null.
2023  // Case 3: current Map is null, new Map is nonnull.
2024  // Case 4: both Maps are null: forbidden.
2025  //
2026  // Case 1 means that we don't have to do anything on this process,
2027  // other than assign the new Map. (We always have to do that.)
2028  // It's an error for the user to supply a Map that requires
2029  // resizing in this case.
2030  //
2031  // Case 2 means that the calling process is in the current Map's
2032  // communicator, but will be excluded from the new Map's
2033  // communicator. We don't have to do anything on the calling
2034  // process; just leave whatever data it may have alone.
2035  //
2036  // Case 3 means that the calling process is excluded from the
2037  // current Map's communicator, but will be included in the new
2038  // Map's communicator. This means we need to (re)allocate the
2039  // local DualView if it does not have the right number of rows.
2040  // If the new number of rows is nonzero, we'll fill the newly
2041  // allocated local data with zeros, as befits a projection
2042  // operation.
2043  //
2044  // The typical use case for Case 3 is that the MultiVector was
2045  // first created with the Map with more processes, then that Map
2046  // was replaced with a Map with fewer processes, and finally the
2047  // original Map was restored on this call to replaceMap.
2048 
2049 #ifdef HAVE_TEUCHOS_DEBUG
2050  // mfh 28 Mar 2013: We can't check for compatibility across the
2051  // whole communicator, unless we know that the current and new
2052  // Maps are nonnull on _all_ participating processes.
2053  // TEUCHOS_TEST_FOR_EXCEPTION(
2054  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2055  // std::invalid_argument, "Tpetra::MultiVector::project: "
2056  // "If the input Map's communicator is compatible (has the same number of "
2057  // "processes as) the current Map's communicator, then the two Maps must be "
2058  // "compatible. The replaceMap() method is not for data redistribution; "
2059  // "use Import or Export for that purpose.");
2060 
2061  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2062  // of the Map, in case the process counts don't match.
2063 #endif // HAVE_TEUCHOS_DEBUG
2064 
2065  if (this->getMap ().is_null ()) { // current Map is null
2066  // If this->getMap() is null, that means that this MultiVector
2067  // has already had replaceMap happen to it. In that case, just
2068  // reallocate the DualView with the right size.
2069 
2070  TEUCHOS_TEST_FOR_EXCEPTION(
2071  newMap.is_null (), std::invalid_argument,
2072  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2073  "This probably means that the input Map is incorrect.");
2074 
2075  // Case 3: current Map is null, new Map is nonnull.
2076  // Reallocate the DualView with the right dimensions.
2077  const size_t newNumRows = newMap->getNodeNumElements ();
2078  const size_t origNumRows = view_.dimension_0 ();
2079  const size_t numCols = this->getNumVectors ();
2080 
2081  if (origNumRows != newNumRows || view_.dimension_1 () != numCols) {
2082  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2083  }
2084  }
2085  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2086  // I am an excluded process. Reinitialize my data so that I
2087  // have 0 rows. Keep the number of columns as before.
2088  const size_t newNumRows = static_cast<size_t> (0);
2089  const size_t numCols = this->getNumVectors ();
2090  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2091  }
2092 
2093  this->map_ = newMap;
2094  }
2095 
2096  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2097  void
2099  scale (const Scalar& alpha)
2100  {
2101  using Kokkos::ALL;
2102  using Kokkos::subview;
2103 
2104  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2105  if (theAlpha == Kokkos::Details::ArithTraits<impl_scalar_type>::one ()) {
2106  return; // do nothing
2107  }
2108  const size_t lclNumRows = this->getLocalLength ();
2109  const size_t numVecs = this->getNumVectors ();
2110  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2111  const std::pair<size_t, size_t> colRng (0, numVecs);
2112 
2113  typedef typename dual_view_type::t_dev dev_view_type;
2114  typedef typename dual_view_type::t_host host_view_type;
2115 
2116  // We can't substitute putScalar(0.0) for scale(0.0), because the
2117  // former will overwrite NaNs present in the MultiVector. The
2118  // semantics of this call require multiplying them by 0, which
2119  // IEEE 754 requires to be NaN.
2120 
2121  // FIXME (mfh 05 Mar 2015) DualView flags are not indicative when
2122  // the two memory spaces are the same, so we check the latter.
2123  const bool oneMemorySpace =
2124  Kokkos::Impl::is_same<typename dev_view_type::memory_space,
2125  typename host_view_type::memory_space>::value;
2126  if (! oneMemorySpace && view_.modified_host() > view_.modified_device()) {
2127  auto Y_lcl = subview (this->view_.h_view, rowRng, Kokkos::ALL());
2128 
2129  if (isConstantStride ()) {
2130  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2131  }
2132  else {
2133  for (size_t k = 0; k < numVecs; ++k) {
2134  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2135  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2136  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2137  }
2138  }
2139  }
2140  else { // work on device
2141  auto Y_lcl = subview (this->view_.d_view, rowRng, Kokkos::ALL());
2142 
2143  if (isConstantStride ()) {
2144  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2145  }
2146  else {
2147  for (size_t k = 0; k < numVecs; ++k) {
2148  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2149  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2150  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2151  }
2152  }
2153  }
2154  }
2155 
2156 
2157  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2158  void
2160  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2161  {
2162  const size_t numVecs = this->getNumVectors ();
2163  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2164  TEUCHOS_TEST_FOR_EXCEPTION(
2165  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2166  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2167  << numVecs << ".");
2168 
2169  // Use a DualView to copy the scaling constants onto the device.
2170  typedef Kokkos::DualView<impl_scalar_type*, device_type> k_alphas_type ;
2171  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2172  k_alphas.template modify<typename k_alphas_type::host_mirror_space> ();
2173  for (size_t i = 0; i < numAlphas; ++i) {
2174  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2175  }
2176  k_alphas.template sync<typename k_alphas_type::memory_space> ();
2177  // Invoke the scale() overload that takes a device View of coefficients.
2178  this->scale (k_alphas.d_view);
2179  }
2180 
2181  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2182  void
2184  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2185  {
2186  using Kokkos::ALL;
2187  using Kokkos::subview;
2188 
2189  const size_t lclNumRows = this->getLocalLength ();
2190  const size_t numVecs = this->getNumVectors ();
2191  TEUCHOS_TEST_FOR_EXCEPTION(
2192  static_cast<size_t> (alphas.dimension_0 ()) != numVecs,
2193  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2194  "alphas.dimension_0() = " << alphas.dimension_0 ()
2195  << " != this->getNumVectors () = " << numVecs << ".");
2196  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2197  const std::pair<size_t, size_t> colRng (0, numVecs);
2198 
2199  typedef typename dual_view_type::t_dev dev_view_type;
2200  typedef typename dual_view_type::t_host host_view_type;
2201  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2202  // type of the return value of subview. This is because if we
2203  // switch the array layout from LayoutLeft to LayoutRight
2204  // (preferred for performance of block operations), the types
2205  // below won't be valid. (A view of a column of a LayoutRight
2206  // multivector has LayoutStride, not LayoutLeft.)
2207 
2208  const bool oneMemorySpace =
2209  Kokkos::Impl::is_same<typename dev_view_type::memory_space,
2210  typename host_view_type::memory_space>::value;
2211  if (! oneMemorySpace &&
2212  this->view_.modified_host() > this->view_.modified_device()) {
2213  // Work in host memory. This means we need to create a host
2214  // mirror of the input View of coefficients.
2215  typedef Kokkos::View<const impl_scalar_type*,
2216  execution_space> input_view_type;
2217  typename input_view_type::HostMirror alphas_h =
2218  Kokkos::create_mirror_view (alphas);
2219  Kokkos::deep_copy (alphas_h, alphas);
2220 
2221  auto Y_lcl = subview (this->view_.h_view, rowRng, Kokkos::ALL());
2222 
2223  if (isConstantStride ()) {
2224  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2225  }
2226  else {
2227  for (size_t k = 0; k < numVecs; ++k) {
2228  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2229  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2230  // We don't have to use the entire 1-D View here; we can use
2231  // the version that takes a scalar coefficient.
2232  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2233  }
2234  }
2235  }
2236  else { // Work in device memory, using the input View 'alphas' directly.
2237  auto Y_lcl = subview (this->view_.d_view, rowRng, Kokkos::ALL());
2238 
2239  if (isConstantStride ()) {
2240  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2241  }
2242  else {
2243  for (size_t k = 0; k < numVecs; ++k) {
2244  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2245  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2246  //
2247  // FIXME (mfh 08 Apr 2015) This assumes UVM. It would be
2248  // better to fix scal() so that it takes a 0-D View as the
2249  // second argument.
2250  //
2251  KokkosBlas::scal (Y_k, alphas(k), Y_k);
2252  }
2253  }
2254  }
2255  }
2256 
2257  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2258  void
2260  scale (const Scalar& alpha,
2262  {
2263  using Kokkos::ALL;
2264  using Kokkos::subview;
2265  const char tfecfFuncName[] = "scale: ";
2266 
2267  const size_t lclNumRows = getLocalLength ();
2268  const size_t numVecs = getNumVectors ();
2269 
2270  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2271  lclNumRows != A.getLocalLength (), std::invalid_argument,
2272  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2273  << A.getLocalLength () << ".");
2274  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2275  numVecs != A.getNumVectors (), std::invalid_argument,
2276  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2277  << A.getNumVectors () << ".");
2278 
2279  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2280  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2281  const std::pair<size_t, size_t> colRng (0, numVecs);
2282 
2283  typedef typename dual_view_type::t_dev dev_view_type;
2284  typedef typename dual_view_type::t_host host_view_type;
2285 
2286  // FIXME (mfh 05 Mar 2015) DualView flags are not indicative when
2287  // the two memory spaces are the same, so we check the latter.
2288  const bool oneMemorySpace =
2289  Kokkos::Impl::is_same<typename dev_view_type::memory_space,
2290  typename host_view_type::memory_space>::value;
2291  if (! oneMemorySpace && A.view_.modified_host() > A.view_.modified_device()) {
2292  // Work on host, where A's data were most recently modified. A
2293  // is a "guest" of this method, so it's more polite to sync
2294  // *this, than to sync A.
2295  this->view_.template sync<typename host_view_type::memory_space> ();
2296  this->view_.template modify<typename host_view_type::memory_space> ();
2297  auto Y_lcl = subview (this->view_.h_view, rowRng, Kokkos::ALL());
2298  auto X_lcl = subview (A.view_.h_view, rowRng, Kokkos::ALL());
2299 
2300  if (isConstantStride () && A.isConstantStride ()) {
2301  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2302  }
2303  else {
2304  // Make sure that Kokkos only uses the local length for add.
2305  for (size_t k = 0; k < numVecs; ++k) {
2306  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2307  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2308  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2309  auto X_k = subview (X_lcl, ALL (), X_col);
2310 
2311  KokkosBlas::scal (Y_k, theAlpha, X_k);
2312  }
2313  }
2314  }
2315  else { // work on device
2316  // A is a "guest" of this method, so it's more polite to sync
2317  // *this, than to sync A.
2318  this->view_.template sync<typename dev_view_type::memory_space> ();
2319  this->view_.template modify<typename dev_view_type::memory_space> ();
2320  auto Y_lcl = subview (this->view_.d_view, rowRng, Kokkos::ALL());
2321  auto X_lcl = subview (A.view_.d_view, rowRng, Kokkos::ALL());
2322 
2323  if (isConstantStride () && A.isConstantStride ()) {
2324  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2325  }
2326  else {
2327  // Make sure that Kokkos only uses the local length for add.
2328  for (size_t k = 0; k < numVecs; ++k) {
2329  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2330  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2331  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2332  auto X_k = subview (X_lcl, ALL (), X_col);
2333 
2334  KokkosBlas::scal (Y_k, theAlpha, X_k);
2335  }
2336  }
2337  }
2338  }
2339 
2340 
2341  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2342  void
2345  {
2346  const char tfecfFuncName[] = "reciprocal";
2347 
2348  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2349  getLocalLength () != A.getLocalLength (), std::runtime_error,
2350  ": MultiVectors do not have the same local length. "
2351  "this->getLocalLength() = " << getLocalLength ()
2352  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2353  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2354  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2355  ": MultiVectors do not have the same number of columns (vectors). "
2356  "this->getNumVectors() = " << getNumVectors ()
2357  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2358 
2359  // FIXME (mfh 07 Jan 2015) See note on two-argument scale() above.
2360 
2361  const size_t numVecs = getNumVectors ();
2362  try {
2363  if (isConstantStride () && A.isConstantStride ()) {
2364  view_.template sync<device_type> ();
2365  view_.template modify<device_type> ();
2366  KokkosBlas::reciprocal (view_.d_view, A.view_.d_view);
2367  }
2368  else {
2369  using Kokkos::ALL;
2370  using Kokkos::subview;
2371  typedef Kokkos::View<impl_scalar_type*, device_type> view_type;
2372 
2373  view_.template sync<device_type> ();
2374  view_.template modify<device_type> ();
2375 
2376  // FIXME (mfh 23 Jul 2014) I'm not sure if it should be our
2377  // responsibility to sync A.
2378  A.view_.template sync<device_type> ();
2379  A.view_.template modify<device_type> ();
2380 
2381  for (size_t k = 0; k < numVecs; ++k) {
2382  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2383  view_type vector_k = subview (view_.d_view, ALL (), this_col);
2384  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2385  view_type vector_Ak = subview (A.view_.d_view, ALL (), A_col);
2386  KokkosBlas::reciprocal(vector_k, vector_Ak);
2387  }
2388  }
2389  }
2390  catch (std::runtime_error &e) {
2391  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
2392  ": Caught exception from Kokkos: " << e.what () << std::endl);
2393  }
2394  }
2395 
2396 
2397  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2398  void
2401  {
2402  const char tfecfFuncName[] = "abs";
2403  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2404  getLocalLength () != A.getLocalLength (), std::runtime_error,
2405  ": MultiVectors do not have the same local length. "
2406  "this->getLocalLength() = " << getLocalLength ()
2407  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2408  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2409  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2410  ": MultiVectors do not have the same number of columns (vectors). "
2411  "this->getNumVectors() = " << getNumVectors ()
2412  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2413 
2414  // FIXME (mfh 07 Jan 2015) See note on two-argument scale() above.
2415 
2416  const size_t numVecs = getNumVectors ();
2417  if (isConstantStride () && A.isConstantStride ()) {
2418  view_.template sync<device_type>();
2419  view_.template modify<device_type>();
2420  KokkosBlas::abs (view_.d_view, A.view_.d_view);
2421  }
2422  else {
2423  using Kokkos::ALL;
2424  using Kokkos::subview;
2425  typedef Kokkos::View<impl_scalar_type*, device_type> view_type;
2426 
2427  view_.template sync<device_type> ();
2428  view_.template modify<device_type> ();
2429  A.view_.template sync<device_type> ();
2430  A.view_.template modify<device_type> ();
2431 
2432  for (size_t k=0; k < numVecs; ++k) {
2433  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2434  view_type vector_k = subview (view_.d_view, ALL (), this_col);
2435  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2436  view_type vector_Ak = subview (A.view_.d_view, ALL (), A_col);
2437  KokkosBlas::abs (vector_k, vector_Ak);
2438  }
2439  }
2440  }
2441 
2442 
2443  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2444  void
2446  update (const Scalar& alpha,
2448  const Scalar& beta)
2449  {
2450  using Kokkos::ALL;
2451  using Kokkos::subview;
2452  const char tfecfFuncName[] = "update: ";
2453 
2454  const size_t lclNumRows = getLocalLength ();
2455  const size_t numVecs = getNumVectors ();
2456 
2457  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2458  lclNumRows != A.getLocalLength (), std::invalid_argument,
2459  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2460  << A.getLocalLength () << ".");
2461  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2462  numVecs != A.getNumVectors (), std::invalid_argument,
2463  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2464  << A.getNumVectors () << ".");
2465 
2466  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2467  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2468  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2469  const std::pair<size_t, size_t> colRng (0, numVecs);
2470 
2471  typedef typename dual_view_type::t_dev dev_view_type;
2472  typedef typename dual_view_type::t_host host_view_type;
2473 
2474  // FIXME (mfh 05 Mar 2015) DualView flags are not indicative when
2475  // the two memory spaces are the same, so we check the latter.
2476  const bool oneMemorySpace =
2477  Kokkos::Impl::is_same<typename dev_view_type::memory_space,
2478  typename host_view_type::memory_space>::value;
2479  if (! oneMemorySpace && A.view_.modified_host() > A.view_.modified_device()) {
2480  // Work on host, where A's data were most recently modified. A
2481  // is a "guest" of this method, so it's more polite to sync
2482  // *this, than to sync A.
2483  this->view_.template sync<typename host_view_type::memory_space> ();
2484  this->view_.template modify<typename host_view_type::memory_space> ();
2485  auto Y_lcl = subview (this->view_.h_view, rowRng, Kokkos::ALL());
2486  auto X_lcl = subview (A.view_.h_view, rowRng, Kokkos::ALL());
2487 
2488  if (isConstantStride () && A.isConstantStride ()) {
2489  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2490  }
2491  else {
2492  // Make sure that Kokkos only uses the local length for add.
2493  for (size_t k = 0; k < numVecs; ++k) {
2494  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2495  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2496  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2497  auto X_k = subview (X_lcl, ALL (), X_col);
2498 
2499  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2500  }
2501  }
2502  }
2503  else { // work on device
2504  // A is a "guest" of this method, so it's more polite to sync
2505  // *this, than to sync A.
2506  this->view_.template sync<typename dev_view_type::memory_space> ();
2507  this->view_.template modify<typename dev_view_type::memory_space> ();
2508  auto Y_lcl = subview (this->view_.d_view, rowRng, Kokkos::ALL());
2509  auto X_lcl = subview (A.view_.d_view, rowRng, Kokkos::ALL());
2510 
2511  if (isConstantStride () && A.isConstantStride ()) {
2512  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2513  }
2514  else {
2515  // Make sure that Kokkos only uses the local length for add.
2516  for (size_t k = 0; k < numVecs; ++k) {
2517  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2518  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2519  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2520  auto X_k = subview (X_lcl, ALL (), X_col);
2521 
2522  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2523  }
2524  }
2525  }
2526  }
2527 
2528  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2529  void
2531  update (const Scalar& alpha,
2533  const Scalar& beta,
2535  const Scalar& gamma)
2536  {
2537  using Kokkos::ALL;
2538  using Kokkos::subview;
2539  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
2540 
2541  const size_t lclNumRows = this->getLocalLength ();
2542  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2543  lclNumRows != A.getLocalLength (), std::invalid_argument,
2544  "The input MultiVector A has " << A.getLocalLength () << " local "
2545  "row(s), but this MultiVector has " << lclNumRows << " local "
2546  "row(s).");
2547  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2548  lclNumRows != B.getLocalLength (), std::invalid_argument,
2549  "The input MultiVector B has " << B.getLocalLength () << " local "
2550  "row(s), but this MultiVector has " << lclNumRows << " local "
2551  "row(s).");
2552  const size_t numVecs = getNumVectors ();
2553  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2554  A.getNumVectors () != numVecs, std::invalid_argument,
2555  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
2556  "but this MultiVector has " << numVecs << " column(s).");
2557  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2558  B.getNumVectors () != numVecs, std::invalid_argument,
2559  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
2560  "but this MultiVector has " << numVecs << " column(s).");
2561 
2562  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2563  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2564  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
2565 
2566  // We're lucky if *this, A, and B are all sync'd to the same
2567  // memory space. If not, we have to sync _something_. Unlike
2568  // three-argument update() or (say) dot(), we may have to sync one
2569  // of the inputs. For now, we just sync _everything_ to device.
2570  this->view_.template sync<typename dual_view_type::t_dev::memory_space> ();
2571  A.view_.template sync<typename dual_view_type::t_dev::memory_space> ();
2572  B.view_.template sync<typename dual_view_type::t_dev::memory_space> ();
2573 
2574  // This method modifies *this.
2575  this->template modify<typename dual_view_type::t_dev::memory_space> ();
2576 
2577  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2578  const std::pair<size_t, size_t> colRng (0, numVecs);
2579 
2580  // Prefer 'auto' over specifying the type explicitly. This avoids
2581  // issues with a subview possibly having a different type than the
2582  // original view.
2583  auto C_lcl = subview (this->view_.d_view, rowRng, Kokkos::ALL());
2584  auto A_lcl = subview (A.view_.d_view, rowRng, Kokkos::ALL());
2585  auto B_lcl = subview (B.view_.d_view, rowRng, Kokkos::ALL());
2586 
2587  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
2588  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
2589  }
2590  else {
2591  // Some input (or *this) is not constant stride,
2592  // so perform the update one column at a time.
2593  for (size_t k = 0; k < numVecs; ++k) {
2594  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2595  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
2596  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
2597  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
2598  theBeta, subview (B_lcl, rowRng, B_col),
2599  theGamma, subview (C_lcl, rowRng, this_col));
2600  }
2601  }
2602  }
2603 
2604  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2605  Teuchos::ArrayRCP<const Scalar>
2607  getData (size_t j) const
2608  {
2609  using Kokkos::ALL;
2610  using Kokkos::subview;
2611  typedef typename dual_view_type::host_mirror_space host_type;
2612  typedef typename dual_view_type::t_host host_view_type;
2613 
2614  // Any MultiVector method that called the (classic) Kokkos Node's
2615  // viewBuffer or viewBufferNonConst methods always implied a
2616  // device->host synchronization. Thus, we synchronize here as
2617  // well.
2618  view_.template sync<host_type> ();
2619 
2620  // Get a host view of the entire MultiVector's data.
2621  host_view_type hostView = view_.template view<host_type> ();
2622  // Get a subview of column j.
2623  host_view_type hostView_j;
2624 
2625  const size_t colStart = isConstantStride () ? j : whichVectors_[j];
2626  const std::pair<size_t, size_t> colRng (colStart, colStart+1);
2627  hostView_j = subview (hostView, ALL (), colRng);
2628 
2629  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
2630  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
2631  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
2632 
2633 #ifdef HAVE_TPETRA_DEBUG
2634  TEUCHOS_TEST_FOR_EXCEPTION(
2635  static_cast<size_t>(hostView_j.dimension_0 ()) < static_cast<size_t>(dataAsArcp.size ()), std::logic_error,
2636  "Tpetra::MultiVector::getData: hostView_j.dimension_0() = "
2637  << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
2638  << dataAsArcp.size () << ". "
2639  "Please report this bug to the Tpetra developers.");
2640 #endif // HAVE_TPETRA_DEBUG
2641 
2642  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
2643  }
2644 
2645  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2646  Teuchos::ArrayRCP<Scalar>
2649  {
2650  using Kokkos::ALL;
2651  using Kokkos::subview;
2652  typedef typename dual_view_type::host_mirror_space host_type;
2653  typedef typename dual_view_type::t_host host_view_type;
2654 
2655  // Any MultiVector method that called the (classic) Kokkos Node's
2656  // viewBuffer or viewBufferNonConst methods always implied a
2657  // device->host synchronization. Thus, we synchronize here as
2658  // well.
2659  view_.template sync<host_type> ();
2660 
2661  // Get a host view of the entire MultiVector's data.
2662  host_view_type hostView = view_.template view<host_type> ();
2663  // Get a subview of column j.
2664  host_view_type hostView_j;
2665  if (isConstantStride ()) {
2666  hostView_j = subview (hostView, ALL (), Kokkos::pair<int,int>(j,j+1));
2667  } else {
2668  hostView_j = subview (hostView, ALL (), Kokkos::pair<int,int>(whichVectors_[j],whichVectors_[j]+1));
2669  }
2670 
2671  // Calling getDataNonConst() implies that the user plans to modify
2672  // the values in the MultiVector, so we call modify on the view
2673  // here.
2674  view_.template modify<host_type> ();
2675 
2676  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
2677  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
2678  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
2679 
2680 #ifdef HAVE_TPETRA_DEBUG
2681  TEUCHOS_TEST_FOR_EXCEPTION(
2682  static_cast<size_t>(hostView_j.dimension_0 ()) < static_cast<size_t>(dataAsArcp.size ()), std::logic_error,
2683  "Tpetra::MultiVector::getDataNonConst: hostView_j.dimension_0() = "
2684  << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
2685  << dataAsArcp.size () << ". "
2686  "Please report this bug to the Tpetra developers.");
2687 #endif // HAVE_TPETRA_DEBUG
2688 
2689  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
2690  }
2691 
2692  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2696  {
2697  if (this != &source) {
2698  base_type::operator= (source);
2699  //
2700  // operator= implements view semantics (shallow copy).
2701  //
2702 
2703  // Kokkos::View operator= also implements view semantics.
2704  view_ = source.view_;
2705  origView_ = source.origView_;
2706 
2707  // NOTE (mfh 24 Mar 2014) Christian wrote here that assigning
2708  // whichVectors_ is "probably not ok" (probably constitutes deep
2709  // copy). I would say that it's OK, because whichVectors_ is
2710  // immutable (from the user's perspective); it's analogous to
2711  // the dimensions or stride. Once we make whichVectors_ a
2712  // Kokkos::View instead of a Teuchos::Array, all debate will go
2713  // away and we will unquestionably have view semantics.
2714  whichVectors_ = source.whichVectors_;
2715  }
2716  return *this;
2717  }
2718 
2719 
2720  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2721  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
2723  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
2724  {
2725  using Teuchos::RCP;
2727 
2728  // Check whether the index set in cols is contiguous. If it is,
2729  // use the more efficient Range1D version of subCopy.
2730  bool contiguous = true;
2731  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
2732  for (size_t j = 1; j < numCopyVecs; ++j) {
2733  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
2734  contiguous = false;
2735  break;
2736  }
2737  }
2738  if (contiguous && numCopyVecs > 0) {
2739  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
2740  }
2741  else {
2742  RCP<const MV> X_sub = this->subView (cols);
2743  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
2744  Y->assign (*X_sub);
2745  return Y;
2746  }
2747  }
2748 
2749  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2750  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
2752  subCopy (const Teuchos::Range1D &colRng) const
2753  {
2754  using Teuchos::RCP;
2756 
2757  RCP<const MV> X_sub = this->subView (colRng);
2758  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
2759  Y->assign (*X_sub);
2760  return Y;
2761  }
2762 
2763  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2764  size_t
2767  return origView_.dimension_0 ();
2768  }
2769 
2770  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2771  size_t
2774  return origView_.dimension_1 ();
2775  }
2776 
2777  template <class Scalar, class LO, class GO, class Node, const bool classic>
2780  const map_type& subMap,
2781  const size_t offset) :
2782  base_type (Teuchos::null) // to be replaced below
2783  {
2784  using Kokkos::ALL;
2785  using Kokkos::subview;
2786  using Teuchos::RCP;
2787  using Teuchos::rcp;
2789  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
2790 
2791  const size_t newNumRows = subMap.getNodeNumElements ();
2792  const bool tooManyElts = newNumRows + offset > X.getOrigNumLocalRows ();
2793  if (tooManyElts) {
2794  const int myRank = X.getMap ()->getComm ()->getRank ();
2795  TEUCHOS_TEST_FOR_EXCEPTION(
2796  newNumRows + offset > X.getLocalLength (), std::runtime_error,
2797  prefix << "Invalid input Map. The input Map owns " << newNumRows <<
2798  " entries on process " << myRank << ". offset = " << offset << ". "
2799  "Yet, the MultiVector contains only " << X.getOrigNumLocalRows () <<
2800  " rows on this process.");
2801  }
2802 
2803 #ifdef HAVE_TPETRA_DEBUG
2804  const size_t strideBefore =
2805  X.isConstantStride () ? X.getStride () : static_cast<size_t> (0);
2806  const size_t lclNumRowsBefore = X.getLocalLength ();
2807  const size_t numColsBefore = X.getNumVectors ();
2808  const impl_scalar_type* hostPtrBefore =
2809  X.getDualView ().h_view.ptr_on_device ();
2810 #endif // HAVE_TPETRA_DEBUG
2811 
2812  const std::pair<size_t, size_t> rowRng (offset, offset + newNumRows);
2813  // FIXME (mfh 10 May 2014) Use of origView_ instead of view_ for
2814  // the second argument may be wrong, if view_ resulted from a
2815  // previous call to offsetView with offset != 0.
2816  dual_view_type newView = subview (X.origView_, rowRng, ALL ());
2817  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
2818  // handling subviews of degenerate Views quite so well. For some
2819  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
2820  // 0. We work around by creating a new empty DualView of the
2821  // desired (degenerate) dimensions.
2822  if (newView.dimension_0 () == 0 &&
2823  newView.dimension_1 () != X.view_.dimension_1 ()) {
2824  newView = allocDualView<Scalar, LO, GO, Node> (size_t (0),
2825  X.getNumVectors ());
2826  }
2827 
2828  MV subViewMV = X.isConstantStride () ?
2829  MV (Teuchos::rcp (new map_type (subMap)), newView, X.origView_) :
2830  MV (Teuchos::rcp (new map_type (subMap)), newView, X.origView_, X.whichVectors_ ());
2831 
2832 #ifdef HAVE_TPETRA_DEBUG
2833  const size_t strideAfter = X.isConstantStride () ?
2834  X.getStride () :
2835  static_cast<size_t> (0);
2836  const size_t lclNumRowsAfter = X.getLocalLength ();
2837  const size_t numColsAfter = X.getNumVectors ();
2838  const impl_scalar_type* hostPtrAfter =
2839  X.getDualView ().h_view.ptr_on_device ();
2840 
2841  const size_t strideRet = subViewMV.isConstantStride () ?
2842  subViewMV.getStride () :
2843  static_cast<size_t> (0);
2844  const size_t lclNumRowsRet = subViewMV.getLocalLength ();
2845  const size_t numColsRet = subViewMV.getNumVectors ();
2846 
2847  const char suffix[] = ". This should never happen. Please report this "
2848  "bug to the Tpetra developers.";
2849 
2850  TEUCHOS_TEST_FOR_EXCEPTION(
2851  lclNumRowsRet != subMap.getNodeNumElements (),
2852  std::logic_error, prefix << "Returned MultiVector has a number of rows "
2853  "different than the number of local indices in the input Map. "
2854  "lclNumRowsRet: " << lclNumRowsRet << ", subMap.getNodeNumElements(): "
2855  << subMap.getNodeNumElements () << suffix);
2856  TEUCHOS_TEST_FOR_EXCEPTION(
2857  strideBefore != strideAfter || lclNumRowsBefore != lclNumRowsAfter ||
2858  numColsBefore != numColsAfter || hostPtrBefore != hostPtrAfter,
2859  std::logic_error, prefix << "Original MultiVector changed dimensions, "
2860  "stride, or host pointer after taking offset view. strideBefore: " <<
2861  strideBefore << ", strideAfter: " << strideAfter << ", lclNumRowsBefore: "
2862  << lclNumRowsBefore << ", lclNumRowsAfter: " << lclNumRowsAfter <<
2863  ", numColsBefore: " << numColsBefore << ", numColsAfter: " <<
2864  numColsAfter << ", hostPtrBefore: " << hostPtrBefore << ", hostPtrAfter: "
2865  << hostPtrAfter << suffix);
2866  TEUCHOS_TEST_FOR_EXCEPTION(
2867  strideBefore != strideRet, std::logic_error, prefix << "Returned "
2868  "MultiVector has different stride than original MultiVector. "
2869  "strideBefore: " << strideBefore << ", strideRet: " << strideRet <<
2870  ", numColsBefore: " << numColsBefore << ", numColsRet: " << numColsRet
2871  << suffix);
2872  TEUCHOS_TEST_FOR_EXCEPTION(
2873  numColsBefore != numColsRet, std::logic_error,
2874  prefix << "Returned MultiVector has a different number of columns than "
2875  "original MultiVector. numColsBefore: " << numColsBefore << ", "
2876  "numColsRet: " << numColsRet << suffix);
2877 #endif // HAVE_TPETRA_DEBUG
2878 
2879  *this = subViewMV; // shallow copy
2880  }
2881 
2882  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2883  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
2885  offsetView (const Teuchos::RCP<const map_type>& subMap,
2886  const size_t offset) const
2887  {
2889  return Teuchos::rcp (new MV (*this, *subMap, offset));
2890  }
2891 
2892  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2893  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
2895  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
2896  const size_t offset)
2897  {
2899  return Teuchos::rcp (new MV (*this, *subMap, offset));
2900  }
2901 
2902  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2903  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
2905  subView (const Teuchos::ArrayView<const size_t>& cols) const
2906  {
2907  using Teuchos::Array;
2908  using Teuchos::rcp;
2910 
2911  const size_t numViewCols = static_cast<size_t> (cols.size ());
2912  TEUCHOS_TEST_FOR_EXCEPTION(
2913  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
2914  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
2915  "contain at least one entry, but cols.size() = " << cols.size ()
2916  << " == 0.");
2917 
2918  // Check whether the index set in cols is contiguous. If it is,
2919  // use the more efficient Range1D version of subView.
2920  bool contiguous = true;
2921  for (size_t j = 1; j < numViewCols; ++j) {
2922  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
2923  contiguous = false;
2924  break;
2925  }
2926  }
2927  if (contiguous) {
2928  if (numViewCols == 0) {
2929  // The output MV has no columns, so there is nothing to view.
2930  return rcp (new MV (this->getMap (), numViewCols));
2931  } else {
2932  // Use the more efficient contiguous-index-range version.
2933  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
2934  }
2935  }
2936 
2937  if (isConstantStride ()) {
2938  return rcp (new MV (this->getMap (), view_, origView_, cols));
2939  }
2940  else {
2941  Array<size_t> newcols (cols.size ());
2942  for (size_t j = 0; j < numViewCols; ++j) {
2943  newcols[j] = whichVectors_[cols[j]];
2944  }
2945  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
2946  }
2947  }
2948 
2949 
2950  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2951  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
2953  subView (const Teuchos::Range1D& colRng) const
2954  {
2955  using Kokkos::ALL;
2956  using Kokkos::subview;
2957  using Teuchos::Array;
2958  using Teuchos::RCP;
2959  using Teuchos::rcp;
2961  const char tfecfFuncName[] = "subView(Range1D): ";
2962 
2963  const size_t lclNumRows = this->getLocalLength ();
2964  const size_t numVecs = this->getNumVectors ();
2965  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2966  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
2967  // "at least one vector.");
2968  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2969  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
2970  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
2971  << numVecs << ".");
2972  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2973  numVecs != 0 && colRng.size () != 0 &&
2974  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
2975  static_cast<size_t> (colRng.ubound ()) >= numVecs),
2976  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
2977  "," << colRng.ubound () << "] exceeds the valid range of column indices "
2978  "[0, " << numVecs << "].");
2979 
2980  RCP<const MV> X_ret; // the MultiVector subview to return
2981 
2982  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
2983  // broken for the case of views with zero rows. I will brutally
2984  // enforce that the subview has the correct dimensions. In
2985  // particular, in the case of zero rows, I will, if necessary,
2986  // create a new dual_view_type with zero rows and the correct
2987  // number of columns. In a debug build, I will use an all-reduce
2988  // to ensure that it has the correct dimensions on all processes.
2989 
2990  const std::pair<size_t, size_t> rows (0, lclNumRows);
2991  if (colRng.size () == 0) {
2992  const std::pair<size_t, size_t> cols (0, 0); // empty range
2993  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
2994  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
2995  }
2996  else {
2997  // Returned MultiVector is constant stride only if *this is.
2998  if (isConstantStride ()) {
2999  const std::pair<size_t, size_t> cols (colRng.lbound (),
3000  colRng.ubound () + 1);
3001  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3002  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3003  }
3004  else {
3005  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3006  // We're only asking for one column, so the result does have
3007  // constant stride, even though this MultiVector does not.
3008  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3009  whichVectors_[0] + colRng.ubound () + 1);
3010  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3011  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3012  }
3013  else {
3014  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3015  whichVectors_.begin () + colRng.ubound () + 1);
3016  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3017  }
3018  }
3019  }
3020 
3021 #ifdef HAVE_TPETRA_DEBUG
3022  using Teuchos::Comm;
3023  using Teuchos::outArg;
3024  using Teuchos::REDUCE_MIN;
3025  using Teuchos::reduceAll;
3026 
3027  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
3028  this->getMap ()->getComm ();
3029  if (! comm.is_null ()) {
3030  int lclSuccess = 1;
3031  int gblSuccess = 1;
3032 
3033  if (X_ret.is_null ()) {
3034  lclSuccess = 0;
3035  }
3036  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3037  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3038  lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3039  "MultiVector; the return value of this method) is null on some MPI "
3040  "process in this MultiVector's communicator. This should never "
3041  "happen. Please report this bug to the Tpetra developers.");
3042 
3043  if (! X_ret.is_null () &&
3044  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3045  lclSuccess = 0;
3046  }
3047  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3048  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3049  lclSuccess != 1, std::logic_error,
3050  "X_ret->getNumVectors() != colRng.size(), on at least one MPI process "
3051  "in this MultiVector's communicator. This should never happen. "
3052  "Please report this bug to the Tpetra developers.");
3053  }
3054 #endif // HAVE_TPETRA_DEBUG
3055 
3056  return X_ret;
3057  }
3058 
3059 
3060  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3061  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3063  subViewNonConst (const ArrayView<const size_t> &cols)
3064  {
3066  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3067  }
3068 
3069 
3070  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3071  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3073  subViewNonConst (const Teuchos::Range1D &colRng)
3074  {
3076  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3077  }
3078 
3079 
3080  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3081  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3083  getVector (const size_t j) const
3084  {
3085  using Kokkos::ALL;
3086  using Kokkos::subview;
3087  using Teuchos::rcp;
3089 
3090 #ifdef HAVE_TPETRA_DEBUG
3091  const char tfecfFuncName[] = "getVector(NonConst): ";
3092  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3093  this->vectorIndexOutOfRange (j), std::runtime_error, "Input index j (== "
3094  << j << ") exceeds valid range [0, " << this->getNumVectors ()
3095  << " - 1].");
3096 #endif // HAVE_TPETRA_DEBUG
3097  const size_t jj = this->isConstantStride () ?
3098  static_cast<size_t> (j) :
3099  static_cast<size_t> (this->whichVectors_[j]);
3100  const std::pair<size_t, size_t> rng (jj, jj+1);
3101  return rcp (new V (this->getMap (),
3102  takeSubview (this->view_, ALL (), rng),
3103  origView_));
3104  }
3105 
3106 
3107  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3108  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3110  getVectorNonConst (const size_t j)
3111  {
3113  return Teuchos::rcp_const_cast<V> (this->getVector (j));
3114  }
3115 
3116 
3117  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3118  void
3120  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3121  {
3122  using Kokkos::subview;
3123  typedef impl_scalar_type IST;
3124  typedef MakeUnmanagedView<IST, device_type> view_getter_type;
3125  typedef typename view_getter_type::view_type input_col_type;
3126  // Types of views of this MultiVector's data.
3127  typedef typename dual_view_type::t_host host_view_type;
3128  typedef typename dual_view_type::t_dev dev_view_type;
3129  typedef Kokkos::View<IST*,
3130  typename host_view_type::array_layout,
3131  typename host_view_type::memory_space> host_col_type;
3132  typedef Kokkos::View<IST*,
3133  typename dev_view_type::array_layout,
3134  typename dev_view_type::memory_space> dev_col_type;
3135  const char tfecfFuncName[] = "get1dCopy: ";
3136 
3137  const size_t numRows = this->getLocalLength ();
3138  const size_t numCols = this->getNumVectors ();
3139  const std::pair<size_t, size_t> rowRange (0, numRows);
3140  const std::pair<size_t, size_t> colRange (0, numCols);
3141 
3142  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3143  LDA < numRows, std::runtime_error,
3144  "LDA = " << LDA << " < numRows = " << numRows << ".");
3145  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3146  numRows > static_cast<size_t> (0) &&
3147  numCols > static_cast<size_t> (0) &&
3148  static_cast<size_t> (A.size ()) < LDA * (numCols - 1) + numRows,
3149  std::runtime_error,
3150  "A.size() = " << A.size () << ", but its size must be at least "
3151  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3152 
3153  // FIXME (mfh 22 Jul 2014, 10 Dec 2014) Currently, it doesn't work
3154  // to do a 2-D copy, even if this MultiVector has constant stride.
3155  // This is because Kokkos can't currently tell the difference
3156  // between padding (which permits a single deep_copy for the whole
3157  // 2-D View) and stride > numRows (which does NOT permit a single
3158  // deep_copy for the whole 2-D View). Carter is working on this,
3159  // but for now, the temporary fix is to copy one column at a time.
3160 
3161  for (size_t j = 0; j < numCols; ++j) {
3162  const size_t srcCol =
3163  this->isConstantStride () ? j : this->whichVectors_[j];
3164  const size_t dstCol = j;
3165  IST* const dstColRaw =
3166  reinterpret_cast<IST*> (A.getRawPtr () + LDA * dstCol);
3167  input_col_type dstColView (dstColRaw, numRows);
3168  // Use the most recently updated version of this MultiVector's
3169  // data. This avoids sync'ing, which could violate users'
3170  // expectations.
3171  if (view_.modified_host() > view_.modified_device()) {
3172  host_col_type srcColView =
3173  subview (view_.h_view, rowRange, srcCol);
3174  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3175  dstColView.dimension_0 () != srcColView.dimension_0 (),
3176  std::logic_error, ": srcColView and dstColView have different "
3177  "dimensions. Please report this bug to the Tpetra developers.");
3178  Kokkos::deep_copy (dstColView, srcColView);
3179  }
3180  else {
3181  dev_col_type srcColView =
3182  subview (view_.d_view, rowRange, srcCol);
3183  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3184  dstColView.dimension_0 () != srcColView.dimension_0 (),
3185  std::logic_error, ": srcColView and dstColView have different "
3186  "dimensions. Please report this bug to the Tpetra developers.");
3187  Kokkos::deep_copy (dstColView, srcColView);
3188  }
3189  }
3190  }
3191 
3192 
3193  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3194  void
3196  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3197  {
3199  const char tfecfFuncName[] = "get2dCopy: ";
3200  const size_t numRows = this->getLocalLength ();
3201  const size_t numCols = this->getNumVectors ();
3202 
3203  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3204  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3205  std::runtime_error, "Input array of pointers must contain as many "
3206  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3207  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3208 
3209  if (numRows != 0 && numCols != 0) {
3210  // No side effects until we've validated the input.
3211  for (size_t j = 0; j < numCols; ++j) {
3212  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3213  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3214  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3215  "the input array of arrays is not long enough to fit all entries in "
3216  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3217  << " < getLocalLength() = " << numRows << ".");
3218  }
3219 
3220  // We've validated the input, so it's safe to start copying.
3221  for (size_t j = 0; j < numCols; ++j) {
3222  RCP<const V> X_j = this->getVector (j);
3223  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3224  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3225  }
3226  }
3227  }
3228 
3229  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3230  Teuchos::ArrayRCP<const Scalar>
3232  get1dView () const
3233  {
3234  if (getLocalLength () == 0 || getNumVectors () == 0) {
3235  return Teuchos::null;
3236  } else {
3237  TEUCHOS_TEST_FOR_EXCEPTION(
3238  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3239  "get1dView: This MultiVector does not have constant stride, so it is "
3240  "not possible to view its data as a single array. You may check "
3241  "whether a MultiVector has constant stride by calling "
3242  "isConstantStride().");
3243  // NOTE (mfh 09 2014) get1dView() and get1dViewNonConst() have
3244  // always been device->host synchronization points. We might
3245  // want to change this in the future.
3246  typedef typename dual_view_type::host_mirror_space host_type;
3247  view_.template sync<host_type> ();
3248  // Both get1dView() and get1dViewNonConst() return a host view
3249  // of the data.
3250  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3251  Kokkos::Compat::persistingView (view_.template view<host_type> ());
3252  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3253  }
3254  }
3255 
3256  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3257  Teuchos::ArrayRCP<Scalar>
3260  {
3261  if (getLocalLength () == 0 || getNumVectors () == 0) {
3262  return Teuchos::null;
3263  } else {
3264  TEUCHOS_TEST_FOR_EXCEPTION(
3265  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3266  "get1dViewNonConst: This MultiVector does not have constant stride, so "
3267  "it is not possible to view its data as a single array. You may check "
3268  "whether a MultiVector has constant stride by calling "
3269  "isConstantStride().");
3270  // NOTE (mfh 09 May 2014) get1dView() and get1dViewNonConst()
3271  // have always been device->host synchronization points. We
3272  // might want to change this in the future.
3273  typedef typename dual_view_type::host_mirror_space host_type;
3274  view_.template sync<host_type> ();
3275  // Both get1dView() and get1dViewNonConst() return a host view
3276  // of the data.
3277  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3278  Kokkos::Compat::persistingView (view_.template view<host_type> ());
3279  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3280  }
3281  }
3282 
3283  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3284  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3287  {
3288  using Teuchos::ArrayRCP;
3289  typedef Kokkos::DualView<impl_scalar_type*,
3290  typename dual_view_type::array_layout, device_type> col_dual_view_type;
3291 
3292  const size_t numCols = getNumVectors ();
3293  ArrayRCP<ArrayRCP<Scalar> > views (numCols);
3294  for (size_t j = 0; j < numCols; ++j) {
3295  const size_t col = isConstantStride () ? j : whichVectors_[j];
3296  col_dual_view_type X_col =
3297  Kokkos::subview (view_, Kokkos::ALL (), col);
3298  ArrayRCP<impl_scalar_type> X_col_arcp =
3299  Kokkos::Compat::persistingView (X_col.d_view);
3300  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_col_arcp);
3301  }
3302  return views;
3303  }
3304 
3305  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3306  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3308  get2dView () const
3309  {
3310  using Teuchos::ArrayRCP;
3311  typedef Kokkos::DualView<const impl_scalar_type*,
3312  typename dual_view_type::array_layout, device_type> col_dual_view_type;
3313 
3314  const size_t numCols = getNumVectors ();
3315  ArrayRCP<ArrayRCP<const Scalar> > views (numCols);
3316  for (size_t j = 0; j < numCols; ++j) {
3317  const size_t col = isConstantStride () ? j : whichVectors_[j];
3318  col_dual_view_type X_col =
3319  Kokkos::subview (view_, Kokkos::ALL (), col);
3320  ArrayRCP<const impl_scalar_type> X_col_arcp =
3321  Kokkos::Compat::persistingView (X_col.d_view);
3322  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_col_arcp);
3323  }
3324  return views;
3325  }
3326 
3327  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3328  void
3330  multiply (Teuchos::ETransp transA,
3331  Teuchos::ETransp transB,
3332  const Scalar& alpha,
3335  const Scalar& beta)
3336  {
3337  using Teuchos::CONJ_TRANS;
3338  using Teuchos::NO_TRANS;
3339  using Teuchos::TRANS;
3340  using Teuchos::RCP;
3341  using Teuchos::rcp;
3342  using Teuchos::rcpFromRef;
3343  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
3344  typedef Teuchos::ScalarTraits<Scalar> STS;
3346  const char errPrefix[] = "Tpetra::MultiVector::multiply: ";
3347 
3348  // This routine performs a variety of matrix-matrix multiply
3349  // operations, interpreting the MultiVector (this-aka C , A and B)
3350  // as 2D matrices. Variations are due to the fact that A, B and C
3351  // can be local replicated or global distributed MultiVectors and
3352  // that we may or may not operate with the transpose of A and B.
3353  // Possible cases are:
3354  //
3355  // Operations # Cases Notes
3356  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3357  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3358  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3359  //
3360  // The following operations are not meaningful for 1-D
3361  // distributions:
3362  //
3363  // u1) C(local) = A^T(distr) * B^T(distr) 1
3364  // u2) C(local) = A (distr) * B^X(distr) 2
3365  // u3) C(distr) = A^X(local) * B^X(local) 4
3366  // u4) C(distr) = A^X(local) * B^X(distr) 4
3367  // u5) C(distr) = A^T(distr) * B^X(local) 2
3368  // u6) C(local) = A^X(distr) * B^X(local) 4
3369  // u7) C(distr) = A^X(distr) * B^X(local) 4
3370  // u8) C(local) = A^X(local) * B^X(distr) 4
3371  //
3372  // Total number of cases: 32 (= 2^5).
3373 
3374  TEUCHOS_TEST_FOR_EXCEPTION(
3375  ATS::is_complex && (transA == TRANS || transB == TRANS),
3376  std::invalid_argument, errPrefix << "Transpose without conjugation "
3377  "(transA == TRANS || transB == TRANS) is not supported for complex Scalar "
3378  "types.");
3379 
3380  transA = (transA == NO_TRANS ? NO_TRANS : CONJ_TRANS);
3381  transB = (transB == NO_TRANS ? NO_TRANS : CONJ_TRANS);
3382 
3383  // Compute effective dimensions, w.r.t. transpose operations on
3384  size_t A_nrows = (transA==CONJ_TRANS) ? A.getNumVectors() : A.getLocalLength();
3385  size_t A_ncols = (transA==CONJ_TRANS) ? A.getLocalLength() : A.getNumVectors();
3386  size_t B_nrows = (transB==CONJ_TRANS) ? B.getNumVectors() : B.getLocalLength();
3387  size_t B_ncols = (transB==CONJ_TRANS) ? B.getLocalLength() : B.getNumVectors();
3388 
3389  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3390 
3391  TEUCHOS_TEST_FOR_EXCEPTION(
3392  getLocalLength () != A_nrows || getNumVectors () != B_ncols ||
3393  A_ncols != B_nrows, std::runtime_error, errPrefix << "Dimensions of "
3394  "*this, op(A), and op(B) must be consistent. Local part of *this is "
3395  << getLocalLength() << " x " << getNumVectors()
3396  << ", A is " << A_nrows << " x " << A_ncols
3397  << ", and B is " << B_nrows << " x " << B_ncols << ".");
3398 
3399  const bool A_is_local = ! A.isDistributed ();
3400  const bool B_is_local = ! B.isDistributed ();
3401  const bool C_is_local = ! this->isDistributed ();
3402  // Case 1: C(local) = A^X(local) * B^X(local)
3403  const bool Case1 = C_is_local && A_is_local && B_is_local;
3404  // Case 2: C(local) = A^T(distr) * B (distr)
3405  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
3406  transA == CONJ_TRANS && transB == NO_TRANS;
3407  // Case 3: C(distr) = A (distr) * B^X(local)
3408  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
3409  transA == NO_TRANS;
3410 
3411  // Test that we are considering a meaningful case
3412  TEUCHOS_TEST_FOR_EXCEPTION(
3413  ! Case1 && ! Case2 && ! Case3, std::runtime_error, errPrefix
3414  << "Multiplication of op(A) and op(B) into *this is not a "
3415  "supported use case.");
3416 
3417  if (beta != STS::zero () && Case2) {
3418  // If Case2, then C is local and contributions must be summed
3419  // across all processes. However, if beta != 0, then accumulate
3420  // beta*C into the sum. When summing across all processes, we
3421  // only want to accumulate this once, so set beta == 0 on all
3422  // processes except Process 0.
3423  const int myRank = this->getMap ()->getComm ()->getRank ();
3424  if (myRank != 0) {
3425  beta_local = STS::zero ();
3426  }
3427  }
3428 
3429  // We only know how to do matrix-matrix multiplies if all the
3430  // MultiVectors have constant stride. If not, we have to make
3431  // temporary copies of those MultiVectors (including possibly
3432  // *this) that don't have constant stride.
3433  RCP<MV> C_tmp;
3434  if (! isConstantStride ()) {
3435  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
3436  } else {
3437  C_tmp = rcp (this, false);
3438  }
3439 
3440  RCP<const MV> A_tmp;
3441  if (! A.isConstantStride ()) {
3442  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
3443  } else {
3444  A_tmp = rcpFromRef (A);
3445  }
3446 
3447  RCP<const MV> B_tmp;
3448  if (! B.isConstantStride ()) {
3449  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
3450  } else {
3451  B_tmp = rcpFromRef (B);
3452  }
3453 
3454  TEUCHOS_TEST_FOR_EXCEPTION(
3455  ! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
3456  ! A_tmp->isConstantStride (), std::logic_error, errPrefix
3457  << "Failed to make temporary constant-stride copies of MultiVectors.");
3458 
3460 
3461  gemm_type::GEMM (transA, transB, alpha,
3462  A_tmp->getDualView ().d_view, B_tmp->getDualView ().d_view,
3463  beta_local, C_tmp->getDualView ().d_view);
3464  if (! isConstantStride ()) {
3465  deep_copy (*this, *C_tmp); // Copy the result back into *this.
3466  }
3467 
3468  // Dispose of (possibly) extra copies of A and B.
3469  A_tmp = Teuchos::null;
3470  B_tmp = Teuchos::null;
3471 
3472  // If Case 2 then sum up *this and distribute it to all processes.
3473  if (Case2) {
3474  this->reduce ();
3475  }
3476  }
3477 
3478  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3479  void
3481  elementWiseMultiply (Scalar scalarAB,
3484  Scalar scalarThis)
3485  {
3486  using Kokkos::ALL;
3487  using Kokkos::subview;
3488  const char tfecfFuncName[] = "elementWiseMultiply: ";
3489  const size_t numVecs = this->getNumVectors ();
3490 
3491 #ifdef HAVE_TPETRA_DEBUG
3492  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3493  getLocalLength() != A.getLocalLength() ||
3494  getLocalLength() != B.getLocalLength(), std::runtime_error,
3495  "MultiVectors do not have the same local length.");
3496 #endif // HAVE_TPETRA_DEBUG
3497  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3498  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
3499  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
3500  << ".");
3501 
3502  if (isConstantStride () && B.isConstantStride ()) {
3503  // A is just a Vector; it only has one column, so it always has
3504  // constant stride.
3505  //
3506  // If both *this and B have constant stride, we can do an
3507  // element-wise multiply on all columns at once.
3508  view_.template sync<device_type> ();
3509  view_.template modify<device_type> ();
3510  A.view_.template sync<device_type> ();
3511  B.view_.template sync<device_type> ();
3512  KokkosBlas::mult (scalarThis, view_.d_view, scalarAB,
3513  subview (A.view_.d_view, ALL (), 0),
3514  B.view_.d_view);
3515  }
3516  else {
3517  view_.template sync<device_type> ();
3518  view_.template modify<device_type> ();
3519  A.view_.template sync<device_type> ();
3520  B.view_.template sync<device_type> ();
3521 
3522  for (size_t j = 0; j < numVecs; ++j) {
3523  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
3524  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
3525 
3526  KokkosBlas::mult (scalarThis,
3527  subview (view_.d_view, ALL (), C_col),
3528  scalarAB,
3529  subview (A.view_.d_view, ALL (), 0),
3530  subview (B.view_.d_view, ALL (), B_col));
3531  }
3532  }
3533  }
3534 
3535  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3536  void
3539  {
3540  using Kokkos::ALL;
3541  using Kokkos::subview;
3542  using Teuchos::reduceAll;
3543  using Teuchos::REDUCE_SUM;
3544  typedef typename dual_view_type::t_dev device_view_type;
3545  typedef typename dual_view_type::host_mirror_space host_mirror_space;
3546 
3547  TEUCHOS_TEST_FOR_EXCEPTION(
3548  this->isDistributed (), std::runtime_error,
3549  "Tpetra::MultiVector::reduce() should only be called with locally "
3550  "replicated or otherwise not distributed MultiVector objects.");
3551  const Teuchos::Comm<int>& comm = * (this->getMap ()->getComm ());
3552  if (comm.getSize () == 1) {
3553  return;
3554  }
3555 
3556  const size_t numLclRows = getLocalLength ();
3557  const size_t numCols = getNumVectors ();
3558 
3559  // FIXME (mfh 16 June 2014) This exception will cause deadlock if
3560  // it triggers on only some processes. We don't have a good way
3561  // to pack this result into the all-reduce below, but this would
3562  // be a good reason to set a "local error flag" and find other
3563  // opportunities to let it propagate.
3564  TEUCHOS_TEST_FOR_EXCEPTION(
3565  numLclRows > static_cast<size_t> (std::numeric_limits<int>::max ()),
3566  std::runtime_error, "Tpetra::MultiVector::reduce: On Process " <<
3567  comm.getRank () << ", the number of local rows " << numLclRows <<
3568  " does not fit in int.");
3569 
3570  //
3571  // Use MPI to sum the entries across all local blocks.
3572  //
3573  // If this MultiVector's local data are stored contiguously, we
3574  // can use the local View as the source buffer in the
3575  // MPI_Allreduce. Otherwise, we have to allocate a temporary
3576  // source buffer and pack.
3577  const bool contig = isConstantStride () && getStride () == numLclRows;
3578  device_view_type srcBuf;
3579  if (contig) {
3580  srcBuf = view_.d_view;
3581  }
3582  else {
3583  srcBuf = device_view_type ("srcBuf", numLclRows, numCols);
3584  Kokkos::deep_copy (srcBuf, view_.d_view);
3585  }
3586 
3587  // MPI requires that the send and receive buffers don't alias one
3588  // another, so we have to copy temporary storage for the result.
3589  //
3590  // We expect that MPI implementations will know how to read device
3591  // pointers.
3592  device_view_type tgtBuf ("tgtBuf", numLclRows, numCols);
3593 
3594  const int reduceCount = static_cast<int> (numLclRows * numCols);
3595  reduceAll<int, impl_scalar_type> (comm, REDUCE_SUM, reduceCount,
3596  srcBuf.ptr_on_device (),
3597  tgtBuf.ptr_on_device ());
3598 
3599  // Tell the DualView that we plan to modify the device data.
3600  view_.template modify<execution_space> ();
3601 
3602  const std::pair<size_t, size_t> lclRowRange (0, numLclRows);
3603  device_view_type d_view =
3604  subview (view_.d_view, lclRowRange, ALL ());
3605 
3606  if (contig || isConstantStride ()) {
3607  Kokkos::deep_copy (d_view, tgtBuf);
3608  }
3609  else {
3610  for (size_t j = 0; j < numCols; ++j) {
3611  device_view_type d_view_j =
3612  subview (d_view, ALL (), std::pair<int,int>(j,j+1));
3613  device_view_type tgtBuf_j =
3614  subview (tgtBuf, ALL (), std::pair<int,int>(j,j+1));
3615  Kokkos::deep_copy (d_view_j, tgtBuf_j);
3616  }
3617  }
3618 
3619  // Synchronize the host with changes on the device.
3620  //
3621  // FIXME (mfh 16 June 2014) This raises the question of whether we
3622  // want to synchronize always. Users will find it reassuring if
3623  // MultiVector methods always leave the MultiVector in a
3624  // synchronized state, but it seems silly to synchronize to host
3625  // if they hardly ever need host data.
3626  view_.template sync<host_mirror_space> ();
3627  }
3628 
3629 
3630  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3631  void
3633  replaceLocalValue (LocalOrdinal MyRow,
3634  size_t VectorIndex,
3635  const impl_scalar_type& ScalarValue) const
3636  {
3637 #ifdef HAVE_TPETRA_DEBUG
3638  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
3639  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
3640  TEUCHOS_TEST_FOR_EXCEPTION(
3641  MyRow < minLocalIndex || MyRow > maxLocalIndex,
3642  std::runtime_error,
3643  "Tpetra::MultiVector::replaceLocalValue: row index " << MyRow
3644  << " is invalid. The range of valid row indices on this process "
3645  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
3646  << ", " << maxLocalIndex << "].");
3647  TEUCHOS_TEST_FOR_EXCEPTION(
3648  vectorIndexOutOfRange(VectorIndex),
3649  std::runtime_error,
3650  "Tpetra::MultiVector::replaceLocalValue: vector index " << VectorIndex
3651  << " of the multivector is invalid.");
3652 #endif
3653  const size_t colInd = isConstantStride () ?
3654  VectorIndex : whichVectors_[VectorIndex];
3655  view_.h_view (MyRow, colInd) = ScalarValue;
3656  }
3657 
3658 
3659  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3660  void
3662  sumIntoLocalValue (const LocalOrdinal localRow,
3663  const size_t col,
3664  const impl_scalar_type& value,
3665  const bool atomic) const
3666  {
3667 #ifdef HAVE_TPETRA_DEBUG
3668  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
3669  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
3670  TEUCHOS_TEST_FOR_EXCEPTION(
3671  localRow < minLocalIndex || localRow > maxLocalIndex,
3672  std::runtime_error,
3673  "Tpetra::MultiVector::sumIntoLocalValue: row index " << localRow
3674  << " is invalid. The range of valid row indices on this process "
3675  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
3676  << ", " << maxLocalIndex << "].");
3677  TEUCHOS_TEST_FOR_EXCEPTION(
3678  vectorIndexOutOfRange(col),
3679  std::runtime_error,
3680  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
3681  << " of the multivector is invalid.");
3682 #endif
3683  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
3684  if (atomic) {
3685  Kokkos::atomic_add (& (view_.h_view(localRow, colInd)), value);
3686  }
3687  else {
3688  view_.h_view (localRow, colInd) += value;
3689  }
3690  }
3691 
3692 
3693  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3694  void
3696  replaceGlobalValue (GlobalOrdinal GlobalRow,
3697  size_t VectorIndex,
3698  const impl_scalar_type& ScalarValue) const
3699  {
3700  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
3701  // touches the RCP's reference count, which isn't thread safe.
3702  const LocalOrdinal MyRow = this->map_->getLocalElement (GlobalRow);
3703 #ifdef HAVE_TPETRA_DEBUG
3704  TEUCHOS_TEST_FOR_EXCEPTION(
3705  MyRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
3706  std::runtime_error,
3707  "Tpetra::MultiVector::replaceGlobalValue: Global row index " << GlobalRow
3708  << "is not present on this process "
3709  << this->getMap ()->getComm ()->getRank () << ".");
3710  TEUCHOS_TEST_FOR_EXCEPTION(
3711  vectorIndexOutOfRange (VectorIndex), std::runtime_error,
3712  "Tpetra::MultiVector::replaceGlobalValue: Vector index " << VectorIndex
3713  << " of the multivector is invalid.");
3714 #endif // HAVE_TPETRA_DEBUG
3715  this->replaceLocalValue (MyRow, VectorIndex, ScalarValue);
3716  }
3717 
3718  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3719  void
3721  sumIntoGlobalValue (const GlobalOrdinal globalRow,
3722  const size_t col,
3723  const impl_scalar_type& value,
3724  const bool atomic) const
3725  {
3726  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
3727  // touches the RCP's reference count, which isn't thread safe.
3728  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
3729 #ifdef HAVE_TEUCHOS_DEBUG
3730  TEUCHOS_TEST_FOR_EXCEPTION(
3731  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
3732  std::runtime_error,
3733  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
3734  << "is not present on this process "
3735  << this->getMap ()->getComm ()->getRank () << ".");
3736  TEUCHOS_TEST_FOR_EXCEPTION(
3737  vectorIndexOutOfRange(col),
3738  std::runtime_error,
3739  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
3740  << " of the multivector is invalid.");
3741 #endif
3742  this->sumIntoLocalValue (lclRow, col, value, atomic);
3743  }
3744 
3745  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3746  template <class T>
3747  Teuchos::ArrayRCP<T>
3749  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
3750  size_t j) const
3751  {
3752  typedef Kokkos::DualView<impl_scalar_type*,
3753  typename dual_view_type::array_layout,
3754  execution_space> col_dual_view_type;
3755  const size_t col = isConstantStride () ? j : whichVectors_[j];
3756  col_dual_view_type X_col =
3757  Kokkos::subview (view_, Kokkos::ALL (), col);
3758  return Kokkos::Compat::persistingView (X_col.d_view);
3759  }
3760 
3761  template <class Scalar,
3762  class LocalOrdinal,
3763  class GlobalOrdinal,
3764  class Node,
3765  const bool classic>
3766  KokkosClassic::MultiVector<Scalar, Node>
3768  getLocalMV () const
3769  {
3770  using Teuchos::ArrayRCP;
3771  typedef KokkosClassic::MultiVector<Scalar, Node> KMV;
3772 
3773  // This method creates the KokkosClassic object on the fly. Thus,
3774  // it's OK to leave this in place for backwards compatibility.
3775  ArrayRCP<Scalar> data;
3776  if (getLocalLength () == 0 || getNumVectors () == 0) {
3777  data = Teuchos::null;
3778  }
3779  else {
3780  ArrayRCP<impl_scalar_type> dataTmp = (getLocalLength() > 0) ?
3781  Kokkos::Compat::persistingView (view_.d_view) :
3782  Teuchos::null;
3783  data = Teuchos::arcp_reinterpret_cast<Scalar> (dataTmp);
3784  }
3785  size_t stride[8];
3786  origView_.stride (stride);
3787  const size_t LDA =
3788  origView_.dimension_1 () > 1 ? stride[1] : origView_.dimension_0 ();
3789 
3790  KMV kmv (this->getMap ()->getNode ());
3791  kmv.initializeValues (getLocalLength (), getNumVectors (),
3792  data, LDA, getOrigNumLocalRows (),
3793  getOrigNumLocalCols ());
3794  return kmv;
3795  }
3796 
3797  template <class Scalar,
3798  class LocalOrdinal,
3799  class GlobalOrdinal,
3800  class Node,
3801  const bool classic>
3804  getDualView () const {
3805  return view_;
3806  }
3807 
3808  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3809  std::string
3812  {
3813  using std::endl;
3814  std::ostringstream oss;
3815  oss << Teuchos::typeName (*this) << " {"
3816  << "label: \"" << this->getObjectLabel () << "\""
3817  << ", numRows: " << getGlobalLength ()
3818  << ", numCols: " << getNumVectors ()
3819  << ", isConstantStride: " << isConstantStride ();
3820  if (isConstantStride ()) {
3821  oss << ", columnStride: " << getStride ();
3822  }
3823  oss << "}";
3824  return oss.str();
3825  }
3826 
3827  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3828  void
3830  describe (Teuchos::FancyOStream &out,
3831  const Teuchos::EVerbosityLevel verbLevel) const
3832  {
3833  using Teuchos::ArrayRCP;
3834  using Teuchos::RCP;
3835  using Teuchos::VERB_DEFAULT;
3836  using Teuchos::VERB_NONE;
3837  using Teuchos::VERB_LOW;
3838  using Teuchos::VERB_MEDIUM;
3839  using Teuchos::VERB_HIGH;
3840  using Teuchos::VERB_EXTREME;
3841  using std::endl;
3842  using std::setw;
3843 
3844  // Set default verbosity if applicable.
3845  const Teuchos::EVerbosityLevel vl =
3846  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3847 
3848  RCP<const Teuchos::Comm<int> > comm = this->getMap()->getComm();
3849  const int myImageID = comm->getRank();
3850  const int numImages = comm->getSize();
3851 
3852  if (vl != VERB_NONE) {
3853  // Don't set the tab level unless we're printing something.
3854  Teuchos::OSTab tab0 (out);
3855 
3856  if (myImageID == 0) { // >= VERB_LOW prints description()
3857  out << "Tpetra::MultiVector:" << endl;
3858  Teuchos::OSTab tab1 (out);
3859  out << "Template parameters:" << endl;
3860  {
3861  Teuchos::OSTab tab2 (out);
3862  out << "Scalar: " << Teuchos::TypeNameTraits<Scalar>::name () << endl
3863  << "LocalOrdinal: " << Teuchos::TypeNameTraits<LocalOrdinal>::name () << endl
3864  << "GlobalOrdinal: " << Teuchos::TypeNameTraits<GlobalOrdinal>::name () << endl
3865  << "Node: " << Teuchos::TypeNameTraits<Node>::name () << endl;
3866  }
3867  out << "label: \"" << this->getObjectLabel () << "\"" << endl
3868  << "numRows: " << getGlobalLength () << endl
3869  << "numCols: " << getNumVectors () << endl
3870  << "isConstantStride: " << isConstantStride () << endl;
3871  if (isConstantStride ()) {
3872  out << "columnStride: " << getStride () << endl;
3873  }
3874  }
3875  for (int imageCtr = 0; imageCtr < numImages; ++imageCtr) {
3876  if (myImageID == imageCtr) {
3877  if (vl != VERB_LOW) {
3878  // At verbosity > VERB_LOW, each process prints something.
3879  out << "Process " << myImageID << ":" << endl;
3880  Teuchos::OSTab tab2 (out);
3881 
3882  // >= VERB_MEDIUM: print the local vector length.
3883  out << "localNumRows: " << getLocalLength() << endl
3884  << "isConstantStride: " << isConstantStride () << endl;
3885  if (vl != VERB_MEDIUM) {
3886  // >= VERB_HIGH: print isConstantStride() and getStride()
3887  if (isConstantStride()) {
3888  out << "columnStride: " << getStride() << endl;
3889  }
3890  if (vl == VERB_EXTREME) {
3891  // VERB_EXTREME: print all the values in the multivector.
3892  out << "values: " << endl;
3893  typename dual_view_type::t_host X = this->getDualView ().h_view;
3894  out << "[";
3895  for (size_t i = 0; i < getLocalLength (); ++i) {
3896  for (size_t j = 0; j < getNumVectors (); ++j) {
3897  const size_t col = isConstantStride () ? j : whichVectors_[j];
3898  out << X(i,col);
3899  if (j + 1 < getNumVectors ()) {
3900  out << ", ";
3901  }
3902  } // for each column
3903  if (i + 1 < getLocalLength ()) {
3904  out << "; ";
3905  }
3906  } // for each row
3907  out << "]" << endl;
3908  } // if vl == VERB_EXTREME
3909  } // if (vl != VERB_MEDIUM)
3910  else { // vl == VERB_LOW
3911  out << endl;
3912  }
3913  } // if vl != VERB_LOW
3914  } // if it is my process' turn to print
3915  comm->barrier ();
3916  } // for each process in the communicator
3917  } // if vl != VERB_NONE
3918  }
3919 
3920 #if TPETRA_USE_KOKKOS_DISTOBJECT
3921  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3922  void
3924  createViews () const
3925  {
3926  // Do nothing in Kokkos::View implementation
3927  }
3928 
3929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3930  void
3932  createViewsNonConst (KokkosClassic::ReadWriteOption rwo)
3933  {
3934  // Do nothing in Kokkos::View implementation
3935  }
3936 
3937  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3938  void
3940  releaseViews () const
3941  {
3942  // Do nothing in Kokkos::View implementation
3943  }
3944 
3945 #else // NOT TPETRA_USE_KOKKOS_DISTOBJECT
3946 
3947  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3948  void
3951  {}
3952 
3953  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3954  void
3956  createViewsNonConst (KokkosClassic::ReadWriteOption /* rwo */ )
3957  {}
3958 
3959  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3960  void
3963  {}
3964 
3965 #endif // TPETRA_USE_KOKKOS_DISTOBJECT
3966 
3967  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3968  void
3970  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
3971  {
3972  replaceMap (newMap);
3973  }
3974 
3975  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3976  void
3979  {
3980  using Kokkos::parallel_for;
3981  typedef LocalOrdinal LO;
3982  typedef device_type DT;
3983  typedef typename dual_view_type::host_mirror_space::device_type HMDT;
3984  const bool debug = false;
3985 
3986  TEUCHOS_TEST_FOR_EXCEPTION(
3987  this->getGlobalLength () != src.getGlobalLength () ||
3988  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
3989  "Tpetra::deep_copy: Global dimensions of the two Tpetra::MultiVector "
3990  "objects do not match. src has dimensions [" << src.getGlobalLength ()
3991  << "," << src.getNumVectors () << "], and *this has dimensions ["
3992  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
3993  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
3994  TEUCHOS_TEST_FOR_EXCEPTION(
3995  this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
3996  "Tpetra::deep_copy: The local row counts of the two Tpetra::MultiVector "
3997  "objects do not match. src has " << src.getLocalLength () << " row(s) "
3998  << " and *this has " << this->getLocalLength () << " row(s).");
3999 
4000  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4001  std::cout << "*** MultiVector::assign: ";
4002  }
4003 
4004  if (src.isConstantStride () && this->isConstantStride ()) {
4005  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4006  std::cout << "Both *this and src have constant stride" << std::endl;
4007  }
4008 
4009  if (src.getDualView ().modified_device() >= src.getDualView ().modified_host()) {
4010  // Device memory has the most recent version of src.
4011  this->template modify<DT> (); // We are about to modify dst on device.
4012  // Copy from src to dst on device.
4013  Details::localDeepCopyConstStride (this->getDualView ().template view<DT> (),
4014  src.getDualView ().template view<DT> ());
4015  this->template sync<HMDT> (); // Sync dst from device to host.
4016  }
4017  else { // Host memory has the most recent version of src.
4018  this->template modify<HMDT> (); // We are about to modify dst on host.
4019  // Copy from src to dst on host.
4020  Details::localDeepCopyConstStride (this->getDualView ().template view<HMDT> (),
4021  src.getDualView ().template view<HMDT> ());
4022  this->template sync<DT> (); // Sync dst from host to device.
4023  }
4024  }
4025  else {
4026  if (this->isConstantStride ()) {
4027  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4028  std::cout << "Only *this has constant stride";
4029  }
4030 
4031  const LO numWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4032  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4033 
4034  // We can't sync src, since it is only an input argument.
4035  // Thus, we have to use the most recently modified version of
4036  // src, device or host.
4037  if (src.getDualView ().modified_device() >= src.getDualView ().modified_host()) {
4038  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4039  std::cout << "; Copy from device version of src" << std::endl;
4040  }
4041  // Copy from the device version of src.
4042  //
4043  // whichVecs tells the kernel which vectors (columns) of src
4044  // to copy. Fill whichVecs on the host, and sync to device.
4045  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4046  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4047  srcWhichVecs.template modify<HMDT> ();
4048  for (LO i = 0; i < numWhichVecs; ++i) {
4049  srcWhichVecs.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
4050  }
4051  // Sync the host version of srcWhichVecs to the device.
4052  srcWhichVecs.template sync<DT> ();
4053 
4054  // Mark the device version of dst's DualView as modified.
4055  this->template modify<DT> ();
4056 
4057  // Copy from the selected vectors of src to dst, on the
4058  // device. The function ignores its dstWhichVecs argument
4059  // in this case.
4060  Details::localDeepCopy (this->getDualView ().template view<DT> (),
4061  src.getDualView ().template view<DT> (),
4062  true, false, srcWhichVecs.d_view, srcWhichVecs.d_view);
4063  // Sync *this' DualView to the host. This is cheaper than
4064  // repeating the above copy from src to *this on the host.
4065  this->template sync<HMDT> ();
4066  }
4067  else { // host version of src was the most recently modified
4068  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4069  std::cout << "; Copy from host version of src" << std::endl;
4070  }
4071  // Copy from the host version of src.
4072  //
4073  // whichVecs tells the kernel which vectors (columns) of src
4074  // to copy. Fill whichVecs on the host, and use it there.
4075  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4076  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4077  for (LO i = 0; i < numWhichVecs; ++i) {
4078  srcWhichVecs(i) = static_cast<LO> (src.whichVectors_[i]);
4079  }
4080  // Copy from the selected vectors of src to dst, on the
4081  // host. The function ignores its dstWhichVecs argument in
4082  // this case.
4083  Details::localDeepCopy (this->getDualView ().template view<HMDT> (),
4084  src.getDualView ().template view<HMDT> (),
4085  true, false, srcWhichVecs, srcWhichVecs);
4086  // Sync dst back to the device, since we only copied on the host.
4087  this->template sync<DT> ();
4088  }
4089  }
4090  else { // dst is NOT constant stride
4091  if (src.isConstantStride ()) {
4092  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4093  std::cout << "Only src has constant stride" << std::endl;
4094  }
4095 
4096  if (src.getDualView ().modified_device() >= src.getDualView ().modified_host()) {
4097  // Copy from the device version of src.
4098  //
4099  // whichVecs tells the kernel which vectors (columns) of dst
4100  // to copy. Fill whichVecs on the host, and sync to device.
4101  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4102  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4103  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4104  whichvecs_type whichVecs (whichVecsLabel, numWhichVecs);
4105  whichVecs.template modify<HMDT> ();
4106  for (LO i = 0; i < numWhichVecs; ++i) {
4107  whichVecs.h_view(i) = this->whichVectors_[i];
4108  }
4109  // Sync the host version of whichVecs to the device.
4110  whichVecs.template sync<DT> ();
4111 
4112  // Copy src to the selected vectors of dst, on the device.
4113  Details::localDeepCopy (this->getDualView ().template view<DT> (),
4114  src.getDualView ().template view<DT> (),
4115  this->isConstantStride (), src.isConstantStride (),
4116  whichVecs.d_view, whichVecs.d_view);
4117  // We can't sync src and repeat the above copy on the
4118  // host, so sync dst back to the host.
4119  //
4120  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4121  // don't actually belong to dst's view.
4122  this->template sync<HMDT> ();
4123  }
4124  else { // host version of src was the most recently modified
4125  // Copy from the host version of src.
4126  //
4127  // whichVecs tells the kernel which vectors (columns) of src
4128  // to copy. Fill whichVecs on the host, and use it there.
4129  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4130  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4131  whichvecs_type whichVecs ("MV::deep_copy::whichVecs", numWhichVecs);
4132  for (LO i = 0; i < numWhichVecs; ++i) {
4133  whichVecs(i) = static_cast<LO> (this->whichVectors_[i]);
4134  }
4135  // Copy from src to the selected vectors of dst, on the
4136  // host. The functor ignores its 4th arg in this case.
4137  Details::localDeepCopy (this->getDualView ().template view<HMDT> (),
4138  src.getDualView ().template view<HMDT> (),
4139  this->isConstantStride (), src.isConstantStride (),
4140  whichVecs, whichVecs);
4141  // Sync dst back to the device, since we only copied on the host.
4142  //
4143  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4144  // don't actually belong to dst's view.
4145  this->template sync<DT> ();
4146  }
4147  }
4148  else { // neither src nor dst have constant stride
4149  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4150  std::cout << "Neither *this nor src has constant stride" << std::endl;
4151  }
4152 
4153  if (src.getDualView ().modified_device() >= src.getDualView ().modified_host()) {
4154  // Copy from the device version of src.
4155  //
4156  // whichVectorsDst tells the kernel which vectors
4157  // (columns) of dst to copy. Fill it on the host, and
4158  // sync to device.
4159  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4160  Kokkos::DualView<LO*, DT> whichVecsDst ("MV::deep_copy::whichVecsDst",
4161  dstNumWhichVecs);
4162  whichVecsDst.template modify<HMDT> ();
4163  for (LO i = 0; i < dstNumWhichVecs; ++i) {
4164  whichVecsDst.h_view(i) = static_cast<LO> (this->whichVectors_[i]);
4165  }
4166  // Sync the host version of whichVecsDst to the device.
4167  whichVecsDst.template sync<DT> ();
4168 
4169  // whichVectorsSrc tells the kernel which vectors
4170  // (columns) of src to copy. Fill it on the host, and
4171  // sync to device. Use the destination MultiVector's
4172  // LocalOrdinal type here.
4173  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4174  Kokkos::DualView<LO*, DT> whichVecsSrc ("MV::deep_copy::whichVecsSrc",
4175  srcNumWhichVecs);
4176  whichVecsSrc.template modify<HMDT> ();
4177  for (LO i = 0; i < srcNumWhichVecs; ++i) {
4178  whichVecsSrc.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
4179  }
4180  // Sync the host version of whichVecsSrc to the device.
4181  whichVecsSrc.template sync<DT> ();
4182 
4183  // Copy from the selected vectors of src to the selected
4184  // vectors of dst, on the device.
4185  Details::localDeepCopy (this->getDualView ().template view<DT> (),
4186  src.getDualView ().template view<DT> (),
4187  this->isConstantStride (), src.isConstantStride (),
4188  whichVecsDst.d_view, whichVecsSrc.d_view);
4189  }
4190  else {
4191  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4192  Kokkos::View<LO*, HMDT> whichVectorsDst ("dstWhichVecs", dstNumWhichVecs);
4193  for (LO i = 0; i < dstNumWhichVecs; ++i) {
4194  whichVectorsDst(i) = this->whichVectors_[i];
4195  }
4196 
4197  // Use the destination MultiVector's LocalOrdinal type here.
4198  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4199  Kokkos::View<LO*, HMDT> whichVectorsSrc ("srcWhichVecs", srcNumWhichVecs);
4200  for (LO i = 0; i < srcNumWhichVecs; ++i) {
4201  whichVectorsSrc(i) = src.whichVectors_[i];
4202  }
4203 
4204  // Copy from the selected vectors of src to the selected
4205  // vectors of dst, on the host.
4206  Details::localDeepCopy (this->getDualView ().template view<HMDT> (),
4207  src.getDualView ().template view<HMDT> (),
4208  this->isConstantStride (), src.isConstantStride (),
4209  whichVectorsDst, whichVectorsSrc);
4210 
4211  // We can't sync src and repeat the above copy on the
4212  // host, so sync dst back to the host.
4213  //
4214  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4215  // don't actually belong to dst's view.
4216  this->template sync<HMDT> ();
4217  }
4218  }
4219  }
4220  }
4221  }
4222 
4223  template <class Scalar, class LO, class GO, class NT, const bool classic>
4224  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT, classic> >
4225  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4226  size_t numVectors)
4227  {
4229  return Teuchos::rcp (new MV (map, numVectors));
4230  }
4231 
4232  template <class ST, class LO, class GO, class NT, const bool classic>
4235  {
4237  MV cpy (src.getMap (), src.getNumVectors (), false);
4238  cpy.assign (src);
4239  return cpy;
4240  }
4241 
4242 } // namespace Tpetra
4243 
4244 //
4245 // Explicit instantiation macro
4246 //
4247 // Must be expanded from within the Tpetra namespace!
4248 //
4249 
4250 #define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4251  template class MultiVector< SCALAR , LO , GO , NODE >; \
4252  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4253  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors);
4254 
4255 #endif // TPETRA_MULTIVECTOR_DEF_HPP
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
size_t getLocalLength() const
Local number of rows on the calling process.
EWhichNormImpl
Input argument for localNormImpl() (which see).
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector&#39;s local values.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
friend void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector&#39;s local values.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
MultiVector< ST, LO, GO, NT, classic > createCopy(const MultiVector< ST, LO, GO, NT, classic > &src)
Return a deep copy of the given MultiVector.
void normInf(const Kokkos::View< mag_type *, device_type > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a device view...
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
void replaceGlobalValue(GlobalOrdinal globalRow, size_t col, const impl_scalar_type &value) const
Replace value, using global (row) index.
One or more distributed dense vectors.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector&#39;s local values.
Node::device_type device_type
The Kokkos device type.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
void randomize()
Set all values in the multivector to pseudorandom numbers.
void deep_copy(const LittleBlock< ST2, LO > &dst, const LittleBlock< ST1, LO > &src, typename std::enable_if< std::is_convertible< ST1, ST2 >::value &&!std::is_const< ST2 >::value, int >::type *=NULL)
Copy the LittleBlock src into the LittleBlock dst.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
void norm1(const Kokkos::View< mag_type *, device_type > &norms) const
Compute the one-norm of each vector (column), storing the result in a device view.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
void reduce()
Sum values of a locally replicated multivector across all processes.
size_t global_size_t
Global size_t object.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void norm2(const Kokkos::View< mag_type *, device_type > &norms) const
Compute the two-norm of each vector (column), storing the result in a device view.
Insert new values that don&#39;t currently exist.
void createViews() const
Hook for creating a const view.
dual_view_type getDualView() const
Get the Kokkos::DualView which implements local storage.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
void GEMM(const char transA[], const char transB[], const CoefficientType &alpha, const ViewType1 &A, const ViewType2 &B, const CoefficientType &beta, const ViewType3 &C)
Small dense matrix-matrix multiply: C := alpha*A*B + beta*C
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
void sumIntoGlobalValue(const GlobalOrdinal globalRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault) const
Add value to existing value, using global (row) index.
Node::execution_space execution_space
Type of the (new) Kokkos execution space.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > & operator=(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &source)
Assign the contents of source to this multivector (deep copy).
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
void createViewsNonConst(KokkosClassic::ReadWriteOption rwo)
Hook for creating a nonconst view.
bool isDistributed() const
Whether this is a globally distributed object.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector&#39;s local values.
Replace existing values with new values.
size_t getStride() const
Stride between columns in the multivector.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,j).
KokkosClassic::MultiVector< Scalar, Node > getLocalMV() const
A view of the underlying KokkosClassic::MultiVector object.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
virtual ~MultiVector()
Destructor (virtual for memory safety of derived classes).
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
void releaseViews() const
Hook for releasing views.
Kokkos::Details::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, typename execution_space::execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
Describes a parallel distribution of objects over processes.
void TPETRA_DEPRECATED normWeighted(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &weights, const Teuchos::ArrayView< mag_type > &norms) const
Compute Weighted 2-norm (RMS Norm) of each column.
Class that provides GEMM for a particular Kokkos Device.
A distributed dense vector.
Stand-alone utility functions and macros.
virtual std::string description() const
A simple one-line description of this object.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &src)
Copy the contents of src into *this (deep copy).
size_t getNumVectors() const
Number of columns in the multivector.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
EWhichNorm
Input argument for normImpl() (which see).
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
void normImpl(const Kokkos::View< mag_type *, device_type > &norms, const EWhichNorm whichNorm) const
Compute the norm of each vector (column), storing the result in a device View.
dual_view_type origView_
The "original view" of the MultiVector&#39;s data.
void sumIntoLocalValue(const LocalOrdinal localRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault) const
Add value to existing value, using local (row) index.
void replaceLocalValue(LocalOrdinal localRow, size_t col, const impl_scalar_type &value) const
Replace value, using local (row) index.