46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR 51 #include <Teuchos_ScalarTraits.hpp> 52 #include <Teuchos_SerialDenseMatrix.hpp> 53 #include <Teuchos_SerialQRDenseSolver.hpp> 57 #include "MueLu_Aggregates_kokkos.hpp" 58 #include "MueLu_AmalgamationFactory.hpp" 59 #include "MueLu_AmalgamationInfo.hpp" 60 #include "MueLu_CoarseMapFactory_kokkos.hpp" 61 #include "MueLu_NullspaceFactory_kokkos.hpp" 62 #include "MueLu_PerfUtils.hpp" 64 #include "MueLu_Utilities_kokkos.hpp" 70 template<
class LocalOrdinal,
class RowType>
73 ScanFunctor(RowType rows) : rows_(rows) { }
75 KOKKOS_INLINE_FUNCTION
76 void operator()(
const LocalOrdinal i, LocalOrdinal& upd,
const bool&
final)
const {
88 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
89 RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList()
const {
90 RCP<ParameterList> validParamList = rcp(
new ParameterList());
92 validParamList->set< RCP<const FactoryBase> >(
"A", Teuchos::null,
"Generating factory of the matrix A");
93 validParamList->set< RCP<const FactoryBase> >(
"Aggregates", Teuchos::null,
"Generating factory of the aggregates");
94 validParamList->set< RCP<const FactoryBase> >(
"Nullspace", Teuchos::null,
"Generating factory of the nullspace");
95 validParamList->set< RCP<const FactoryBase> >(
"UnAmalgamationInfo", Teuchos::null,
"Generating factory of UnAmalgamationInfo");
96 validParamList->set< RCP<const FactoryBase> >(
"CoarseMap", Teuchos::null,
"Generating factory of the coarse map");
98 return validParamList;
101 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
102 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& coarseLevel)
const {
103 Input(fineLevel,
"A");
104 Input(fineLevel,
"Aggregates");
105 Input(fineLevel,
"Nullspace");
106 Input(fineLevel,
"UnAmalgamationInfo");
107 Input(fineLevel,
"CoarseMap");
110 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
111 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel)
const {
112 return BuildP(fineLevel, coarseLevel);
115 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
116 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel)
const {
117 FactoryMonitor m(*
this,
"Build", coarseLevel);
119 auto A = Get< RCP<Matrix> > (fineLevel,
"A");
120 auto aggregates = Get< RCP<Aggregates_kokkos> >(fineLevel,
"Aggregates");
121 auto amalgInfo = Get< RCP<AmalgamationInfo> > (fineLevel,
"UnAmalgamationInfo");
122 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel,
"Nullspace");
123 auto coarseMap = Get< RCP<const Map> > (fineLevel,
"CoarseMap");
125 RCP<Matrix> Ptentative;
126 RCP<MultiVector> coarseNullspace;
127 if (!aggregates->AggregatesCrossProcessors())
128 BuildPuncoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
130 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
140 if (A->IsView(
"stridedMaps") ==
true)
141 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
143 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
145 Set(coarseLevel,
"Nullspace", coarseNullspace);
146 Set(coarseLevel,
"P", Ptentative);
149 RCP<ParameterList> params = rcp(
new ParameterList());
150 params->set(
"printLoadBalancingInfo",
true);
155 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
156 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
157 BuildPuncoupled(RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
158 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace)
const {
159 RCP<const Map> rowMap = A->getRowMap();
160 RCP<const Map> colMap = A->getColMap();
162 const size_t numRows = rowMap->getNodeNumElements();
163 const size_t NSDim = fineNullspace->getNumVectors();
165 typedef Teuchos::ScalarTraits<SC> STS;
166 const SC zero = STS::zero();
167 const SC one = STS::one();
168 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
170 auto aggGraph = aggregates->GetGraph();
171 auto aggRows = aggGraph.row_map;
172 auto aggCols = aggGraph.entries;
173 const GO numAggs = aggregates->GetNumAggregates();
178 bool goodMap = isGoodMap(*rowMap, *colMap);
180 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
"For now, need matching row and col maps");
181 TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() != 1, Exceptions::RuntimeError,
"For now, only block size 1");
187 for (LO i = 0; i < aggCols.size(); i++)
188 agg2RowMapLO(i) = aggCols(i);
190 for (LO i = 0; i < aggCols.size(); i++)
191 for (LO k = 0; k < NSDim; k++) {
193 agg2RowMapLO(i*NSDim+k) = aggCols(i)*NSDim+k;
197 ArrayRCP<LO> aggStart;
198 ArrayRCP<LO> aggToRowMapLO;
199 ArrayRCP<GO> aggToRowMapGO;
201 amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO);
202 GetOStream(
Runtime1) <<
"Column map is consistent with the row map, good." << std::endl;
205 amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO);
206 GetOStream(
Warnings0) <<
"Column map is not consistent with the row map\n" 207 <<
"using GO->LO conversion with performance penalty" << std::endl;
227 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
230 auto fineNS = fineNullspace ->template getLocalView<DeviceType>();
231 auto coarseNS = coarseNullspace->template getLocalView<DeviceType>();
233 size_t nnzEstimate = numRows * NSDim, nnz = 0;
235 typedef typename Matrix::local_matrix_type local_matrix_type;
236 typedef typename local_matrix_type::row_map_type rows_type;
237 typedef typename local_matrix_type::index_type cols_type;
238 typedef typename local_matrix_type::values_type vals_type;
243 typename rows_type::non_const_type rowsAux(
"Ptent_aux_rows", numRows+1), rows(
"Ptent_rows", numRows+1);
244 typename cols_type::non_const_type colsAux(
"Ptent_aux_cols", nnzEstimate);
245 typename vals_type::non_const_type valsAux(
"Ptent_aux_vals", nnzEstimate);
247 Kokkos::parallel_for(
"TentativePF:BuildPuncoupled:for1", numRows+1, KOKKOS_LAMBDA(
const LO row) {
248 rowsAux(row) = row*NSDim;
250 Kokkos::parallel_for(
"TentativePF:BuildUncoupled:for2", nnzEstimate, KOKKOS_LAMBDA(
const LO j) {
251 colsAux(j) = INVALID;
256 status_type status(
"status");
261 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
262 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
266 Kokkos::parallel_reduce(
"TentativePF:BuildUncoupled:main_loop", numAggs, KOKKOS_LAMBDA(
const GO agg,
size_t& rowNnz) {
267 LO aggSize = aggRows(agg+1) - aggRows(agg);
274 typedef Kokkos::ArithTraits<SC> ATS;
275 typedef typename ATS::magnitudeType Magnitude;
277 Magnitude norm = ATS::magnitude(zero);
278 for (
size_t k = 0; k < aggSize; k++) {
279 Magnitude dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
286 statusAtomic(1) =
true;
291 coarseNS(agg, 0) = norm;
294 for (LO k = 0; k < aggSize; k++) {
295 LO localRow = agg2RowMapLO(aggRows(agg)+k);
296 SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
298 size_t rowStart = rowsAux(localRow);
299 colsAux(rowStart) = agg;
300 valsAux(rowStart) = localVal;
303 rows(localRow+1) = 1;
310 statusAtomic(0) =
true;
315 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
316 for (
int i = 0; i < statusHost.size(); i++)
318 std::ostringstream oss;
319 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
321 case 0: oss <<
"!goodMap is not implemented";
break;
322 case 1: oss <<
"fine level NS part has a zero column";
break;
324 throw Exceptions::RuntimeError(oss.str());
328 throw Exceptions::RuntimeError(
"Ignore NSDim > 1 for now");
330 Kokkos::parallel_reduce(
"TentativePF:BuildUncoupled:main_loop", numAggs, KOKKOS_LAMBDA(
const GO agg,
size_t& nnz) {
331 LO aggSize = aggRows(agg+1) - aggRows(agg);
333 Xpetra::global_size_t offset = agg*NSDim;
342 for (
size_t j = 0; j < NSDim; j++)
343 for (LO k = 0; k < aggSize; k++)
344 localQR(k,j) = fineNSRandom(agg2RowMapLO(aggRows(agg)+k), j);
346 statusAtomic(0) =
true;
349 for (
size_t j = 0; j < NSDim; j++)
350 for (LO k = 0; k < aggSize; k++)
352 localQR(k,j) = fineNS(rowMap->getLocalElement(aggToRowMapGO(aggStart(agg)+k)), j);
357 for (
size_t j = 0; j < NSDim; j++) {
358 bool bIsZeroNSColumn =
true;
360 for (LO k = 0; k < aggSize; k++)
361 if (localQR(k,j) != zero)
362 bIsZeroNSColumn =
false;
364 if (bIsZeroNSColumn) {
365 statusAtomic(1) =
true;
372 if (aggSize >= NSDim) {
376 typedef Kokkos::ArithTraits<SC> ATS;
377 typedef typename ATS::magnitudeType Magnitude;
379 Magnitude norm = ATS::magnitude(zero);
380 for (
size_t k = 0; k < Teuchos::as<size_t>(aggSize); k++)
381 norm += ATS::magnitude(localQR(k,0)*localQR(k,0));
382 norm = Kokkos::ArithTraits<Magnitude>::squareroot(norm);
385 coarseNS(offset, 0) = norm;
388 for (LO i = 0; i < aggSize; i++)
389 localQR(i,0) /= norm;
393 statusAtomic(2) =
true;
397 Teuchos::SerialQRDenseSolver<LO,SC> qrSolver;
398 qrSolver.setMatrix(Teuchos::rcp(&localQR,
false));
402 for (
size_t j = 0; j < NSDim; j++)
403 for (
size_t k = 0; k <= j; k++)
404 coarseNS(offset+k,j) = localQR(k,j);
409 Teuchos::RCP<Teuchos::SerialDenseMatrix<LO,SC> > qFactor = qrSolver.getQ();
410 for (
size_t j = 0; j < NSDim; j++)
411 for (
size_t i = 0; i < Teuchos::as<size_t>(aggSize); i++)
412 localQR(i,j) = (*qFactor)(i,j);
417 statusAtomic(3) =
true;
447 for (
size_t j = 0; j < NSDim; j++)
448 for (
size_t k = 0; k < NSDim; k++)
449 if (k < as<size_t>(aggSize))
450 coarseNS[j][offset+k] = localQR(k,j);
452 coarseNS[j][offset+k] = (k == j ? one : zero);
455 for (
size_t i = 0; i < as<size_t>(aggSize); i++)
456 for (
size_t j = 0; j < NSDim; j++)
457 localQR(i,j) = (j == i ? one : zero);
463 for (LO j = 0; j < aggSize; j++) {
465 LO localRow = (goodMap ? agg2RowMapLO(aggRows(agg)+j) : -1);
467 LO localRow = (goodMap ? agg2RowMapLO[aggRows(agg)+j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]));
470 size_t rowStart = rowsAux(localRow), lnnz = 0;
471 for (
size_t k = 0; k < NSDim; k++) {
473 if (localQR(j,k) != zero) {
474 colsAux(rowStart+lnnz) = offset + k;
475 valsAux(rowStart+lnnz) = localQR(j,k);
480 rows(localRow+1) = lnnz;
484 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
485 for (
int i = 0; i < statusHost.size(); i++)
487 std::ostringstream oss;
488 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
490 case 0: oss <<
"!goodMap is not implemented";
491 case 1: oss <<
"fine level NS part has a zero column";
492 case 2: oss <<
"NSDim > 1 is not implemented";
493 case 3: oss <<
"aggSize < NSDim is not imlemented";
495 throw Exceptions::RuntimeError(oss.str());
501 ScanFunctor<LO,decltype(rows)> scanFunctor(rows);
502 Kokkos::parallel_scan(
"TentativePF:Build:compress_rows", numRows+1, scanFunctor);
505 typename cols_type::non_const_type cols(
"Ptent_cols", nnz);
506 typename vals_type::non_const_type vals(
"Ptent_vals", nnz);
507 Kokkos::parallel_for(
"TentativePF:Build:compress_cols_vals", numRows, KOKKOS_LAMBDA(
const LO i) {
508 LO rowStart = rows(i);
511 for (LO j = rowsAux(i); j < rowsAux(i+1); j++)
512 if (valsAux(j) != INVALID) {
513 cols(rowStart+lnnz) = colsAux(j);
514 vals(rowStart+lnnz) = valsAux(j);
519 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
525 Ptentative = rcp(
new CrsMatrixWrap(rowMap, coarseMap, 0, Xpetra::StaticProfile));
526 RCP<CrsMatrix> PtentCrs = rcp_dynamic_cast<CrsMatrixWrap>(Ptentative)->getCrsMatrix();
528 ArrayRCP<size_t> iaPtent;
529 ArrayRCP<LO> jaPtent;
530 ArrayRCP<SC> valPtent;
532 PtentCrs->allocateAllValues(nnz, iaPtent, jaPtent, valPtent);
534 ArrayView<size_t> ia = iaPtent();
535 ArrayView<LO> ja = jaPtent();
536 ArrayView<SC> val = valPtent();
539 typename rows_type::HostMirror rowsHost = Kokkos::create_mirror_view(rows);
540 typename cols_type::HostMirror colsHost = Kokkos::create_mirror_view(cols);
541 typename vals_type::HostMirror valsHost = Kokkos::create_mirror_view(vals);
542 for (LO i = 0; i < rowsHost.size(); i++)
544 for (LO j = 0; j < colsHost.size(); j++) {
545 ja [j] = colsHost(j);
546 val[j] = valsHost(j);
549 PtentCrs->setAllValues(iaPtent, jaPtent, valPtent);
550 PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap());
553 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
554 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
555 BuildPcoupled(RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
556 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace)
const {
557 throw Exceptions::RuntimeError(
"Construction of coupled tentative P is not implemented");
560 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
561 bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::isGoodMap(
const Map& rowMap,
const Map& colMap)
const {
562 ArrayView<const GO> rowElements = rowMap.getNodeElementList();
563 ArrayView<const GO> colElements = colMap.getNodeElementList();
565 const size_t numElements = rowElements.size();
568 for (
size_t i = 0; i < numElements; i++)
569 if (rowElements[i] != colElements[i]) {
579 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT 580 #endif // HAVE_MUELU_KOKKOS_REFACTOR 581 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP Important warning messages (one line)
Namespace for MueLu classes and methods.
void parallel_reduce(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< !Impl::is_integral< ExecPolicy >::value >::type *=0)
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< !Impl::is_integral< ExecPolicy >::value >::type *=0)
Description of what is happening (more verbose)