49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_ 50 #define _ZOLTAN2_ALGMultiJagged_HPP_ 57 #include <Tpetra_Distributor.hpp> 58 #include <Teuchos_ParameterList.hpp> 65 #if defined(__cplusplus) && __cplusplus >= 201103L 66 #include <unordered_map> 68 #include <Teuchos_Hashtable.hpp> 69 #endif // C++11 is enabled 71 #ifdef ZOLTAN2_USEZOLTANCOMM 72 #ifdef HAVE_ZOLTAN2_MPI 73 #define ENABLE_ZOLTAN_MIGRATION 74 #include "zoltan_comm_cpp.h" 75 #include "zoltan_types.h" 79 #ifdef HAVE_ZOLTAN2_OMP 83 #define LEAST_SIGNIFICANCE 0.0001 84 #define SIGNIFICANCE_MUL 1000 89 #define FUTURE_REDUCEALL_CUTOFF 1500000 92 #define MIN_WORK_LAST_DIM 1000 97 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x)) 99 #define imbalanceOf(Wachieved, totalW, expectedRatio) \ 100 (Wachieved) / ((totalW) * (expectedRatio)) - 1 101 #define imbalanceOf2(Wachieved, wExpected) \ 102 (Wachieved) / (wExpected) - 1 105 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp; 114 template <
typename Ordinal,
typename T>
133 size(s_), _EPSILON (
std::numeric_limits<T>::
epsilon()){}
137 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const 139 for (Ordinal i=0; i < count; i++){
140 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
141 inoutBuffer[i] = inBuffer[i];
153 template <
typename T>
158 throw "cannot allocate memory";
170 template <
typename T>
181 template <
typename tt>
183 std::stringstream ss (std::stringstream::in |std::stringstream::out);
185 std::string tmp =
"";
197 template <
typename IT,
typename CT,
typename WT>
216 this->index = index_;
217 this->count = count_;
223 this->index = other.
index;
224 this->count = other.
count;
225 this->val = other.
val;
233 void set(IT index_ ,CT count_, WT *vals_){
234 this->index = index_;
235 this->count = count_;
241 this->index = other.
index;
242 this->count = other.
count;
243 this->val = other.
val;
247 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const{
248 assert (this->count == other.count);
249 for(CT i = 0; i < this->count; ++i){
251 if (
ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
255 if(this->val[i] < other.val[i]){
264 return this->index < other.index;
267 assert (this->count == other.
count);
268 for(CT i = 0; i < this->count; ++i){
274 if(this->val[i] > other.
val[i]){
284 return this->index > other.
index;
291 template <
class IT,
class WT>
302 template <
class IT,
class WT>
308 IT i, ir=n, j, k, l=1;
309 IT jstack=0, istack[50];
318 for (j=l+1;j<=ir;j++)
324 if (arr[i].val <= aval)
340 if (arr[l+1].val > arr[ir].val)
344 if (arr[l].val > arr[ir].val)
348 if (arr[l+1].val > arr[l].val)
358 do i++;
while (arr[i].val < aval);
359 do j--;
while (arr[j].val > aval);
366 if (jstack > NSTACK){
367 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
386 template <
class IT,
class WT,
class SIGN>
393 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
395 if (this->signbit < rhs.signbit){
399 else if (this->signbit == rhs.signbit){
401 if (this->val < rhs.val){
402 return this->signbit;
405 else if (this->val > rhs.val){
406 return !this->signbit;
421 if (this->signbit > rhs.
signbit){
425 else if (this->signbit == rhs.
signbit){
427 if (this->val < rhs.
val){
428 return !this->signbit;
431 else if (this->val > rhs.
val){
432 return this->signbit;
444 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
445 return !(*
this > rhs);}
447 return !(*
this < rhs);}
453 template <
class IT,
class WT,
class SIGN>
458 IT i, ir=n, j, k, l=1;
459 IT jstack=0, istack[50];
467 for (j=l+1;j<=ir;j++)
489 if (arr[l+1] > arr[ir])
493 if (arr[l] > arr[ir])
497 if (arr[l+1] > arr[l])
506 do i++;
while (arr[i] < a);
507 do j--;
while (arr[j] > a);
514 if (jstack > NSTACK){
515 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
537 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
543 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
545 RCP<const Environment> mj_env;
546 RCP<Comm<int> > mj_problemComm;
548 double imbalance_tolerance;
549 mj_part_t *part_no_array;
551 int coord_dim, num_weights_per_coord;
553 size_t initial_num_loc_coords;
556 mj_lno_t num_local_coords;
557 mj_gno_t num_global_coords;
559 mj_scalar_t **mj_coordinates;
560 mj_scalar_t **mj_weights;
561 bool *mj_uniform_parts;
562 mj_scalar_t **mj_part_sizes;
563 bool *mj_uniform_weights;
565 ArrayView<const mj_gno_t> mj_gnos;
566 size_t num_global_parts;
568 mj_gno_t *initial_mj_gnos;
569 mj_gno_t *current_mj_gnos;
570 int *owner_of_coordinate;
572 mj_lno_t *coordinate_permutations;
573 mj_lno_t *new_coordinate_permutations;
574 mj_part_t *assigned_part_ids;
577 mj_lno_t *new_part_xadj;
580 bool distribute_points_on_cut_lines;
581 mj_part_t max_concurrent_part_calculation;
584 int mj_user_recursion_depth;
585 int mj_keep_part_boxes;
587 int check_migrate_avoid_migration_option;
588 mj_scalar_t minimum_migration_imbalance;
591 mj_part_t total_num_cut ;
592 mj_part_t total_num_part;
594 mj_part_t max_num_part_along_dim ;
595 mj_part_t max_num_cut_along_dim;
596 size_t max_num_total_part_along_dim;
598 mj_part_t total_dim_num_reduce_all;
599 mj_part_t last_dim_num_part;
603 RCP<Comm<int> > comm;
605 mj_scalar_t sEpsilon;
607 mj_scalar_t maxScalar_t;
608 mj_scalar_t minScalar_t;
610 mj_scalar_t *all_cut_coordinates;
611 mj_scalar_t *max_min_coords;
612 mj_scalar_t *process_cut_line_weight_to_put_left;
613 mj_scalar_t **thread_cut_line_weight_to_put_left;
619 mj_scalar_t *cut_coordinates_work_array;
622 mj_scalar_t *target_part_weights;
624 mj_scalar_t *cut_upper_bound_coordinates ;
625 mj_scalar_t *cut_lower_bound_coordinates ;
626 mj_scalar_t *cut_lower_bound_weights ;
627 mj_scalar_t *cut_upper_bound_weights ;
629 mj_scalar_t *process_local_min_max_coord_total_weight ;
630 mj_scalar_t *global_min_max_coord_total_weight ;
634 bool *is_cut_line_determined;
637 mj_part_t *my_incomplete_cut_count;
639 double **thread_part_weights;
641 double **thread_part_weight_work;
644 mj_scalar_t **thread_cut_left_closest_point;
646 mj_scalar_t **thread_cut_right_closest_point;
649 mj_lno_t **thread_point_counts;
651 mj_scalar_t *process_rectilinear_cut_weight;
652 mj_scalar_t *global_rectilinear_cut_weight;
658 mj_scalar_t *total_part_weight_left_right_closests ;
659 mj_scalar_t *global_total_part_weight_left_right_closests;
661 RCP<mj_partBoxVector_t> kept_boxes;
664 RCP<mj_partBox_t> global_box;
665 int myRank, myActualRank;
674 void set_part_specifications();
681 inline mj_part_t get_part_count(
682 mj_part_t num_total_future,
688 void allocate_set_work_memory();
695 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
698 void compute_global_box();
714 mj_part_t update_part_num_arrays(
715 std::vector<mj_part_t> &num_partitioning_in_current_dim,
716 std::vector<mj_part_t> *future_num_part_in_parts,
717 std::vector<mj_part_t> *next_future_num_parts_in_parts,
718 mj_part_t &future_num_parts,
719 mj_part_t current_num_parts,
720 int current_iteration,
721 RCP<mj_partBoxVector_t> input_part_boxes,
722 RCP<mj_partBoxVector_t> output_part_boxes);
735 void mj_get_local_min_max_coord_totW(
736 mj_lno_t coordinate_begin_index,
737 mj_lno_t coordinate_end_index,
738 mj_lno_t *mj_current_coordinate_permutations,
739 mj_scalar_t *mj_current_dim_coords,
740 mj_scalar_t &min_coordinate,
741 mj_scalar_t &max_coordinate,
742 mj_scalar_t &total_weight);
751 void mj_get_global_min_max_coord_totW(
752 mj_part_t current_concurrent_num_parts,
753 mj_scalar_t *local_min_max_total,
754 mj_scalar_t *global_min_max_total);
774 void mj_get_initial_cut_coords_target_weights(
775 mj_scalar_t min_coord,
776 mj_scalar_t max_coord,
778 mj_scalar_t global_weight,
779 mj_scalar_t *initial_cut_coords ,
780 mj_scalar_t *target_part_weights ,
782 std::vector <mj_part_t> *future_num_part_in_parts,
783 std::vector <mj_part_t> *next_future_num_parts_in_parts,
784 mj_part_t concurrent_current_part,
785 mj_part_t obtained_part_index);
799 void set_initial_coordinate_parts(
800 mj_scalar_t &max_coordinate,
801 mj_scalar_t &min_coordinate,
802 mj_part_t &concurrent_current_part_index,
803 mj_lno_t coordinate_begin_index,
804 mj_lno_t coordinate_end_index,
805 mj_lno_t *mj_current_coordinate_permutations,
806 mj_scalar_t *mj_current_dim_coords,
807 mj_part_t *mj_part_ids,
808 mj_part_t &partition_count);
821 mj_scalar_t *mj_current_dim_coords,
822 mj_scalar_t imbalanceTolerance,
823 mj_part_t current_work_part,
824 mj_part_t current_concurrent_num_parts,
825 mj_scalar_t *current_cut_coordinates,
826 mj_part_t total_incomplete_cut_count,
827 std::vector <mj_part_t> &num_partitioning_in_current_dim);
848 void mj_1D_part_get_thread_part_weights(
849 size_t total_part_count,
851 mj_scalar_t max_coord,
852 mj_scalar_t min_coord,
853 mj_lno_t coordinate_begin_index,
854 mj_lno_t coordinate_end_index,
855 mj_scalar_t *mj_current_dim_coords,
856 mj_scalar_t *temp_current_cut_coords,
857 bool *current_cut_status,
858 double *my_current_part_weights,
859 mj_scalar_t *my_current_left_closest,
860 mj_scalar_t *my_current_right_closest);
869 void mj_accumulate_thread_results(
870 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
871 mj_part_t current_work_part,
872 mj_part_t current_concurrent_num_parts);
904 void mj_get_new_cut_coordinates(
905 const size_t &num_total_part,
906 const mj_part_t &num_cuts,
907 const mj_scalar_t &max_coordinate,
908 const mj_scalar_t &min_coordinate,
909 const mj_scalar_t &global_total_weight,
910 const mj_scalar_t &used_imbalance_tolerance,
911 mj_scalar_t * current_global_part_weights,
912 const mj_scalar_t * current_local_part_weights,
913 const mj_scalar_t *current_part_target_weights,
914 bool *current_cut_line_determined,
915 mj_scalar_t *current_cut_coordinates,
916 mj_scalar_t *current_cut_upper_bounds,
917 mj_scalar_t *current_cut_lower_bounds,
918 mj_scalar_t *current_global_left_closest_points,
919 mj_scalar_t *current_global_right_closest_points,
920 mj_scalar_t * current_cut_lower_bound_weights,
921 mj_scalar_t * current_cut_upper_weights,
922 mj_scalar_t *new_current_cut_coordinates,
923 mj_scalar_t *current_part_cut_line_weight_to_put_left,
924 mj_part_t *rectilinear_cut_count,
925 mj_part_t &my_num_incomplete_cut);
936 void mj_calculate_new_cut_position (
937 mj_scalar_t cut_upper_bound,
938 mj_scalar_t cut_lower_bound,
939 mj_scalar_t cut_upper_weight,
940 mj_scalar_t cut_lower_weight,
941 mj_scalar_t expected_weight,
942 mj_scalar_t &new_cut_position);
954 void mj_create_new_partitions(
956 mj_scalar_t *mj_current_dim_coords,
957 mj_scalar_t *current_concurrent_cut_coordinate,
958 mj_lno_t coordinate_begin,
959 mj_lno_t coordinate_end,
960 mj_scalar_t *used_local_cut_line_weight_to_left,
961 double **used_thread_part_weight_work,
962 mj_lno_t *out_part_xadj);
986 bool mj_perform_migration(
987 mj_part_t in_num_parts,
988 mj_part_t &out_num_parts,
989 std::vector<mj_part_t> *next_future_num_parts_in_parts,
990 mj_part_t &output_part_begin_index,
991 size_t migration_reduce_all_population,
992 mj_lno_t num_coords_for_last_dim_part,
993 std::string iteration,
994 RCP<mj_partBoxVector_t> &input_part_boxes,
995 RCP<mj_partBoxVector_t> &output_part_boxes);
1006 void get_processor_num_points_in_parts(
1007 mj_part_t num_procs,
1008 mj_part_t num_parts,
1009 mj_gno_t *&num_points_in_all_processor_parts);
1023 bool mj_check_to_migrate(
1024 size_t migration_reduce_all_population,
1025 mj_lno_t num_coords_for_last_dim_part,
1026 mj_part_t num_procs,
1027 mj_part_t num_parts,
1028 mj_gno_t *num_points_in_all_processor_parts);
1048 void mj_migration_part_proc_assignment(
1049 mj_gno_t * num_points_in_all_processor_parts,
1050 mj_part_t num_parts,
1051 mj_part_t num_procs,
1052 mj_lno_t *send_count_to_each_proc,
1053 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1054 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1055 mj_part_t &out_num_part,
1056 std::vector<mj_part_t> &out_part_indices,
1057 mj_part_t &output_part_numbering_begin_index,
1058 int *coordinate_destinations);
1076 void mj_assign_proc_to_parts(
1077 mj_gno_t * num_points_in_all_processor_parts,
1078 mj_part_t num_parts,
1079 mj_part_t num_procs,
1080 mj_lno_t *send_count_to_each_proc,
1081 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1082 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1083 mj_part_t &out_part_index,
1084 mj_part_t &output_part_numbering_begin_index,
1085 int *coordinate_destinations);
1097 void assign_send_destinations(
1098 mj_part_t num_parts,
1099 mj_part_t *part_assignment_proc_begin_indices,
1100 mj_part_t *processor_chains_in_parts,
1101 mj_lno_t *send_count_to_each_proc,
1102 int *coordinate_destinations);
1116 void assign_send_destinations2(
1117 mj_part_t num_parts,
1119 int *coordinate_destinations,
1120 mj_part_t &output_part_numbering_begin_index,
1121 std::vector<mj_part_t> *next_future_num_parts_in_parts);
1139 void mj_assign_parts_to_procs(
1140 mj_gno_t * num_points_in_all_processor_parts,
1141 mj_part_t num_parts,
1142 mj_part_t num_procs,
1143 mj_lno_t *send_count_to_each_proc,
1144 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1145 mj_part_t &out_num_part,
1146 std::vector<mj_part_t> &out_part_indices,
1147 mj_part_t &output_part_numbering_begin_index,
1148 int *coordinate_destinations);
1162 void mj_migrate_coords(
1163 mj_part_t num_procs,
1164 mj_lno_t &num_new_local_points,
1165 std::string iteration,
1166 int *coordinate_destinations,
1167 mj_part_t num_parts);
1175 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1183 void fill_permutation_array(
1184 mj_part_t output_num_parts,
1185 mj_part_t num_parts);
1195 void set_final_parts(
1196 mj_part_t current_num_parts,
1197 mj_part_t output_part_begin_index,
1198 RCP<mj_partBoxVector_t> &output_part_boxes,
1199 bool is_data_ever_migrated);
1202 void free_work_memory();
1216 void create_consistent_chunks(
1217 mj_part_t num_parts,
1218 mj_scalar_t *mj_current_dim_coords,
1219 mj_scalar_t *current_concurrent_cut_coordinate,
1220 mj_lno_t coordinate_begin,
1221 mj_lno_t coordinate_end,
1222 mj_scalar_t *used_local_cut_line_weight_to_left,
1223 mj_lno_t *out_part_xadj,
1256 void multi_jagged_part(
1257 const RCP<const Environment> &env,
1258 RCP<Comm<int> > &problemComm,
1260 double imbalance_tolerance,
1261 size_t num_global_parts,
1262 mj_part_t *part_no_array,
1263 int recursion_depth,
1266 mj_lno_t num_local_coords,
1267 mj_gno_t num_global_coords,
1268 const mj_gno_t *initial_mj_gnos,
1269 mj_scalar_t **mj_coordinates,
1271 int num_weights_per_coord,
1272 bool *mj_uniform_weights,
1273 mj_scalar_t **mj_weights,
1274 bool *mj_uniform_parts,
1275 mj_scalar_t **mj_part_sizes,
1277 mj_part_t *&result_assigned_part_ids,
1278 mj_gno_t *&result_mj_gnos
1288 void set_partitioning_parameters(
1289 bool distribute_points_on_cut_lines_,
1290 int max_concurrent_part_calculation_,
1291 int check_migrate_avoid_migration_option_,
1292 mj_scalar_t minimum_migration_imbalance_);
1296 void set_to_keep_part_boxes();
1300 RCP<mj_partBox_t> get_global_box()
const;
1302 RCP<mj_partBoxVector_t> get_kept_boxes()
const;
1304 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1305 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1331 void sequential_task_partitioning(
1332 const RCP<const Environment> &env,
1333 mj_lno_t num_total_coords,
1334 mj_lno_t num_selected_coords,
1335 size_t num_target_part,
1337 mj_scalar_t **mj_coordinates,
1338 mj_lno_t *initial_selected_coords_output_permutation,
1339 mj_lno_t *output_xadj,
1340 int recursion_depth,
1341 const mj_part_t *part_no_array);
1369 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1372 const RCP<const Environment> &env,
1373 mj_lno_t num_total_coords,
1374 mj_lno_t num_selected_coords,
1375 size_t num_target_part,
1377 mj_scalar_t **mj_coordinates_,
1378 mj_lno_t *inital_adjList_output_adjlist,
1379 mj_lno_t *output_xadj,
1381 const mj_part_t *part_no_array_
1385 const RCP<Comm<int> > commN;
1386 this->comm = this->mj_problemComm = Teuchos::rcp_const_cast<Comm<int> >
1387 (Teuchos::DefaultComm<int>::getDefaultSerialComm(commN));
1388 this->myActualRank = this->myRank = 1;
1390 #ifdef HAVE_ZOLTAN2_OMP 1391 int actual_num_threads = omp_get_num_threads();
1392 omp_set_num_threads(1);
1400 this->imbalance_tolerance = 0;
1401 this->num_global_parts = num_target_part;
1402 this->part_no_array = (mj_part_t *)part_no_array_;
1403 this->recursion_depth = rd;
1405 this->coord_dim = coord_dim_;
1406 this->num_local_coords = num_total_coords;
1407 this->num_global_coords = num_total_coords;
1408 this->mj_coordinates = mj_coordinates_;
1412 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1414 this->num_weights_per_coord = 0;
1415 bool *tmp_mj_uniform_weights =
new bool[1];
1416 this->mj_uniform_weights = tmp_mj_uniform_weights ;
1417 this->mj_uniform_weights[0] =
true;
1419 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1420 this->mj_weights = tmp_mj_weights;
1422 bool *tmp_mj_uniform_parts =
new bool[1];
1423 this->mj_uniform_parts = tmp_mj_uniform_parts;
1424 this->mj_uniform_parts[0] =
true;
1426 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1427 this->mj_part_sizes = tmp_mj_part_sizes;
1428 this->mj_part_sizes[0] = NULL;
1430 this->num_threads = 1;
1431 this->set_part_specifications();
1433 this->allocate_set_work_memory();
1435 this->part_xadj[0] =
static_cast<mj_lno_t
>(num_selected_coords);
1436 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1437 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1440 mj_part_t current_num_parts = 1;
1442 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1444 mj_part_t future_num_parts = this->total_num_part;
1446 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1447 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1448 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1449 RCP<mj_partBoxVector_t> t1;
1450 RCP<mj_partBoxVector_t> t2;
1452 for (
int i = 0; i < this->recursion_depth; ++i){
1456 std::vector <mj_part_t> num_partitioning_in_current_dim;
1470 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1471 future_num_part_in_parts = next_future_num_parts_in_parts;
1472 next_future_num_parts_in_parts = tmpPartVect;
1477 next_future_num_parts_in_parts->clear();
1481 mj_part_t output_part_count_in_dimension =
1482 this->update_part_num_arrays(
1483 num_partitioning_in_current_dim,
1484 future_num_part_in_parts,
1485 next_future_num_parts_in_parts,
1495 if(output_part_count_in_dimension == current_num_parts) {
1496 tmpPartVect= future_num_part_in_parts;
1497 future_num_part_in_parts = next_future_num_parts_in_parts;
1498 next_future_num_parts_in_parts = tmpPartVect;
1503 int coordInd = i % this->coord_dim;
1504 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1506 std::string istring = toString<int>(i);
1510 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1513 mj_part_t output_part_index = 0;
1516 mj_part_t output_coordinate_end_index = 0;
1518 mj_part_t current_work_part = 0;
1519 mj_part_t current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1520 this->max_concurrent_part_calculation);
1522 mj_part_t obtained_part_index = 0;
1525 for (; current_work_part < current_num_parts;
1526 current_work_part += current_concurrent_num_parts){
1528 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1529 this->max_concurrent_part_calculation);
1531 mj_part_t actual_work_part_count = 0;
1535 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1536 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1540 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1543 ++actual_work_part_count;
1544 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1545 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1553 this->mj_get_local_min_max_coord_totW(
1554 coordinate_begin_index,
1555 coordinate_end_index,
1556 this->coordinate_permutations,
1557 mj_current_dim_coords,
1558 this->process_local_min_max_coord_total_weight[kk],
1559 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1560 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1565 if (actual_work_part_count > 0){
1567 this->mj_get_global_min_max_coord_totW(
1568 current_concurrent_num_parts,
1569 this->process_local_min_max_coord_total_weight,
1570 this->global_min_max_coord_total_weight);
1574 mj_part_t total_incomplete_cut_count = 0;
1579 mj_part_t concurrent_part_cut_shift = 0;
1580 mj_part_t concurrent_part_part_shift = 0;
1581 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1582 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1583 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1584 current_concurrent_num_parts];
1585 mj_scalar_t global_total_weight =
1586 this->global_min_max_coord_total_weight[kk +
1587 2 * current_concurrent_num_parts];
1589 mj_part_t concurrent_current_part_index = current_work_part + kk;
1591 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1593 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1594 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1595 concurrent_part_part_shift;
1597 concurrent_part_cut_shift += partition_count - 1;
1599 concurrent_part_part_shift += partition_count;
1603 if(partition_count > 1 && min_coordinate <= max_coordinate){
1607 total_incomplete_cut_count += partition_count - 1;
1610 this->my_incomplete_cut_count[kk] = partition_count - 1;
1613 this->mj_get_initial_cut_coords_target_weights(
1616 partition_count - 1,
1617 global_total_weight,
1619 current_target_part_weights,
1620 future_num_part_in_parts,
1621 next_future_num_parts_in_parts,
1622 concurrent_current_part_index,
1623 obtained_part_index);
1625 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1626 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1629 this->set_initial_coordinate_parts(
1632 concurrent_current_part_index,
1633 coordinate_begin_index, coordinate_end_index,
1634 this->coordinate_permutations,
1635 mj_current_dim_coords,
1636 this->assigned_part_ids,
1641 this->my_incomplete_cut_count[kk] = 0;
1643 obtained_part_index += partition_count;
1647 mj_scalar_t used_imbalance = 0;
1651 mj_current_dim_coords,
1654 current_concurrent_num_parts,
1655 current_cut_coordinates,
1656 total_incomplete_cut_count,
1657 num_partitioning_in_current_dim);
1663 mj_part_t output_array_shift = 0;
1664 mj_part_t cut_shift = 0;
1665 size_t tlr_shift = 0;
1666 size_t partweight_array_shift = 0;
1668 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1669 mj_part_t current_concurrent_work_part = current_work_part + kk;
1670 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1673 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1674 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1676 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1677 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1679 cut_shift += num_parts - 1;
1680 tlr_shift += (4 *(num_parts - 1) + 1);
1681 output_array_shift += num_parts;
1682 partweight_array_shift += (2 * (num_parts - 1) + 1);
1686 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1687 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1689 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1690 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1693 for(
int ii = 0; ii < this->num_threads; ++ii){
1694 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1699 this->create_consistent_chunks(
1701 mj_current_dim_coords,
1702 current_concurrent_cut_coordinate,
1705 used_local_cut_line_weight_to_left,
1706 this->new_part_xadj + output_part_index + output_array_shift,
1712 mj_lno_t part_size = coordinate_end - coordinate_begin;
1713 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1714 memcpy(this->new_coordinate_permutations + coordinate_begin,
1715 this->coordinate_permutations + coordinate_begin,
1716 part_size *
sizeof(mj_lno_t));
1718 cut_shift += num_parts - 1;
1719 tlr_shift += (4 *(num_parts - 1) + 1);
1720 output_array_shift += num_parts;
1721 partweight_array_shift += (2 * (num_parts - 1) + 1);
1730 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1731 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1732 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1734 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1737 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1739 output_part_index += num_parts ;
1746 current_num_parts = output_part_count_in_dimension;
1749 mj_lno_t * tmp = this->coordinate_permutations;
1750 this->coordinate_permutations = this->new_coordinate_permutations;
1751 this->new_coordinate_permutations = tmp;
1753 freeArray<mj_lno_t>(this->part_xadj);
1754 this->part_xadj = this->new_part_xadj;
1757 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1758 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1763 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1764 output_xadj[i+1] = this->part_xadj[i];
1767 delete future_num_part_in_parts;
1768 delete next_future_num_parts_in_parts;
1771 freeArray<mj_part_t>(this->assigned_part_ids);
1772 freeArray<mj_gno_t>(this->initial_mj_gnos);
1773 freeArray<mj_gno_t>(this->current_mj_gnos);
1774 freeArray<bool>(tmp_mj_uniform_weights);
1775 freeArray<bool>(tmp_mj_uniform_parts);
1776 freeArray<mj_scalar_t *>(tmp_mj_weights);
1777 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1779 this->free_work_memory();
1781 #ifdef HAVE_ZOLTAN2_OMP 1782 omp_set_num_threads(actual_num_threads);
1789 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1792 mj_env(), mj_problemComm(), imbalance_tolerance(0),
1793 part_no_array(NULL), recursion_depth(0), coord_dim(0),
1794 num_weights_per_coord(0), initial_num_loc_coords(0),
1795 initial_num_glob_coords(0),
1796 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1797 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1798 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1799 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1800 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1801 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1802 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1803 mj_run_as_rcb(0), mj_user_recursion_depth(0), mj_keep_part_boxes(0),
1804 check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
1805 num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1806 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1807 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1808 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1809 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1810 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1811 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1812 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1813 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1814 thread_part_weights(NULL), thread_part_weight_work(NULL),
1815 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1816 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1817 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1818 global_total_part_weight_left_right_closests(NULL),
1819 kept_boxes(),global_box(),
1820 myRank(0), myActualRank(0)
1825 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1826 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1834 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1836 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1839 return this->global_box;
1845 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1848 this->mj_keep_part_boxes = 1;
1859 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1863 this->total_num_cut = 0;
1864 this->total_num_part = 1;
1865 this->max_num_part_along_dim = 0;
1866 this->total_dim_num_reduce_all = 0;
1867 this->last_dim_num_part = 1;
1870 this->max_num_cut_along_dim = 0;
1871 this->max_num_total_part_along_dim = 0;
1873 if (this->part_no_array){
1875 for (
int i = 0; i < this->recursion_depth; ++i){
1876 this->total_dim_num_reduce_all += this->total_num_part;
1877 this->total_num_part *= this->part_no_array[i];
1878 if(this->part_no_array[i] > this->max_num_part_along_dim) {
1879 this->max_num_part_along_dim = this->part_no_array[i];
1882 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1883 this->num_global_parts = this->total_num_part;
1885 mj_part_t future_num_parts = this->num_global_parts;
1888 for (
int i = 0; i < this->recursion_depth; ++i){
1890 mj_part_t maxNoPartAlongI = this->get_part_count(
1891 future_num_parts, 1.0f / (this->recursion_depth - i));
1893 if (maxNoPartAlongI > this->max_num_part_along_dim){
1894 this->max_num_part_along_dim = maxNoPartAlongI;
1897 mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
1898 if (future_num_parts % maxNoPartAlongI){
1901 future_num_parts = nfutureNumParts;
1903 this->total_num_part = this->num_global_parts;
1907 for (
int i = 0; i < this->recursion_depth; ++i){
1908 this->total_dim_num_reduce_all += p;
1909 p *= this->max_num_part_along_dim;
1912 this->last_dim_num_part = p / this->max_num_part_along_dim;
1915 this->total_num_cut = this->total_num_part - 1;
1916 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
1917 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
1921 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
1922 if(this->mj_problemComm->getRank() == 0){
1923 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
1924 ") has been set bigger than maximum amount that can be used." <<
1925 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
1927 this->max_concurrent_part_calculation = this->last_dim_num_part;
1936 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1939 mj_part_t num_total_future,
1942 double fp = pow(num_total_future, root);
1943 mj_part_t ip = mj_part_t (fp);
1944 if (fp - ip < this->fEpsilon * 100){
1966 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1969 std::vector <mj_part_t> &num_partitioning_in_current_dim,
1970 std::vector<mj_part_t> *future_num_part_in_parts,
1971 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1972 mj_part_t &future_num_parts,
1973 mj_part_t current_num_parts,
1974 int current_iteration,
1975 RCP<mj_partBoxVector_t> input_part_boxes,
1976 RCP<mj_partBoxVector_t> output_part_boxes
1979 mj_part_t output_num_parts = 0;
1980 if(this->part_no_array){
1985 mj_part_t p = this->part_no_array[current_iteration];
1987 std::cout <<
"i:" << current_iteration <<
" p is given as:" << p << std::endl;
1991 return current_num_parts;
1994 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
1995 num_partitioning_in_current_dim.push_back(p);
2009 future_num_parts /= num_partitioning_in_current_dim[0];
2010 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2012 if (this->mj_keep_part_boxes){
2013 for (mj_part_t k = 0; k < current_num_parts; ++k){
2015 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2016 output_part_boxes->push_back((*input_part_boxes)[k]);
2024 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2025 next_future_num_parts_in_parts->push_back(future_num_parts);
2035 future_num_parts = 1;
2039 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2041 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2045 mj_part_t num_partitions_in_current_dim =
2046 this->get_part_count(
2047 future_num_parts_of_part_ii,
2048 1.0 / (this->recursion_depth - current_iteration)
2051 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2052 std::cerr <<
"ERROR: maxPartNo calculation is wrong." << std::endl;
2056 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2060 output_num_parts += num_partitions_in_current_dim;
2063 mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
2064 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2065 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2068 if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
2070 ++num_future_parts_for_part_iii;
2072 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
2075 if (this->mj_keep_part_boxes){
2076 output_part_boxes->push_back((*input_part_boxes)[ii]);
2080 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2084 return output_num_parts;
2091 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2096 this->owner_of_coordinate = NULL;
2101 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2103 #ifdef HAVE_ZOLTAN2_OMP 2104 #pragma omp parallel for 2106 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2107 this->coordinate_permutations[i] = i;
2111 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2113 this->assigned_part_ids = NULL;
2114 if(this->num_local_coords > 0){
2115 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2121 this->part_xadj = allocMemory<mj_lno_t>(1);
2122 this->part_xadj[0] =
static_cast<mj_lno_t
>(this->num_local_coords);
2124 this->new_part_xadj = NULL;
2130 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2132 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2134 this->process_cut_line_weight_to_put_left = NULL;
2135 this->thread_cut_line_weight_to_put_left = NULL;
2137 if(this->distribute_points_on_cut_lines){
2138 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2139 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2140 for(
int i = 0; i < this->num_threads; ++i){
2141 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2143 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2144 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2152 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2153 this->max_concurrent_part_calculation);
2157 this->target_part_weights = allocMemory<mj_scalar_t>(
2158 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2161 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2162 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2163 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2164 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2166 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2167 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2171 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2174 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2176 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2178 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2181 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2183 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2186 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2188 for(
int i = 0; i < this->num_threads; ++i){
2190 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2191 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2192 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2193 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2199 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2200 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2203 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2204 for (
int i=0; i < this->coord_dim; i++){
2205 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2206 #ifdef HAVE_ZOLTAN2_OMP 2207 #pragma omp parallel for 2209 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2210 coord[i][j] = this->mj_coordinates[i][j];
2212 this->mj_coordinates = coord;
2215 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2216 mj_scalar_t **
weights = allocMemory<mj_scalar_t *>(criteria_dim);
2218 for (
int i=0; i < criteria_dim; i++){
2221 for (
int i=0; i < this->num_weights_per_coord; i++){
2222 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2223 #ifdef HAVE_ZOLTAN2_OMP 2224 #pragma omp parallel for 2226 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2227 weights[i][j] = this->mj_weights[i][j];
2231 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2232 #ifdef HAVE_ZOLTAN2_OMP 2233 #pragma omp parallel for 2235 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2236 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2238 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2240 #ifdef HAVE_ZOLTAN2_OMP 2241 #pragma omp parallel for 2243 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2244 this->owner_of_coordinate[j] = this->myActualRank;
2249 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2254 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2256 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2258 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2260 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2262 for (
int i = 0; i < this->coord_dim; ++i){
2263 mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2264 mj_scalar_t localMax = -localMin;
2265 if (localMax > 0) localMax = 0;
2268 for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2269 if (this->mj_coordinates[i][j] < localMin){
2270 localMin = this->mj_coordinates[i][j];
2272 if (this->mj_coordinates[i][j] > localMax){
2273 localMax = this->mj_coordinates[i][j];
2282 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2283 this->coord_dim, mins, gmins
2287 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2288 this->coord_dim, maxs, gmaxs
2294 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2296 freeArray<mj_scalar_t>(mins);
2297 freeArray<mj_scalar_t>(gmins);
2298 freeArray<mj_scalar_t>(maxs);
2299 freeArray<mj_scalar_t>(gmaxs);
2307 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2310 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2314 initial_partitioning_boxes->push_back(tmp_box);
2327 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2330 mj_lno_t coordinate_begin_index,
2331 mj_lno_t coordinate_end_index,
2332 mj_lno_t *mj_current_coordinate_permutations,
2333 mj_scalar_t *mj_current_dim_coords,
2334 mj_scalar_t &min_coordinate,
2335 mj_scalar_t &max_coordinate,
2336 mj_scalar_t &total_weight){
2340 if(coordinate_begin_index >= coordinate_end_index)
2342 min_coordinate = this->maxScalar_t;
2343 max_coordinate = this->minScalar_t;
2347 mj_scalar_t my_total_weight = 0;
2348 #ifdef HAVE_ZOLTAN2_OMP 2349 #pragma omp parallel 2353 if (this->mj_uniform_weights[0]) {
2354 #ifdef HAVE_ZOLTAN2_OMP 2358 my_total_weight = coordinate_end_index - coordinate_begin_index;
2364 #ifdef HAVE_ZOLTAN2_OMP 2365 #pragma omp for reduction(+:my_total_weight) 2367 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2368 int i = mj_current_coordinate_permutations[ii];
2369 my_total_weight += this->mj_weights[0][i];
2373 int my_thread_id = 0;
2374 #ifdef HAVE_ZOLTAN2_OMP 2375 my_thread_id = omp_get_thread_num();
2377 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2378 my_thread_min_coord=my_thread_max_coord
2379 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2382 #ifdef HAVE_ZOLTAN2_OMP 2385 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2386 int i = mj_current_coordinate_permutations[j];
2387 if(mj_current_dim_coords[i] > my_thread_max_coord)
2388 my_thread_max_coord = mj_current_dim_coords[i];
2389 if(mj_current_dim_coords[i] < my_thread_min_coord)
2390 my_thread_min_coord = mj_current_dim_coords[i];
2392 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2393 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2395 #ifdef HAVE_ZOLTAN2_OMP 2398 #pragma omp single nowait 2401 min_coordinate = this->max_min_coords[0];
2402 for(
int i = 1; i < this->num_threads; ++i){
2403 if(this->max_min_coords[i] < min_coordinate)
2404 min_coordinate = this->max_min_coords[i];
2408 #ifdef HAVE_ZOLTAN2_OMP 2409 #pragma omp single nowait 2412 max_coordinate = this->max_min_coords[this->num_threads];
2413 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2414 if(this->max_min_coords[i] > max_coordinate)
2415 max_coordinate = this->max_min_coords[i];
2419 total_weight = my_total_weight;
2431 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2434 mj_part_t current_concurrent_num_parts,
2435 mj_scalar_t *local_min_max_total,
2436 mj_scalar_t *global_min_max_total){
2441 if(this->comm->getSize() > 1){
2444 current_concurrent_num_parts,
2445 current_concurrent_num_parts,
2446 current_concurrent_num_parts);
2448 reduceAll<int, mj_scalar_t>(
2451 3 * current_concurrent_num_parts,
2452 local_min_max_total,
2453 global_min_max_total);
2458 mj_part_t s = 3 * current_concurrent_num_parts;
2459 for (mj_part_t i = 0; i < s; ++i){
2460 global_min_max_total[i] = local_min_max_total[i];
2485 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2488 mj_scalar_t min_coord,
2489 mj_scalar_t max_coord,
2490 mj_part_t num_cuts ,
2491 mj_scalar_t global_weight,
2492 mj_scalar_t *initial_cut_coords ,
2493 mj_scalar_t *current_target_part_weights ,
2495 std::vector <mj_part_t> *future_num_part_in_parts,
2496 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2497 mj_part_t concurrent_current_part,
2498 mj_part_t obtained_part_index
2501 mj_scalar_t coord_range = max_coord - min_coord;
2502 if(this->mj_uniform_parts[0]){
2504 mj_part_t cumulative = 0;
2506 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2510 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2516 for(mj_part_t i = 0; i < num_cuts; ++i){
2517 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2525 current_target_part_weights[i] = cumulative * unit_part_weight;
2528 initial_cut_coords[i] = min_coord + (coord_range *
2529 cumulative) / total_future_part_count_in_part;
2531 current_target_part_weights[num_cuts] = 1;
2535 if (this->mj_uniform_weights[0]){
2536 for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2537 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2542 std::cerr <<
"MJ does not support non uniform part weights" << std::endl;
2560 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2563 mj_scalar_t &max_coordinate,
2564 mj_scalar_t &min_coordinate,
2565 mj_part_t &concurrent_current_part_index,
2566 mj_lno_t coordinate_begin_index,
2567 mj_lno_t coordinate_end_index,
2568 mj_lno_t *mj_current_coordinate_permutations,
2569 mj_scalar_t *mj_current_dim_coords,
2570 mj_part_t *mj_part_ids,
2571 mj_part_t &partition_count
2573 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2577 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2578 #ifdef HAVE_ZOLTAN2_OMP 2579 #pragma omp parallel for 2581 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2582 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2589 mj_scalar_t slice = coordinate_range / partition_count;
2591 #ifdef HAVE_ZOLTAN2_OMP 2592 #pragma omp parallel for 2594 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2596 mj_lno_t iii = mj_current_coordinate_permutations[ii];
2597 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2598 mj_part_ids[iii] = 2 * pp;
2614 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2617 mj_scalar_t *mj_current_dim_coords,
2618 mj_scalar_t used_imbalance_tolerance,
2619 mj_part_t current_work_part,
2620 mj_part_t current_concurrent_num_parts,
2621 mj_scalar_t *current_cut_coordinates,
2622 mj_part_t total_incomplete_cut_count,
2623 std::vector <mj_part_t> &num_partitioning_in_current_dim
2627 mj_part_t rectilinear_cut_count = 0;
2628 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2631 *reductionOp = NULL;
2633 <mj_part_t, mj_scalar_t>(
2634 &num_partitioning_in_current_dim ,
2636 current_concurrent_num_parts);
2638 size_t total_reduction_size = 0;
2639 #ifdef HAVE_ZOLTAN2_OMP 2640 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) 2644 #ifdef HAVE_ZOLTAN2_OMP 2645 me = omp_get_thread_num();
2647 double *my_thread_part_weights = this->thread_part_weights[me];
2648 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2649 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2651 #ifdef HAVE_ZOLTAN2_OMP 2657 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2659 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2660 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2661 total_reduction_size += (4 * num_cut_in_dim + 1);
2663 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2664 this->is_cut_line_determined[next] =
false;
2665 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
2666 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
2668 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
2669 this->cut_lower_bound_weights[next] = 0;
2671 if(this->distribute_points_on_cut_lines){
2672 this->process_cut_line_weight_to_put_left[next] = 0;
2683 while (total_incomplete_cut_count != 0){
2686 mj_part_t concurrent_cut_shifts = 0;
2687 size_t total_part_shift = 0;
2689 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2690 mj_part_t num_parts = -1;
2691 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2693 mj_part_t num_cuts = num_parts - 1;
2694 size_t total_part_count = num_parts + size_t (num_cuts) ;
2695 if (this->my_incomplete_cut_count[kk] > 0){
2698 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2699 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2700 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2701 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2703 mj_part_t conccurent_current_part = current_work_part + kk;
2704 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2705 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2706 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2708 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2709 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2712 this->mj_1D_part_get_thread_part_weights(
2717 coordinate_begin_index,
2718 coordinate_end_index,
2719 mj_current_dim_coords,
2720 temp_current_cut_coords,
2722 my_current_part_weights,
2723 my_current_left_closest,
2724 my_current_right_closest);
2728 concurrent_cut_shifts += num_cuts;
2729 total_part_shift += total_part_count;
2733 this->mj_accumulate_thread_results(
2734 num_partitioning_in_current_dim,
2736 current_concurrent_num_parts);
2739 #ifdef HAVE_ZOLTAN2_OMP 2743 if(this->comm->getSize() > 1){
2744 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2745 total_reduction_size,
2746 this->total_part_weight_left_right_closests,
2747 this->global_total_part_weight_left_right_closests);
2752 this->global_total_part_weight_left_right_closests,
2753 this->total_part_weight_left_right_closests,
2754 total_reduction_size *
sizeof(mj_scalar_t));
2759 mj_part_t cut_shift = 0;
2762 size_t tlr_shift = 0;
2763 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2764 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2765 mj_part_t num_cuts = num_parts - 1;
2766 size_t num_total_part = num_parts + size_t (num_cuts) ;
2771 if (this->my_incomplete_cut_count[kk] == 0) {
2772 cut_shift += num_cuts;
2773 tlr_shift += (num_total_part + 2 * num_cuts);
2777 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2778 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2779 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
2780 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
2781 mj_scalar_t *current_global_part_weights = current_global_tlr;
2782 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2784 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2785 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2787 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2788 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2789 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2790 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2791 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2792 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2793 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2795 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2798 this->mj_get_new_cut_coordinates(
2803 global_total_weight,
2804 used_imbalance_tolerance,
2805 current_global_part_weights,
2806 current_local_part_weights,
2807 current_part_target_weights,
2808 current_cut_line_determined,
2809 temp_cut_coords + cut_shift,
2810 current_cut_upper_bounds,
2811 current_cut_lower_bounds,
2812 current_global_left_closest_points,
2813 current_global_right_closest_points,
2814 current_cut_lower_bound_weights,
2815 current_cut_upper_weights,
2816 this->cut_coordinates_work_array +cut_shift,
2817 current_part_cut_line_weight_to_put_left,
2818 &rectilinear_cut_count,
2819 this->my_incomplete_cut_count[kk]);
2821 cut_shift += num_cuts;
2822 tlr_shift += (num_total_part + 2 * num_cuts);
2823 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
2824 #ifdef HAVE_ZOLTAN2_OMP 2828 total_incomplete_cut_count -= iteration_complete_cut_count;
2832 #ifdef HAVE_ZOLTAN2_OMP 2838 mj_scalar_t *t = temp_cut_coords;
2839 temp_cut_coords = this->cut_coordinates_work_array;
2840 this->cut_coordinates_work_array = t;
2848 if (current_cut_coordinates != temp_cut_coords){
2849 #ifdef HAVE_ZOLTAN2_OMP 2854 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2855 mj_part_t num_parts = -1;
2856 num_parts = num_partitioning_in_current_dim[current_work_part + i];
2857 mj_part_t num_cuts = num_parts - 1;
2859 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
2860 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
2866 #ifdef HAVE_ZOLTAN2_OMP 2870 this->cut_coordinates_work_array = temp_cut_coords;
2897 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2900 size_t total_part_count,
2902 mj_scalar_t max_coord,
2903 mj_scalar_t min_coord,
2904 mj_lno_t coordinate_begin_index,
2905 mj_lno_t coordinate_end_index,
2906 mj_scalar_t *mj_current_dim_coords,
2907 mj_scalar_t *temp_current_cut_coords,
2908 bool *current_cut_status,
2909 double *my_current_part_weights,
2910 mj_scalar_t *my_current_left_closest,
2911 mj_scalar_t *my_current_right_closest){
2914 for (
size_t i = 0; i < total_part_count; ++i){
2915 my_current_part_weights[i] = 0;
2920 for(mj_part_t i = 0; i < num_cuts; ++i){
2921 my_current_left_closest[i] = min_coord - 1;
2922 my_current_right_closest[i] = max_coord + 1;
2925 mj_scalar_t minus_EPSILON = -this->sEpsilon;
2926 #ifdef HAVE_ZOLTAN2_OMP 2932 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2933 int i = this->coordinate_permutations[ii];
2937 mj_part_t j = this->assigned_part_ids[i] / 2;
2943 mj_part_t lower_cut_index = 0;
2944 mj_part_t upper_cut_index = num_cuts - 1;
2946 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
2947 bool is_inserted =
false;
2948 bool is_on_left_of_cut =
false;
2949 bool is_on_right_of_cut =
false;
2950 mj_part_t last_compared_part = -1;
2952 mj_scalar_t coord = mj_current_dim_coords[i];
2954 while(upper_cut_index >= lower_cut_index)
2957 last_compared_part = -1;
2958 is_on_left_of_cut =
false;
2959 is_on_right_of_cut =
false;
2960 mj_scalar_t cut = temp_current_cut_coords[j];
2961 mj_scalar_t distance_to_cut = coord - cut;
2962 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
2965 if(abs_distance_to_cut < this->sEpsilon){
2967 my_current_part_weights[j * 2 + 1] += w;
2968 this->assigned_part_ids[i] = j * 2 + 1;
2971 my_current_left_closest[j] = coord;
2972 my_current_right_closest[j] = coord;
2975 mj_part_t kk = j + 1;
2976 while(kk < num_cuts){
2978 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
2979 if(distance_to_cut < this->sEpsilon){
2980 my_current_part_weights[2 * kk + 1] += w;
2981 my_current_left_closest[kk] = coord;
2982 my_current_right_closest[kk] = coord;
2988 if(coord - my_current_left_closest[kk] > this->sEpsilon){
2989 my_current_left_closest[kk] = coord;
2999 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3000 if(distance_to_cut < this->sEpsilon){
3001 my_current_part_weights[2 * kk + 1] += w;
3003 this->assigned_part_ids[i] = kk * 2 + 1;
3004 my_current_left_closest[kk] = coord;
3005 my_current_right_closest[kk] = coord;
3011 if(my_current_right_closest[kk] - coord > this->sEpsilon){
3012 my_current_right_closest[kk] = coord;
3023 if (distance_to_cut < 0) {
3024 bool _break =
false;
3028 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3029 if(distance_to_next_cut > this->sEpsilon){
3035 upper_cut_index = j - 1;
3037 is_on_left_of_cut =
true;
3038 last_compared_part = j;
3043 bool _break =
false;
3044 if(j < num_cuts - 1){
3047 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3048 if(distance_to_next_cut < minus_EPSILON){
3055 lower_cut_index = j + 1;
3057 is_on_right_of_cut =
true;
3058 last_compared_part = j;
3063 j = (upper_cut_index + lower_cut_index) / 2;
3066 if(is_on_right_of_cut){
3069 my_current_part_weights[2 * last_compared_part + 2] += w;
3070 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3073 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3074 my_current_right_closest[last_compared_part] = coord;
3077 if(last_compared_part+1 < num_cuts){
3079 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3080 my_current_left_closest[last_compared_part + 1] = coord;
3085 else if(is_on_left_of_cut){
3088 my_current_part_weights[2 * last_compared_part] += w;
3089 this->assigned_part_ids[i] = 2 * last_compared_part;
3093 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3094 my_current_left_closest[last_compared_part] = coord;
3098 if(last_compared_part-1 >= 0){
3099 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3100 my_current_right_closest[last_compared_part -1] = coord;
3109 for (
size_t i = 1; i < total_part_count; ++i){
3113 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3114 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3119 my_current_part_weights[i] = my_current_part_weights[i-2];
3123 my_current_part_weights[i] += my_current_part_weights[i-1];
3135 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3138 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3139 mj_part_t current_work_part,
3140 mj_part_t current_concurrent_num_parts){
3142 #ifdef HAVE_ZOLTAN2_OMP 3149 size_t tlr_array_shift = 0;
3150 mj_part_t cut_shift = 0;
3153 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3155 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3156 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3157 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3160 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3161 mj_part_t next = tlr_array_shift + ii;
3162 mj_part_t cut_index = cut_shift + ii;
3163 if(this->is_cut_line_determined[cut_index])
continue;
3164 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3165 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3168 for (
int j = 1; j < this->num_threads; ++j){
3169 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3170 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3172 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3173 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3177 this->total_part_weight_left_right_closests[num_total_part_in_part +
3178 next] = left_closest_in_process;
3179 this->total_part_weight_left_right_closests[num_total_part_in_part +
3180 num_cuts_in_part + next] = right_closest_in_process;
3183 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3184 cut_shift += num_cuts_in_part;
3187 tlr_array_shift = 0;
3189 size_t total_part_array_shift = 0;
3192 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3194 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3195 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3196 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3198 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3200 mj_part_t cut_ind = j / 2 + cut_shift;
3205 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3207 for (
int k = 0; k < this->num_threads; ++k){
3208 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3211 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3213 cut_shift += num_cuts_in_part;
3214 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3215 total_part_array_shift += num_total_part_in_part;
3233 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3236 mj_scalar_t cut_upper_bound,
3237 mj_scalar_t cut_lower_bound,
3238 mj_scalar_t cut_upper_weight,
3239 mj_scalar_t cut_lower_weight,
3240 mj_scalar_t expected_weight,
3241 mj_scalar_t &new_cut_position){
3243 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3244 new_cut_position = cut_upper_bound;
3248 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3249 new_cut_position = cut_lower_bound;
3252 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3253 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3254 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3256 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3257 int scale_constant = 20;
3258 int shiftint= int (required_shift * scale_constant);
3259 if (shiftint == 0) shiftint = 1;
3260 required_shift = mj_scalar_t (shiftint) / scale_constant;
3261 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3275 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3278 mj_part_t num_parts,
3279 mj_scalar_t *mj_current_dim_coords,
3280 mj_scalar_t *current_concurrent_cut_coordinate,
3281 mj_lno_t coordinate_begin,
3282 mj_lno_t coordinate_end,
3283 mj_scalar_t *used_local_cut_line_weight_to_left,
3284 double **used_thread_part_weight_work,
3285 mj_lno_t *out_part_xadj){
3287 mj_part_t num_cuts = num_parts - 1;
3289 #ifdef HAVE_ZOLTAN2_OMP 3290 #pragma omp parallel 3294 #ifdef HAVE_ZOLTAN2_OMP 3295 me = omp_get_thread_num();
3298 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3299 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3303 if (this->distribute_points_on_cut_lines){
3304 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3306 #ifdef HAVE_ZOLTAN2_OMP 3309 for (mj_part_t i = 0; i < num_cuts; ++i){
3311 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3312 for(
int ii = 0; ii < this->num_threads; ++ii){
3313 if(left_weight > this->sEpsilon){
3315 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3316 if(thread_ii_weight_on_cut < left_weight){
3318 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3322 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3324 left_weight -= thread_ii_weight_on_cut;
3327 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3335 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3336 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3337 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3340 / mj_scalar_t(SIGNIFICANCE_MUL);
3345 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3346 thread_num_points_in_parts[ii] = 0;
3350 #ifdef HAVE_ZOLTAN2_OMP 3354 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3356 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3357 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3358 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3359 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3360 if(coordinate_assigned_place % 2 == 1){
3362 if(this->distribute_points_on_cut_lines
3363 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3367 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3372 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3373 && coordinate_assigned_part < num_cuts - 1
3374 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3375 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3376 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3378 ++thread_num_points_in_parts[coordinate_assigned_part];
3379 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3383 ++coordinate_assigned_part;
3385 while(this->distribute_points_on_cut_lines &&
3386 coordinate_assigned_part < num_cuts){
3388 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3389 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3392 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3394 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3395 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3396 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3399 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3400 coordinate_assigned_part < num_cuts - 1 &&
3401 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3402 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3403 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3411 ++coordinate_assigned_part;
3413 ++thread_num_points_in_parts[coordinate_assigned_part];
3414 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3419 ++thread_num_points_in_parts[coordinate_assigned_part];
3420 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3429 #ifdef HAVE_ZOLTAN2_OMP 3432 for(mj_part_t j = 0; j < num_parts; ++j){
3433 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3434 for (
int i = 0; i < this->num_threads; ++i){
3435 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3437 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3438 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3441 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3445 #ifdef HAVE_ZOLTAN2_OMP 3450 for(mj_part_t j = 1; j < num_parts; ++j){
3451 out_part_xadj[j] += out_part_xadj[j - 1];
3457 for(mj_part_t j = 1; j < num_parts; ++j){
3458 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3464 #ifdef HAVE_ZOLTAN2_OMP 3467 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3468 mj_lno_t i = this->coordinate_permutations[ii];
3469 mj_part_t p = this->assigned_part_ids[i];
3470 this->new_coordinate_permutations[coordinate_begin +
3471 thread_num_points_in_parts[p]++] = i;
3506 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3509 const size_t &num_total_part,
3510 const mj_part_t &num_cuts,
3511 const mj_scalar_t &max_coordinate,
3512 const mj_scalar_t &min_coordinate,
3513 const mj_scalar_t &global_total_weight,
3514 const mj_scalar_t &used_imbalance_tolerance,
3515 mj_scalar_t * current_global_part_weights,
3516 const mj_scalar_t * current_local_part_weights,
3517 const mj_scalar_t *current_part_target_weights,
3518 bool *current_cut_line_determined,
3519 mj_scalar_t *current_cut_coordinates,
3520 mj_scalar_t *current_cut_upper_bounds,
3521 mj_scalar_t *current_cut_lower_bounds,
3522 mj_scalar_t *current_global_left_closest_points,
3523 mj_scalar_t *current_global_right_closest_points,
3524 mj_scalar_t * current_cut_lower_bound_weights,
3525 mj_scalar_t * current_cut_upper_weights,
3526 mj_scalar_t *new_current_cut_coordinates,
3527 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3528 mj_part_t *rectilinear_cut_count,
3529 mj_part_t &my_num_incomplete_cut){
3532 mj_scalar_t seen_weight_in_part = 0;
3534 mj_scalar_t expected_weight_in_part = 0;
3536 mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3539 #ifdef HAVE_ZOLTAN2_OMP 3542 for (mj_part_t i = 0; i < num_cuts; i++){
3545 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3546 current_global_left_closest_points[i] = current_cut_coordinates[i];
3547 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3548 current_global_right_closest_points[i] = current_cut_coordinates[i];
3551 #ifdef HAVE_ZOLTAN2_OMP 3554 for (mj_part_t i = 0; i < num_cuts; i++){
3556 if(this->distribute_points_on_cut_lines){
3558 this->global_rectilinear_cut_weight[i] = 0;
3559 this->process_rectilinear_cut_weight[i] = 0;
3563 if(current_cut_line_determined[i]) {
3564 new_current_cut_coordinates[i] = current_cut_coordinates[i];
3569 seen_weight_in_part = current_global_part_weights[i * 2];
3578 expected_weight_in_part = current_part_target_weights[i];
3580 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3582 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3584 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3585 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3588 if(is_left_imbalance_valid && is_right_imbalance_valid){
3589 current_cut_line_determined[i] =
true;
3590 #ifdef HAVE_ZOLTAN2_OMP 3593 my_num_incomplete_cut -= 1;
3594 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3597 else if(imbalance_on_left < 0){
3600 if(this->distribute_points_on_cut_lines){
3605 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3607 current_cut_line_determined[i] =
true;
3608 #ifdef HAVE_ZOLTAN2_OMP 3611 my_num_incomplete_cut -= 1;
3614 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3618 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3621 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3625 current_cut_line_determined[i] =
true;
3626 #ifdef HAVE_ZOLTAN2_OMP 3629 *rectilinear_cut_count += 1;
3632 #ifdef HAVE_ZOLTAN2_OMP 3635 my_num_incomplete_cut -= 1;
3636 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3637 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3638 current_local_part_weights[i * 2];
3643 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3645 current_cut_lower_bound_weights[i] = seen_weight_in_part;
3649 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3650 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3651 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3653 if(p_weight >= expected_weight_in_part){
3658 if(p_weight == expected_weight_in_part){
3659 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3660 current_cut_upper_weights[i] = p_weight;
3661 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3662 current_cut_lower_bound_weights[i] = p_weight;
3663 }
else if (p_weight < current_cut_upper_weights[i]){
3666 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3667 current_cut_upper_weights[i] = p_weight;
3673 if(line_weight >= expected_weight_in_part){
3676 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3677 current_cut_upper_weights[i] = line_weight;
3678 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3679 current_cut_lower_bound_weights[i] = p_weight;
3684 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3685 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3686 current_cut_lower_bound_weights[i] = p_weight;
3691 mj_scalar_t new_cut_position = 0;
3692 this->mj_calculate_new_cut_position(
3693 current_cut_upper_bounds[i],
3694 current_cut_lower_bounds[i],
3695 current_cut_upper_weights[i],
3696 current_cut_lower_bound_weights[i],
3697 expected_weight_in_part, new_cut_position);
3701 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3704 current_cut_line_determined[i] =
true;
3705 #ifdef HAVE_ZOLTAN2_OMP 3708 my_num_incomplete_cut -= 1;
3711 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3713 new_current_cut_coordinates [i] = new_cut_position;
3719 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3720 current_cut_upper_weights[i] = seen_weight_in_part;
3723 for (
int ii = i - 1; ii >= 0; --ii){
3724 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3725 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3726 if(p_weight <= expected_weight_in_part){
3727 if(p_weight == expected_weight_in_part){
3730 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3731 current_cut_upper_weights[i] = p_weight;
3732 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3733 current_cut_lower_bound_weights[i] = p_weight;
3735 else if (p_weight > current_cut_lower_bound_weights[i]){
3738 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3739 current_cut_lower_bound_weights[i] = p_weight;
3745 if(line_weight > expected_weight_in_part){
3746 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3747 current_cut_upper_weights[i] = line_weight;
3756 if (p_weight >= expected_weight_in_part &&
3757 (p_weight < current_cut_upper_weights[i] ||
3758 (p_weight == current_cut_upper_weights[i] &&
3759 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3763 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3764 current_cut_upper_weights[i] = p_weight;
3767 mj_scalar_t new_cut_position = 0;
3768 this->mj_calculate_new_cut_position(
3769 current_cut_upper_bounds[i],
3770 current_cut_lower_bounds[i],
3771 current_cut_upper_weights[i],
3772 current_cut_lower_bound_weights[i],
3773 expected_weight_in_part,
3777 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3779 current_cut_line_determined[i] =
true;
3780 #ifdef HAVE_ZOLTAN2_OMP 3783 my_num_incomplete_cut -= 1;
3785 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3787 new_current_cut_coordinates [ i] = new_cut_position;
3794 #ifdef HAVE_ZOLTAN2_OMP 3799 if(*rectilinear_cut_count > 0){
3802 Teuchos::scan<int,mj_scalar_t>(
3803 *comm, Teuchos::REDUCE_SUM,
3805 this->process_rectilinear_cut_weight,
3806 this->global_rectilinear_cut_weight
3811 for (mj_part_t i = 0; i < num_cuts; ++i){
3813 if(this->global_rectilinear_cut_weight[i] > 0) {
3815 mj_scalar_t expected_part_weight = current_part_target_weights[i];
3817 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
3819 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
3821 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
3824 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
3826 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
3836 if(space_left_to_me < 0){
3838 current_part_cut_line_weight_to_put_left[i] = 0;
3840 else if(space_left_to_me >= my_weight_on_line){
3843 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
3848 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
3855 *rectilinear_cut_count = 0;
3869 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3872 mj_part_t num_procs,
3873 mj_part_t num_parts,
3874 mj_gno_t *&num_points_in_all_processor_parts){
3877 size_t allocation_size = num_parts * (num_procs + 1);
3884 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
3889 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
3892 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
3895 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
3898 for (mj_part_t i = 0; i < num_parts; ++i){
3899 mj_lno_t part_begin_index = 0;
3901 part_begin_index = this->new_part_xadj[i - 1];
3903 mj_lno_t part_end_index = this->new_part_xadj[i];
3904 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
3909 memcpy (my_local_point_counts_in_each_art,
3910 my_local_points_to_reduce_sum,
3911 sizeof(mj_gno_t) * (num_parts) );
3919 reduceAll<int, mj_gno_t>(
3921 Teuchos::REDUCE_SUM,
3923 num_local_points_in_each_part_to_reduce_sum,
3924 num_points_in_all_processor_parts);
3927 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
3944 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3947 size_t migration_reduce_all_population,
3948 mj_lno_t num_coords_for_last_dim_part,
3949 mj_part_t num_procs,
3950 mj_part_t num_parts,
3951 mj_gno_t *num_points_in_all_processor_parts){
3959 if (this->check_migrate_avoid_migration_option == 0){
3960 double global_imbalance = 0;
3962 size_t global_shift = num_procs * num_parts;
3964 for (mj_part_t ii = 0; ii < num_procs; ++ii){
3965 for (mj_part_t i = 0; i < num_parts; ++i){
3966 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
3967 / double(num_procs);
3970 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
3973 global_imbalance /= num_parts;
3974 global_imbalance /= num_procs;
3982 if(global_imbalance <= this->minimum_migration_imbalance){
4005 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4008 mj_part_t num_parts,
4009 mj_part_t *part_assignment_proc_begin_indices,
4010 mj_part_t *processor_chains_in_parts,
4011 mj_lno_t *send_count_to_each_proc,
4012 int *coordinate_destinations){
4014 for (mj_part_t p = 0; p < num_parts; ++p){
4015 mj_lno_t part_begin = 0;
4016 if (p > 0) part_begin = this->new_part_xadj[p - 1];
4017 mj_lno_t part_end = this->new_part_xadj[p];
4020 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4022 mj_lno_t num_total_send = 0;
4023 for (mj_lno_t j=part_begin; j < part_end; j++){
4024 mj_lno_t local_ind = this->new_coordinate_permutations[j];
4025 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4029 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4031 processor_chains_in_parts[proc_to_sent] = -1;
4033 proc_to_sent = part_assignment_proc_begin_indices[p];
4036 coordinate_destinations[local_ind] = proc_to_sent;
4056 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4059 mj_gno_t * num_points_in_all_processor_parts,
4060 mj_part_t num_parts,
4061 mj_part_t num_procs,
4062 mj_lno_t *send_count_to_each_proc,
4063 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4064 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4065 mj_part_t &out_part_index,
4066 mj_part_t &output_part_numbering_begin_index,
4067 int *coordinate_destinations){
4070 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4071 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4074 bool did_i_find_my_group =
false;
4076 mj_part_t num_free_procs = num_procs;
4077 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4079 double max_imbalance_difference = 0;
4080 mj_part_t max_differing_part = 0;
4083 for (mj_part_t i=0; i < num_parts; i++){
4086 double scalar_required_proc = num_procs *
4087 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4090 mj_part_t required_proc =
static_cast<mj_part_t
> (0.5 + scalar_required_proc);
4094 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4095 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4099 num_free_procs -= required_proc;
4101 --minimum_num_procs_required_for_rest_of_parts;
4104 num_procs_assigned_to_each_part[i] = required_proc;
4109 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4110 if (imbalance_wrt_ideal > max_imbalance_difference){
4111 max_imbalance_difference = imbalance_wrt_ideal;
4112 max_differing_part = i;
4117 if (num_free_procs > 0){
4118 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4125 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4127 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4128 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4137 for (
int i = 0; i < num_procs; ++i ){
4138 processor_part_assignments[i] = -1;
4139 processor_chains_in_parts[i] = -1;
4141 for (
int i = 0; i < num_parts; ++i ){
4142 part_assignment_proc_begin_indices[i] = -1;
4148 for(mj_part_t i = 0; i < num_parts; ++i){
4152 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4153 sort_item_num_part_points_in_procs[ii].
id = ii;
4156 if (processor_part_assignments[ii] == -1){
4157 sort_item_num_part_points_in_procs[ii].
val = num_points_in_all_processor_parts[ii * num_parts + i];
4158 sort_item_num_part_points_in_procs[ii].
signbit = 1;
4168 sort_item_num_part_points_in_procs[ii].
val = num_points_in_all_processor_parts[ii * num_parts + i];
4169 sort_item_num_part_points_in_procs[ii].
signbit = 0;
4173 uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4183 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4184 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4185 mj_gno_t ideal_num_points_in_a_proc =
4186 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4189 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4190 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].
id;
4191 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].
val;
4195 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4196 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].
id;
4198 processor_part_assignments[proc_id] = i;
4201 bool did_change_sign =
false;
4203 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4206 if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4207 did_change_sign =
true;
4208 sort_item_num_part_points_in_procs[ii].
signbit = 1;
4214 if(did_change_sign){
4216 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4228 if (!did_i_find_my_group){
4229 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4231 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].
id;
4233 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4235 if(proc_id_to_assign == this->myRank){
4237 did_i_find_my_group =
true;
4239 part_assignment_proc_begin_indices[i] = this->myRank;
4240 processor_chains_in_parts[this->myRank] = -1;
4243 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].
val;
4246 for (mj_part_t in = 0; in < i; ++in){
4247 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4254 if (!did_i_find_my_group){
4255 processor_ranks_for_subcomm.clear();
4263 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4264 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].
id;
4265 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].
val;
4270 if (num_points_to_sent < 0) {
4271 cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4277 while (num_points_to_sent > 0){
4279 if (num_points_to_sent <= space_left_in_sent_proc){
4281 space_left_in_sent_proc -= num_points_to_sent;
4283 if (this->myRank == nonassigned_proc_id){
4285 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4288 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4289 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4290 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4292 num_points_to_sent = 0;
4296 if(space_left_in_sent_proc > 0){
4297 num_points_to_sent -= space_left_in_sent_proc;
4300 if (this->myRank == nonassigned_proc_id){
4302 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4303 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4304 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4305 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4310 ++next_proc_to_send_index;
4313 if(next_part_to_send_index < nprocs - required_proc_count ){
4314 cout <<
"Migration - processor assignments - for part:" 4316 <<
" next_part_to_send :" << next_part_to_send_index
4317 <<
" nprocs:" << nprocs
4318 <<
" required_proc_count:" << required_proc_count
4319 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4325 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].
id;
4327 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].
val;
4335 this->assign_send_destinations(
4337 part_assignment_proc_begin_indices,
4338 processor_chains_in_parts,
4339 send_count_to_each_proc,
4340 coordinate_destinations);
4342 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4343 freeArray<mj_part_t>(processor_chains_in_parts);
4344 freeArray<mj_part_t>(processor_part_assignments);
4345 freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4346 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4363 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4366 mj_part_t num_parts,
4368 int *coordinate_destinations,
4369 mj_part_t &output_part_numbering_begin_index,
4370 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4372 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4373 mj_part_t previous_processor = -1;
4374 for(mj_part_t i = 0; i < num_parts; ++i){
4375 mj_part_t p = sort_item_part_to_proc_assignment[i].
id;
4377 mj_lno_t part_begin_index = 0;
4378 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4379 mj_lno_t part_end_index = this->new_part_xadj[p];
4381 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].
val;
4382 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4383 output_part_numbering_begin_index = part_shift_amount;
4385 previous_processor = assigned_proc;
4386 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4388 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4389 mj_lno_t localInd = this->new_coordinate_permutations[j];
4390 coordinate_destinations[localInd] = assigned_proc;
4412 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4415 mj_gno_t * num_points_in_all_processor_parts,
4416 mj_part_t num_parts,
4417 mj_part_t num_procs,
4418 mj_lno_t *send_count_to_each_proc,
4419 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4420 mj_part_t &out_num_part,
4421 std::vector<mj_part_t> &out_part_indices,
4422 mj_part_t &output_part_numbering_begin_index,
4423 int *coordinate_destinations){
4426 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4427 out_part_indices.clear();
4436 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4438 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4440 for (mj_part_t i = 0; i < num_procs; ++i){
4441 space_in_each_processor[i] = work_each;
4448 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4449 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4450 int empty_proc_count = num_procs;
4458 for (mj_part_t i = 0; i < num_parts; ++i){
4459 sort_item_point_counts_in_parts[i].
id = i;
4460 sort_item_point_counts_in_parts[i].
val = global_num_points_in_parts[i];
4463 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4469 for (mj_part_t j = 0; j < num_parts; ++j){
4471 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].
id;
4473 mj_gno_t load = global_num_points_in_parts[i];
4476 mj_part_t assigned_proc = -1;
4478 mj_part_t best_proc_to_assign = 0;
4482 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4483 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4488 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4490 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4493 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4496 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4499 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4500 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4501 mj_lno_t left_space = space_in_each_processor[ii] - load;
4503 if(left_space >= 0 ){
4508 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4509 best_proc_to_assign = ii;
4514 if (assigned_proc == -1){
4515 assigned_proc = best_proc_to_assign;
4518 if (num_parts_proc_assigned[assigned_proc]++ == 0){
4521 space_in_each_processor[assigned_proc] -= load;
4523 sort_item_part_to_proc_assignment[j].
id = i;
4524 sort_item_part_to_proc_assignment[j].
val = assigned_proc;
4528 if (assigned_proc == this->myRank){
4530 out_part_indices.push_back(i);
4534 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4536 freeArray<mj_part_t>(num_parts_proc_assigned);
4537 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4538 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4539 freeArray<mj_lno_t >(space_in_each_processor);
4543 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4547 this->assign_send_destinations2(
4549 sort_item_part_to_proc_assignment,
4550 coordinate_destinations,
4551 output_part_numbering_begin_index,
4552 next_future_num_parts_in_parts);
4554 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4575 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4578 mj_gno_t * num_points_in_all_processor_parts,
4579 mj_part_t num_parts,
4580 mj_part_t num_procs,
4581 mj_lno_t *send_count_to_each_proc,
4582 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4583 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4584 mj_part_t &out_num_part,
4585 std::vector<mj_part_t> &out_part_indices,
4586 mj_part_t &output_part_numbering_begin_index,
4587 int *coordinate_destinations){
4591 processor_ranks_for_subcomm.clear();
4593 if (num_procs > num_parts){
4598 mj_part_t out_part_index = 0;
4599 this->mj_assign_proc_to_parts(
4600 num_points_in_all_processor_parts,
4603 send_count_to_each_proc,
4604 processor_ranks_for_subcomm,
4605 next_future_num_parts_in_parts,
4607 output_part_numbering_begin_index,
4608 coordinate_destinations
4612 out_part_indices.clear();
4613 out_part_indices.push_back(out_part_index);
4620 processor_ranks_for_subcomm.push_back(this->myRank);
4624 this->mj_assign_parts_to_procs(
4625 num_points_in_all_processor_parts,
4628 send_count_to_each_proc,
4629 next_future_num_parts_in_parts,
4632 output_part_numbering_begin_index,
4633 coordinate_destinations);
4649 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4652 mj_part_t num_procs,
4653 mj_lno_t &num_new_local_points,
4654 std::string iteration,
4655 int *coordinate_destinations,
4656 mj_part_t num_parts)
4658 #ifdef ENABLE_ZOLTAN_MIGRATION 4659 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
4665 ZOLTAN_COMM_OBJ *plan = NULL;
4666 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4667 int num_incoming_gnos = 0;
4668 int message_tag = 7859;
4670 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4671 int ierr = Zoltan_Comm_Create(
4673 int(this->num_local_coords),
4674 coordinate_destinations,
4677 &num_incoming_gnos);
4679 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4681 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4682 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4686 ierr = Zoltan_Comm_Do(
4689 (
char *) this->current_mj_gnos,
4691 (
char *) incoming_gnos);
4694 freeArray<mj_gno_t>(this->current_mj_gnos);
4695 this->current_mj_gnos = incoming_gnos;
4699 for (
int i = 0; i < this->coord_dim; ++i){
4701 mj_scalar_t *coord = this->mj_coordinates[i];
4703 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4704 ierr = Zoltan_Comm_Do(
4708 sizeof(mj_scalar_t),
4709 (
char *) this->mj_coordinates[i]);
4711 freeArray<mj_scalar_t>(coord);
4715 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4717 mj_scalar_t *weight = this->mj_weights[i];
4719 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4720 ierr = Zoltan_Comm_Do(
4724 sizeof(mj_scalar_t),
4725 (
char *) this->mj_weights[i]);
4727 freeArray<mj_scalar_t>(weight);
4732 int *coord_own = allocMemory<int>(num_incoming_gnos);
4734 ierr = Zoltan_Comm_Do(
4737 (
char *) this->owner_of_coordinate,
4738 sizeof(
int), (
char *) coord_own);
4740 freeArray<int>(this->owner_of_coordinate);
4741 this->owner_of_coordinate = coord_own;
4747 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4748 if(num_procs < num_parts){
4750 ierr = Zoltan_Comm_Do(
4753 (
char *) this->assigned_part_ids,
4755 (
char *) new_parts);
4758 freeArray<mj_part_t>(this->assigned_part_ids);
4759 this->assigned_part_ids = new_parts;
4761 ierr = Zoltan_Comm_Destroy(&plan);
4763 num_new_local_points = num_incoming_gnos;
4764 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4769 #endif // ENABLE_ZOLTAN_MIGRATION 4771 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
4772 Tpetra::Distributor distributor(this->comm);
4773 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
4774 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
4775 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
4777 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
4780 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
4781 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
4782 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
4783 freeArray<mj_gno_t>(this->current_mj_gnos);
4784 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
4786 this->current_mj_gnos,
4787 received_gnos.getRawPtr(),
4788 num_incoming_gnos *
sizeof(mj_gno_t));
4791 for (
int i = 0; i < this->coord_dim; ++i){
4793 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
4794 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
4795 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
4796 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
4797 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4799 this->mj_coordinates[i],
4800 received_coord.getRawPtr(),
4801 num_incoming_gnos *
sizeof(mj_scalar_t));
4805 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4807 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
4808 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
4809 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
4810 freeArray<mj_scalar_t>(this->mj_weights[i]);
4811 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4813 this->mj_weights[i],
4814 received_weight.getRawPtr(),
4815 num_incoming_gnos *
sizeof(mj_scalar_t));
4820 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
4821 ArrayRCP<int> received_owners(num_incoming_gnos);
4822 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
4823 freeArray<int>(this->owner_of_coordinate);
4824 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
4826 this->owner_of_coordinate,
4827 received_owners.getRawPtr(),
4828 num_incoming_gnos *
sizeof(int));
4834 if(num_procs < num_parts){
4835 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
4836 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
4837 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
4838 freeArray<mj_part_t>(this->assigned_part_ids);
4839 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
4841 this->assigned_part_ids,
4842 received_partids.getRawPtr(),
4843 num_incoming_gnos *
sizeof(mj_part_t));
4846 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
4847 freeArray<mj_part_t>(this->assigned_part_ids);
4848 this->assigned_part_ids = new_parts;
4850 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
4851 num_new_local_points = num_incoming_gnos;
4862 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4865 mj_part_t group_size = processor_ranks_for_subcomm.size();
4866 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
4867 for(mj_part_t i = 0; i < group_size; ++i) {
4868 ids[i] = processor_ranks_for_subcomm[i];
4870 ArrayView<const mj_part_t> idView(ids, group_size);
4871 this->comm = this->comm->createSubcommunicator(idView);
4872 freeArray<mj_part_t>(ids);
4881 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4884 mj_part_t output_num_parts,
4885 mj_part_t num_parts){
4887 if (output_num_parts == 1){
4888 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4889 this->new_coordinate_permutations[i] = i;
4891 this->new_part_xadj[0] = this->num_local_coords;
4898 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
4900 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
4902 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
4904 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4905 mj_part_t ii = this->assigned_part_ids[i];
4906 ++num_points_in_parts[ii];
4911 mj_lno_t prev_index = 0;
4912 for(mj_part_t i = 0; i < num_parts; ++i){
4913 if(num_points_in_parts[i] > 0) {
4914 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
4915 prev_index += num_points_in_parts[i];
4916 part_shifts[i] = p++;
4921 mj_part_t assigned_num_parts = p - 1;
4922 for (;p < num_parts; ++p){
4923 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
4925 for(mj_part_t i = 0; i < output_num_parts; ++i){
4926 num_points_in_parts[i] = this->new_part_xadj[i];
4932 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
4933 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
4934 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
4937 freeArray<mj_lno_t>(num_points_in_parts);
4938 freeArray<mj_part_t>(part_shifts);
4965 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4968 mj_part_t input_num_parts,
4969 mj_part_t &output_num_parts,
4970 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4971 mj_part_t &output_part_begin_index,
4972 size_t migration_reduce_all_population,
4973 mj_lno_t num_coords_for_last_dim_part,
4974 std::string iteration,
4975 RCP<mj_partBoxVector_t> &input_part_boxes,
4976 RCP<mj_partBoxVector_t> &output_part_boxes
4979 mj_part_t num_procs = this->comm->getSize();
4980 this->myRank = this->comm->getRank();
4986 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
4989 this->get_processor_num_points_in_parts(
4992 num_points_in_all_processor_parts);
4996 if (!this->mj_check_to_migrate(
4997 migration_reduce_all_population,
4998 num_coords_for_last_dim_part,
5001 num_points_in_all_processor_parts)){
5002 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5007 mj_lno_t *send_count_to_each_proc = NULL;
5008 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5009 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5010 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5012 std::vector<mj_part_t> processor_ranks_for_subcomm;
5013 std::vector<mj_part_t> out_part_indices;
5016 this->mj_migration_part_proc_assignment(
5017 num_points_in_all_processor_parts,
5020 send_count_to_each_proc,
5021 processor_ranks_for_subcomm,
5022 next_future_num_parts_in_parts,
5025 output_part_begin_index,
5026 coordinate_destinations);
5031 freeArray<mj_lno_t>(send_count_to_each_proc);
5032 std::vector <mj_part_t> tmpv;
5034 std::sort (out_part_indices.begin(), out_part_indices.end());
5035 mj_part_t outP = out_part_indices.size();
5037 mj_gno_t new_global_num_points = 0;
5038 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5040 if (this->mj_keep_part_boxes){
5041 input_part_boxes->clear();
5046 for (mj_part_t i = 0; i < outP; ++i){
5047 mj_part_t ind = out_part_indices[i];
5048 new_global_num_points += global_num_points_in_parts[ind];
5049 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5050 if (this->mj_keep_part_boxes){
5051 input_part_boxes->push_back((*output_part_boxes)[ind]);
5055 if (this->mj_keep_part_boxes){
5056 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5057 input_part_boxes = output_part_boxes;
5058 output_part_boxes = tmpPartBoxes;
5060 next_future_num_parts_in_parts->clear();
5061 for (mj_part_t i = 0; i < outP; ++i){
5062 mj_part_t p = tmpv[i];
5063 next_future_num_parts_in_parts->push_back(p);
5066 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5068 mj_lno_t num_new_local_points = 0;
5072 this->mj_migrate_coords(
5074 num_new_local_points,
5076 coordinate_destinations,
5080 freeArray<int>(coordinate_destinations);
5082 if(this->num_local_coords != num_new_local_points){
5083 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5084 freeArray<mj_lno_t>(this->coordinate_permutations);
5086 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5087 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5089 this->num_local_coords = num_new_local_points;
5090 this->num_global_coords = new_global_num_points;
5095 this->create_sub_communicator(processor_ranks_for_subcomm);
5096 processor_ranks_for_subcomm.clear();
5099 this->fill_permutation_array(
5119 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5122 mj_part_t num_parts,
5123 mj_scalar_t *mj_current_dim_coords,
5124 mj_scalar_t *current_concurrent_cut_coordinate,
5125 mj_lno_t coordinate_begin,
5126 mj_lno_t coordinate_end,
5127 mj_scalar_t *used_local_cut_line_weight_to_left,
5128 mj_lno_t *out_part_xadj,
5132 mj_part_t no_cuts = num_parts - 1;
5137 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5138 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5143 if (this->distribute_points_on_cut_lines){
5145 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5146 for (mj_part_t i = 0; i < no_cuts; ++i){
5148 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5150 for(
int ii = 0; ii < this->num_threads; ++ii){
5151 if(left_weight > this->sEpsilon){
5153 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5154 if(thread_ii_weight_on_cut < left_weight){
5155 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5158 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5160 left_weight -= thread_ii_weight_on_cut;
5163 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5171 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5172 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5173 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5176 / mj_scalar_t(SIGNIFICANCE_MUL);
5181 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5182 thread_num_points_in_parts[ii] = 0;
5194 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5198 typedef std::vector< multiSItem > multiSVector;
5199 typedef std::vector<multiSVector> multiS2Vector;
5202 std::vector<mj_scalar_t *>allocated_memory;
5205 multiS2Vector sort_vector_points_on_cut;
5208 mj_part_t different_cut_count = 1;
5213 multiSVector tmpMultiSVector;
5214 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5216 for (mj_part_t i = 1; i < no_cuts ; ++i){
5219 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5220 cut_map[i] = cut_map[i-1];
5223 cut_map[i] = different_cut_count++;
5224 multiSVector tmp2MultiSVector;
5225 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5231 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5233 mj_lno_t i = this->coordinate_permutations[ii];
5235 mj_part_t pp = this->assigned_part_ids[i];
5236 mj_part_t p = pp / 2;
5239 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5240 allocated_memory.push_back(vals);
5244 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5245 vals[val_ind++] = this->mj_coordinates[dim][i];
5247 for(
int dim = 0; dim < coordInd; ++dim){
5248 vals[val_ind++] = this->mj_coordinates[dim][i];
5250 multiSItem tempSortItem(i, this->coord_dim -1, vals);
5252 mj_part_t cmap = cut_map[p];
5253 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5257 ++thread_num_points_in_parts[p];
5258 this->assigned_part_ids[i] = p;
5263 for (mj_part_t i = 0; i < different_cut_count; ++i){
5264 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5268 mj_part_t previous_cut_map = cut_map[0];
5277 mj_scalar_t weight_stolen_from_previous_part = 0;
5278 for (mj_part_t p = 0; p < no_cuts; ++p){
5280 mj_part_t mapped_cut = cut_map[p];
5284 if (previous_cut_map != mapped_cut){
5285 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5286 for (; sort_vector_end >= 0; --sort_vector_end){
5287 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5288 mj_lno_t i = t.index;
5289 ++thread_num_points_in_parts[p];
5290 this->assigned_part_ids[i] = p;
5292 sort_vector_points_on_cut[previous_cut_map].clear();
5294 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5296 for (; sort_vector_end >= 0; --sort_vector_end){
5297 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5298 mj_lno_t i = t.index;
5299 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5303 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5304 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5307 my_local_thread_cut_weights_to_put_left[p] -= w;
5308 sort_vector_points_on_cut[mapped_cut].pop_back();
5309 ++thread_num_points_in_parts[p];
5310 this->assigned_part_ids[i] = p;
5313 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5314 if(mapped_cut == cut_map[p + 1] ){
5317 if (previous_cut_map != mapped_cut){
5318 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5323 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5327 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5336 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5337 if (previous_cut_map != mapped_cut){
5338 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5341 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5345 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5351 previous_cut_map = mapped_cut;
5354 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5355 for (; sort_vector_end >= 0; --sort_vector_end){
5356 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5357 mj_lno_t i = t.index;
5358 ++thread_num_points_in_parts[no_cuts];
5359 this->assigned_part_ids[i] = no_cuts;
5361 sort_vector_points_on_cut[previous_cut_map].clear();
5362 freeArray<mj_part_t> (cut_map);
5365 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5366 for(mj_lno_t i = 0; i < vSize; ++i){
5367 freeArray<mj_scalar_t> (allocated_memory[i]);
5371 for(mj_part_t j = 0; j < num_parts; ++j){
5372 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5373 for (
int i = 0; i < this->num_threads; ++i){
5374 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5375 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5376 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5379 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5383 for(mj_part_t j = 1; j < num_parts; ++j){
5384 out_part_xadj[j] += out_part_xadj[j - 1];
5390 for(mj_part_t j = 1; j < num_parts; ++j){
5391 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5396 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5397 mj_lno_t i = this->coordinate_permutations[ii];
5398 mj_part_t p = this->assigned_part_ids[i];
5399 this->new_coordinate_permutations[coordinate_begin +
5400 thread_num_points_in_parts[p]++] = i;
5415 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5418 mj_part_t current_num_parts,
5419 mj_part_t output_part_begin_index,
5420 RCP<mj_partBoxVector_t> &output_part_boxes,
5421 bool is_data_ever_migrated)
5423 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5425 #ifdef HAVE_ZOLTAN2_OMP 5426 #pragma omp parallel for 5428 for(mj_part_t i = 0; i < current_num_parts;++i){
5431 mj_lno_t end = this->part_xadj[i];
5433 if(i > 0) begin = this->part_xadj[i -1];
5434 mj_part_t part_to_set_index = i + output_part_begin_index;
5435 if (this->mj_keep_part_boxes){
5436 (*output_part_boxes)[i].setpId(part_to_set_index);
5438 for (mj_lno_t ii = begin; ii < end; ++ii){
5439 mj_lno_t k = this->coordinate_permutations[ii];
5440 this->assigned_part_ids[k] = part_to_set_index;
5445 if(!is_data_ever_migrated){
5452 #ifdef ENABLE_ZOLTAN_MIGRATION 5453 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5460 ZOLTAN_COMM_OBJ *plan = NULL;
5461 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5464 int message_tag = 7856;
5466 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5467 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5468 this->owner_of_coordinate, mpi_comm, message_tag,
5471 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5473 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5476 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5477 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
5478 sizeof(mj_gno_t), (
char *) incoming_gnos);
5481 freeArray<mj_gno_t>(this->current_mj_gnos);
5482 this->current_mj_gnos = incoming_gnos;
5484 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5487 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
5488 sizeof(mj_part_t), (
char *) incoming_partIds);
5490 freeArray<mj_part_t>(this->assigned_part_ids);
5491 this->assigned_part_ids = incoming_partIds;
5493 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5494 ierr = Zoltan_Comm_Destroy(&plan);
5497 this->num_local_coords = incoming;
5502 #endif // !ENABLE_ZOLTAN_MIGRATION 5505 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
5506 Tpetra::Distributor distributor(this->mj_problemComm);
5507 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5508 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5509 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
5511 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5513 ArrayRCP<mj_gno_t> received_gnos(incoming);
5514 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5515 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5516 freeArray<mj_gno_t>(this->current_mj_gnos);
5517 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5518 memcpy( this->current_mj_gnos,
5519 received_gnos.getRawPtr(),
5520 incoming *
sizeof(mj_gno_t));
5523 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5524 ArrayRCP<mj_part_t> received_partids(incoming);
5525 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5526 freeArray<mj_part_t>(this->assigned_part_ids);
5527 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5528 memcpy( this->assigned_part_ids,
5529 received_partids.getRawPtr(),
5530 incoming *
sizeof(mj_part_t));
5532 this->num_local_coords = incoming;
5533 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5538 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5540 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5545 if (this->mj_keep_part_boxes){
5550 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5555 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5558 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5560 for (
int i=0; i < this->coord_dim; i++){
5561 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5563 freeArray<mj_scalar_t *>(this->mj_coordinates);
5565 for (
int i=0; i < this->num_weights_per_coord; i++){
5566 freeArray<mj_scalar_t>(this->mj_weights[i]);
5568 freeArray<mj_scalar_t *>(this->mj_weights);
5570 freeArray<int>(this->owner_of_coordinate);
5572 for(
int i = 0; i < this->num_threads; ++i){
5573 freeArray<mj_lno_t>(this->thread_point_counts[i]);
5576 freeArray<mj_lno_t *>(this->thread_point_counts);
5577 freeArray<double *> (this->thread_part_weight_work);
5579 if(this->distribute_points_on_cut_lines){
5580 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5581 for(
int i = 0; i < this->num_threads; ++i){
5582 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5584 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5585 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5586 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5589 freeArray<mj_part_t>(this->my_incomplete_cut_count);
5591 freeArray<mj_scalar_t>(this->max_min_coords);
5593 freeArray<mj_lno_t>(this->new_part_xadj);
5595 freeArray<mj_lno_t>(this->coordinate_permutations);
5597 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5599 freeArray<mj_scalar_t>(this->all_cut_coordinates);
5601 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5603 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5605 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5607 freeArray<mj_scalar_t>(this->target_part_weights);
5609 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5611 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5613 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5614 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5615 freeArray<bool>(this->is_cut_line_determined);
5616 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5617 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5619 for(
int i = 0; i < this->num_threads; ++i){
5620 freeArray<double>(this->thread_part_weights[i]);
5621 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5622 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5625 freeArray<double *>(this->thread_part_weights);
5626 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5627 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5629 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5639 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5642 bool distribute_points_on_cut_lines_,
5643 int max_concurrent_part_calculation_,
5644 int check_migrate_avoid_migration_option_,
5645 mj_scalar_t minimum_migration_imbalance_){
5646 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5647 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5648 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5649 this->minimum_migration_imbalance = minimum_migration_imbalance_;
5682 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5686 const RCP<const Environment> &env,
5687 RCP<Comm<int> > &problemComm,
5689 double imbalance_tolerance_,
5690 size_t num_global_parts_,
5691 mj_part_t *part_no_array_,
5692 int recursion_depth_,
5695 mj_lno_t num_local_coords_,
5696 mj_gno_t num_global_coords_,
5697 const mj_gno_t *initial_mj_gnos_,
5698 mj_scalar_t **mj_coordinates_,
5700 int num_weights_per_coord_,
5701 bool *mj_uniform_weights_,
5702 mj_scalar_t **mj_weights_,
5703 bool *mj_uniform_parts_,
5704 mj_scalar_t **mj_part_sizes_,
5706 mj_part_t *&result_assigned_part_ids_,
5707 mj_gno_t *&result_mj_gnos_
5712 if(comm->getRank() == 0){
5713 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
5714 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
5715 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
5720 this->mj_problemComm = problemComm;
5721 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5723 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
5724 this->mj_env->debug(3,
"In MultiJagged Jagged");
5727 this->imbalance_tolerance = imbalance_tolerance_;
5728 this->num_global_parts = num_global_parts_;
5729 this->part_no_array = part_no_array_;
5730 this->recursion_depth = recursion_depth_;
5732 this->coord_dim = coord_dim_;
5733 this->num_local_coords = num_local_coords_;
5734 this->num_global_coords = num_global_coords_;
5735 this->mj_coordinates = mj_coordinates_;
5736 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
5738 this->num_weights_per_coord = num_weights_per_coord_;
5739 this->mj_uniform_weights = mj_uniform_weights_;
5740 this->mj_weights = mj_weights_;
5741 this->mj_uniform_parts = mj_uniform_parts_;
5742 this->mj_part_sizes = mj_part_sizes_;
5744 this->num_threads = 1;
5745 #ifdef HAVE_ZOLTAN2_OMP 5746 #pragma omp parallel 5749 this->num_threads = omp_get_num_threads();
5754 this->set_part_specifications();
5756 this->allocate_set_work_memory();
5760 this->comm = this->mj_problemComm->duplicate();
5763 mj_part_t current_num_parts = 1;
5764 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
5766 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
5768 mj_part_t output_part_begin_index = 0;
5769 mj_part_t future_num_parts = this->total_num_part;
5770 bool is_data_ever_migrated =
false;
5772 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
5773 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
5774 next_future_num_parts_in_parts->push_back(this->num_global_parts);
5776 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
5777 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
5779 compute_global_box();
5780 if(this->mj_keep_part_boxes){
5781 this->init_part_boxes(output_part_boxes);
5784 for (
int i = 0; i < this->recursion_depth; ++i){
5787 std::vector <mj_part_t> num_partitioning_in_current_dim;
5801 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
5802 future_num_part_in_parts = next_future_num_parts_in_parts;
5803 next_future_num_parts_in_parts = tmpPartVect;
5808 next_future_num_parts_in_parts->clear();
5810 if(this->mj_keep_part_boxes){
5811 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5812 input_part_boxes = output_part_boxes;
5813 output_part_boxes = tmpPartBoxes;
5814 output_part_boxes->clear();
5818 mj_part_t output_part_count_in_dimension =
5819 this->update_part_num_arrays(
5820 num_partitioning_in_current_dim,
5821 future_num_part_in_parts,
5822 next_future_num_parts_in_parts,
5832 if(output_part_count_in_dimension == current_num_parts) {
5834 tmpPartVect= future_num_part_in_parts;
5835 future_num_part_in_parts = next_future_num_parts_in_parts;
5836 next_future_num_parts_in_parts = tmpPartVect;
5838 if(this->mj_keep_part_boxes){
5839 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5840 input_part_boxes = output_part_boxes;
5841 output_part_boxes = tmpPartBoxes;
5848 int coordInd = i % this->coord_dim;
5849 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
5852 std::string istring = toString<int>(i);
5853 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
5857 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
5860 mj_part_t output_part_index = 0;
5863 mj_part_t output_coordinate_end_index = 0;
5865 mj_part_t current_work_part = 0;
5866 mj_part_t current_concurrent_num_parts =
5867 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
5869 mj_part_t obtained_part_index = 0;
5872 for (; current_work_part < current_num_parts;
5873 current_work_part += current_concurrent_num_parts){
5875 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
5876 this->max_concurrent_part_calculation);
5878 mj_part_t actual_work_part_count = 0;
5882 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
5883 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
5887 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
5890 ++actual_work_part_count;
5891 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
5892 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
5900 this->mj_get_local_min_max_coord_totW(
5901 coordinate_begin_index,
5902 coordinate_end_index,
5903 this->coordinate_permutations,
5904 mj_current_dim_coords,
5905 this->process_local_min_max_coord_total_weight[kk],
5906 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
5907 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
5912 if (actual_work_part_count > 0){
5914 this->mj_get_global_min_max_coord_totW(
5915 current_concurrent_num_parts,
5916 this->process_local_min_max_coord_total_weight,
5917 this->global_min_max_coord_total_weight);
5921 mj_part_t total_incomplete_cut_count = 0;
5926 mj_part_t concurrent_part_cut_shift = 0;
5927 mj_part_t concurrent_part_part_shift = 0;
5928 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
5929 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
5930 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
5931 current_concurrent_num_parts];
5933 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
5934 2 * current_concurrent_num_parts];
5936 mj_part_t concurrent_current_part_index = current_work_part + kk;
5938 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
5940 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
5941 mj_scalar_t *current_target_part_weights = this->target_part_weights +
5942 concurrent_part_part_shift;
5944 concurrent_part_cut_shift += partition_count - 1;
5946 concurrent_part_part_shift += partition_count;
5951 if(partition_count > 1 && min_coordinate <= max_coordinate){
5955 total_incomplete_cut_count += partition_count - 1;
5958 this->my_incomplete_cut_count[kk] = partition_count - 1;
5961 this->mj_get_initial_cut_coords_target_weights(
5964 partition_count - 1,
5965 global_total_weight,
5967 current_target_part_weights,
5968 future_num_part_in_parts,
5969 next_future_num_parts_in_parts,
5970 concurrent_current_part_index,
5971 obtained_part_index);
5973 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
5974 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
5978 this->set_initial_coordinate_parts(
5981 concurrent_current_part_index,
5982 coordinate_begin_index, coordinate_end_index,
5983 this->coordinate_permutations,
5984 mj_current_dim_coords,
5985 this->assigned_part_ids,
5990 this->my_incomplete_cut_count[kk] = 0;
5992 obtained_part_index += partition_count;
5999 mj_scalar_t used_imbalance = 0;
6004 mj_current_dim_coords,
6007 current_concurrent_num_parts,
6008 current_cut_coordinates,
6009 total_incomplete_cut_count,
6010 num_partitioning_in_current_dim);
6015 mj_part_t output_array_shift = 0;
6016 mj_part_t cut_shift = 0;
6017 size_t tlr_shift = 0;
6018 size_t partweight_array_shift = 0;
6020 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6021 mj_part_t current_concurrent_work_part = current_work_part + kk;
6022 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6025 if((num_parts != 1 )
6027 this->global_min_max_coord_total_weight[kk] >
6028 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6032 for(mj_part_t jj = 0; jj < num_parts; ++jj){
6033 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6035 cut_shift += num_parts - 1;
6036 tlr_shift += (4 *(num_parts - 1) + 1);
6037 output_array_shift += num_parts;
6038 partweight_array_shift += (2 * (num_parts - 1) + 1);
6042 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6043 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6044 current_concurrent_work_part -1];
6045 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6046 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6051 for(
int ii = 0; ii < this->num_threads; ++ii){
6052 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6056 if(this->mj_keep_part_boxes){
6058 for (mj_part_t j = 0; j < num_parts - 1; ++j){
6059 (*output_part_boxes)[output_array_shift + output_part_index +
6060 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6063 (*output_part_boxes)[output_array_shift + output_part_index + j +
6064 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6070 this->mj_create_new_partitions(
6072 mj_current_dim_coords,
6073 current_concurrent_cut_coordinate,
6076 used_local_cut_line_weight_to_left,
6077 this->thread_part_weight_work,
6078 this->new_part_xadj + output_part_index + output_array_shift
6085 mj_lno_t part_size = coordinate_end - coordinate_begin;
6086 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6088 this->new_coordinate_permutations + coordinate_begin,
6089 this->coordinate_permutations + coordinate_begin,
6090 part_size *
sizeof(mj_lno_t));
6092 cut_shift += num_parts - 1;
6093 tlr_shift += (4 *(num_parts - 1) + 1);
6094 output_array_shift += num_parts;
6095 partweight_array_shift += (2 * (num_parts - 1) + 1);
6105 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6106 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6107 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6109 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6112 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6114 output_part_index += num_parts ;
6121 int current_world_size = this->comm->getSize();
6122 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6125 bool is_migrated_in_current_dimension =
false;
6130 if (future_num_parts > 1 &&
6131 this->check_migrate_avoid_migration_option >= 0 &&
6132 current_world_size > 1){
6134 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6135 mj_part_t num_parts = output_part_count_in_dimension;
6136 if ( this->mj_perform_migration(
6139 next_future_num_parts_in_parts,
6140 output_part_begin_index,
6141 migration_reduce_all_population,
6142 this->num_local_coords / (future_num_parts * current_num_parts),
6144 input_part_boxes, output_part_boxes) ) {
6145 is_migrated_in_current_dimension =
true;
6146 is_data_ever_migrated =
true;
6147 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
6150 this->total_dim_num_reduce_all /= num_parts;
6153 is_migrated_in_current_dimension =
false;
6154 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6159 mj_lno_t * tmp = this->coordinate_permutations;
6160 this->coordinate_permutations = this->new_coordinate_permutations;
6161 this->new_coordinate_permutations = tmp;
6163 if(!is_migrated_in_current_dimension){
6164 this->total_dim_num_reduce_all -= current_num_parts;
6165 current_num_parts = output_part_count_in_dimension;
6167 freeArray<mj_lno_t>(this->part_xadj);
6168 this->part_xadj = this->new_part_xadj;
6170 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6174 delete future_num_part_in_parts;
6175 delete next_future_num_parts_in_parts;
6177 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6184 this->set_final_parts(
6186 output_part_begin_index,
6188 is_data_ever_migrated);
6190 result_assigned_part_ids_ = this->assigned_part_ids;
6191 result_mj_gnos_ = this->current_mj_gnos;
6193 this->free_work_memory();
6194 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6195 this->mj_env->debug(3,
"Out of MultiJagged");
6203 template <
typename Adapter>
6208 #ifndef DOXYGEN_SHOULD_SKIP_THIS 6211 typedef typename Adapter::scalar_t mj_scalar_t;
6212 typedef typename Adapter::gno_t mj_gno_t;
6213 typedef typename Adapter::lno_t mj_lno_t;
6214 typedef typename Adapter::node_t mj_node_t;
6215 typedef typename Adapter::part_t mj_part_t;
6217 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6221 RCP<const Environment> mj_env;
6222 RCP<Comm<int> > mj_problemComm;
6223 RCP<const coordinateModel_t> mj_coords;
6226 double imbalance_tolerance;
6227 size_t num_global_parts;
6228 mj_part_t *part_no_array;
6229 int recursion_depth;
6232 mj_lno_t num_local_coords;
6233 mj_gno_t num_global_coords;
6234 const mj_gno_t *initial_mj_gnos;
6235 mj_scalar_t **mj_coordinates;
6237 int num_weights_per_coord;
6238 bool *mj_uniform_weights;
6239 mj_scalar_t **mj_weights;
6240 bool *mj_uniform_parts;
6241 mj_scalar_t **mj_part_sizes;
6243 bool distribute_points_on_cut_lines;
6244 mj_part_t max_concurrent_part_calculation;
6245 int check_migrate_avoid_migration_option;
6246 mj_scalar_t minimum_migration_imbalance;
6247 int mj_keep_part_boxes;
6253 ArrayRCP<mj_part_t> comXAdj_;
6254 ArrayRCP<mj_part_t> comAdj_;
6260 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6262 void set_up_partitioning_data(
6265 void set_input_parameters(
const Teuchos::ParameterList &p);
6267 void free_work_memory();
6269 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6274 RCP<Comm<int> > &problemComm,
6275 const RCP<const coordinateModel_t> &coords) :
6276 mj_partitioner(), mj_env(env),
6277 mj_problemComm(problemComm),
6279 imbalance_tolerance(0),
6280 num_global_parts(1), part_no_array(NULL),
6282 coord_dim(0),num_local_coords(0), num_global_coords(0),
6283 initial_mj_gnos(NULL), mj_coordinates(NULL),
6284 num_weights_per_coord(0),
6285 mj_uniform_weights(NULL), mj_weights(NULL),
6286 mj_uniform_parts(NULL),
6287 mj_part_sizes(NULL),
6288 distribute_points_on_cut_lines(true),
6289 max_concurrent_part_calculation(1),
6290 check_migrate_avoid_migration_option(0),
6291 minimum_migration_imbalance(0.30),
6292 mj_keep_part_boxes(0), num_threads(1), mj_run_as_rcb(0),
6293 comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6296 if (coordinate_ArrayRCP_holder != NULL){
6297 delete [] this->coordinate_ArrayRCP_holder;
6298 this->coordinate_ArrayRCP_holder = NULL;
6312 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6316 mj_part_t pointAssign(
int dim, mj_scalar_t *point)
const;
6318 void boxAssign(
int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6319 size_t &nPartsFound, mj_part_t **partsFound)
const;
6324 void getCommunicationGraph(
6326 ArrayRCP<mj_part_t> &comXAdj,
6327 ArrayRCP<mj_part_t> &comAdj);
6340 template <
typename Adapter>
6345 this->set_up_partitioning_data(solution);
6346 this->set_input_parameters(this->mj_env->getParameters());
6347 if (this->mj_keep_part_boxes){
6348 this->mj_partitioner.set_to_keep_part_boxes();
6350 this->mj_partitioner.set_partitioning_parameters(
6351 this->distribute_points_on_cut_lines,
6352 this->max_concurrent_part_calculation,
6353 this->check_migrate_avoid_migration_option,
6354 this->minimum_migration_imbalance);
6356 mj_part_t *result_assigned_part_ids = NULL;
6357 mj_gno_t *result_mj_gnos = NULL;
6358 this->mj_partitioner.multi_jagged_part(
6360 this->mj_problemComm,
6362 this->imbalance_tolerance,
6363 this->num_global_parts,
6364 this->part_no_array,
6365 this->recursion_depth,
6368 this->num_local_coords,
6369 this->num_global_coords,
6370 this->initial_mj_gnos,
6371 this->mj_coordinates,
6373 this->num_weights_per_coord,
6374 this->mj_uniform_weights,
6376 this->mj_uniform_parts,
6377 this->mj_part_sizes,
6379 result_assigned_part_ids,
6384 #if defined(__cplusplus) && __cplusplus >= 201103L 6385 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6386 localGidToLid.reserve(this->num_local_coords);
6387 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6388 localGidToLid[this->initial_mj_gnos[i]] = i;
6390 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6391 0, this->num_local_coords,
true);
6393 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6394 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6395 partId[origLID] = result_assigned_part_ids[i];
6399 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6400 localGidToLid(this->num_local_coords);
6401 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6402 localGidToLid.put(this->initial_mj_gnos[i], i);
6404 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6405 0, this->num_local_coords,
true);
6407 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6408 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6409 partId[origLID] = result_assigned_part_ids[i];
6412 #endif // C++11 is enabled 6414 delete [] result_mj_gnos;
6415 delete [] result_assigned_part_ids;
6417 solution->setParts(partId);
6418 this->free_work_memory();
6423 template <
typename Adapter>
6425 freeArray<mj_scalar_t *>(this->mj_coordinates);
6426 freeArray<mj_scalar_t *>(this->mj_weights);
6427 freeArray<bool>(this->mj_uniform_parts);
6428 freeArray<mj_scalar_t *>(this->mj_part_sizes);
6429 freeArray<bool>(this->mj_uniform_weights);
6435 template <
typename Adapter>
6440 this->coord_dim = this->mj_coords->getCoordinateDim();
6441 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
6442 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
6443 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
6444 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
6449 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
6452 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
6453 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
6456 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
6459 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
6461 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
6464 ArrayView<const mj_gno_t> gnos;
6465 ArrayView<input_t> xyz;
6466 ArrayView<input_t> wgts;
6469 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
6471 this->mj_coords->getCoordinates(gnos, xyz, wgts);
6473 ArrayView<const mj_gno_t> mj_gnos = gnos;
6474 this->initial_mj_gnos = mj_gnos.getRawPtr();
6477 for (
int dim=0; dim < this->coord_dim; dim++){
6478 ArrayRCP<const mj_scalar_t> ar;
6480 this->coordinate_ArrayRCP_holder[dim] = ar;
6483 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
6487 if (this->num_weights_per_coord == 0){
6488 this->mj_uniform_weights[0] =
true;
6489 this->mj_weights[0] = NULL;
6493 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
6494 ArrayRCP<const mj_scalar_t> ar;
6495 wgts[wdim].getInputArray(ar);
6496 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
6497 this->mj_uniform_weights[wdim] =
false;
6498 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
6502 for (
int wdim = 0; wdim < criteria_dim; wdim++){
6503 if (solution->criteriaHasUniformPartSizes(wdim)){
6504 this->mj_uniform_parts[wdim] =
true;
6505 this->mj_part_sizes[wdim] = NULL;
6508 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
6517 template <
typename Adapter>
6520 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
6523 tol = pe->getValue(&tol);
6524 this->imbalance_tolerance = tol - 1.0;
6528 if (this->imbalance_tolerance <= 0)
6529 this->imbalance_tolerance= 10e-4;
6532 this->part_no_array = NULL;
6534 this->recursion_depth = 0;
6536 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
6537 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
6538 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
6539 this->mj_env->debug(2,
"mj_parts provided by user");
6543 this->distribute_points_on_cut_lines =
true;
6544 this->max_concurrent_part_calculation = 1;
6546 this->mj_run_as_rcb = 0;
6547 int mj_user_recursion_depth = -1;
6548 this->mj_keep_part_boxes = 0;
6549 this->check_migrate_avoid_migration_option = 0;
6550 this->minimum_migration_imbalance = 0.35;
6552 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
6555 imb = pe->getValue(&imb);
6556 this->minimum_migration_imbalance = imb - 1.0;
6559 pe = pl.getEntryPtr(
"mj_migration_option");
6561 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
6563 this->check_migrate_avoid_migration_option = 0;
6565 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
6568 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
6570 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
6572 this->max_concurrent_part_calculation = 1;
6575 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
6577 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
6579 this->mj_keep_part_boxes = 0;
6590 if (this->mj_keep_part_boxes == 0){
6591 pe = pl.getEntryPtr(
"mapping_type");
6593 int mapping_type = -1;
6594 mapping_type = pe->getValue(&mapping_type);
6595 if (mapping_type == 0){
6596 mj_keep_part_boxes = 1;
6602 pe = pl.getEntryPtr(
"mj_enable_rcb");
6604 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
6606 this->mj_run_as_rcb = 0;
6609 pe = pl.getEntryPtr(
"mj_recursion_depth");
6611 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
6613 mj_user_recursion_depth = -1;
6617 pe = pl.getEntryPtr(
"rectilinear");
6618 if (pe) val = pe->getValue(&val);
6620 this->distribute_points_on_cut_lines =
false;
6622 this->distribute_points_on_cut_lines =
true;
6625 if (this->mj_run_as_rcb){
6626 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
6628 if (this->recursion_depth < 1){
6629 if (mj_user_recursion_depth > 0){
6630 this->recursion_depth = mj_user_recursion_depth;
6633 this->recursion_depth = this->coord_dim;
6637 this->num_threads = 1;
6638 #ifdef HAVE_ZOLTAN2_OMP 6639 #pragma omp parallel 6641 this->num_threads = omp_get_num_threads();
6648 template <
typename Adapter>
6651 typename Adapter::scalar_t *lower,
6652 typename Adapter::scalar_t *upper,
6653 size_t &nPartsFound,
6654 typename Adapter::part_t **partsFound)
const 6663 if (this->mj_keep_part_boxes) {
6666 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6668 size_t nBoxes = (*partBoxes).size();
6670 throw std::logic_error(
"no part boxes exist");
6674 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6676 if (globalBox->boxesOverlap(dim, lower, upper)) {
6678 std::vector<typename Adapter::part_t> partlist;
6681 for (
size_t i = 0; i < nBoxes; i++) {
6683 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
6685 partlist.push_back((*partBoxes)[i].getpId());
6706 *partsFound =
new mj_part_t[nPartsFound];
6707 for (
size_t i = 0; i < nPartsFound; i++)
6708 (*partsFound)[i] = partlist[i];
6721 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
6726 template <
typename Adapter>
6729 typename Adapter::scalar_t *point)
const 6736 if (this->mj_keep_part_boxes) {
6737 typename Adapter::part_t foundPart = -1;
6740 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6742 size_t nBoxes = (*partBoxes).size();
6744 throw std::logic_error(
"no part boxes exist");
6748 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6750 if (globalBox->pointInBox(dim, point)) {
6754 for (i = 0; i < nBoxes; i++) {
6756 if ((*partBoxes)[i].pointInBox(dim, point)) {
6757 foundPart = (*partBoxes)[i].getpId();
6771 std::ostringstream oss;
6773 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
6774 oss <<
") not found in domain";
6775 throw std::logic_error(oss.str());
6784 size_t closestBox = 0;
6785 mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
6786 mj_scalar_t *centroid =
new mj_scalar_t[dim];
6787 for (
size_t i = 0; i < nBoxes; i++) {
6788 (*partBoxes)[i].computeCentroid(centroid);
6789 mj_scalar_t sum = 0.;
6791 for (
int j = 0; j < dim; j++) {
6792 diff = centroid[j] - point[j];
6795 if (sum < minDistance) {
6800 foundPart = (*partBoxes)[closestBox].getpId();
6807 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
6811 template <
typename Adapter>
6817 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
6818 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6819 mj_part_t ntasks = (*pBoxes).size();
6820 int dim = (*pBoxes)[0].getDim();
6829 template <
typename Adapter>
6830 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
6833 return this->mj_partitioner.get_kept_boxes();
6837 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6839 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6842 if (this->mj_keep_part_boxes)
6843 return this->kept_boxes;
6845 throw std::logic_error(
"Error: part boxes are not stored.");
6848 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6850 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6852 RCP<mj_partBoxVector_t> &localPartBoxes
6855 mj_part_t ntasks = this->num_global_parts;
6856 int dim = (*localPartBoxes)[0].getDim();
6857 mj_scalar_t *localPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
6859 memset(localPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
6861 mj_scalar_t *globalPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
6862 memset(globalPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
6864 mj_scalar_t *localPartMins = localPartBoundaries;
6865 mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
6867 mj_scalar_t *globalPartMins = globalPartBoundaries;
6868 mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
6870 mj_part_t boxCount = localPartBoxes->size();
6871 for (mj_part_t i = 0; i < boxCount; ++i){
6872 mj_part_t pId = (*localPartBoxes)[i].getpId();
6875 mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
6876 mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
6878 for (
int j = 0; j < dim; ++j){
6879 localPartMins[dim * pId + j] = lmins[j];
6880 localPartMaxs[dim * pId + j] = lmaxs[j];
6892 reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
6893 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
6894 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
6895 for (mj_part_t i = 0; i < ntasks; ++i){
6897 globalPartMins + dim * i,
6898 globalPartMaxs + dim * i);
6910 delete []localPartBoundaries;
6911 delete []globalPartBoundaries;
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_)
Multi Jagged coordinate partitioning algorithm.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Defines Parameter related enumerators, declares functions.
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
T * allocMemory(size_t size)
Allocates memory for the given size.
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
void getInputArray(ArrayRCP< const T > &array) const
Create a contiguous array of the required type, perhaps for a TPL.
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
The StridedData class manages lists of weights or coordinates.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
std::string toString(tt obj)
Converts the given object to string.
Defines the CoordinateModel classes.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
void multi_jagged_part(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.