Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
56 
57 #include <Tpetra_Distributor.hpp>
58 #include <Teuchos_ParameterList.hpp>
60 #include <new> // ::operator new[]
61 #include <algorithm> // std::sort
62 #include <Zoltan2_Util.hpp>
63 #include <vector>
64 
65 #if defined(__cplusplus) && __cplusplus >= 201103L
66 #include <unordered_map>
67 #else
68 #include <Teuchos_Hashtable.hpp>
69 #endif // C++11 is enabled
70 
71 #ifdef ZOLTAN2_USEZOLTANCOMM
72 #ifdef HAVE_ZOLTAN2_MPI
73 #define ENABLE_ZOLTAN_MIGRATION
74 #include "zoltan_comm_cpp.h"
75 #include "zoltan_types.h" // for error codes
76 #endif
77 #endif
78 
79 #ifdef HAVE_ZOLTAN2_OMP
80 #include <omp.h>
81 #endif
82 
83 #define LEAST_SIGNIFICANCE 0.0001
84 #define SIGNIFICANCE_MUL 1000
85 
86 //if the (last dimension reduce all count) x the mpi world size
87 //estimated to be bigger than this number then migration will be forced
88 //in earlier iterations.
89 #define FUTURE_REDUCEALL_CUTOFF 1500000
90 //if parts right before last dimension are estimated to have less than
91 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
92 #define MIN_WORK_LAST_DIM 1000
93 
94 
95 
96 
97 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
98 //imbalance calculation. Wreal / Wexpected - 1
99 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
100  (Wachieved) / ((totalW) * (expectedRatio)) - 1
101 #define imbalanceOf2(Wachieved, wExpected) \
102  (Wachieved) / (wExpected) - 1
103 
104 
105 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
106 
107 
108 namespace Teuchos{
109 
114 template <typename Ordinal, typename T>
115 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
116 {
117 private:
118  Ordinal size;
119  T _EPSILON;
120 
121 public:
124  Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
125 
132  Zoltan2_BoxBoundaries (Ordinal s_):
133  size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
134 
137  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
138  {
139  for (Ordinal i=0; i < count; i++){
140  if (Z2_ABS(inBuffer[i]) > _EPSILON){
141  inoutBuffer[i] = inBuffer[i];
142  }
143  }
144  }
145 };
146 } // namespace Teuchos
147 
148 namespace Zoltan2{
149 
153 template <typename T>
154 T *allocMemory(size_t size){
155  if (size > 0){
156  T * a = new T[size];
157  if (a == NULL) {
158  throw "cannot allocate memory";
159  }
160  return a;
161  }
162  else {
163  return NULL;
164  }
165 }
166 
170 template <typename T>
171 void freeArray(T *&array){
172  if(array != NULL){
173  delete [] array;
174  array = NULL;
175  }
176 }
177 
181 template <typename tt>
182 std::string toString(tt obj){
183  std::stringstream ss (std::stringstream::in |std::stringstream::out);
184  ss << obj;
185  std::string tmp = "";
186  ss >> tmp;
187  return tmp;
188 }
189 
197 template <typename IT, typename CT, typename WT>
199 {
200 public:
201  IT index;
202  CT count;
203  //unsigned int val;
204  WT *val;
206 
208  this->index = 0;
209  this->count = 0;
210  this->val = NULL;
211  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
212  }
213 
214 
215  uMultiSortItem(IT index_ ,CT count_, WT *vals_){
216  this->index = index_;
217  this->count = count_;
218  this->val = vals_;
219  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
220  }
221 
223  this->index = other.index;
224  this->count = other.count;
225  this->val = other.val;
226  this->_EPSILON = other._EPSILON;
227  }
228 
230  //freeArray<WT>(this->val);
231  }
232 
233  void set(IT index_ ,CT count_, WT *vals_){
234  this->index = index_;
235  this->count = count_;
236  this->val = vals_;
237  }
238 
239 
241  this->index = other.index;
242  this->count = other.count;
243  this->val = other.val;
244  return *(this);
245  }
246 
247  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
248  assert (this->count == other.count);
249  for(CT i = 0; i < this->count; ++i){
250  //if the values are equal go to next one.
251  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
252  continue;
253  }
254  //if next value is smaller return true;
255  if(this->val[i] < other.val[i]){
256  return true;
257  }
258  //if next value is bigger return false;
259  else {
260  return false;
261  }
262  }
263  //if they are totally equal.
264  return this->index < other.index;
265  }
266  bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
267  assert (this->count == other.count);
268  for(CT i = 0; i < this->count; ++i){
269  //if the values are equal go to next one.
270  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
271  continue;
272  }
273  //if next value is bigger return true;
274  if(this->val[i] > other.val[i]){
275  return true;
276  }
277  //if next value is smaller return false;
278  else //(this->val[i] > other.val[i])
279  {
280  return false;
281  }
282  }
283  //if they are totally equal.
284  return this->index > other.index;
285  }
286 };// uSortItem;
287 
291 template <class IT, class WT>
292 struct uSortItem
293 {
294  IT id;
295  //unsigned int val;
296  WT val;
297 };// uSortItem;
298 
302 template <class IT, class WT>
303 void uqsort(IT n, uSortItem<IT, WT> * arr)
304 {
305 
306  int NSTACK = 50;
307  int M = 7;
308  IT i, ir=n, j, k, l=1;
309  IT jstack=0, istack[50];
310  WT aval;
311  uSortItem<IT,WT> a, temp;
312 
313  --arr;
314  for (;;)
315  {
316  if (ir-l < M)
317  {
318  for (j=l+1;j<=ir;j++)
319  {
320  a=arr[j];
321  aval = a.val;
322  for (i=j-1;i>=1;i--)
323  {
324  if (arr[i].val <= aval)
325  break;
326  arr[i+1] = arr[i];
327  }
328  arr[i+1]=a;
329  }
330  if (jstack == 0)
331  break;
332  ir=istack[jstack--];
333  l=istack[jstack--];
334  }
335  else
336  {
337  k=(l+ir) >> 1;
338 
339  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
340  if (arr[l+1].val > arr[ir].val)
341  {
342  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
343  }
344  if (arr[l].val > arr[ir].val)
345  {
346  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
347  }
348  if (arr[l+1].val > arr[l].val)
349  {
350  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
351  }
352  i=l+1;
353  j=ir;
354  a=arr[l];
355  aval = a.val;
356  for (;;)
357  {
358  do i++; while (arr[i].val < aval);
359  do j--; while (arr[j].val > aval);
360  if (j < i) break;
361  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
362  }
363  arr[l]=arr[j];
364  arr[j]=a;
365  jstack += 2;
366  if (jstack > NSTACK){
367  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
368  exit(1);
369  }
370  if (ir-i+1 >= j-l)
371  {
372  istack[jstack]=ir;
373  istack[jstack-1]=i;
374  ir=j-1;
375  }
376  else
377  {
378  istack[jstack]=j-1;
379  istack[jstack-1]=l;
380  l=i;
381  }
382  }
383  }
384 }
385 
386 template <class IT, class WT, class SIGN>
388 {
389  IT id;
390  //unsigned int val;
391  WT val;
392  SIGN signbit; // 1 means positive, 0 means negative.
393  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
394  /*if I am negative, the other is positive*/
395  if (this->signbit < rhs.signbit){
396  return true;
397  }
398  /*if both has the same sign*/
399  else if (this->signbit == rhs.signbit){
400 
401  if (this->val < rhs.val){//if my value is smaller,
402  return this->signbit;//then if we both are positive return true.
403  //if we both are negative, return false.
404  }
405  else if (this->val > rhs.val){//if my value is larger,
406  return !this->signbit; //then if we both are positive return false.
407  //if we both are negative, return true.
408  }
409  else { //if both are equal.
410  return false;
411  }
412  }
413  else {
414  /*if I am positive, the other is negative*/
415  return false;
416  }
417 
418  }
419  bool operator>(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
420  /*if I am positive, the other is negative*/
421  if (this->signbit > rhs.signbit){
422  return true;
423  }
424  /*if both has the same sign*/
425  else if (this->signbit == rhs.signbit){
426 
427  if (this->val < rhs.val){//if my value is smaller,
428  return !this->signbit;//then if we both are positive return false.
429  //if we both are negative, return true.
430  }
431  else if (this->val > rhs.val){//if my value is larger,
432  return this->signbit; //then if we both are positive return true.
433  //if we both are negative, return false.
434  }
435  else { // if they are equal
436  return false;
437  }
438  }
439  else {
440  /*if I am negative, the other is positive*/
441  return false;
442  }
443  }
444  bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
445  return !(*this > rhs);}
447  return !(*this < rhs);}
448 };
449 
453 template <class IT, class WT, class SIGN>
455 
456  IT NSTACK = 50;
457  IT M = 7;
458  IT i, ir=n, j, k, l=1;
459  IT jstack=0, istack[50];
461 
462  --arr;
463  for (;;)
464  {
465  if (ir < M + l)
466  {
467  for (j=l+1;j<=ir;j++)
468  {
469  a=arr[j];
470  for (i=j-1;i>=1;i--)
471  {
472  if (arr[i] <= a)
473  {
474  break;
475  }
476  arr[i+1] = arr[i];
477  }
478  arr[i+1]=a;
479  }
480  if (jstack == 0)
481  break;
482  ir=istack[jstack--];
483  l=istack[jstack--];
484  }
485  else
486  {
487  k=(l+ir) >> 1;
488  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
489  if (arr[l+1] > arr[ir])
490  {
491  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
492  }
493  if (arr[l] > arr[ir])
494  {
495  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
496  }
497  if (arr[l+1] > arr[l])
498  {
499  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
500  }
501  i=l+1;
502  j=ir;
503  a=arr[l];
504  for (;;)
505  {
506  do i++; while (arr[i] < a);
507  do j--; while (arr[j] > a);
508  if (j < i) break;
509  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
510  }
511  arr[l]=arr[j];
512  arr[j]=a;
513  jstack += 2;
514  if (jstack > NSTACK){
515  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
516  exit(1);
517  }
518  if (ir+l+1 >= j+i)
519  {
520  istack[jstack]=ir;
521  istack[jstack-1]=i;
522  ir=j-1;
523  }
524  else
525  {
526  istack[jstack]=j-1;
527  istack[jstack-1]=l;
528  l=i;
529  }
530  }
531  }
532 }
533 
537 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
538  typename mj_part_t>
539 class AlgMJ
540 {
541 private:
543  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
544 
545  RCP<const Environment> mj_env; //the environment object
546  RCP<Comm<int> > mj_problemComm; //initial comm object
547 
548  double imbalance_tolerance; //input imbalance tolerance.
549  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
550  int recursion_depth; //the number of steps that partitioning will be solved in.
551  int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
552 
553  size_t initial_num_loc_coords; //initial num local coords.
554  global_size_t initial_num_glob_coords; //initial num global coords.
555 
556  mj_lno_t num_local_coords; //number of local coords.
557  mj_gno_t num_global_coords; //number of global coords.
558 
559  mj_scalar_t **mj_coordinates; //two dimension coordinate array
560  mj_scalar_t **mj_weights; //two dimension weight array
561  bool *mj_uniform_parts; //if the target parts are uniform
562  mj_scalar_t **mj_part_sizes; //target part weight sizes.
563  bool *mj_uniform_weights; //if the coordinates have uniform weights.
564 
565  ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
566  size_t num_global_parts; //the targeted number of parts
567 
568  mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
569  mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
570  int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
571 
572  mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
573  mj_lno_t *new_coordinate_permutations; //permutation work array.
574  mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
575 
576  mj_lno_t *part_xadj; //beginning and end of each part.
577  mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
578 
579  //get mj specific parameters.
580  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
581  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
582 
583  int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
584  int mj_user_recursion_depth; //the recursion depth value provided by user.
585  int mj_keep_part_boxes; //if the boxes need to be kept.
586 
587  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
588  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
589  int num_threads; //num threads
590 
591  mj_part_t total_num_cut ; //how many cuts will be totally
592  mj_part_t total_num_part; //how many parts will be totally
593 
594  mj_part_t max_num_part_along_dim ; //maximum part count along a dimension.
595  mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
596  size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
597 
598  mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done.
599  mj_part_t last_dim_num_part; //max no of parts that might occur
600  //during the partition before the
601  //last partitioning dimension.
602 
603  RCP<Comm<int> > comm; //comm object than can be altered during execution
604  float fEpsilon; //epsilon for float
605  mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
606 
607  mj_scalar_t maxScalar_t; //max possible scalar
608  mj_scalar_t minScalar_t; //min scalar
609 
610  mj_scalar_t *all_cut_coordinates;
611  mj_scalar_t *max_min_coords;
612  mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
613  mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
614 
615  // work array to manipulate coordinate of cutlines in different iterations.
616  //necessary because previous cut line information is used for determining
617  //the next cutline information. therefore, cannot update the cut work array
618  //until all cutlines are determined.
619  mj_scalar_t *cut_coordinates_work_array;
620 
621  //cumulative part weight array.
622  mj_scalar_t *target_part_weights;
623 
624  mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line
625  mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line
626  mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line
627  mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line
628 
629  mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
630  mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
631 
632  //isDone is used to determine if a cutline is determined already.
633  //If a cut line is already determined, the next iterations will skip this cut line.
634  bool *is_cut_line_determined;
635  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
636  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
637  mj_part_t *my_incomplete_cut_count;
638  //local part weights of each thread.
639  double **thread_part_weights;
640  //the work manupulation array for partweights.
641  double **thread_part_weight_work;
642 
643  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
644  mj_scalar_t **thread_cut_left_closest_point;
645  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
646  mj_scalar_t **thread_cut_right_closest_point;
647 
648  //to store how many points in each part a thread has.
649  mj_lno_t **thread_point_counts;
650 
651  mj_scalar_t *process_rectilinear_cut_weight;
652  mj_scalar_t *global_rectilinear_cut_weight;
653 
654  //for faster communication, concatanation of
655  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
656  //leftClosest distances sized P-1, since P-1 cut lines
657  //rightClosest distances size P-1, since P-1 cut lines.
658  mj_scalar_t *total_part_weight_left_right_closests ;
659  mj_scalar_t *global_total_part_weight_left_right_closests;
660 
661  RCP<mj_partBoxVector_t> kept_boxes; // vector of all boxes for all parts;
662  // constructed only if
663  // mj_keep_part_boxes == true
664  RCP<mj_partBox_t> global_box;
665  int myRank, myActualRank; //processor rank, and initial rank
666 
667  /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
668  * the input. part_no_array takes
669  * precedence if both are provided.
670  * Depending on these parameters, total cut/part number,
671  * maximum part/cut number along a dimension, estimated number of reduceAlls,
672  * and the number of parts before the last dimension is calculated.
673  * */
674  void set_part_specifications();
675 
676  /* \brief Tries to determine the part number for current dimension,
677  * by trying to make the partitioning as square as possible.
678  * \param num_total_future how many more partitionings are required.
679  * \param root how many more recursion depth is left.
680  */
681  inline mj_part_t get_part_count(
682  mj_part_t num_total_future,
683  double root);
684 
685  /* \brief Allocates the all required memory for the mj partitioning algorithm.
686  *
687  */
688  void allocate_set_work_memory();
689 
690  /* \brief for part communication we keep track of the box boundaries.
691  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
692  * This function initializes a single box with all global min and max coordinates.
693  * \param initial_partitioning_boxes the input and output vector for boxes.
694  */
695  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
696 
697  /* \brief compute global bounding box: min/max coords of global domain */
698  void compute_global_box();
699 
700  /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
701  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
702  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
703  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
704  *
705  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
706  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
707  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
708  * \param future_num_parts: output, max number of future parts that will be obtained from a single
709  * \param current_num_parts: input, how many parts are there currently.
710  * \param current_iteration: input, current dimension iteration number.
711  * \param input_part_boxes: input, if boxes are kept, current boxes.
712  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
713  */
714  mj_part_t update_part_num_arrays(
715  std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
716  std::vector<mj_part_t> *future_num_part_in_parts,
717  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
718  mj_part_t &future_num_parts,
719  mj_part_t current_num_parts,
720  int current_iteration,
721  RCP<mj_partBoxVector_t> input_part_boxes,
722  RCP<mj_partBoxVector_t> output_part_boxes);
723 
735  void mj_get_local_min_max_coord_totW(
736  mj_lno_t coordinate_begin_index,
737  mj_lno_t coordinate_end_index,
738  mj_lno_t *mj_current_coordinate_permutations,
739  mj_scalar_t *mj_current_dim_coords,
740  mj_scalar_t &min_coordinate,
741  mj_scalar_t &max_coordinate,
742  mj_scalar_t &total_weight);
743 
751  void mj_get_global_min_max_coord_totW(
752  mj_part_t current_concurrent_num_parts,
753  mj_scalar_t *local_min_max_total,
754  mj_scalar_t *global_min_max_total);
755 
774  void mj_get_initial_cut_coords_target_weights(
775  mj_scalar_t min_coord,
776  mj_scalar_t max_coord,
777  mj_part_t num_cuts/*p-1*/ ,
778  mj_scalar_t global_weight,
779  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
780  mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
781 
782  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
783  std::vector <mj_part_t> *next_future_num_parts_in_parts,
784  mj_part_t concurrent_current_part,
785  mj_part_t obtained_part_index);
786 
799  void set_initial_coordinate_parts(
800  mj_scalar_t &max_coordinate,
801  mj_scalar_t &min_coordinate,
802  mj_part_t &concurrent_current_part_index,
803  mj_lno_t coordinate_begin_index,
804  mj_lno_t coordinate_end_index,
805  mj_lno_t *mj_current_coordinate_permutations,
806  mj_scalar_t *mj_current_dim_coords,
807  mj_part_t *mj_part_ids,
808  mj_part_t &partition_count);
809 
820  void mj_1D_part(
821  mj_scalar_t *mj_current_dim_coords,
822  mj_scalar_t imbalanceTolerance,
823  mj_part_t current_work_part,
824  mj_part_t current_concurrent_num_parts,
825  mj_scalar_t *current_cut_coordinates,
826  mj_part_t total_incomplete_cut_count,
827  std::vector <mj_part_t> &num_partitioning_in_current_dim);
828 
848  void mj_1D_part_get_thread_part_weights(
849  size_t total_part_count,
850  mj_part_t num_cuts,
851  mj_scalar_t max_coord,
852  mj_scalar_t min_coord,
853  mj_lno_t coordinate_begin_index,
854  mj_lno_t coordinate_end_index,
855  mj_scalar_t *mj_current_dim_coords,
856  mj_scalar_t *temp_current_cut_coords,
857  bool *current_cut_status,
858  double *my_current_part_weights,
859  mj_scalar_t *my_current_left_closest,
860  mj_scalar_t *my_current_right_closest);
861 
869  void mj_accumulate_thread_results(
870  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
871  mj_part_t current_work_part,
872  mj_part_t current_concurrent_num_parts);
873 
904  void mj_get_new_cut_coordinates(
905  const size_t &num_total_part,
906  const mj_part_t &num_cuts,
907  const mj_scalar_t &max_coordinate,
908  const mj_scalar_t &min_coordinate,
909  const mj_scalar_t &global_total_weight,
910  const mj_scalar_t &used_imbalance_tolerance,
911  mj_scalar_t * current_global_part_weights,
912  const mj_scalar_t * current_local_part_weights,
913  const mj_scalar_t *current_part_target_weights,
914  bool *current_cut_line_determined,
915  mj_scalar_t *current_cut_coordinates,
916  mj_scalar_t *current_cut_upper_bounds,
917  mj_scalar_t *current_cut_lower_bounds,
918  mj_scalar_t *current_global_left_closest_points,
919  mj_scalar_t *current_global_right_closest_points,
920  mj_scalar_t * current_cut_lower_bound_weights,
921  mj_scalar_t * current_cut_upper_weights,
922  mj_scalar_t *new_current_cut_coordinates,
923  mj_scalar_t *current_part_cut_line_weight_to_put_left,
924  mj_part_t *rectilinear_cut_count,
925  mj_part_t &my_num_incomplete_cut);
926 
936  void mj_calculate_new_cut_position (
937  mj_scalar_t cut_upper_bound,
938  mj_scalar_t cut_lower_bound,
939  mj_scalar_t cut_upper_weight,
940  mj_scalar_t cut_lower_weight,
941  mj_scalar_t expected_weight,
942  mj_scalar_t &new_cut_position);
943 
954  void mj_create_new_partitions(
955  mj_part_t num_parts,
956  mj_scalar_t *mj_current_dim_coords,
957  mj_scalar_t *current_concurrent_cut_coordinate,
958  mj_lno_t coordinate_begin,
959  mj_lno_t coordinate_end,
960  mj_scalar_t *used_local_cut_line_weight_to_left,
961  double **used_thread_part_weight_work,
962  mj_lno_t *out_part_xadj);
963 
986  bool mj_perform_migration(
987  mj_part_t in_num_parts, //current umb parts
988  mj_part_t &out_num_parts, //output umb parts.
989  std::vector<mj_part_t> *next_future_num_parts_in_parts,
990  mj_part_t &output_part_begin_index,
991  size_t migration_reduce_all_population,
992  mj_lno_t num_coords_for_last_dim_part,
993  std::string iteration,
994  RCP<mj_partBoxVector_t> &input_part_boxes,
995  RCP<mj_partBoxVector_t> &output_part_boxes);
996 
1006  void get_processor_num_points_in_parts(
1007  mj_part_t num_procs,
1008  mj_part_t num_parts,
1009  mj_gno_t *&num_points_in_all_processor_parts);
1010 
1023  bool mj_check_to_migrate(
1024  size_t migration_reduce_all_population,
1025  mj_lno_t num_coords_for_last_dim_part,
1026  mj_part_t num_procs,
1027  mj_part_t num_parts,
1028  mj_gno_t *num_points_in_all_processor_parts);
1029 
1030 
1048  void mj_migration_part_proc_assignment(
1049  mj_gno_t * num_points_in_all_processor_parts,
1050  mj_part_t num_parts,
1051  mj_part_t num_procs,
1052  mj_lno_t *send_count_to_each_proc,
1053  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1054  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1055  mj_part_t &out_num_part,
1056  std::vector<mj_part_t> &out_part_indices,
1057  mj_part_t &output_part_numbering_begin_index,
1058  int *coordinate_destinations);
1059 
1076  void mj_assign_proc_to_parts(
1077  mj_gno_t * num_points_in_all_processor_parts,
1078  mj_part_t num_parts,
1079  mj_part_t num_procs,
1080  mj_lno_t *send_count_to_each_proc,
1081  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1082  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1083  mj_part_t &out_part_index,
1084  mj_part_t &output_part_numbering_begin_index,
1085  int *coordinate_destinations);
1086 
1097  void assign_send_destinations(
1098  mj_part_t num_parts,
1099  mj_part_t *part_assignment_proc_begin_indices,
1100  mj_part_t *processor_chains_in_parts,
1101  mj_lno_t *send_count_to_each_proc,
1102  int *coordinate_destinations);
1103 
1116  void assign_send_destinations2(
1117  mj_part_t num_parts,
1118  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
1119  int *coordinate_destinations,
1120  mj_part_t &output_part_numbering_begin_index,
1121  std::vector<mj_part_t> *next_future_num_parts_in_parts);
1122 
1139  void mj_assign_parts_to_procs(
1140  mj_gno_t * num_points_in_all_processor_parts,
1141  mj_part_t num_parts,
1142  mj_part_t num_procs,
1143  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
1144  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
1145  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
1146  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
1147  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
1148  int *coordinate_destinations);
1149 
1162  void mj_migrate_coords(
1163  mj_part_t num_procs,
1164  mj_lno_t &num_new_local_points,
1165  std::string iteration,
1166  int *coordinate_destinations,
1167  mj_part_t num_parts);
1168 
1175  void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1176 
1177 
1183  void fill_permutation_array(
1184  mj_part_t output_num_parts,
1185  mj_part_t num_parts);
1186 
1195  void set_final_parts(
1196  mj_part_t current_num_parts,
1197  mj_part_t output_part_begin_index,
1198  RCP<mj_partBoxVector_t> &output_part_boxes,
1199  bool is_data_ever_migrated);
1202  void free_work_memory();
1216  void create_consistent_chunks(
1217  mj_part_t num_parts,
1218  mj_scalar_t *mj_current_dim_coords,
1219  mj_scalar_t *current_concurrent_cut_coordinate,
1220  mj_lno_t coordinate_begin,
1221  mj_lno_t coordinate_end,
1222  mj_scalar_t *used_local_cut_line_weight_to_left,
1223  mj_lno_t *out_part_xadj,
1224  int coordInd);
1225 public:
1226  AlgMJ();
1227 
1256  void multi_jagged_part(
1257  const RCP<const Environment> &env,
1258  RCP<Comm<int> > &problemComm,
1259 
1260  double imbalance_tolerance,
1261  size_t num_global_parts,
1262  mj_part_t *part_no_array,
1263  int recursion_depth,
1264 
1265  int coord_dim,
1266  mj_lno_t num_local_coords,
1267  mj_gno_t num_global_coords,
1268  const mj_gno_t *initial_mj_gnos,
1269  mj_scalar_t **mj_coordinates,
1270 
1271  int num_weights_per_coord,
1272  bool *mj_uniform_weights,
1273  mj_scalar_t **mj_weights,
1274  bool *mj_uniform_parts,
1275  mj_scalar_t **mj_part_sizes,
1276 
1277  mj_part_t *&result_assigned_part_ids,
1278  mj_gno_t *&result_mj_gnos
1279 
1280  );
1288  void set_partitioning_parameters(
1289  bool distribute_points_on_cut_lines_,
1290  int max_concurrent_part_calculation_,
1291  int check_migrate_avoid_migration_option_,
1292  mj_scalar_t minimum_migration_imbalance_);
1296  void set_to_keep_part_boxes();
1297 
1300  RCP<mj_partBox_t> get_global_box() const;
1301 
1302  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1303 
1304  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1305  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1306 
1331  void sequential_task_partitioning(
1332  const RCP<const Environment> &env,
1333  mj_lno_t num_total_coords,
1334  mj_lno_t num_selected_coords,
1335  size_t num_target_part,
1336  int coord_dim,
1337  mj_scalar_t **mj_coordinates,
1338  mj_lno_t *initial_selected_coords_output_permutation,
1339  mj_lno_t *output_xadj,
1340  int recursion_depth,
1341  const mj_part_t *part_no_array);
1342 
1343 };
1344 
1369 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1370  typename mj_part_t>
1372  const RCP<const Environment> &env,
1373  mj_lno_t num_total_coords,
1374  mj_lno_t num_selected_coords,
1375  size_t num_target_part,
1376  int coord_dim_,
1377  mj_scalar_t **mj_coordinates_,
1378  mj_lno_t *inital_adjList_output_adjlist,
1379  mj_lno_t *output_xadj,
1380  int rd,
1381  const mj_part_t *part_no_array_
1382 ){
1383 
1384  this->mj_env = env;
1385  const RCP<Comm<int> > commN;
1386  this->comm = this->mj_problemComm = Teuchos::rcp_const_cast<Comm<int> >
1387  (Teuchos::DefaultComm<int>::getDefaultSerialComm(commN));
1388  this->myActualRank = this->myRank = 1;
1389 
1390 #ifdef HAVE_ZOLTAN2_OMP
1391  int actual_num_threads = omp_get_num_threads();
1392  omp_set_num_threads(1);
1393 #endif
1394 
1395  //weights are uniform for task mapping
1396 
1397  //parts are uniform for task mapping
1398  //as input indices.
1399 
1400  this->imbalance_tolerance = 0;
1401  this->num_global_parts = num_target_part;
1402  this->part_no_array = (mj_part_t *)part_no_array_;
1403  this->recursion_depth = rd;
1404 
1405  this->coord_dim = coord_dim_;
1406  this->num_local_coords = num_total_coords;
1407  this->num_global_coords = num_total_coords;
1408  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
1409 
1412  this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1413 
1414  this->num_weights_per_coord = 0;
1415  bool *tmp_mj_uniform_weights = new bool[1];
1416  this->mj_uniform_weights = tmp_mj_uniform_weights ;
1417  this->mj_uniform_weights[0] = true;
1418 
1419  mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
1420  this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
1421 
1422  bool *tmp_mj_uniform_parts = new bool[1];
1423  this->mj_uniform_parts = tmp_mj_uniform_parts;
1424  this->mj_uniform_parts[0] = true;
1425 
1426  mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
1427  this->mj_part_sizes = tmp_mj_part_sizes;
1428  this->mj_part_sizes[0] = NULL;
1429 
1430  this->num_threads = 1;
1431  this->set_part_specifications();
1432 
1433  this->allocate_set_work_memory();
1434  //the end of the initial partition is the end of coordinates.
1435  this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1436  for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1437  this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1438  }
1439 
1440  mj_part_t current_num_parts = 1;
1441 
1442  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1443 
1444  mj_part_t future_num_parts = this->total_num_part;
1445 
1446  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
1447  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
1448  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1449  RCP<mj_partBoxVector_t> t1;
1450  RCP<mj_partBoxVector_t> t2;
1451 
1452  for (int i = 0; i < this->recursion_depth; ++i){
1453 
1454  //partitioning array. size will be as the number of current partitions and this
1455  //holds how many parts that each part will be in the current dimension partitioning.
1456  std::vector <mj_part_t> num_partitioning_in_current_dim;
1457 
1458  //number of parts that will be obtained at the end of this partitioning.
1459  //future_num_part_in_parts is as the size of current number of parts.
1460  //holds how many more parts each should be divided in the further
1461  //iterations. this will be used to calculate num_partitioning_in_current_dim,
1462  //as the number of parts that the part will be partitioned
1463  //in the current dimension partitioning.
1464 
1465  //next_future_num_parts_in_parts will be as the size of outnumParts,
1466  //and this will hold how many more parts that each output part
1467  //should be divided. this array will also be used to determine the weight ratios
1468  //of the parts.
1469  //swap the arrays to use iteratively..
1470  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1471  future_num_part_in_parts = next_future_num_parts_in_parts;
1472  next_future_num_parts_in_parts = tmpPartVect;
1473 
1474  //clear next_future_num_parts_in_parts array as
1475  //getPartitionArrays expects it to be empty.
1476  //it also expects num_partitioning_in_current_dim to be empty as well.
1477  next_future_num_parts_in_parts->clear();
1478 
1479 
1480  //returns the total number of output parts for this dimension partitioning.
1481  mj_part_t output_part_count_in_dimension =
1482  this->update_part_num_arrays(
1483  num_partitioning_in_current_dim,
1484  future_num_part_in_parts,
1485  next_future_num_parts_in_parts,
1486  future_num_parts,
1487  current_num_parts,
1488  i,
1489  t1,
1490  t2);
1491 
1492  //if the number of obtained parts equal to current number of parts,
1493  //skip this dimension. For example, this happens when 1 is given in the input
1494  //part array is given. P=4,5,1,2
1495  if(output_part_count_in_dimension == current_num_parts) {
1496  tmpPartVect= future_num_part_in_parts;
1497  future_num_part_in_parts = next_future_num_parts_in_parts;
1498  next_future_num_parts_in_parts = tmpPartVect;
1499  continue;
1500  }
1501 
1502  //get the coordinate axis along which the partitioning will be done.
1503  int coordInd = i % this->coord_dim;
1504  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1505  //convert i to string to be used for debugging purposes.
1506  std::string istring = toString<int>(i);
1507 
1508  //alloc Memory to point the indices
1509  //of the parts in the permutation array.
1510  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1511 
1512  //the index where in the outtotalCounts will be written.
1513  mj_part_t output_part_index = 0;
1514  //whatever is written to outTotalCounts will be added with previousEnd
1515  //so that the points will be shifted.
1516  mj_part_t output_coordinate_end_index = 0;
1517 
1518  mj_part_t current_work_part = 0;
1519  mj_part_t current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1520  this->max_concurrent_part_calculation);
1521 
1522  mj_part_t obtained_part_index = 0;
1523 
1524  //run for all available parts.
1525  for (; current_work_part < current_num_parts;
1526  current_work_part += current_concurrent_num_parts){
1527 
1528  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1529  this->max_concurrent_part_calculation);
1530 
1531  mj_part_t actual_work_part_count = 0;
1532  //initialization for 1D partitioning.
1533  //get the min and max coordinates of each part
1534  //together with the part weights of each part.
1535  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1536  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1537 
1538  //if this part wont be partitioned any further
1539  //dont do any work for this part.
1540  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1541  continue;
1542  }
1543  ++actual_work_part_count;
1544  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1545  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1546 
1547  /*
1548  std::cout << "i:" << i << " j:" << current_work_part + kk
1549  << " coordinate_begin_index:" << coordinate_begin_index
1550  << " coordinate_end_index:" << coordinate_end_index
1551  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
1552  */
1553  this->mj_get_local_min_max_coord_totW(
1554  coordinate_begin_index,
1555  coordinate_end_index,
1556  this->coordinate_permutations,
1557  mj_current_dim_coords,
1558  this->process_local_min_max_coord_total_weight[kk], //min coordinate
1559  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
1560  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
1561  );
1562  }
1563 
1564  //1D partitioning
1565  if (actual_work_part_count > 0){
1566  //obtain global Min max of the part.
1567  this->mj_get_global_min_max_coord_totW(
1568  current_concurrent_num_parts,
1569  this->process_local_min_max_coord_total_weight,
1570  this->global_min_max_coord_total_weight);
1571 
1572  //represents the total number of cutlines
1573  //whose coordinate should be determined.
1574  mj_part_t total_incomplete_cut_count = 0;
1575 
1576  //Compute weight ratios for parts & cuts:
1577  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
1578  //part0 cut0 part1 cut1 part2 cut2 part3
1579  mj_part_t concurrent_part_cut_shift = 0;
1580  mj_part_t concurrent_part_part_shift = 0;
1581  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1582  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1583  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1584  current_concurrent_num_parts];
1585  mj_scalar_t global_total_weight =
1586  this->global_min_max_coord_total_weight[kk +
1587  2 * current_concurrent_num_parts];
1588 
1589  mj_part_t concurrent_current_part_index = current_work_part + kk;
1590 
1591  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1592 
1593  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1594  mj_scalar_t *current_target_part_weights = this->target_part_weights +
1595  concurrent_part_part_shift;
1596  //shift the usedCutCoordinate array as noCuts.
1597  concurrent_part_cut_shift += partition_count - 1;
1598  //shift the partRatio array as noParts.
1599  concurrent_part_part_shift += partition_count;
1600 
1601  //calculate only if part is not empty,
1602  //and part will be further partitioend.
1603  if(partition_count > 1 && min_coordinate <= max_coordinate){
1604 
1605  //increase allDone by the number of cuts of the current
1606  //part's cut line number.
1607  total_incomplete_cut_count += partition_count - 1;
1608  //set the number of cut lines that should be determined
1609  //for this part.
1610  this->my_incomplete_cut_count[kk] = partition_count - 1;
1611 
1612  //get the target weights of the parts.
1613  this->mj_get_initial_cut_coords_target_weights(
1614  min_coordinate,
1615  max_coordinate,
1616  partition_count - 1,
1617  global_total_weight,
1618  usedCutCoordinate,
1619  current_target_part_weights,
1620  future_num_part_in_parts,
1621  next_future_num_parts_in_parts,
1622  concurrent_current_part_index,
1623  obtained_part_index);
1624 
1625  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1626  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1627 
1628  //get the initial estimated part assignments of the coordinates.
1629  this->set_initial_coordinate_parts(
1630  max_coordinate,
1631  min_coordinate,
1632  concurrent_current_part_index,
1633  coordinate_begin_index, coordinate_end_index,
1634  this->coordinate_permutations,
1635  mj_current_dim_coords,
1636  this->assigned_part_ids,
1637  partition_count);
1638  }
1639  else {
1640  // e.g., if have fewer coordinates than parts, don't need to do next dim.
1641  this->my_incomplete_cut_count[kk] = 0;
1642  }
1643  obtained_part_index += partition_count;
1644  }
1645 
1646  //used imbalance, it is always 0, as it is difficult to estimate a range.
1647  mj_scalar_t used_imbalance = 0;
1648 
1649  // Determine cut lines for k parts here.
1650  this->mj_1D_part(
1651  mj_current_dim_coords,
1652  used_imbalance,
1653  current_work_part,
1654  current_concurrent_num_parts,
1655  current_cut_coordinates,
1656  total_incomplete_cut_count,
1657  num_partitioning_in_current_dim);
1658  }
1659 
1660  //create part chunks
1661  {
1662 
1663  mj_part_t output_array_shift = 0;
1664  mj_part_t cut_shift = 0;
1665  size_t tlr_shift = 0;
1666  size_t partweight_array_shift = 0;
1667 
1668  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1669  mj_part_t current_concurrent_work_part = current_work_part + kk;
1670  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1671 
1672  //if the part is empty, skip the part.
1673  if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1674  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1675 
1676  for(mj_part_t jj = 0; jj < num_parts; ++jj){
1677  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1678  }
1679  cut_shift += num_parts - 1;
1680  tlr_shift += (4 *(num_parts - 1) + 1);
1681  output_array_shift += num_parts;
1682  partweight_array_shift += (2 * (num_parts - 1) + 1);
1683  continue;
1684  }
1685 
1686  mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1687  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1688  -1];
1689  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1690  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1691  cut_shift;
1692 
1693  for(int ii = 0; ii < this->num_threads; ++ii){
1694  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1695  }
1696 
1697  if(num_parts > 1){
1698  // Rewrite the indices based on the computed cuts.
1699  this->create_consistent_chunks(
1700  num_parts,
1701  mj_current_dim_coords,
1702  current_concurrent_cut_coordinate,
1703  coordinate_begin,
1704  coordinate_end,
1705  used_local_cut_line_weight_to_left,
1706  this->new_part_xadj + output_part_index + output_array_shift,
1707  coordInd );
1708  }
1709  else {
1710  //if this part is partitioned into 1 then just copy
1711  //the old values.
1712  mj_lno_t part_size = coordinate_end - coordinate_begin;
1713  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1714  memcpy(this->new_coordinate_permutations + coordinate_begin,
1715  this->coordinate_permutations + coordinate_begin,
1716  part_size * sizeof(mj_lno_t));
1717  }
1718  cut_shift += num_parts - 1;
1719  tlr_shift += (4 *(num_parts - 1) + 1);
1720  output_array_shift += num_parts;
1721  partweight_array_shift += (2 * (num_parts - 1) + 1);
1722  }
1723 
1724  //shift cut coordinates so that all cut coordinates are stored.
1725  //current_cut_coordinates += cutShift;
1726 
1727  //getChunks from coordinates partitioned the parts and
1728  //wrote the indices as if there were a single part.
1729  //now we need to shift the beginning indices.
1730  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1731  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1732  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1733  //shift it by previousCount
1734  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1735  }
1736  //increase the previous count by current end.
1737  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1738  //increase the current out.
1739  output_part_index += num_parts ;
1740  }
1741  }
1742  }
1743  // end of this partitioning dimension
1744 
1745  //set the current num parts for next dim partitioning
1746  current_num_parts = output_part_count_in_dimension;
1747 
1748  //swap the coordinate permutations for the next dimension.
1749  mj_lno_t * tmp = this->coordinate_permutations;
1750  this->coordinate_permutations = this->new_coordinate_permutations;
1751  this->new_coordinate_permutations = tmp;
1752 
1753  freeArray<mj_lno_t>(this->part_xadj);
1754  this->part_xadj = this->new_part_xadj;
1755  }
1756 
1757  for(mj_lno_t i = 0; i < num_total_coords; ++i){
1758  inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1759  }
1760 
1761  // Return output_xadj in CSR format
1762  output_xadj[0] = 0;
1763  for(size_t i = 0; i < this->num_global_parts ; ++i){
1764  output_xadj[i+1] = this->part_xadj[i];
1765  }
1766 
1767  delete future_num_part_in_parts;
1768  delete next_future_num_parts_in_parts;
1769 
1770  //free the extra memory that we allocated.
1771  freeArray<mj_part_t>(this->assigned_part_ids);
1772  freeArray<mj_gno_t>(this->initial_mj_gnos);
1773  freeArray<mj_gno_t>(this->current_mj_gnos);
1774  freeArray<bool>(tmp_mj_uniform_weights);
1775  freeArray<bool>(tmp_mj_uniform_parts);
1776  freeArray<mj_scalar_t *>(tmp_mj_weights);
1777  freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1778 
1779  this->free_work_memory();
1780 
1781 #ifdef HAVE_ZOLTAN2_OMP
1782  omp_set_num_threads(actual_num_threads);
1783 #endif
1784 }
1785 
1789 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1790  typename mj_part_t>
1792  mj_env(), mj_problemComm(), imbalance_tolerance(0),
1793  part_no_array(NULL), recursion_depth(0), coord_dim(0),
1794  num_weights_per_coord(0), initial_num_loc_coords(0),
1795  initial_num_glob_coords(0),
1796  num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1797  mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1798  mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1799  initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1800  coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1801  assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1802  distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1803  mj_run_as_rcb(0), mj_user_recursion_depth(0), mj_keep_part_boxes(0),
1804  check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
1805  num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1806  max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1807  last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1808  all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1809  thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1810  target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1811  cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1812  process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1813  is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1814  thread_part_weights(NULL), thread_part_weight_work(NULL),
1815  thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1816  thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1817  global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1818  global_total_part_weight_left_right_closests(NULL),
1819  kept_boxes(),global_box(),
1820  myRank(0), myActualRank(0)
1821 {
1822  this->fEpsilon = std::numeric_limits<float>::epsilon();
1823  this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
1824 
1825  this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1826  this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1827 
1828 }
1829 
1830 
1834 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1835  typename mj_part_t>
1836 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1838 {
1839  return this->global_box;
1840 }
1841 
1845 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1846  typename mj_part_t>
1848  this->mj_keep_part_boxes = 1;
1849 }
1850 
1851 
1852 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
1853  * the input. part_no_array takes
1854  * precedence if both are provided.
1855  * Depending on these parameters, total cut/part number,
1856  * maximum part/cut number along a dimension, estimated number of reduceAlls,
1857  * and the number of parts before the last dimension is calculated.
1858  * */
1859 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1860  typename mj_part_t>
1862 
1863  this->total_num_cut = 0; //how many cuts will be totally
1864  this->total_num_part = 1; //how many parts will be totally
1865  this->max_num_part_along_dim = 0; //maximum part count along a dimension.
1866  this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done.
1867  this->last_dim_num_part = 1; //max no of parts that might occur
1868  //during the partition before the
1869  //last partitioning dimension.
1870  this->max_num_cut_along_dim = 0;
1871  this->max_num_total_part_along_dim = 0;
1872 
1873  if (this->part_no_array){
1874  //if user provided part array, traverse the array and set variables.
1875  for (int i = 0; i < this->recursion_depth; ++i){
1876  this->total_dim_num_reduce_all += this->total_num_part;
1877  this->total_num_part *= this->part_no_array[i];
1878  if(this->part_no_array[i] > this->max_num_part_along_dim) {
1879  this->max_num_part_along_dim = this->part_no_array[i];
1880  }
1881  }
1882  this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1883  this->num_global_parts = this->total_num_part;
1884  } else {
1885  mj_part_t future_num_parts = this->num_global_parts;
1886 
1887  //we need to calculate the part numbers now, to determine the maximum along the dimensions.
1888  for (int i = 0; i < this->recursion_depth; ++i){
1889 
1890  mj_part_t maxNoPartAlongI = this->get_part_count(
1891  future_num_parts, 1.0f / (this->recursion_depth - i));
1892 
1893  if (maxNoPartAlongI > this->max_num_part_along_dim){
1894  this->max_num_part_along_dim = maxNoPartAlongI;
1895  }
1896 
1897  mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
1898  if (future_num_parts % maxNoPartAlongI){
1899  ++nfutureNumParts;
1900  }
1901  future_num_parts = nfutureNumParts;
1902  }
1903  this->total_num_part = this->num_global_parts;
1904  //estimate reduceAll Count here.
1905  //we find the upperbound instead.
1906  mj_part_t p = 1;
1907  for (int i = 0; i < this->recursion_depth; ++i){
1908  this->total_dim_num_reduce_all += p;
1909  p *= this->max_num_part_along_dim;
1910  }
1911 
1912  this->last_dim_num_part = p / this->max_num_part_along_dim;
1913  }
1914 
1915  this->total_num_cut = this->total_num_part - 1;
1916  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
1917  this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
1918  //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
1919 
1920  //refine the concurrent part count, if it is given bigger than the maximum possible part count.
1921  if(this->max_concurrent_part_calculation > this->last_dim_num_part){
1922  if(this->mj_problemComm->getRank() == 0){
1923  std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
1924  ") has been set bigger than maximum amount that can be used." <<
1925  " Setting to:" << this->last_dim_num_part << "." << std::endl;
1926  }
1927  this->max_concurrent_part_calculation = this->last_dim_num_part;
1928  }
1929 
1930 }
1931 /* \brief Tries to determine the part number for current dimension,
1932  * by trying to make the partitioning as square as possible.
1933  * \param num_total_future how many more partitionings are required.
1934  * \param root how many more recursion depth is left.
1935  */
1936 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1937  typename mj_part_t>
1939  mj_part_t num_total_future,
1940  double root)
1941 {
1942  double fp = pow(num_total_future, root);
1943  mj_part_t ip = mj_part_t (fp);
1944  if (fp - ip < this->fEpsilon * 100){
1945  return ip;
1946  }
1947  else {
1948  return ip + 1;
1949  }
1950 }
1951 
1952 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
1953  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
1954  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
1955  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
1956  *
1957  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
1958  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
1959  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
1960  * \param future_num_parts: output, max number of future parts that will be obtained from a single
1961  * \param current_num_parts: input, how many parts are there currently.
1962  * \param current_iteration: input, current dimension iteration number.
1963  * \param input_part_boxes: input, if boxes are kept, current boxes.
1964  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
1965  */
1966 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1967  typename mj_part_t>
1969  std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
1970  std::vector<mj_part_t> *future_num_part_in_parts,
1971  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
1972  mj_part_t &future_num_parts,
1973  mj_part_t current_num_parts,
1974  int current_iteration,
1975  RCP<mj_partBoxVector_t> input_part_boxes,
1976  RCP<mj_partBoxVector_t> output_part_boxes
1977 ){
1978  //how many parts that will be obtained after this dimension.
1979  mj_part_t output_num_parts = 0;
1980  if(this->part_no_array){
1981  //when the partNo array is provided as input,
1982  //each current partition will be partition to the same number of parts.
1983  //we dont need to use the future_num_part_in_parts vector in this case.
1984 
1985  mj_part_t p = this->part_no_array[current_iteration];
1986  if (p < 1){
1987  std::cout << "i:" << current_iteration << " p is given as:" << p << std::endl;
1988  exit(1);
1989  }
1990  if (p == 1){
1991  return current_num_parts;
1992  }
1993 
1994  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
1995  num_partitioning_in_current_dim.push_back(p);
1996 
1997  }
1998  //cout << "me:" << this->myRank << " current_iteration" << current_iteration <<
1999  //" current_num_parts:" << current_num_parts << std::endl;
2000  //cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << std::endl;
2001  //set the new value of future_num_parts.
2002 
2003  /*
2004  cout << "\tfuture_num_parts:" << future_num_parts
2005  << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0]
2006  << future_num_parts/ num_partitioning_in_current_dim[0] << std::endl;
2007  */
2008 
2009  future_num_parts /= num_partitioning_in_current_dim[0];
2010  output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2011 
2012  if (this->mj_keep_part_boxes){
2013  for (mj_part_t k = 0; k < current_num_parts; ++k){
2014  //initialized the output boxes as its ancestor.
2015  for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2016  output_part_boxes->push_back((*input_part_boxes)[k]);
2017  }
2018  }
2019  }
2020 
2021  //set the how many more parts each part will be divided.
2022  //this is obvious when partNo array is provided as input.
2023  //however, fill this so that weights will be calculated according to this array.
2024  for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2025  next_future_num_parts_in_parts->push_back(future_num_parts);
2026  }
2027  }
2028  else {
2029  //if partNo array is not provided as input,
2030  //future_num_part_in_parts holds how many parts each part should be divided.
2031  //initially it holds a single number equal to the total number of global parts.
2032 
2033  //calculate the future_num_parts from beginning,
2034  //since each part might be divided into different number of parts.
2035  future_num_parts = 1;
2036 
2037  //cout << "i:" << i << std::endl;
2038 
2039  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2040  //get how many parts a part should be divided.
2041  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2042 
2043  //get the ideal number of parts that is close to the
2044  //(recursion_depth - i) root of the future_num_parts_of_part_ii.
2045  mj_part_t num_partitions_in_current_dim =
2046  this->get_part_count(
2047  future_num_parts_of_part_ii,
2048  1.0 / (this->recursion_depth - current_iteration)
2049  );
2050 
2051  if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2052  std::cerr << "ERROR: maxPartNo calculation is wrong." << std::endl;
2053  exit(1);
2054  }
2055  //add this number to num_partitioning_in_current_dim vector.
2056  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2057 
2058 
2059  //increase the output number of parts.
2060  output_num_parts += num_partitions_in_current_dim;
2061 
2062  //ideal number of future partitions for each part.
2063  mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
2064  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2065  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2066 
2067  //if there is a remainder in the part increase the part weight.
2068  if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
2069  //if not uniform, add 1 for the extra parts.
2070  ++num_future_parts_for_part_iii;
2071  }
2072  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
2073 
2074  //if part boxes are stored, initialize the box of the parts as the ancestor.
2075  if (this->mj_keep_part_boxes){
2076  output_part_boxes->push_back((*input_part_boxes)[ii]);
2077  }
2078 
2079  //set num future_num_parts to maximum in this part.
2080  if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2081  }
2082  }
2083  }
2084  return output_num_parts;
2085 }
2086 
2087 
2088 /* \brief Allocates and initializes the work memory that will be used by MJ.
2089  *
2090  * */
2091 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2092  typename mj_part_t>
2094 
2095  //points to process that initially owns the coordinate.
2096  this->owner_of_coordinate = NULL;
2097 
2098  //Throughout the partitioning execution,
2099  //instead of the moving the coordinates, hold a permutation array for parts.
2100  //coordinate_permutations holds the current permutation.
2101  this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2102  //initial configuration, set each pointer-i to i.
2103 #ifdef HAVE_ZOLTAN2_OMP
2104 #pragma omp parallel for
2105 #endif
2106  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2107  this->coordinate_permutations[i] = i;
2108  }
2109 
2110  //new_coordinate_permutations holds the current permutation.
2111  this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2112 
2113  this->assigned_part_ids = NULL;
2114  if(this->num_local_coords > 0){
2115  this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2116  }
2117 
2118  //single partition starts at index-0, and ends at numLocalCoords
2119  //inTotalCounts array holds the end points in coordinate_permutations array
2120  //for each partition. Initially sized 1, and single element is set to numLocalCoords.
2121  this->part_xadj = allocMemory<mj_lno_t>(1);
2122  this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
2123  //the ends points of the output, this is allocated later.
2124  this->new_part_xadj = NULL;
2125 
2126  // only store this much if cuts are needed to be stored.
2127  //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
2128 
2129 
2130  this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2131 
2132  this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2133 
2134  this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
2135  this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
2136  //distribute_points_on_cut_lines = false;
2137  if(this->distribute_points_on_cut_lines){
2138  this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2139  this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2140  for(int i = 0; i < this->num_threads; ++i){
2141  this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2142  }
2143  this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2144  this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2145  }
2146 
2147 
2148  // work array to manipulate coordinate of cutlines in different iterations.
2149  //necessary because previous cut line information is used for determining
2150  //the next cutline information. therefore, cannot update the cut work array
2151  //until all cutlines are determined.
2152  this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2153  this->max_concurrent_part_calculation);
2154 
2155 
2156  //cumulative part weight array.
2157  this->target_part_weights = allocMemory<mj_scalar_t>(
2158  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2159  // the weight from left to write.
2160 
2161  this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line
2162  this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line
2163  this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line
2164  this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line
2165 
2166  this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
2167  this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
2168 
2169  //is_cut_line_determined is used to determine if a cutline is determined already.
2170  //If a cut line is already determined, the next iterations will skip this cut line.
2171  this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2172  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
2173  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
2174  this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2175  //local part weights of each thread.
2176  this->thread_part_weights = allocMemory<double *>(this->num_threads);
2177  //the work manupulation array for partweights.
2178  this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2179 
2180  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
2181  this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2182  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
2183  this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2184 
2185  //to store how many points in each part a thread has.
2186  this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2187 
2188  for(int i = 0; i < this->num_threads; ++i){
2189  //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
2190  this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2191  this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2192  this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2193  this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2194  }
2195  //for faster communication, concatanation of
2196  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2197  //leftClosest distances sized P-1, since P-1 cut lines
2198  //rightClosest distances size P-1, since P-1 cut lines.
2199  this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2200  this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2201 
2202 
2203  mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2204  for (int i=0; i < this->coord_dim; i++){
2205  coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2206 #ifdef HAVE_ZOLTAN2_OMP
2207 #pragma omp parallel for
2208 #endif
2209  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2210  coord[i][j] = this->mj_coordinates[i][j];
2211  }
2212  this->mj_coordinates = coord;
2213 
2214 
2215  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2216  mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2217 
2218  for (int i=0; i < criteria_dim; i++){
2219  weights[i] = NULL;
2220  }
2221  for (int i=0; i < this->num_weights_per_coord; i++){
2222  weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2223 #ifdef HAVE_ZOLTAN2_OMP
2224 #pragma omp parallel for
2225 #endif
2226  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2227  weights[i][j] = this->mj_weights[i][j];
2228 
2229  }
2230  this->mj_weights = weights;
2231  this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2232 #ifdef HAVE_ZOLTAN2_OMP
2233 #pragma omp parallel for
2234 #endif
2235  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2236  this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2237 
2238  this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2239 
2240 #ifdef HAVE_ZOLTAN2_OMP
2241 #pragma omp parallel for
2242 #endif
2243  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2244  this->owner_of_coordinate[j] = this->myActualRank;
2245 }
2246 
2247 /* \brief compute the global bounding box
2248  */
2249 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2250  typename mj_part_t>
2252 {
2253  //local min coords
2254  mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2255  //global min coords
2256  mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2257  //local max coords
2258  mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2259  //global max coords
2260  mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2261 
2262  for (int i = 0; i < this->coord_dim; ++i){
2263  mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2264  mj_scalar_t localMax = -localMin;
2265  if (localMax > 0) localMax = 0;
2266 
2267 
2268  for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2269  if (this->mj_coordinates[i][j] < localMin){
2270  localMin = this->mj_coordinates[i][j];
2271  }
2272  if (this->mj_coordinates[i][j] > localMax){
2273  localMax = this->mj_coordinates[i][j];
2274  }
2275  }
2276  //cout << " localMin:" << localMin << endl;
2277  //cout << " localMax:" << localMax << endl;
2278  mins[i] = localMin;
2279  maxs[i] = localMax;
2280 
2281  }
2282  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2283  this->coord_dim, mins, gmins
2284  );
2285 
2286 
2287  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2288  this->coord_dim, maxs, gmaxs
2289  );
2290 
2291 
2292 
2293  //create single box with all areas.
2294  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2295  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2296  freeArray<mj_scalar_t>(mins);
2297  freeArray<mj_scalar_t>(gmins);
2298  freeArray<mj_scalar_t>(maxs);
2299  freeArray<mj_scalar_t>(gmaxs);
2300 }
2301 
2302 /* \brief for part communication we keep track of the box boundaries.
2303  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
2304  * This function initializes a single box with all global min and max coordinates.
2305  * \param initial_partitioning_boxes the input and output vector for boxes.
2306  */
2307 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2308  typename mj_part_t>
2310  RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2311 )
2312 {
2313  mj_partBox_t tmp_box(*global_box);
2314  initial_partitioning_boxes->push_back(tmp_box);
2315 }
2316 
2327 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2328  typename mj_part_t>
2330  mj_lno_t coordinate_begin_index,
2331  mj_lno_t coordinate_end_index,
2332  mj_lno_t *mj_current_coordinate_permutations,
2333  mj_scalar_t *mj_current_dim_coords,
2334  mj_scalar_t &min_coordinate,
2335  mj_scalar_t &max_coordinate,
2336  mj_scalar_t &total_weight){
2337 
2338  //if the part is empty.
2339  //set the min and max coordinates as reverse.
2340  if(coordinate_begin_index >= coordinate_end_index)
2341  {
2342  min_coordinate = this->maxScalar_t;
2343  max_coordinate = this->minScalar_t;
2344  total_weight = 0;
2345  }
2346  else {
2347  mj_scalar_t my_total_weight = 0;
2348 #ifdef HAVE_ZOLTAN2_OMP
2349 #pragma omp parallel
2350 #endif
2351  {
2352  //if uniform weights are used, then weight is equal to count.
2353  if (this->mj_uniform_weights[0]) {
2354 #ifdef HAVE_ZOLTAN2_OMP
2355 #pragma omp single
2356 #endif
2357  {
2358  my_total_weight = coordinate_end_index - coordinate_begin_index;
2359  }
2360 
2361  }
2362  else {
2363  //if not uniform, then weights are reducted from threads.
2364 #ifdef HAVE_ZOLTAN2_OMP
2365 #pragma omp for reduction(+:my_total_weight)
2366 #endif
2367  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2368  int i = mj_current_coordinate_permutations[ii];
2369  my_total_weight += this->mj_weights[0][i];
2370  }
2371  }
2372 
2373  int my_thread_id = 0;
2374 #ifdef HAVE_ZOLTAN2_OMP
2375  my_thread_id = omp_get_thread_num();
2376 #endif
2377  mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2378  my_thread_min_coord=my_thread_max_coord
2379  =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2380 
2381 
2382 #ifdef HAVE_ZOLTAN2_OMP
2383 #pragma omp for
2384 #endif
2385  for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2386  int i = mj_current_coordinate_permutations[j];
2387  if(mj_current_dim_coords[i] > my_thread_max_coord)
2388  my_thread_max_coord = mj_current_dim_coords[i];
2389  if(mj_current_dim_coords[i] < my_thread_min_coord)
2390  my_thread_min_coord = mj_current_dim_coords[i];
2391  }
2392  this->max_min_coords[my_thread_id] = my_thread_min_coord;
2393  this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2394 
2395 #ifdef HAVE_ZOLTAN2_OMP
2396 //we need a barrier here, because max_min_array might not be filled by some of the threads.
2397 #pragma omp barrier
2398 #pragma omp single nowait
2399 #endif
2400  {
2401  min_coordinate = this->max_min_coords[0];
2402  for(int i = 1; i < this->num_threads; ++i){
2403  if(this->max_min_coords[i] < min_coordinate)
2404  min_coordinate = this->max_min_coords[i];
2405  }
2406  }
2407 
2408 #ifdef HAVE_ZOLTAN2_OMP
2409 #pragma omp single nowait
2410 #endif
2411  {
2412  max_coordinate = this->max_min_coords[this->num_threads];
2413  for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2414  if(this->max_min_coords[i] > max_coordinate)
2415  max_coordinate = this->max_min_coords[i];
2416  }
2417  }
2418  }
2419  total_weight = my_total_weight;
2420  }
2421 }
2422 
2423 
2431 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2432  typename mj_part_t>
2434  mj_part_t current_concurrent_num_parts,
2435  mj_scalar_t *local_min_max_total,
2436  mj_scalar_t *global_min_max_total){
2437 
2438  //reduce min for first current_concurrent_num_parts elements, reduce max for next
2439  //concurrentPartCount elements,
2440  //reduce sum for the last concurrentPartCount elements.
2441  if(this->comm->getSize() > 1){
2443  reductionOp(
2444  current_concurrent_num_parts,
2445  current_concurrent_num_parts,
2446  current_concurrent_num_parts);
2447  try{
2448  reduceAll<int, mj_scalar_t>(
2449  *(this->comm),
2450  reductionOp,
2451  3 * current_concurrent_num_parts,
2452  local_min_max_total,
2453  global_min_max_total);
2454  }
2455  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2456  }
2457  else {
2458  mj_part_t s = 3 * current_concurrent_num_parts;
2459  for (mj_part_t i = 0; i < s; ++i){
2460  global_min_max_total[i] = local_min_max_total[i];
2461  }
2462  }
2463 }
2464 
2465 
2466 
2485 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2486  typename mj_part_t>
2488  mj_scalar_t min_coord,
2489  mj_scalar_t max_coord,
2490  mj_part_t num_cuts/*p-1*/ ,
2491  mj_scalar_t global_weight,
2492  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
2493  mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
2494 
2495  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
2496  std::vector <mj_part_t> *next_future_num_parts_in_parts,
2497  mj_part_t concurrent_current_part,
2498  mj_part_t obtained_part_index
2499 ){
2500 
2501  mj_scalar_t coord_range = max_coord - min_coord;
2502  if(this->mj_uniform_parts[0]){
2503  {
2504  mj_part_t cumulative = 0;
2505  //how many total future parts the part will be partitioned into.
2506  mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2507 
2508 
2509  //how much each part should weigh in ideal case.
2510  mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2511  /*
2512  cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << endl;
2513  cout << "global_weight:" << global_weight << endl;
2514  cout << "unit_part_weight" << unit_part_weight <<endl;
2515  */
2516  for(mj_part_t i = 0; i < num_cuts; ++i){
2517  cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2518 
2519  /*
2520  cout << "obtained_part_index:" << obtained_part_index <<
2521  " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] <<
2522  " cumulative:" << cumulative << endl;
2523  */
2524  //set target part weight.
2525  current_target_part_weights[i] = cumulative * unit_part_weight;
2526  //cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] << endl;
2527  //set initial cut coordinate.
2528  initial_cut_coords[i] = min_coord + (coord_range *
2529  cumulative) / total_future_part_count_in_part;
2530  }
2531  current_target_part_weights[num_cuts] = 1;
2532  }
2533 
2534  //round the target part weights.
2535  if (this->mj_uniform_weights[0]){
2536  for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2537  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2538  }
2539  }
2540  }
2541  else {
2542  std::cerr << "MJ does not support non uniform part weights" << std::endl;
2543  exit(1);
2544  }
2545 }
2546 
2547 
2560 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2561  typename mj_part_t>
2563  mj_scalar_t &max_coordinate,
2564  mj_scalar_t &min_coordinate,
2565  mj_part_t &concurrent_current_part_index,
2566  mj_lno_t coordinate_begin_index,
2567  mj_lno_t coordinate_end_index,
2568  mj_lno_t *mj_current_coordinate_permutations,
2569  mj_scalar_t *mj_current_dim_coords,
2570  mj_part_t *mj_part_ids,
2571  mj_part_t &partition_count
2572 ){
2573  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2574 
2575  //if there is single point, or if all points are along a line.
2576  //set initial part to 0 for all.
2577  if(ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2578 #ifdef HAVE_ZOLTAN2_OMP
2579 #pragma omp parallel for
2580 #endif
2581  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2582  mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2583  }
2584  }
2585  else{
2586 
2587  //otherwise estimate an initial part for each coordinate.
2588  //assuming uniform distribution of points.
2589  mj_scalar_t slice = coordinate_range / partition_count;
2590 
2591 #ifdef HAVE_ZOLTAN2_OMP
2592 #pragma omp parallel for
2593 #endif
2594  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2595 
2596  mj_lno_t iii = mj_current_coordinate_permutations[ii];
2597  mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2598  mj_part_ids[iii] = 2 * pp;
2599  }
2600  }
2601 }
2602 
2603 
2614 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2615  typename mj_part_t>
2617  mj_scalar_t *mj_current_dim_coords,
2618  mj_scalar_t used_imbalance_tolerance,
2619  mj_part_t current_work_part,
2620  mj_part_t current_concurrent_num_parts,
2621  mj_scalar_t *current_cut_coordinates,
2622  mj_part_t total_incomplete_cut_count,
2623  std::vector <mj_part_t> &num_partitioning_in_current_dim
2624 ){
2625 
2626 
2627  mj_part_t rectilinear_cut_count = 0;
2628  mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2629 
2631  *reductionOp = NULL;
2632  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
2633  <mj_part_t, mj_scalar_t>(
2634  &num_partitioning_in_current_dim ,
2635  current_work_part ,
2636  current_concurrent_num_parts);
2637 
2638  size_t total_reduction_size = 0;
2639 #ifdef HAVE_ZOLTAN2_OMP
2640 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count)
2641 #endif
2642  {
2643  int me = 0;
2644 #ifdef HAVE_ZOLTAN2_OMP
2645  me = omp_get_thread_num();
2646 #endif
2647  double *my_thread_part_weights = this->thread_part_weights[me];
2648  mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2649  mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2650 
2651 #ifdef HAVE_ZOLTAN2_OMP
2652 #pragma omp single
2653 #endif
2654  {
2655  //initialize the lower and upper bounds of the cuts.
2656  mj_part_t next = 0;
2657  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2658 
2659  mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2660  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2661  total_reduction_size += (4 * num_cut_in_dim + 1);
2662 
2663  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2664  this->is_cut_line_determined[next] = false;
2665  this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
2666  this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
2667 
2668  this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
2669  this->cut_lower_bound_weights[next] = 0;
2670 
2671  if(this->distribute_points_on_cut_lines){
2672  this->process_cut_line_weight_to_put_left[next] = 0;
2673  }
2674  ++next;
2675  }
2676  }
2677  }
2678 
2679  //no need to have barrier here.
2680  //pragma omp single have implicit barrier.
2681 
2682  int iteration = 0;
2683  while (total_incomplete_cut_count != 0){
2684  iteration += 1;
2685  //cout << "\niteration:" << iteration << " ";
2686  mj_part_t concurrent_cut_shifts = 0;
2687  size_t total_part_shift = 0;
2688 
2689  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2690  mj_part_t num_parts = -1;
2691  num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2692 
2693  mj_part_t num_cuts = num_parts - 1;
2694  size_t total_part_count = num_parts + size_t (num_cuts) ;
2695  if (this->my_incomplete_cut_count[kk] > 0){
2696 
2697  //although isDone shared, currentDone is private and same for all.
2698  bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2699  double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2700  mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2701  mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2702 
2703  mj_part_t conccurent_current_part = current_work_part + kk;
2704  mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2705  mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2706  mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2707 
2708  mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2709  mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2710 
2711  // compute part weights using existing cuts
2712  this->mj_1D_part_get_thread_part_weights(
2713  total_part_count,
2714  num_cuts,
2715  max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
2716  min_coord,//globalMinMaxTotal[kk]//minScalar,
2717  coordinate_begin_index,
2718  coordinate_end_index,
2719  mj_current_dim_coords,
2720  temp_current_cut_coords,
2721  current_cut_status,
2722  my_current_part_weights,
2723  my_current_left_closest,
2724  my_current_right_closest);
2725 
2726  }
2727 
2728  concurrent_cut_shifts += num_cuts;
2729  total_part_shift += total_part_count;
2730  }
2731 
2732  //sum up the results of threads
2733  this->mj_accumulate_thread_results(
2734  num_partitioning_in_current_dim,
2735  current_work_part,
2736  current_concurrent_num_parts);
2737 
2738  //now sum up the results of mpi processors.
2739 #ifdef HAVE_ZOLTAN2_OMP
2740 #pragma omp single
2741 #endif
2742  {
2743  if(this->comm->getSize() > 1){
2744  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2745  total_reduction_size,
2746  this->total_part_weight_left_right_closests,
2747  this->global_total_part_weight_left_right_closests);
2748 
2749  }
2750  else {
2751  memcpy(
2752  this->global_total_part_weight_left_right_closests,
2753  this->total_part_weight_left_right_closests,
2754  total_reduction_size * sizeof(mj_scalar_t));
2755  }
2756  }
2757 
2758  //how much cut will be shifted for the next part in the concurrent part calculation.
2759  mj_part_t cut_shift = 0;
2760 
2761  //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
2762  size_t tlr_shift = 0;
2763  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2764  mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2765  mj_part_t num_cuts = num_parts - 1;
2766  size_t num_total_part = num_parts + size_t (num_cuts) ;
2767 
2768  //if the cuts of this cut has already been completed.
2769  //nothing to do for this part.
2770  //just update the shift amount and proceed.
2771  if (this->my_incomplete_cut_count[kk] == 0) {
2772  cut_shift += num_cuts;
2773  tlr_shift += (num_total_part + 2 * num_cuts);
2774  continue;
2775  }
2776 
2777  mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2778  mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2779  mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
2780  mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
2781  mj_scalar_t *current_global_part_weights = current_global_tlr;
2782  bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2783 
2784  mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2785  mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2786 
2787  mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2788  mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2789  mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2790  mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2791  mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2792  mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2793  mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2794 
2795  mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2796 
2797  // Now compute the new cut coordinates.
2798  this->mj_get_new_cut_coordinates(
2799  num_total_part,
2800  num_cuts,
2801  max_coordinate,
2802  min_coordinate,
2803  global_total_weight,
2804  used_imbalance_tolerance,
2805  current_global_part_weights,
2806  current_local_part_weights,
2807  current_part_target_weights,
2808  current_cut_line_determined,
2809  temp_cut_coords + cut_shift,
2810  current_cut_upper_bounds,
2811  current_cut_lower_bounds,
2812  current_global_left_closest_points,
2813  current_global_right_closest_points,
2814  current_cut_lower_bound_weights,
2815  current_cut_upper_weights,
2816  this->cut_coordinates_work_array +cut_shift, //new cut coordinates
2817  current_part_cut_line_weight_to_put_left,
2818  &rectilinear_cut_count,
2819  this->my_incomplete_cut_count[kk]);
2820 
2821  cut_shift += num_cuts;
2822  tlr_shift += (num_total_part + 2 * num_cuts);
2823  mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
2824 #ifdef HAVE_ZOLTAN2_OMP
2825 #pragma omp single
2826 #endif
2827  {
2828  total_incomplete_cut_count -= iteration_complete_cut_count;
2829  }
2830 
2831  }
2832 #ifdef HAVE_ZOLTAN2_OMP
2833 #pragma omp barrier
2834 #pragma omp single
2835 #endif
2836  {
2837  //swap the cut coordinates for next iteration.
2838  mj_scalar_t *t = temp_cut_coords;
2839  temp_cut_coords = this->cut_coordinates_work_array;
2840  this->cut_coordinates_work_array = t;
2841  }
2842  }
2843 
2844  // Needed only if keep_cuts; otherwise can simply swap array pointers
2845  // cutCoordinates and cutCoordinatesWork.
2846  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
2847  // computed cuts must be in cutCoordinates.
2848  if (current_cut_coordinates != temp_cut_coords){
2849 #ifdef HAVE_ZOLTAN2_OMP
2850 #pragma omp single
2851 #endif
2852  {
2853  mj_part_t next = 0;
2854  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2855  mj_part_t num_parts = -1;
2856  num_parts = num_partitioning_in_current_dim[current_work_part + i];
2857  mj_part_t num_cuts = num_parts - 1;
2858 
2859  for(mj_part_t ii = 0; ii < num_cuts; ++ii){
2860  current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
2861  }
2862  next += num_cuts;
2863  }
2864  }
2865 
2866 #ifdef HAVE_ZOLTAN2_OMP
2867 #pragma omp single
2868 #endif
2869  {
2870  this->cut_coordinates_work_array = temp_cut_coords;
2871  }
2872  }
2873  }
2874  delete reductionOp;
2875 }
2876 
2877 
2897 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2898  typename mj_part_t>
2900  size_t total_part_count,
2901  mj_part_t num_cuts,
2902  mj_scalar_t max_coord,
2903  mj_scalar_t min_coord,
2904  mj_lno_t coordinate_begin_index,
2905  mj_lno_t coordinate_end_index,
2906  mj_scalar_t *mj_current_dim_coords,
2907  mj_scalar_t *temp_current_cut_coords,
2908  bool *current_cut_status,
2909  double *my_current_part_weights,
2910  mj_scalar_t *my_current_left_closest,
2911  mj_scalar_t *my_current_right_closest){
2912 
2913  // initializations for part weights, left/right closest
2914  for (size_t i = 0; i < total_part_count; ++i){
2915  my_current_part_weights[i] = 0;
2916  }
2917 
2918  //initialize the left and right closest coordinates
2919  //to their max value.
2920  for(mj_part_t i = 0; i < num_cuts; ++i){
2921  my_current_left_closest[i] = min_coord - 1;
2922  my_current_right_closest[i] = max_coord + 1;
2923  }
2924  //mj_lno_t comparison_count = 0;
2925  mj_scalar_t minus_EPSILON = -this->sEpsilon;
2926 #ifdef HAVE_ZOLTAN2_OMP
2927  //no need for the barrier as all threads uses their local memories.
2928  //dont change the static scheduling here, as it is assumed when the new
2929  //partitions are created later.
2930 #pragma omp for
2931 #endif
2932  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2933  int i = this->coordinate_permutations[ii];
2934 
2935  //the accesses to assigned_part_ids are thread safe
2936  //since each coordinate is assigned to only a single thread.
2937  mj_part_t j = this->assigned_part_ids[i] / 2;
2938 
2939  if(j >= num_cuts){
2940  j = num_cuts - 1;
2941  }
2942 
2943  mj_part_t lower_cut_index = 0;
2944  mj_part_t upper_cut_index = num_cuts - 1;
2945 
2946  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
2947  bool is_inserted = false;
2948  bool is_on_left_of_cut = false;
2949  bool is_on_right_of_cut = false;
2950  mj_part_t last_compared_part = -1;
2951 
2952  mj_scalar_t coord = mj_current_dim_coords[i];
2953 
2954  while(upper_cut_index >= lower_cut_index)
2955  {
2956  //comparison_count++;
2957  last_compared_part = -1;
2958  is_on_left_of_cut = false;
2959  is_on_right_of_cut = false;
2960  mj_scalar_t cut = temp_current_cut_coords[j];
2961  mj_scalar_t distance_to_cut = coord - cut;
2962  mj_scalar_t abs_distance_to_cut = ZOLTAN2_ABS(distance_to_cut);
2963 
2964  //if it is on the line.
2965  if(abs_distance_to_cut < this->sEpsilon){
2966 
2967  my_current_part_weights[j * 2 + 1] += w;
2968  this->assigned_part_ids[i] = j * 2 + 1;
2969 
2970  //assign left and right closest point to cut as the point is on the cut.
2971  my_current_left_closest[j] = coord;
2972  my_current_right_closest[j] = coord;
2973  //now we need to check if there are other cuts on the same cut coordinate.
2974  //if there are, then we add the weight of the cut to all cuts in the same coordinate.
2975  mj_part_t kk = j + 1;
2976  while(kk < num_cuts){
2977  // Needed when cuts shared the same position
2978  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
2979  if(distance_to_cut < this->sEpsilon){
2980  my_current_part_weights[2 * kk + 1] += w;
2981  my_current_left_closest[kk] = coord;
2982  my_current_right_closest[kk] = coord;
2983  kk++;
2984  }
2985  else{
2986  //cut is far away.
2987  //just check the left closest point for the next cut.
2988  if(coord - my_current_left_closest[kk] > this->sEpsilon){
2989  my_current_left_closest[kk] = coord;
2990  }
2991  break;
2992  }
2993  }
2994 
2995 
2996  kk = j - 1;
2997  //continue checking for the cuts on the left if they share the same coordinate.
2998  while(kk >= 0){
2999  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3000  if(distance_to_cut < this->sEpsilon){
3001  my_current_part_weights[2 * kk + 1] += w;
3002  //try to write the partId as the leftmost cut.
3003  this->assigned_part_ids[i] = kk * 2 + 1;
3004  my_current_left_closest[kk] = coord;
3005  my_current_right_closest[kk] = coord;
3006  kk--;
3007  }
3008  else{
3009  //if cut is far away on the left of the point.
3010  //then just compare for right closest point.
3011  if(my_current_right_closest[kk] - coord > this->sEpsilon){
3012  my_current_right_closest[kk] = coord;
3013  }
3014  break;
3015  }
3016  }
3017 
3018  is_inserted = true;
3019  break;
3020  }
3021  else {
3022  //if point is on the left of the cut.
3023  if (distance_to_cut < 0) {
3024  bool _break = false;
3025  if(j > 0){
3026  //check distance to the cut on the left the current cut compared.
3027  //if point is on the right, then we find the part of the point.
3028  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3029  if(distance_to_next_cut > this->sEpsilon){
3030  _break = true;
3031  }
3032  }
3033  //if point is not on the right of the next cut, then
3034  //set the upper bound to this cut.
3035  upper_cut_index = j - 1;
3036  //set the last part, and mark it as on the left of the last part.
3037  is_on_left_of_cut = true;
3038  last_compared_part = j;
3039  if(_break) break;
3040  }
3041  else {
3042  //if point is on the right of the cut.
3043  bool _break = false;
3044  if(j < num_cuts - 1){
3045  //check distance to the cut on the left the current cut compared.
3046  //if point is on the right, then we find the part of the point.
3047  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3048  if(distance_to_next_cut < minus_EPSILON){
3049  _break = true;
3050  }
3051  }
3052 
3053  //if point is not on the left of the next cut, then
3054  //set the upper bound to this cut.
3055  lower_cut_index = j + 1;
3056  //set the last part, and mark it as on the right of the last part.
3057  is_on_right_of_cut = true;
3058  last_compared_part = j;
3059  if(_break) break;
3060  }
3061  }
3062 
3063  j = (upper_cut_index + lower_cut_index) / 2;
3064  }
3065  if(!is_inserted){
3066  if(is_on_right_of_cut){
3067 
3068  //add it to the right of the last compared part.
3069  my_current_part_weights[2 * last_compared_part + 2] += w;
3070  this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3071 
3072  //update the right closest point of last compared cut.
3073  if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3074  my_current_right_closest[last_compared_part] = coord;
3075  }
3076  //update the left closest point of the cut on the right of the last compared cut.
3077  if(last_compared_part+1 < num_cuts){
3078 
3079  if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3080  my_current_left_closest[last_compared_part + 1] = coord;
3081  }
3082  }
3083 
3084  }
3085  else if(is_on_left_of_cut){
3086 
3087  //add it to the left of the last compared part.
3088  my_current_part_weights[2 * last_compared_part] += w;
3089  this->assigned_part_ids[i] = 2 * last_compared_part;
3090 
3091 
3092  //update the left closest point of last compared cut.
3093  if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3094  my_current_left_closest[last_compared_part] = coord;
3095  }
3096 
3097  //update the right closest point of the cut on the left of the last compared cut.
3098  if(last_compared_part-1 >= 0){
3099  if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3100  my_current_right_closest[last_compared_part -1] = coord;
3101  }
3102  }
3103  }
3104  }
3105  }
3106 
3107  // prefix sum computation.
3108  //we need prefix sum for each part to determine cut positions.
3109  for (size_t i = 1; i < total_part_count; ++i){
3110  // check for cuts sharing the same position; all cuts sharing a position
3111  // have the same weight == total weight for all cuts sharing the position.
3112  // don't want to accumulate that total weight more than once.
3113  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3114  ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3115  < this->sEpsilon){
3116  //i % 2 = 0 when part i represents the cut coordinate.
3117  //if it is a cut, and if the next cut also have the same coordinate, then
3118  //dont addup.
3119  my_current_part_weights[i] = my_current_part_weights[i-2];
3120  continue;
3121  }
3122  //otherwise do the prefix sum.
3123  my_current_part_weights[i] += my_current_part_weights[i-1];
3124  }
3125 }
3126 
3127 
3135 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3136  typename mj_part_t>
3138  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3139  mj_part_t current_work_part,
3140  mj_part_t current_concurrent_num_parts){
3141 
3142 #ifdef HAVE_ZOLTAN2_OMP
3143  //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
3144  //using parallel region here reduces the performance because of the cache invalidates.
3145 #pragma omp barrier
3146 #pragma omp single
3147 #endif
3148  {
3149  size_t tlr_array_shift = 0;
3150  mj_part_t cut_shift = 0;
3151 
3152  //iterate for all concurrent parts to find the left and right closest points in the process.
3153  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3154 
3155  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3156  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3157  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3158 
3159  //iterate for cuts in a single part.
3160  for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3161  mj_part_t next = tlr_array_shift + ii;
3162  mj_part_t cut_index = cut_shift + ii;
3163  if(this->is_cut_line_determined[cut_index]) continue;
3164  mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3165  right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3166 
3167  //find the closest points from left and right for the cut in the process.
3168  for (int j = 1; j < this->num_threads; ++j){
3169  if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3170  right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3171  }
3172  if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3173  left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3174  }
3175  }
3176  //store the left and right closes points.
3177  this->total_part_weight_left_right_closests[num_total_part_in_part +
3178  next] = left_closest_in_process;
3179  this->total_part_weight_left_right_closests[num_total_part_in_part +
3180  num_cuts_in_part + next] = right_closest_in_process;
3181  }
3182  //set the shift position in the arrays
3183  tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3184  cut_shift += num_cuts_in_part;
3185  }
3186 
3187  tlr_array_shift = 0;
3188  cut_shift = 0;
3189  size_t total_part_array_shift = 0;
3190 
3191  //iterate for all concurrent parts to find the total weight in the process.
3192  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3193 
3194  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3195  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3196  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3197 
3198  for(size_t j = 0; j < num_total_part_in_part; ++j){
3199 
3200  mj_part_t cut_ind = j / 2 + cut_shift;
3201 
3202  //need to check j != num_total_part_in_part - 1
3203  // which is same as j/2 != num_cuts_in_part.
3204  //we cannot check it using cut_ind, because of the concurrent part concantanetion.
3205  if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
3206  double pwj = 0;
3207  for (int k = 0; k < this->num_threads; ++k){
3208  pwj += this->thread_part_weights[k][total_part_array_shift + j];
3209  }
3210  //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
3211  this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3212  }
3213  cut_shift += num_cuts_in_part;
3214  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3215  total_part_array_shift += num_total_part_in_part;
3216  }
3217  }
3218  //the other threads needs to wait here.
3219  //but we don't need a pragma omp barrier.
3220  //as omp single has already have implicit barrier.
3221 }
3222 
3223 
3233 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3234  typename mj_part_t>
3236  mj_scalar_t cut_upper_bound,
3237  mj_scalar_t cut_lower_bound,
3238  mj_scalar_t cut_upper_weight,
3239  mj_scalar_t cut_lower_weight,
3240  mj_scalar_t expected_weight,
3241  mj_scalar_t &new_cut_position){
3242 
3243  if(ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3244  new_cut_position = cut_upper_bound; //or lower bound does not matter.
3245  }
3246 
3247 
3248  if(ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3249  new_cut_position = cut_lower_bound;
3250  }
3251 
3252  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3253  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3254  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3255 
3256  mj_scalar_t required_shift = (my_weight_diff / weight_range);
3257  int scale_constant = 20;
3258  int shiftint= int (required_shift * scale_constant);
3259  if (shiftint == 0) shiftint = 1;
3260  required_shift = mj_scalar_t (shiftint) / scale_constant;
3261  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3262 }
3263 
3264 
3275 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3276  typename mj_part_t>
3278  mj_part_t num_parts,
3279  mj_scalar_t *mj_current_dim_coords,
3280  mj_scalar_t *current_concurrent_cut_coordinate,
3281  mj_lno_t coordinate_begin,
3282  mj_lno_t coordinate_end,
3283  mj_scalar_t *used_local_cut_line_weight_to_left,
3284  double **used_thread_part_weight_work,
3285  mj_lno_t *out_part_xadj){
3286 
3287  mj_part_t num_cuts = num_parts - 1;
3288 
3289 #ifdef HAVE_ZOLTAN2_OMP
3290 #pragma omp parallel
3291 #endif
3292  {
3293  int me = 0;
3294 #ifdef HAVE_ZOLTAN2_OMP
3295  me = omp_get_thread_num();
3296 #endif
3297 
3298  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3299  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3300 
3301  //now if the rectilinear partitioning is allowed we decide how
3302  //much weight each thread should put to left and right.
3303  if (this->distribute_points_on_cut_lines){
3304  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3305  // this for assumes the static scheduling in mj_1D_part calculation.
3306 #ifdef HAVE_ZOLTAN2_OMP
3307 #pragma omp for
3308 #endif
3309  for (mj_part_t i = 0; i < num_cuts; ++i){
3310  //the left to be put on the left of the cut.
3311  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3312  for(int ii = 0; ii < this->num_threads; ++ii){
3313  if(left_weight > this->sEpsilon){
3314  //the weight of thread ii on cut.
3315  mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3316  if(thread_ii_weight_on_cut < left_weight){
3317  //if left weight is bigger than threads weight on cut.
3318  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3319  }
3320  else {
3321  //if thread's weight is bigger than space, then put only a portion.
3322  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3323  }
3324  left_weight -= thread_ii_weight_on_cut;
3325  }
3326  else {
3327  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3328  }
3329  }
3330  }
3331 
3332  if(num_cuts > 0){
3333  //this is a special case. If cutlines share the same coordinate, their weights are equal.
3334  //we need to adjust the ratio for that.
3335  for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3336  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3337  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3338  }
3339  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
3340  / mj_scalar_t(SIGNIFICANCE_MUL);
3341  }
3342  }
3343  }
3344 
3345  for(mj_part_t ii = 0; ii < num_parts; ++ii){
3346  thread_num_points_in_parts[ii] = 0;
3347  }
3348 
3349 
3350 #ifdef HAVE_ZOLTAN2_OMP
3351  //dont change static scheduler. the static partitioner used later as well.
3352 #pragma omp for
3353 #endif
3354  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3355 
3356  mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3357  mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3358  mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3359  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3360  if(coordinate_assigned_place % 2 == 1){
3361  //if it is on the cut.
3362  if(this->distribute_points_on_cut_lines
3363  && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3364  //if the rectilinear partitioning is allowed,
3365  //and the thread has still space to put on the left of the cut
3366  //then thread puts the vertex to left.
3367  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3368  //if putting the vertex to left increased the weight more than expected.
3369  //and if the next cut is on the same coordinate,
3370  //then we need to adjust how much weight next cut puts to its left as well,
3371  //in order to take care of the imbalance.
3372  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3373  && coordinate_assigned_part < num_cuts - 1
3374  && ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3375  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3376  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3377  }
3378  ++thread_num_points_in_parts[coordinate_assigned_part];
3379  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3380  }
3381  else{
3382  //if there is no more space on the left, put the coordinate to the right of the cut.
3383  ++coordinate_assigned_part;
3384  //this while loop is necessary when a line is partitioned into more than 2 parts.
3385  while(this->distribute_points_on_cut_lines &&
3386  coordinate_assigned_part < num_cuts){
3387  //traverse all the cut lines having the same partitiong
3388  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3389  current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3390  < this->sEpsilon){
3391  //if line has enough space on left, put it there.
3392  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3393  this->sEpsilon &&
3394  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3395  ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3396  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3397  //Again if it put too much on left of the cut,
3398  //update how much the next cut sharing the same coordinate will put to its left.
3399  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3400  coordinate_assigned_part < num_cuts - 1 &&
3401  ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3402  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3403  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3404  }
3405  break;
3406  }
3407  }
3408  else {
3409  break;
3410  }
3411  ++coordinate_assigned_part;
3412  }
3413  ++thread_num_points_in_parts[coordinate_assigned_part];
3414  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3415  }
3416  }
3417  else {
3418  //if it is already assigned to a part, then just put it to the corresponding part.
3419  ++thread_num_points_in_parts[coordinate_assigned_part];
3420  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3421  }
3422  }
3423 
3424 
3425 
3426  //now we calculate where each thread will write in new_coordinate_permutations array.
3427  //first we find the out_part_xadj, by marking the begin and end points of each part found.
3428  //the below loop find the number of points in each part, and writes it to out_part_xadj
3429 #ifdef HAVE_ZOLTAN2_OMP
3430 #pragma omp for
3431 #endif
3432  for(mj_part_t j = 0; j < num_parts; ++j){
3433  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3434  for (int i = 0; i < this->num_threads; ++i){
3435  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3436  //prefix sum to thread point counts, so that each will have private space to write.
3437  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3438  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3439 
3440  }
3441  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
3442  }
3443 
3444  //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
3445 #ifdef HAVE_ZOLTAN2_OMP
3446 #pragma omp single
3447 #endif
3448  {
3449  //perform prefix sum for num_points in parts.
3450  for(mj_part_t j = 1; j < num_parts; ++j){
3451  out_part_xadj[j] += out_part_xadj[j - 1];
3452  }
3453  }
3454 
3455  //shift the num points in threads thread to obtain the
3456  //beginning index of each thread's private space.
3457  for(mj_part_t j = 1; j < num_parts; ++j){
3458  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3459  }
3460 
3461 
3462  //now thread gets the coordinate and writes the index of coordinate to the permutation array
3463  //using the part index we calculated.
3464 #ifdef HAVE_ZOLTAN2_OMP
3465 #pragma omp for
3466 #endif
3467  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3468  mj_lno_t i = this->coordinate_permutations[ii];
3469  mj_part_t p = this->assigned_part_ids[i];
3470  this->new_coordinate_permutations[coordinate_begin +
3471  thread_num_points_in_parts[p]++] = i;
3472  }
3473  }
3474 }
3475 
3476 
3477 
3506 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3507  typename mj_part_t>
3509  const size_t &num_total_part,
3510  const mj_part_t &num_cuts,
3511  const mj_scalar_t &max_coordinate,
3512  const mj_scalar_t &min_coordinate,
3513  const mj_scalar_t &global_total_weight,
3514  const mj_scalar_t &used_imbalance_tolerance,
3515  mj_scalar_t * current_global_part_weights,
3516  const mj_scalar_t * current_local_part_weights,
3517  const mj_scalar_t *current_part_target_weights,
3518  bool *current_cut_line_determined,
3519  mj_scalar_t *current_cut_coordinates,
3520  mj_scalar_t *current_cut_upper_bounds,
3521  mj_scalar_t *current_cut_lower_bounds,
3522  mj_scalar_t *current_global_left_closest_points,
3523  mj_scalar_t *current_global_right_closest_points,
3524  mj_scalar_t * current_cut_lower_bound_weights,
3525  mj_scalar_t * current_cut_upper_weights,
3526  mj_scalar_t *new_current_cut_coordinates,
3527  mj_scalar_t *current_part_cut_line_weight_to_put_left,
3528  mj_part_t *rectilinear_cut_count,
3529  mj_part_t &my_num_incomplete_cut){
3530 
3531  //seen weight in the part
3532  mj_scalar_t seen_weight_in_part = 0;
3533  //expected weight for part.
3534  mj_scalar_t expected_weight_in_part = 0;
3535  //imbalance for the left and right side of the cut.
3536  mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3537 
3538 
3539 #ifdef HAVE_ZOLTAN2_OMP
3540 #pragma omp for
3541 #endif
3542  for (mj_part_t i = 0; i < num_cuts; i++){
3543  //if left and right closest points are not set yet,
3544  //set it to the cut itself.
3545  if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3546  current_global_left_closest_points[i] = current_cut_coordinates[i];
3547  if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3548  current_global_right_closest_points[i] = current_cut_coordinates[i];
3549 
3550  }
3551 #ifdef HAVE_ZOLTAN2_OMP
3552 #pragma omp for
3553 #endif
3554  for (mj_part_t i = 0; i < num_cuts; i++){
3555 
3556  if(this->distribute_points_on_cut_lines){
3557  //init the weight on the cut.
3558  this->global_rectilinear_cut_weight[i] = 0;
3559  this->process_rectilinear_cut_weight[i] = 0;
3560  }
3561  //if already determined at previous iterations,
3562  //then just write the coordinate to new array, and proceed.
3563  if(current_cut_line_determined[i]) {
3564  new_current_cut_coordinates[i] = current_cut_coordinates[i];
3565  continue;
3566  }
3567 
3568  //current weight of the part at the left of the cut line.
3569  seen_weight_in_part = current_global_part_weights[i * 2];
3570 
3571  /*
3572  cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part << endl;
3573  cout << "\tcut:" << current_cut_coordinates[i]
3574  << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
3575  << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << endl;
3576  */
3577  //expected ratio
3578  expected_weight_in_part = current_part_target_weights[i];
3579  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
3580  imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3581  //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
3582  imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3583 
3584  bool is_left_imbalance_valid = ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3585  bool is_right_imbalance_valid = ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3586 
3587  //if the cut line reaches to desired imbalance.
3588  if(is_left_imbalance_valid && is_right_imbalance_valid){
3589  current_cut_line_determined[i] = true;
3590 #ifdef HAVE_ZOLTAN2_OMP
3591 #pragma omp atomic
3592 #endif
3593  my_num_incomplete_cut -= 1;
3594  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3595  continue;
3596  }
3597  else if(imbalance_on_left < 0){
3598  //if left imbalance < 0 then we need to move the cut to right.
3599 
3600  if(this->distribute_points_on_cut_lines){
3601  //if it is okay to distribute the coordinate on
3602  //the same coordinate to left and right.
3603  //then check if we can reach to the target weight by including the
3604  //coordinates in the part.
3605  if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3606  //if it is we are done.
3607  current_cut_line_determined[i] = true;
3608 #ifdef HAVE_ZOLTAN2_OMP
3609 #pragma omp atomic
3610 #endif
3611  my_num_incomplete_cut -= 1;
3612 
3613  //then assign everything on the cut to the left of the cut.
3614  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3615 
3616  //for this cut all the weight on cut will be put to left.
3617 
3618  current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3619  continue;
3620  }
3621  else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3622 
3623  //if the weight is larger than the expected weight,
3624  //then we need to distribute some points to left, some to right.
3625  current_cut_line_determined[i] = true;
3626 #ifdef HAVE_ZOLTAN2_OMP
3627 #pragma omp atomic
3628 #endif
3629  *rectilinear_cut_count += 1;
3630  //increase the num cuts to be determined with rectilinear partitioning.
3631 
3632 #ifdef HAVE_ZOLTAN2_OMP
3633 #pragma omp atomic
3634 #endif
3635  my_num_incomplete_cut -= 1;
3636  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3637  this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3638  current_local_part_weights[i * 2];
3639  continue;
3640  }
3641  }
3642  //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
3643  current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3644  //set the lower bound weight to the weight we have seen.
3645  current_cut_lower_bound_weights[i] = seen_weight_in_part;
3646 
3647  //compare the upper bound with what has been found in the last iteration.
3648  //we try to make more strict bounds for the cut here.
3649  for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3650  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3651  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3652 
3653  if(p_weight >= expected_weight_in_part){
3654  //if a cut on the right has the expected weight, then we found
3655  //our cut position. Set up and low coordiantes to this new cut coordinate.
3656  //but we need one more iteration to finalize the cut position,
3657  //as wee need to update the part ids.
3658  if(p_weight == expected_weight_in_part){
3659  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3660  current_cut_upper_weights[i] = p_weight;
3661  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3662  current_cut_lower_bound_weights[i] = p_weight;
3663  } else if (p_weight < current_cut_upper_weights[i]){
3664  //if a part weight is larger then my expected weight,
3665  //but lower than my upper bound weight, update upper bound.
3666  current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3667  current_cut_upper_weights[i] = p_weight;
3668  }
3669  break;
3670  }
3671  //if comes here then pw < ew
3672  //then compare the weight against line weight.
3673  if(line_weight >= expected_weight_in_part){
3674  //if the line is larger than the expected weight,
3675  //then we need to reach to the balance by distributing coordinates on this line.
3676  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3677  current_cut_upper_weights[i] = line_weight;
3678  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3679  current_cut_lower_bound_weights[i] = p_weight;
3680  break;
3681  }
3682  //if a stricter lower bound is found,
3683  //update the lower bound.
3684  if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3685  current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3686  current_cut_lower_bound_weights[i] = p_weight;
3687  }
3688  }
3689 
3690 
3691  mj_scalar_t new_cut_position = 0;
3692  this->mj_calculate_new_cut_position(
3693  current_cut_upper_bounds[i],
3694  current_cut_lower_bounds[i],
3695  current_cut_upper_weights[i],
3696  current_cut_lower_bound_weights[i],
3697  expected_weight_in_part, new_cut_position);
3698 
3699  //if cut line does not move significantly.
3700  //then finalize the search.
3701  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3702  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
3703  ){
3704  current_cut_line_determined[i] = true;
3705 #ifdef HAVE_ZOLTAN2_OMP
3706 #pragma omp atomic
3707 #endif
3708  my_num_incomplete_cut -= 1;
3709 
3710  //set the cut coordinate and proceed.
3711  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3712  } else {
3713  new_current_cut_coordinates [i] = new_cut_position;
3714  }
3715  } else {
3716 
3717  //need to move the cut line to left.
3718  //set upper bound to current line.
3719  current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3720  current_cut_upper_weights[i] = seen_weight_in_part;
3721 
3722  // compare the current cut line weights with previous upper and lower bounds.
3723  for (int ii = i - 1; ii >= 0; --ii){
3724  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3725  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3726  if(p_weight <= expected_weight_in_part){
3727  if(p_weight == expected_weight_in_part){
3728  //if the weight of the part is my expected weight
3729  //then we find the solution.
3730  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3731  current_cut_upper_weights[i] = p_weight;
3732  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3733  current_cut_lower_bound_weights[i] = p_weight;
3734  }
3735  else if (p_weight > current_cut_lower_bound_weights[i]){
3736  //if found weight is bigger than the lower bound
3737  //then update the lower bound.
3738  current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3739  current_cut_lower_bound_weights[i] = p_weight;
3740 
3741  //at the same time, if weight of line is bigger than the
3742  //expected weight, then update the upper bound as well.
3743  //in this case the balance will be obtained by distributing weightss
3744  //on this cut position.
3745  if(line_weight > expected_weight_in_part){
3746  current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3747  current_cut_upper_weights[i] = line_weight;
3748  }
3749  }
3750  break;
3751  }
3752  //if the weight of the cut on the left is still bigger than my weight,
3753  //and also if the weight is smaller than the current upper weight,
3754  //or if the weight is equal to current upper weight, but on the left of
3755  // the upper weight, then update upper bound.
3756  if (p_weight >= expected_weight_in_part &&
3757  (p_weight < current_cut_upper_weights[i] ||
3758  (p_weight == current_cut_upper_weights[i] &&
3759  current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3760  )
3761  )
3762  ){
3763  current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3764  current_cut_upper_weights[i] = p_weight;
3765  }
3766  }
3767  mj_scalar_t new_cut_position = 0;
3768  this->mj_calculate_new_cut_position(
3769  current_cut_upper_bounds[i],
3770  current_cut_lower_bounds[i],
3771  current_cut_upper_weights[i],
3772  current_cut_lower_bound_weights[i],
3773  expected_weight_in_part,
3774  new_cut_position);
3775 
3776  //if cut line does not move significantly.
3777  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3778  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
3779  current_cut_line_determined[i] = true;
3780 #ifdef HAVE_ZOLTAN2_OMP
3781 #pragma omp atomic
3782 #endif
3783  my_num_incomplete_cut -= 1;
3784  //set the cut coordinate and proceed.
3785  new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3786  } else {
3787  new_current_cut_coordinates [ i] = new_cut_position;
3788  }
3789  }
3790  }
3791 
3792  //communication to determine the ratios of processors for the distribution
3793  //of coordinates on the cut lines.
3794 #ifdef HAVE_ZOLTAN2_OMP
3795  //no need barrier here as it is implicit.
3796 #pragma omp single
3797 #endif
3798  {
3799  if(*rectilinear_cut_count > 0){
3800 
3801  try{
3802  Teuchos::scan<int,mj_scalar_t>(
3803  *comm, Teuchos::REDUCE_SUM,
3804  num_cuts,
3805  this->process_rectilinear_cut_weight,
3806  this->global_rectilinear_cut_weight
3807  );
3808  }
3809  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
3810 
3811  for (mj_part_t i = 0; i < num_cuts; ++i){
3812  //if cut line weight to be distributed.
3813  if(this->global_rectilinear_cut_weight[i] > 0) {
3814  //expected weight to go to left of the cut.
3815  mj_scalar_t expected_part_weight = current_part_target_weights[i];
3816  //the weight that should be put to left of the cut.
3817  mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
3818  //the weight of the cut in the process
3819  mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
3820  //the sum of the cut weights upto this process, including the weight of this process.
3821  mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
3822  //the space on the left side of the cut after all processes before this process (including this process)
3823  //puts their weights on cut to left.
3824  mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
3825  //add my weight to this space to find out how much space is left to me.
3826  mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
3827 
3828  /*
3829  cout << "expected_part_weight:" << expected_part_weight
3830  << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
3831  << " my_weight_on_line" << my_weight_on_line
3832  << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
3833  << " space_to_put_left:" << space_to_put_left
3834  << " space_left_to_me" << space_left_to_me << endl;
3835  */
3836  if(space_left_to_me < 0){
3837  //space_left_to_me is negative and i dont need to put anything to left.
3838  current_part_cut_line_weight_to_put_left[i] = 0;
3839  }
3840  else if(space_left_to_me >= my_weight_on_line){
3841  //space left to me is bigger than the weight of the processor on cut.
3842  //so put everything to left.
3843  current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
3844  //cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << endl;
3845  }
3846  else {
3847  //put only the weight as much as the space.
3848  current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
3849 
3850  //cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << endl;
3851  }
3852 
3853  }
3854  }
3855  *rectilinear_cut_count = 0;
3856  }
3857  }
3858 }
3859 
3869 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3870  typename mj_part_t>
3872  mj_part_t num_procs,
3873  mj_part_t num_parts,
3874  mj_gno_t *&num_points_in_all_processor_parts){
3875 
3876  //initially allocation_size is num_parts
3877  size_t allocation_size = num_parts * (num_procs + 1);
3878 
3879  //this will be output
3880  //holds how many each processor has in each part.
3881  //last portion is the sum of all processor points in each part.
3882 
3883  //allocate memory for the local num coordinates in each part.
3884  mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
3885 
3886 
3887  //this is the portion of the memory which will be used
3888  //at the summation to obtain total number of processors' points in each part.
3889  mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
3890  //this is the portion of the memory where each stores its local number.
3891  //this information is needed by other processors.
3892  mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
3893 
3894  //initialize the array with 0's.
3895  memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
3896 
3897  //write the number of coordinates in each part.
3898  for (mj_part_t i = 0; i < num_parts; ++i){
3899  mj_lno_t part_begin_index = 0;
3900  if (i > 0){
3901  part_begin_index = this->new_part_xadj[i - 1];
3902  }
3903  mj_lno_t part_end_index = this->new_part_xadj[i];
3904  my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
3905  }
3906 
3907  //copy the local num parts to the last portion of array,
3908  //so that this portion will represent the global num points in each part after the reduction.
3909  memcpy (my_local_point_counts_in_each_art,
3910  my_local_points_to_reduce_sum,
3911  sizeof(mj_gno_t) * (num_parts) );
3912 
3913 
3914  //reduceAll operation.
3915  //the portion that belongs to a processor with index p
3916  //will start from myRank * num_parts.
3917  //the global number of points will be held at the index
3918  try{
3919  reduceAll<int, mj_gno_t>(
3920  *(this->comm),
3921  Teuchos::REDUCE_SUM,
3922  allocation_size,
3923  num_local_points_in_each_part_to_reduce_sum,
3924  num_points_in_all_processor_parts);
3925  }
3926  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
3927  freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
3928 }
3929 
3930 
3931 
3944 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3945  typename mj_part_t>
3947  size_t migration_reduce_all_population,
3948  mj_lno_t num_coords_for_last_dim_part,
3949  mj_part_t num_procs,
3950  mj_part_t num_parts,
3951  mj_gno_t *num_points_in_all_processor_parts){
3952 
3953  //if reduce all count and population in the last dim is too high
3954  if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
3955  //if the work in a part per processor in the last dim is too low.
3956  if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
3957 
3958  //if migration is to be checked and the imbalance is too high
3959  if (this->check_migrate_avoid_migration_option == 0){
3960  double global_imbalance = 0;
3961  //global shift to reach the sum of coordiante count in each part.
3962  size_t global_shift = num_procs * num_parts;
3963 
3964  for (mj_part_t ii = 0; ii < num_procs; ++ii){
3965  for (mj_part_t i = 0; i < num_parts; ++i){
3966  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
3967  / double(num_procs);
3968 
3969  global_imbalance += ZOLTAN2_ABS(ideal_num -
3970  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
3971  }
3972  }
3973  global_imbalance /= num_parts;
3974  global_imbalance /= num_procs;
3975 
3976  /*
3977  if (this->myRank == 0) {
3978  cout << "imbalance for next iteration:" << global_imbalance << endl;
3979  }
3980  */
3981 
3982  if(global_imbalance <= this->minimum_migration_imbalance){
3983  return false;
3984  }
3985  else {
3986  return true;
3987  }
3988  }
3989  else {
3990  //if migration is forced
3991  return true;
3992  }
3993 }
3994 
3995 
4005 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4006  typename mj_part_t>
4008  mj_part_t num_parts,
4009  mj_part_t *part_assignment_proc_begin_indices,
4010  mj_part_t *processor_chains_in_parts,
4011  mj_lno_t *send_count_to_each_proc,
4012  int *coordinate_destinations){
4013 
4014  for (mj_part_t p = 0; p < num_parts; ++p){
4015  mj_lno_t part_begin = 0;
4016  if (p > 0) part_begin = this->new_part_xadj[p - 1];
4017  mj_lno_t part_end = this->new_part_xadj[p];
4018 
4019  //get the first part that current processor will send its part-p.
4020  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4021  //initialize how many point I sent to this processor.
4022  mj_lno_t num_total_send = 0;
4023  for (mj_lno_t j=part_begin; j < part_end; j++){
4024  mj_lno_t local_ind = this->new_coordinate_permutations[j];
4025  while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4026  //then get the next processor to send the points in part p.
4027  num_total_send = 0;
4028  //assign new processor to part_assign_begin[p]
4029  part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4030  //remove the previous processor
4031  processor_chains_in_parts[proc_to_sent] = -1;
4032  //choose the next processor as the next one to send.
4033  proc_to_sent = part_assignment_proc_begin_indices[p];
4034  }
4035  //write the gno index to corresponding position in sendBuf.
4036  coordinate_destinations[local_ind] = proc_to_sent;
4037  ++num_total_send;
4038  }
4039  }
4040 }
4041 
4056 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4057  typename mj_part_t>
4059  mj_gno_t * num_points_in_all_processor_parts,
4060  mj_part_t num_parts,
4061  mj_part_t num_procs,
4062  mj_lno_t *send_count_to_each_proc,
4063  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4064  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4065  mj_part_t &out_part_index,
4066  mj_part_t &output_part_numbering_begin_index,
4067  int *coordinate_destinations){
4068 
4069 
4070  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4071  mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4072 
4073  //boolean variable if the process finds its part to be assigned.
4074  bool did_i_find_my_group = false;
4075 
4076  mj_part_t num_free_procs = num_procs;
4077  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4078 
4079  double max_imbalance_difference = 0;
4080  mj_part_t max_differing_part = 0;
4081 
4082  //find how many processor each part requires.
4083  for (mj_part_t i=0; i < num_parts; i++){
4084 
4085  //scalar portion of the required processors
4086  double scalar_required_proc = num_procs *
4087  (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4088 
4089  //round it to closest integer.
4090  mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
4091 
4092  //if assigning the required num procs, creates problems for the rest of the parts.
4093  //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
4094  if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4095  required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4096  }
4097 
4098  //reduce the free processor count
4099  num_free_procs -= required_proc;
4100  //reduce the free minimum processor count required for the rest of the part by 1.
4101  --minimum_num_procs_required_for_rest_of_parts;
4102 
4103  //part (i) is assigned to (required_proc) processors.
4104  num_procs_assigned_to_each_part[i] = required_proc;
4105 
4106  //because of the roundings some processors might be left as unassigned.
4107  //we want to assign those processors to the part with most imbalance.
4108  //find the part with the maximum imbalance here.
4109  double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4110  if (imbalance_wrt_ideal > max_imbalance_difference){
4111  max_imbalance_difference = imbalance_wrt_ideal;
4112  max_differing_part = i;
4113  }
4114  }
4115 
4116  //assign extra processors to the part with maximum imbalance than the ideal.
4117  if (num_free_procs > 0){
4118  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4119  }
4120 
4121  //now find what are the best processors with least migration for each part.
4122 
4123  //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
4124  //index of a processor that processor sends its data for part - i
4125  mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4126  //the next processor send is found in processor_chains_in_parts, in linked list manner.
4127  mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4128  mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4129 
4130  //initialize the assignment of each processor.
4131  //this has a linked list implementation.
4132  //the beginning of processors assigned
4133  //to each part is hold at part_assignment_proc_begin_indices[part].
4134  //then the next processor assigned to that part is located at
4135  //proc_part_assignments[part_assign_begins[part]], this is a chain
4136  //until the value of -1 is reached.
4137  for (int i = 0; i < num_procs; ++i ){
4138  processor_part_assignments[i] = -1;
4139  processor_chains_in_parts[i] = -1;
4140  }
4141  for (int i = 0; i < num_parts; ++i ){
4142  part_assignment_proc_begin_indices[i] = -1;
4143  }
4144 
4145 
4146  //Allocate memory for sorting data structure.
4147  uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4148  for(mj_part_t i = 0; i < num_parts; ++i){
4149  //the algorithm tries to minimize the cost of migration,
4150  //by assigning the processors with highest number of coordinates on that part.
4151  //here we might want to implement a maximum weighted bipartite matching algorithm.
4152  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4153  sort_item_num_part_points_in_procs[ii].id = ii;
4154  //if processor is not assigned yet.
4155  //add its num points to the sort data structure.
4156  if (processor_part_assignments[ii] == -1){
4157  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4158  sort_item_num_part_points_in_procs[ii].signbit = 1; //indicate that the processor has positive weight.
4159  }
4160  else {
4161  //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
4162  //would be same if we simply set it to -1,
4163  //but more information with no extra cost (which is used later) is provided.
4164  //sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4165 
4166  //UPDATE: Since above gets warning when unsigned is used to represent, we added extra bit to as sign bit to the sort item.
4167  //It is 1 for positives, 0 for negatives.
4168  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4169  sort_item_num_part_points_in_procs[ii].signbit = 0;
4170  }
4171  }
4172  //sort the processors in the part.
4173  uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4174 
4175  /*
4176  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4177  std::cout << "ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4178  " " << sort_item_num_part_points_in_procs[ii].val <<
4179  " " << int(sort_item_num_part_points_in_procs[ii].signbit) << std::endl;
4180  }
4181  */
4182 
4183  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4184  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4185  mj_gno_t ideal_num_points_in_a_proc =
4186  Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
4187 
4188  //starts sending to least heaviest part.
4189  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4190  mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4191  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4192 
4193  //find the processors that will be assigned to this part, which are the heaviest
4194  //non assigned processors.
4195  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4196  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4197  //assign processor to part - i.
4198  processor_part_assignments[proc_id] = i;
4199  }
4200 
4201  bool did_change_sign = false;
4202  //if processor has a minus count, reverse it.
4203  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4204  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
4205  // TODO: SEE BUG 6194
4206  if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4207  did_change_sign = true;
4208  sort_item_num_part_points_in_procs[ii].signbit = 1;
4209  }
4210  else {
4211  break;
4212  }
4213  }
4214  if(did_change_sign){
4215  //resort the processors in the part for the rest of the processors that is not assigned.
4216  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4217  }
4218  /*
4219  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4220  std::cout << "after resort ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4221  " " << sort_item_num_part_points_in_procs[ii].val <<
4222  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) << std::endl;
4223  }
4224  */
4225 
4226  //check if this processors is one of the procs assigned to this part.
4227  //if it is, then get the group.
4228  if (!did_i_find_my_group){
4229  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4230 
4231  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4232  //add the proc to the group.
4233  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4234 
4235  if(proc_id_to_assign == this->myRank){
4236  //if the assigned process is me, then I find my group.
4237  did_i_find_my_group = true;
4238  //set the beginning of part i to my rank.
4239  part_assignment_proc_begin_indices[i] = this->myRank;
4240  processor_chains_in_parts[this->myRank] = -1;
4241 
4242  //set send count to myself to the number of points that I have in part i.
4243  send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4244 
4245  //calculate the shift required for the output_part_numbering_begin_index
4246  for (mj_part_t in = 0; in < i; ++in){
4247  output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4248  }
4249  out_part_index = i;
4250  }
4251  }
4252  //if these was not my group,
4253  //clear the subcomminicator processor array.
4254  if (!did_i_find_my_group){
4255  processor_ranks_for_subcomm.clear();
4256  }
4257  }
4258 
4259  //send points of the nonassigned coordinates to the assigned coordinates.
4260  //starts from the heaviest nonassigned processor.
4261  //TODO we might want to play with this part, that allows more computational imbalance
4262  //but having better communication balance.
4263  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4264  mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4265  mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4266 
4267  //we set number of points to -to_sent - 1 for the assigned processors.
4268  //we reverse it here. This should not happen, as we have already reversed them above.
4269 #ifdef MJ_DEBUG
4270  if (num_points_to_sent < 0) {
4271  cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
4272  exit(1);
4273  }
4274 #endif
4275 
4276  //now sends the points to the assigned processors.
4277  while (num_points_to_sent > 0){
4278  //if the processor has enough space.
4279  if (num_points_to_sent <= space_left_in_sent_proc){
4280  //reduce the space left in the processor.
4281  space_left_in_sent_proc -= num_points_to_sent;
4282  //if my rank is the one that is sending the coordinates.
4283  if (this->myRank == nonassigned_proc_id){
4284  //set my sent count to the sent processor.
4285  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4286  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4287  //that the processor will send its point in part-i.
4288  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4289  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4290  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4291  }
4292  num_points_to_sent = 0;
4293  }
4294  else {
4295  //there might be no space left in the processor.
4296  if(space_left_in_sent_proc > 0){
4297  num_points_to_sent -= space_left_in_sent_proc;
4298 
4299  //send as the space left in the processor.
4300  if (this->myRank == nonassigned_proc_id){
4301  //send as much as the space in this case.
4302  send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4303  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4304  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4305  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4306 
4307  }
4308  }
4309  //change the sent part
4310  ++next_proc_to_send_index;
4311 
4312 #ifdef MJ_DEBUG
4313  if(next_part_to_send_index < nprocs - required_proc_count ){
4314  cout << "Migration - processor assignments - for part:"
4315  << i
4316  << " next_part_to_send :" << next_part_to_send_index
4317  << " nprocs:" << nprocs
4318  << " required_proc_count:" << required_proc_count
4319  << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4320  exit(1)l
4321 
4322  }
4323 #endif
4324  //send the new id.
4325  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4326  //set the new space in the processor.
4327  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4328  }
4329  }
4330  }
4331  }
4332 
4333 
4334 
4335  this->assign_send_destinations(
4336  num_parts,
4337  part_assignment_proc_begin_indices,
4338  processor_chains_in_parts,
4339  send_count_to_each_proc,
4340  coordinate_destinations);
4341 
4342  freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4343  freeArray<mj_part_t>(processor_chains_in_parts);
4344  freeArray<mj_part_t>(processor_part_assignments);
4345  freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4346  freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4347 
4348 }
4349 
4350 
4363 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4364  typename mj_part_t>
4366  mj_part_t num_parts,
4367  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
4368  int *coordinate_destinations,
4369  mj_part_t &output_part_numbering_begin_index,
4370  std::vector<mj_part_t> *next_future_num_parts_in_parts){
4371 
4372  mj_part_t part_shift_amount = output_part_numbering_begin_index;
4373  mj_part_t previous_processor = -1;
4374  for(mj_part_t i = 0; i < num_parts; ++i){
4375  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4376  //assigned processors are sorted.
4377  mj_lno_t part_begin_index = 0;
4378  if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4379  mj_lno_t part_end_index = this->new_part_xadj[p];
4380 
4381  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4382  if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4383  output_part_numbering_begin_index = part_shift_amount;
4384  }
4385  previous_processor = assigned_proc;
4386  part_shift_amount += (*next_future_num_parts_in_parts)[p];
4387 
4388  for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4389  mj_lno_t localInd = this->new_coordinate_permutations[j];
4390  coordinate_destinations[localInd] = assigned_proc;
4391  }
4392  }
4393 }
4394 
4395 
4412 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4413  typename mj_part_t>
4415  mj_gno_t * num_points_in_all_processor_parts,
4416  mj_part_t num_parts,
4417  mj_part_t num_procs,
4418  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
4419  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
4420  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
4421  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
4422  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
4423  int *coordinate_destinations){
4424  out_num_part = 0;
4425 
4426  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4427  out_part_indices.clear();
4428 
4429  //to sort the parts that is assigned to the processors.
4430  //id is the part number, sort value is the assigned processor id.
4431  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4432  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4433 
4434 
4435  //calculate the optimal number of coordinates that should be assigned to each processor.
4436  mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
4437  //to hold the left space as the number of coordinates to the optimal number in each proc.
4438  mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4439  //initialize left space in each.
4440  for (mj_part_t i = 0; i < num_procs; ++i){
4441  space_in_each_processor[i] = work_each;
4442  }
4443 
4444  //we keep track of how many parts each processor is assigned to.
4445  //because in some weird inputs, it might be possible that some
4446  //processors is not assigned to any part. Using these variables,
4447  //we force each processor to have at least one part.
4448  mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4449  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
4450  int empty_proc_count = num_procs;
4451 
4452  //to sort the parts with decreasing order of their coordiantes.
4453  //id are the part numbers, sort value is the number of points in each.
4454  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4455 
4456  //initially we will sort the parts according to the number of coordinates they have.
4457  //so that we will start assigning with the part that has the most number of coordinates.
4458  for (mj_part_t i = 0; i < num_parts; ++i){
4459  sort_item_point_counts_in_parts[i].id = i;
4460  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4461  }
4462  //sort parts with increasing order of loads.
4463  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4464 
4465 
4466  //assigning parts to the processors
4467  //traverse the part win decreasing order of load.
4468  //first assign the heaviest part.
4469  for (mj_part_t j = 0; j < num_parts; ++j){
4470  //sorted with increasing order, traverse inverse.
4471  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4472  //load of the part
4473  mj_gno_t load = global_num_points_in_parts[i];
4474 
4475  //assigned processors
4476  mj_part_t assigned_proc = -1;
4477  //if not fit best processor.
4478  mj_part_t best_proc_to_assign = 0;
4479 
4480 
4481  //sort processors with increasing number of points in this part.
4482  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4483  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4484 
4485  //if there are still enough parts to fill empty processors, than proceed normally.
4486  //but if empty processor count is equal to the number of part, then
4487  //we force to part assignments only to empty processors.
4488  if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4489  //how many points processor ii has in part i?
4490  sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4491  }
4492  else {
4493  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4494  }
4495  }
4496  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4497 
4498  //traverse all processors with decreasing load.
4499  for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4500  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4501  mj_lno_t left_space = space_in_each_processor[ii] - load;
4502  //if enought space, assign to this part.
4503  if(left_space >= 0 ){
4504  assigned_proc = ii;
4505  break;
4506  }
4507  //if space is not enough, store the best candidate part.
4508  if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4509  best_proc_to_assign = ii;
4510  }
4511  }
4512 
4513  //if none had enough space, then assign it to best part.
4514  if (assigned_proc == -1){
4515  assigned_proc = best_proc_to_assign;
4516  }
4517 
4518  if (num_parts_proc_assigned[assigned_proc]++ == 0){
4519  --empty_proc_count;
4520  }
4521  space_in_each_processor[assigned_proc] -= load;
4522  //to sort later, part-i is assigned to the proccessor - assignment.
4523  sort_item_part_to_proc_assignment[j].id = i; //part i
4524  sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
4525 
4526 
4527  //if assigned processor is me, increase the number.
4528  if (assigned_proc == this->myRank){
4529  out_num_part++;//assigned_part_count;
4530  out_part_indices.push_back(i);
4531  }
4532  //increase the send to that processor by the number of points in that part.
4533  //as everyone send their coordiantes in this part to the processor assigned to this part.
4534  send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4535  }
4536  freeArray<mj_part_t>(num_parts_proc_assigned);
4537  freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4538  freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4539  freeArray<mj_lno_t >(space_in_each_processor);
4540 
4541 
4542  //sort assignments with respect to the assigned processors.
4543  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4544  //fill sendBuf.
4545 
4546 
4547  this->assign_send_destinations2(
4548  num_parts,
4549  sort_item_part_to_proc_assignment,
4550  coordinate_destinations,
4551  output_part_numbering_begin_index,
4552  next_future_num_parts_in_parts);
4553 
4554  freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4555 }
4556 
4557 
4575 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4576  typename mj_part_t>
4578  mj_gno_t * num_points_in_all_processor_parts,
4579  mj_part_t num_parts,
4580  mj_part_t num_procs,
4581  mj_lno_t *send_count_to_each_proc,
4582  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4583  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4584  mj_part_t &out_num_part,
4585  std::vector<mj_part_t> &out_part_indices,
4586  mj_part_t &output_part_numbering_begin_index,
4587  int *coordinate_destinations){
4588 
4589 
4590 
4591  processor_ranks_for_subcomm.clear();
4592  // if (this->num_local_coords > 0)
4593  if (num_procs > num_parts){
4594  //if there are more processors than the number of current part
4595  //then processors share the existing parts.
4596  //at the end each processor will have a single part,
4597  //but a part will be shared by a group of processors.
4598  mj_part_t out_part_index = 0;
4599  this->mj_assign_proc_to_parts(
4600  num_points_in_all_processor_parts,
4601  num_parts,
4602  num_procs,
4603  send_count_to_each_proc,
4604  processor_ranks_for_subcomm,
4605  next_future_num_parts_in_parts,
4606  out_part_index,
4607  output_part_numbering_begin_index,
4608  coordinate_destinations
4609  );
4610 
4611  out_num_part = 1;
4612  out_part_indices.clear();
4613  out_part_indices.push_back(out_part_index);
4614  }
4615  else {
4616 
4617  //there are more parts than the processors.
4618  //therefore a processor will be assigned multiple parts,
4619  //the subcommunicators will only have a single processor.
4620  processor_ranks_for_subcomm.push_back(this->myRank);
4621 
4622  //since there are more parts then procs,
4623  //assign multiple parts to processors.
4624  this->mj_assign_parts_to_procs(
4625  num_points_in_all_processor_parts,
4626  num_parts,
4627  num_procs,
4628  send_count_to_each_proc,
4629  next_future_num_parts_in_parts,
4630  out_num_part,
4631  out_part_indices,
4632  output_part_numbering_begin_index,
4633  coordinate_destinations);
4634  }
4635 }
4636 
4649 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4650  typename mj_part_t>
4652  mj_part_t num_procs,
4653  mj_lno_t &num_new_local_points,
4654  std::string iteration,
4655  int *coordinate_destinations,
4656  mj_part_t num_parts)
4657 {
4658 #ifdef ENABLE_ZOLTAN_MIGRATION
4659  if (sizeof(mj_lno_t) <= sizeof(int)) {
4660 
4661  // Cannot use Zoltan_Comm with local ordinals larger than ints.
4662  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
4663  // may overflow.
4664 
4665  ZOLTAN_COMM_OBJ *plan = NULL;
4666  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4667  int num_incoming_gnos = 0;
4668  int message_tag = 7859;
4669 
4670  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4671  int ierr = Zoltan_Comm_Create(
4672  &plan,
4673  int(this->num_local_coords),
4674  coordinate_destinations,
4675  mpi_comm,
4676  message_tag,
4677  &num_incoming_gnos);
4678  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4679  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4680 
4681  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4682  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4683 
4684  //migrate gnos.
4685  message_tag++;
4686  ierr = Zoltan_Comm_Do(
4687  plan,
4688  message_tag,
4689  (char *) this->current_mj_gnos,
4690  sizeof(mj_gno_t),
4691  (char *) incoming_gnos);
4692  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4693 
4694  freeArray<mj_gno_t>(this->current_mj_gnos);
4695  this->current_mj_gnos = incoming_gnos;
4696 
4697 
4698  //migrate coordinates
4699  for (int i = 0; i < this->coord_dim; ++i){
4700  message_tag++;
4701  mj_scalar_t *coord = this->mj_coordinates[i];
4702 
4703  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4704  ierr = Zoltan_Comm_Do(
4705  plan,
4706  message_tag,
4707  (char *) coord,
4708  sizeof(mj_scalar_t),
4709  (char *) this->mj_coordinates[i]);
4710  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4711  freeArray<mj_scalar_t>(coord);
4712  }
4713 
4714  //migrate weights.
4715  for (int i = 0; i < this->num_weights_per_coord; ++i){
4716  message_tag++;
4717  mj_scalar_t *weight = this->mj_weights[i];
4718 
4719  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4720  ierr = Zoltan_Comm_Do(
4721  plan,
4722  message_tag,
4723  (char *) weight,
4724  sizeof(mj_scalar_t),
4725  (char *) this->mj_weights[i]);
4726  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4727  freeArray<mj_scalar_t>(weight);
4728  }
4729 
4730 
4731  //migrate owners.
4732  int *coord_own = allocMemory<int>(num_incoming_gnos);
4733  message_tag++;
4734  ierr = Zoltan_Comm_Do(
4735  plan,
4736  message_tag,
4737  (char *) this->owner_of_coordinate,
4738  sizeof(int), (char *) coord_own);
4739  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4740  freeArray<int>(this->owner_of_coordinate);
4741  this->owner_of_coordinate = coord_own;
4742 
4743 
4744  //if num procs is less than num parts,
4745  //we need the part assigment arrays as well, since
4746  //there will be multiple parts in processor.
4747  mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4748  if(num_procs < num_parts){
4749  message_tag++;
4750  ierr = Zoltan_Comm_Do(
4751  plan,
4752  message_tag,
4753  (char *) this->assigned_part_ids,
4754  sizeof(mj_part_t),
4755  (char *) new_parts);
4756  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4757  }
4758  freeArray<mj_part_t>(this->assigned_part_ids);
4759  this->assigned_part_ids = new_parts;
4760 
4761  ierr = Zoltan_Comm_Destroy(&plan);
4762  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4763  num_new_local_points = num_incoming_gnos;
4764  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4765  }
4766 
4767  else
4768 
4769 #endif // ENABLE_ZOLTAN_MIGRATION
4770  {
4771  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
4772  Tpetra::Distributor distributor(this->comm);
4773  ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
4774  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
4775  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
4776 
4777  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
4778  {
4779  //migrate gnos.
4780  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
4781  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
4782  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
4783  freeArray<mj_gno_t>(this->current_mj_gnos);
4784  this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
4785  memcpy(
4786  this->current_mj_gnos,
4787  received_gnos.getRawPtr(),
4788  num_incoming_gnos * sizeof(mj_gno_t));
4789  }
4790  //migrate coordinates
4791  for (int i = 0; i < this->coord_dim; ++i){
4792 
4793  ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
4794  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
4795  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
4796  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
4797  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4798  memcpy(
4799  this->mj_coordinates[i],
4800  received_coord.getRawPtr(),
4801  num_incoming_gnos * sizeof(mj_scalar_t));
4802  }
4803 
4804  //migrate weights.
4805  for (int i = 0; i < this->num_weights_per_coord; ++i){
4806 
4807  ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
4808  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
4809  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
4810  freeArray<mj_scalar_t>(this->mj_weights[i]);
4811  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4812  memcpy(
4813  this->mj_weights[i],
4814  received_weight.getRawPtr(),
4815  num_incoming_gnos * sizeof(mj_scalar_t));
4816  }
4817 
4818  {
4819  //migrate the owners of the coordinates
4820  ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
4821  ArrayRCP<int> received_owners(num_incoming_gnos);
4822  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
4823  freeArray<int>(this->owner_of_coordinate);
4824  this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
4825  memcpy(
4826  this->owner_of_coordinate,
4827  received_owners.getRawPtr(),
4828  num_incoming_gnos * sizeof(int));
4829  }
4830 
4831  //if num procs is less than num parts,
4832  //we need the part assigment arrays as well, since
4833  //there will be multiple parts in processor.
4834  if(num_procs < num_parts){
4835  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
4836  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
4837  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
4838  freeArray<mj_part_t>(this->assigned_part_ids);
4839  this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
4840  memcpy(
4841  this->assigned_part_ids,
4842  received_partids.getRawPtr(),
4843  num_incoming_gnos * sizeof(mj_part_t));
4844  }
4845  else {
4846  mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
4847  freeArray<mj_part_t>(this->assigned_part_ids);
4848  this->assigned_part_ids = new_parts;
4849  }
4850  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
4851  num_new_local_points = num_incoming_gnos;
4852 
4853  }
4854 }
4855 
4862 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4863  typename mj_part_t>
4864 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
4865  mj_part_t group_size = processor_ranks_for_subcomm.size();
4866  mj_part_t *ids = allocMemory<mj_part_t>(group_size);
4867  for(mj_part_t i = 0; i < group_size; ++i) {
4868  ids[i] = processor_ranks_for_subcomm[i];
4869  }
4870  ArrayView<const mj_part_t> idView(ids, group_size);
4871  this->comm = this->comm->createSubcommunicator(idView);
4872  freeArray<mj_part_t>(ids);
4873 }
4874 
4875 
4881 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4882  typename mj_part_t>
4884  mj_part_t output_num_parts,
4885  mj_part_t num_parts){
4886  //if there is single output part, then simply fill the permutation array.
4887  if (output_num_parts == 1){
4888  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4889  this->new_coordinate_permutations[i] = i;
4890  }
4891  this->new_part_xadj[0] = this->num_local_coords;
4892  }
4893  else {
4894 
4895  //otherwise we need to count how many points are there in each part.
4896  //we allocate here as num_parts, because the sent partids are up to num_parts,
4897  //although there are outout_num_parts different part.
4898  mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
4899  //part shift holds the which part number an old part number corresponds to.
4900  mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
4901 
4902  memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
4903 
4904  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4905  mj_part_t ii = this->assigned_part_ids[i];
4906  ++num_points_in_parts[ii];
4907  }
4908 
4909  //write the end points of the parts.
4910  mj_part_t p = 0;
4911  mj_lno_t prev_index = 0;
4912  for(mj_part_t i = 0; i < num_parts; ++i){
4913  if(num_points_in_parts[i] > 0) {
4914  this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
4915  prev_index += num_points_in_parts[i];
4916  part_shifts[i] = p++;
4917  }
4918  }
4919 
4920  //for the rest of the parts write the end index as end point.
4921  mj_part_t assigned_num_parts = p - 1;
4922  for (;p < num_parts; ++p){
4923  this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
4924  }
4925  for(mj_part_t i = 0; i < output_num_parts; ++i){
4926  num_points_in_parts[i] = this->new_part_xadj[i];
4927  }
4928 
4929  //write the permutation array here.
4930  //get the part of the coordinate i, shift it to obtain the new part number.
4931  //assign it to the end of the new part numbers pointer.
4932  for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
4933  mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
4934  this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
4935  }
4936 
4937  freeArray<mj_lno_t>(num_points_in_parts);
4938  freeArray<mj_part_t>(part_shifts);
4939  }
4940 }
4941 
4942 
4965 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4966  typename mj_part_t>
4968  mj_part_t input_num_parts, //current umb parts
4969  mj_part_t &output_num_parts, //output umb parts.
4970  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4971  mj_part_t &output_part_begin_index,
4972  size_t migration_reduce_all_population,
4973  mj_lno_t num_coords_for_last_dim_part,
4974  std::string iteration,
4975  RCP<mj_partBoxVector_t> &input_part_boxes,
4976  RCP<mj_partBoxVector_t> &output_part_boxes
4977 )
4978 {
4979  mj_part_t num_procs = this->comm->getSize();
4980  this->myRank = this->comm->getRank();
4981 
4982 
4983  //this array holds how many points each processor has in each part.
4984  //to access how many points processor i has on part j,
4985  //num_points_in_all_processor_parts[i * num_parts + j]
4986  mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
4987 
4988  //get the number of coordinates in each part in each processor.
4989  this->get_processor_num_points_in_parts(
4990  num_procs,
4991  input_num_parts,
4992  num_points_in_all_processor_parts);
4993 
4994 
4995  //check if migration will be performed or not.
4996  if (!this->mj_check_to_migrate(
4997  migration_reduce_all_population,
4998  num_coords_for_last_dim_part,
4999  num_procs,
5000  input_num_parts,
5001  num_points_in_all_processor_parts)){
5002  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5003  return false;
5004  }
5005 
5006 
5007  mj_lno_t *send_count_to_each_proc = NULL;
5008  int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5009  send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5010  for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5011 
5012  std::vector<mj_part_t> processor_ranks_for_subcomm;
5013  std::vector<mj_part_t> out_part_indices;
5014 
5015  //determine which processors are assigned to which parts
5016  this->mj_migration_part_proc_assignment(
5017  num_points_in_all_processor_parts,
5018  input_num_parts,
5019  num_procs,
5020  send_count_to_each_proc,
5021  processor_ranks_for_subcomm,
5022  next_future_num_parts_in_parts,
5023  output_num_parts,
5024  out_part_indices,
5025  output_part_begin_index,
5026  coordinate_destinations);
5027 
5028 
5029 
5030 
5031  freeArray<mj_lno_t>(send_count_to_each_proc);
5032  std::vector <mj_part_t> tmpv;
5033 
5034  std::sort (out_part_indices.begin(), out_part_indices.end());
5035  mj_part_t outP = out_part_indices.size();
5036 
5037  mj_gno_t new_global_num_points = 0;
5038  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5039 
5040  if (this->mj_keep_part_boxes){
5041  input_part_boxes->clear();
5042  }
5043 
5044  //now we calculate the new values for next_future_num_parts_in_parts.
5045  //same for the part boxes.
5046  for (mj_part_t i = 0; i < outP; ++i){
5047  mj_part_t ind = out_part_indices[i];
5048  new_global_num_points += global_num_points_in_parts[ind];
5049  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5050  if (this->mj_keep_part_boxes){
5051  input_part_boxes->push_back((*output_part_boxes)[ind]);
5052  }
5053  }
5054  //swap the input and output part boxes.
5055  if (this->mj_keep_part_boxes){
5056  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5057  input_part_boxes = output_part_boxes;
5058  output_part_boxes = tmpPartBoxes;
5059  }
5060  next_future_num_parts_in_parts->clear();
5061  for (mj_part_t i = 0; i < outP; ++i){
5062  mj_part_t p = tmpv[i];
5063  next_future_num_parts_in_parts->push_back(p);
5064  }
5065 
5066  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5067 
5068  mj_lno_t num_new_local_points = 0;
5069 
5070 
5071  //perform the actual migration operation here.
5072  this->mj_migrate_coords(
5073  num_procs,
5074  num_new_local_points,
5075  iteration,
5076  coordinate_destinations,
5077  input_num_parts);
5078 
5079 
5080  freeArray<int>(coordinate_destinations);
5081 
5082  if(this->num_local_coords != num_new_local_points){
5083  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5084  freeArray<mj_lno_t>(this->coordinate_permutations);
5085 
5086  this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5087  this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5088  }
5089  this->num_local_coords = num_new_local_points;
5090  this->num_global_coords = new_global_num_points;
5091 
5092 
5093 
5094  //create subcommunicator.
5095  this->create_sub_communicator(processor_ranks_for_subcomm);
5096  processor_ranks_for_subcomm.clear();
5097 
5098  //fill the new permutation arrays.
5099  this->fill_permutation_array(
5100  output_num_parts,
5101  input_num_parts);
5102  return true;
5103 }
5104 
5105 
5119 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5120  typename mj_part_t>
5122  mj_part_t num_parts,
5123  mj_scalar_t *mj_current_dim_coords,
5124  mj_scalar_t *current_concurrent_cut_coordinate,
5125  mj_lno_t coordinate_begin,
5126  mj_lno_t coordinate_end,
5127  mj_scalar_t *used_local_cut_line_weight_to_left,
5128  mj_lno_t *out_part_xadj,
5129  int coordInd){
5130 
5131  //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin;
5132  mj_part_t no_cuts = num_parts - 1;
5133 
5134 
5135 
5136  int me = 0;
5137  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5138  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5139 
5140 
5141  //now if the rectilinear partitioning is allowed we decide how
5142  //much weight each thread should put to left and right.
5143  if (this->distribute_points_on_cut_lines){
5144 
5145  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5146  for (mj_part_t i = 0; i < no_cuts; ++i){
5147  //the left to be put on the left of the cut.
5148  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5149  //cout << "i:" << i << " left_weight:" << left_weight << endl;
5150  for(int ii = 0; ii < this->num_threads; ++ii){
5151  if(left_weight > this->sEpsilon){
5152  //the weight of thread ii on cut.
5153  mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5154  if(thread_ii_weight_on_cut < left_weight){
5155  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5156  }
5157  else {
5158  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5159  }
5160  left_weight -= thread_ii_weight_on_cut;
5161  }
5162  else {
5163  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5164  }
5165  }
5166  }
5167 
5168  if(no_cuts > 0){
5169  //this is a special case. If cutlines share the same coordinate, their weights are equal.
5170  //we need to adjust the ratio for that.
5171  for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5172  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5173  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5174  }
5175  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
5176  / mj_scalar_t(SIGNIFICANCE_MUL);
5177  }
5178  }
5179  }
5180 
5181  for(mj_part_t ii = 0; ii < num_parts; ++ii){
5182  thread_num_points_in_parts[ii] = 0;
5183  }
5184 
5185  //for this specific case we dont want to distribute the points along the cut position
5186  //randomly, as we need a specific ordering of them. Instead,
5187  //we put the coordinates into a sort item, where we sort those
5188  //using the coordinates of points on other dimensions and the index.
5189 
5190 
5191  //some of the cuts might share the same position.
5192  //in this case, if cut i and cut j share the same position
5193  //cut_map[i] = cut_map[j] = sort item index.
5194  mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5195 
5196 
5197  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5198  typedef std::vector< multiSItem > multiSVector;
5199  typedef std::vector<multiSVector> multiS2Vector;
5200 
5201  //to keep track of the memory allocated.
5202  std::vector<mj_scalar_t *>allocated_memory;
5203 
5204  //vector for which the coordinates will be sorted.
5205  multiS2Vector sort_vector_points_on_cut;
5206 
5207  //the number of cuts that have different coordinates.
5208  mj_part_t different_cut_count = 1;
5209  cut_map[0] = 0;
5210 
5211  //now we insert 1 sort vector for all cuts on the different
5212  //positins.if multiple cuts are on the same position, they share sort vectors.
5213  multiSVector tmpMultiSVector;
5214  sort_vector_points_on_cut.push_back(tmpMultiSVector);
5215 
5216  for (mj_part_t i = 1; i < no_cuts ; ++i){
5217  //if cuts share the same cut coordinates
5218  //set the cutmap accordingly.
5219  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5220  cut_map[i] = cut_map[i-1];
5221  }
5222  else {
5223  cut_map[i] = different_cut_count++;
5224  multiSVector tmp2MultiSVector;
5225  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5226  }
5227  }
5228 
5229 
5230  //now the actual part assigment.
5231  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5232 
5233  mj_lno_t i = this->coordinate_permutations[ii];
5234 
5235  mj_part_t pp = this->assigned_part_ids[i];
5236  mj_part_t p = pp / 2;
5237  //if the coordinate is on a cut.
5238  if(pp % 2 == 1 ){
5239  mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5240  allocated_memory.push_back(vals);
5241 
5242  //we insert the coordinates to the sort item here.
5243  int val_ind = 0;
5244  for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5245  vals[val_ind++] = this->mj_coordinates[dim][i];
5246  }
5247  for(int dim = 0; dim < coordInd; ++dim){
5248  vals[val_ind++] = this->mj_coordinates[dim][i];
5249  }
5250  multiSItem tempSortItem(i, this->coord_dim -1, vals);
5251  //inser the point to the sort vector pointed by the cut_map[p].
5252  mj_part_t cmap = cut_map[p];
5253  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5254  }
5255  else {
5256  //if it is not on the cut, simple sorting.
5257  ++thread_num_points_in_parts[p];
5258  this->assigned_part_ids[i] = p;
5259  }
5260  }
5261 
5262  //sort all the sort vectors.
5263  for (mj_part_t i = 0; i < different_cut_count; ++i){
5264  std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5265  }
5266 
5267  //we do the part assignment for the points on cuts here.
5268  mj_part_t previous_cut_map = cut_map[0];
5269 
5270  //this is how much previous part owns the weight of the current part.
5271  //when target part weight is 1.6, and the part on the left is given 2,
5272  //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
5273  //this parameter is used to balance this issues.
5274  //in the above example weight_stolen_from_previous_part will be 0.4.
5275  //if the left part target is 2.2 but it is given 2,
5276  //then weight_stolen_from_previous_part will be -0.2.
5277  mj_scalar_t weight_stolen_from_previous_part = 0;
5278  for (mj_part_t p = 0; p < no_cuts; ++p){
5279 
5280  mj_part_t mapped_cut = cut_map[p];
5281 
5282  //if previous cut map is done, and it does not have the same index,
5283  //then assign all points left on that cut to its right.
5284  if (previous_cut_map != mapped_cut){
5285  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5286  for (; sort_vector_end >= 0; --sort_vector_end){
5287  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5288  mj_lno_t i = t.index;
5289  ++thread_num_points_in_parts[p];
5290  this->assigned_part_ids[i] = p;
5291  }
5292  sort_vector_points_on_cut[previous_cut_map].clear();
5293  }
5294  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5295 
5296  for (; sort_vector_end >= 0; --sort_vector_end){
5297  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5298  mj_lno_t i = t.index;
5299  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5300 
5301 
5302  //part p has enough space for point i, then put it to point i.
5303  if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5304  my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5305  > this->sEpsilon){
5306 
5307  my_local_thread_cut_weights_to_put_left[p] -= w;
5308  sort_vector_points_on_cut[mapped_cut].pop_back();
5309  ++thread_num_points_in_parts[p];
5310  this->assigned_part_ids[i] = p;
5311  //if putting this weight to left overweights the left cut, then
5312  //increase the space for the next cut using weight_stolen_from_previous_part.
5313  if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5314  if(mapped_cut == cut_map[p + 1] ){
5315  //if the cut before the cut indexed at p was also at the same position
5316  //special case, as we handle the weight differently here.
5317  if (previous_cut_map != mapped_cut){
5318  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5319  }
5320  else {
5321  //if the cut before the cut indexed at p was also at the same position
5322  //we assign extra weights cumulatively in this case.
5323  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5324  }
5325  }
5326  else{
5327  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5328  }
5329  //end assignment for part p
5330  break;
5331  }
5332  } else {
5333  //if part p does not have enough space for this point
5334  //and if there is another cut sharing the same positon,
5335  //again increase the space for the next
5336  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5337  if (previous_cut_map != mapped_cut){
5338  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5339  }
5340  else {
5341  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5342  }
5343  }
5344  else{
5345  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5346  }
5347  //end assignment for part p
5348  break;
5349  }
5350  }
5351  previous_cut_map = mapped_cut;
5352  }
5353  //put everything left on the last cut to the last part.
5354  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5355  for (; sort_vector_end >= 0; --sort_vector_end){
5356  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5357  mj_lno_t i = t.index;
5358  ++thread_num_points_in_parts[no_cuts];
5359  this->assigned_part_ids[i] = no_cuts;
5360  }
5361  sort_vector_points_on_cut[previous_cut_map].clear();
5362  freeArray<mj_part_t> (cut_map);
5363 
5364  //free the memory allocated for vertex sort items .
5365  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5366  for(mj_lno_t i = 0; i < vSize; ++i){
5367  freeArray<mj_scalar_t> (allocated_memory[i]);
5368  }
5369 
5370  //creation of part_xadj as in usual case.
5371  for(mj_part_t j = 0; j < num_parts; ++j){
5372  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5373  for (int i = 0; i < this->num_threads; ++i){
5374  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5375  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5376  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5377 
5378  }
5379  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
5380  }
5381 
5382  //perform prefix sum for num_points in parts.
5383  for(mj_part_t j = 1; j < num_parts; ++j){
5384  out_part_xadj[j] += out_part_xadj[j - 1];
5385  }
5386 
5387 
5388  //shift the num points in threads thread to obtain the
5389  //beginning index of each thread's private space.
5390  for(mj_part_t j = 1; j < num_parts; ++j){
5391  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5392  }
5393 
5394  //now thread gets the coordinate and writes the index of coordinate to the permutation array
5395  //using the part index we calculated.
5396  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5397  mj_lno_t i = this->coordinate_permutations[ii];
5398  mj_part_t p = this->assigned_part_ids[i];
5399  this->new_coordinate_permutations[coordinate_begin +
5400  thread_num_points_in_parts[p]++] = i;
5401  }
5402 }
5403 
5404 
5405 
5415 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5416  typename mj_part_t>
5418  mj_part_t current_num_parts,
5419  mj_part_t output_part_begin_index,
5420  RCP<mj_partBoxVector_t> &output_part_boxes,
5421  bool is_data_ever_migrated)
5422 {
5423  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5424 
5425 #ifdef HAVE_ZOLTAN2_OMP
5426 #pragma omp parallel for
5427 #endif
5428  for(mj_part_t i = 0; i < current_num_parts;++i){
5429 
5430  mj_lno_t begin = 0;
5431  mj_lno_t end = this->part_xadj[i];
5432 
5433  if(i > 0) begin = this->part_xadj[i -1];
5434  mj_part_t part_to_set_index = i + output_part_begin_index;
5435  if (this->mj_keep_part_boxes){
5436  (*output_part_boxes)[i].setpId(part_to_set_index);
5437  }
5438  for (mj_lno_t ii = begin; ii < end; ++ii){
5439  mj_lno_t k = this->coordinate_permutations[ii];
5440  this->assigned_part_ids[k] = part_to_set_index;
5441  }
5442  }
5443 
5444  //ArrayRCP<const mj_gno_t> gnoList;
5445  if(!is_data_ever_migrated){
5446  //freeArray<mj_gno_t>(this->current_mj_gnos);
5447  //if(this->num_local_coords > 0){
5448  // gnoList = arcpFromArrayView(this->mj_gnos);
5449  //}
5450  }
5451  else {
5452 #ifdef ENABLE_ZOLTAN_MIGRATION
5453  if (sizeof(mj_lno_t) <= sizeof(int)) {
5454 
5455  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5456  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5457  // may overflow.
5458 
5459  //if data is migrated, then send part numbers to the original owners.
5460  ZOLTAN_COMM_OBJ *plan = NULL;
5461  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5462 
5463  int incoming = 0;
5464  int message_tag = 7856;
5465 
5466  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
5467  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
5468  this->owner_of_coordinate, mpi_comm, message_tag,
5469  &incoming);
5470  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5471  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
5472 
5473  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5474 
5475  message_tag++;
5476  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5477  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
5478  sizeof(mj_gno_t), (char *) incoming_gnos);
5479  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5480 
5481  freeArray<mj_gno_t>(this->current_mj_gnos);
5482  this->current_mj_gnos = incoming_gnos;
5483 
5484  mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5485 
5486  message_tag++;
5487  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
5488  sizeof(mj_part_t), (char *) incoming_partIds);
5489  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5490  freeArray<mj_part_t>(this->assigned_part_ids);
5491  this->assigned_part_ids = incoming_partIds;
5492 
5493  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5494  ierr = Zoltan_Comm_Destroy(&plan);
5495  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5496 
5497  this->num_local_coords = incoming;
5498  //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
5499  }
5500  else
5501 
5502 #endif // !ENABLE_ZOLTAN_MIGRATION
5503  {
5504  //if data is migrated, then send part numbers to the original owners.
5505  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
5506  Tpetra::Distributor distributor(this->mj_problemComm);
5507  ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5508  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5509  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
5510 
5511  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5512  //migrate gnos to actual owners.
5513  ArrayRCP<mj_gno_t> received_gnos(incoming);
5514  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5515  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5516  freeArray<mj_gno_t>(this->current_mj_gnos);
5517  this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5518  memcpy( this->current_mj_gnos,
5519  received_gnos.getRawPtr(),
5520  incoming * sizeof(mj_gno_t));
5521 
5522  //migrate part ids to actual owners.
5523  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5524  ArrayRCP<mj_part_t> received_partids(incoming);
5525  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5526  freeArray<mj_part_t>(this->assigned_part_ids);
5527  this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5528  memcpy( this->assigned_part_ids,
5529  received_partids.getRawPtr(),
5530  incoming * sizeof(mj_part_t));
5531 
5532  this->num_local_coords = incoming;
5533  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5534 
5535  }
5536  }
5537 
5538  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5539 
5540  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5541 
5542  //ArrayRCP<mj_part_t> partId;
5543  //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
5544 
5545  if (this->mj_keep_part_boxes){
5546  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5547 
5548  }
5549 
5550  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5551 }
5552 
5555 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5556  typename mj_part_t>
5558  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
5559 
5560  for (int i=0; i < this->coord_dim; i++){
5561  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5562  }
5563  freeArray<mj_scalar_t *>(this->mj_coordinates);
5564 
5565  for (int i=0; i < this->num_weights_per_coord; i++){
5566  freeArray<mj_scalar_t>(this->mj_weights[i]);
5567  }
5568  freeArray<mj_scalar_t *>(this->mj_weights);
5569 
5570  freeArray<int>(this->owner_of_coordinate);
5571 
5572  for(int i = 0; i < this->num_threads; ++i){
5573  freeArray<mj_lno_t>(this->thread_point_counts[i]);
5574  }
5575 
5576  freeArray<mj_lno_t *>(this->thread_point_counts);
5577  freeArray<double *> (this->thread_part_weight_work);
5578 
5579  if(this->distribute_points_on_cut_lines){
5580  freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5581  for(int i = 0; i < this->num_threads; ++i){
5582  freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5583  }
5584  freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5585  freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5586  freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5587  }
5588 
5589  freeArray<mj_part_t>(this->my_incomplete_cut_count);
5590 
5591  freeArray<mj_scalar_t>(this->max_min_coords);
5592 
5593  freeArray<mj_lno_t>(this->new_part_xadj);
5594 
5595  freeArray<mj_lno_t>(this->coordinate_permutations);
5596 
5597  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5598 
5599  freeArray<mj_scalar_t>(this->all_cut_coordinates);
5600 
5601  freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5602 
5603  freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5604 
5605  freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5606 
5607  freeArray<mj_scalar_t>(this->target_part_weights);
5608 
5609  freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5610 
5611  freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5612 
5613  freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5614  freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5615  freeArray<bool>(this->is_cut_line_determined);
5616  freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5617  freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5618 
5619  for(int i = 0; i < this->num_threads; ++i){
5620  freeArray<double>(this->thread_part_weights[i]);
5621  freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5622  freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5623  }
5624 
5625  freeArray<double *>(this->thread_part_weights);
5626  freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5627  freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5628 
5629  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
5630 }
5631 
5639 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5640  typename mj_part_t>
5642  bool distribute_points_on_cut_lines_,
5643  int max_concurrent_part_calculation_,
5644  int check_migrate_avoid_migration_option_,
5645  mj_scalar_t minimum_migration_imbalance_){
5646  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5647  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5648  this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5649  this->minimum_migration_imbalance = minimum_migration_imbalance_;
5650 
5651 }
5652 
5653 
5682 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5683  typename mj_part_t>
5685 
5686  const RCP<const Environment> &env,
5687  RCP<Comm<int> > &problemComm,
5688 
5689  double imbalance_tolerance_,
5690  size_t num_global_parts_,
5691  mj_part_t *part_no_array_,
5692  int recursion_depth_,
5693 
5694  int coord_dim_,
5695  mj_lno_t num_local_coords_,
5696  mj_gno_t num_global_coords_,
5697  const mj_gno_t *initial_mj_gnos_,
5698  mj_scalar_t **mj_coordinates_,
5699 
5700  int num_weights_per_coord_,
5701  bool *mj_uniform_weights_,
5702  mj_scalar_t **mj_weights_,
5703  bool *mj_uniform_parts_,
5704  mj_scalar_t **mj_part_sizes_,
5705 
5706  mj_part_t *&result_assigned_part_ids_,
5707  mj_gno_t *&result_mj_gnos_
5708 )
5709 {
5710 
5711 #ifdef print_debug
5712  if(comm->getRank() == 0){
5713  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
5714  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
5715  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
5716  }
5717 #endif
5718 
5719  this->mj_env = env;
5720  this->mj_problemComm = problemComm;
5721  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5722 
5723  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
5724  this->mj_env->debug(3, "In MultiJagged Jagged");
5725 
5726  {
5727  this->imbalance_tolerance = imbalance_tolerance_;
5728  this->num_global_parts = num_global_parts_;
5729  this->part_no_array = part_no_array_;
5730  this->recursion_depth = recursion_depth_;
5731 
5732  this->coord_dim = coord_dim_;
5733  this->num_local_coords = num_local_coords_;
5734  this->num_global_coords = num_global_coords_;
5735  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
5736  this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
5737 
5738  this->num_weights_per_coord = num_weights_per_coord_;
5739  this->mj_uniform_weights = mj_uniform_weights_;
5740  this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
5741  this->mj_uniform_parts = mj_uniform_parts_;
5742  this->mj_part_sizes = mj_part_sizes_;
5743 
5744  this->num_threads = 1;
5745 #ifdef HAVE_ZOLTAN2_OMP
5746 #pragma omp parallel
5747 
5748  {
5749  this->num_threads = omp_get_num_threads();
5750  }
5751 #endif
5752  }
5753  //this->set_input_data();
5754  this->set_part_specifications();
5755 
5756  this->allocate_set_work_memory();
5757 
5758  //We duplicate the comm as we create subcommunicators during migration.
5759  //We keep the problemComm as it is, while comm changes after each migration.
5760  this->comm = this->mj_problemComm->duplicate();
5761 
5762  //initially there is a single partition
5763  mj_part_t current_num_parts = 1;
5764  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
5765 
5766  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
5767 
5768  mj_part_t output_part_begin_index = 0;
5769  mj_part_t future_num_parts = this->total_num_part;
5770  bool is_data_ever_migrated = false;
5771 
5772  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
5773  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
5774  next_future_num_parts_in_parts->push_back(this->num_global_parts);
5775 
5776  RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
5777  RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
5778 
5779  compute_global_box();
5780  if(this->mj_keep_part_boxes){
5781  this->init_part_boxes(output_part_boxes);
5782  }
5783 
5784  for (int i = 0; i < this->recursion_depth; ++i){
5785  //partitioning array. size will be as the number of current partitions and this
5786  //holds how many parts that each part will be in the current dimension partitioning.
5787  std::vector <mj_part_t> num_partitioning_in_current_dim;
5788 
5789  //number of parts that will be obtained at the end of this partitioning.
5790  //future_num_part_in_parts is as the size of current number of parts.
5791  //holds how many more parts each should be divided in the further
5792  //iterations. this will be used to calculate num_partitioning_in_current_dim,
5793  //as the number of parts that the part will be partitioned
5794  //in the current dimension partitioning.
5795 
5796  //next_future_num_parts_in_parts will be as the size of outnumParts,
5797  //and this will hold how many more parts that each output part
5798  //should be divided. this array will also be used to determine the weight ratios
5799  //of the parts.
5800  //swap the arrays to use iteratively..
5801  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
5802  future_num_part_in_parts = next_future_num_parts_in_parts;
5803  next_future_num_parts_in_parts = tmpPartVect;
5804 
5805  //clear next_future_num_parts_in_parts array as
5806  //getPartitionArrays expects it to be empty.
5807  //it also expects num_partitioning_in_current_dim to be empty as well.
5808  next_future_num_parts_in_parts->clear();
5809 
5810  if(this->mj_keep_part_boxes){
5811  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5812  input_part_boxes = output_part_boxes;
5813  output_part_boxes = tmpPartBoxes;
5814  output_part_boxes->clear();
5815  }
5816 
5817  //returns the total no. of output parts for this dimension partitioning.
5818  mj_part_t output_part_count_in_dimension =
5819  this->update_part_num_arrays(
5820  num_partitioning_in_current_dim,
5821  future_num_part_in_parts,
5822  next_future_num_parts_in_parts,
5823  future_num_parts,
5824  current_num_parts,
5825  i,
5826  input_part_boxes,
5827  output_part_boxes);
5828 
5829  //if the number of obtained parts equal to current number of parts,
5830  //skip this dimension. For example, this happens when 1 is given in the input
5831  //part array is given. P=4,5,1,2
5832  if(output_part_count_in_dimension == current_num_parts) {
5833  //still need to swap the input output arrays.
5834  tmpPartVect= future_num_part_in_parts;
5835  future_num_part_in_parts = next_future_num_parts_in_parts;
5836  next_future_num_parts_in_parts = tmpPartVect;
5837 
5838  if(this->mj_keep_part_boxes){
5839  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5840  input_part_boxes = output_part_boxes;
5841  output_part_boxes = tmpPartBoxes;
5842  }
5843  continue;
5844  }
5845 
5846 
5847  //get the coordinate axis along which the partitioning will be done.
5848  int coordInd = i % this->coord_dim;
5849  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
5850 
5851  //convert i to string to be used for debugging purposes.
5852  std::string istring = toString<int>(i);
5853  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
5854 
5855  //alloc Memory to point the indices
5856  //of the parts in the permutation array.
5857  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
5858 
5859  //the index where in the new_part_xadj will be written.
5860  mj_part_t output_part_index = 0;
5861  //whatever is written to output_part_index will be added with putput_coordinate_end_index
5862  //so that the points will be shifted.
5863  mj_part_t output_coordinate_end_index = 0;
5864 
5865  mj_part_t current_work_part = 0;
5866  mj_part_t current_concurrent_num_parts =
5867  std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
5868 
5869  mj_part_t obtained_part_index = 0;
5870 
5871  //run for all available parts.
5872  for (; current_work_part < current_num_parts;
5873  current_work_part += current_concurrent_num_parts){
5874 
5875  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
5876  this->max_concurrent_part_calculation);
5877 
5878  mj_part_t actual_work_part_count = 0;
5879  //initialization for 1D partitioning.
5880  //get the min and max coordinates of each part
5881  //together with the part weights of each part.
5882  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
5883  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
5884 
5885  //if this part wont be partitioned any further
5886  //dont do any work for this part.
5887  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
5888  continue;
5889  }
5890  ++actual_work_part_count;
5891  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
5892  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
5893 
5894 /*
5895  cout << "i:" << i << " j:" << current_work_part + kk
5896  << " coordinate_begin_index:" << coordinate_begin_index
5897  << " coordinate_end_index:" << coordinate_end_index
5898  << " total:" << coordinate_end_index - coordinate_begin_index<< endl;
5899  */
5900  this->mj_get_local_min_max_coord_totW(
5901  coordinate_begin_index,
5902  coordinate_end_index,
5903  this->coordinate_permutations,
5904  mj_current_dim_coords,
5905  this->process_local_min_max_coord_total_weight[kk], //min_coordinate
5906  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
5907  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
5908 
5909  }
5910 
5911  //1D partitioning
5912  if (actual_work_part_count > 0){
5913  //obtain global Min max of the part.
5914  this->mj_get_global_min_max_coord_totW(
5915  current_concurrent_num_parts,
5916  this->process_local_min_max_coord_total_weight,
5917  this->global_min_max_coord_total_weight);
5918 
5919  //represents the total number of cutlines
5920  //whose coordinate should be determined.
5921  mj_part_t total_incomplete_cut_count = 0;
5922 
5923  //Compute weight ratios for parts & cuts:
5924  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
5925  //part0 cut0 part1 cut1 part2 cut2 part3
5926  mj_part_t concurrent_part_cut_shift = 0;
5927  mj_part_t concurrent_part_part_shift = 0;
5928  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
5929  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
5930  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
5931  current_concurrent_num_parts];
5932 
5933  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
5934  2 * current_concurrent_num_parts];
5935 
5936  mj_part_t concurrent_current_part_index = current_work_part + kk;
5937 
5938  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
5939 
5940  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
5941  mj_scalar_t *current_target_part_weights = this->target_part_weights +
5942  concurrent_part_part_shift;
5943  //shift the usedCutCoordinate array as noCuts.
5944  concurrent_part_cut_shift += partition_count - 1;
5945  //shift the partRatio array as noParts.
5946  concurrent_part_part_shift += partition_count;
5947 
5948 
5949  //calculate only if part is not empty,
5950  //and part will be further partitioned.
5951  if(partition_count > 1 && min_coordinate <= max_coordinate){
5952 
5953  //increase num_cuts_do_be_determined by the number of cuts of the current
5954  //part's cut line number.
5955  total_incomplete_cut_count += partition_count - 1;
5956  //set the number of cut lines that should be determined
5957  //for this part.
5958  this->my_incomplete_cut_count[kk] = partition_count - 1;
5959 
5960  //get the target weights of the parts.
5961  this->mj_get_initial_cut_coords_target_weights(
5962  min_coordinate,
5963  max_coordinate,
5964  partition_count - 1,
5965  global_total_weight,
5966  usedCutCoordinate,
5967  current_target_part_weights,
5968  future_num_part_in_parts,
5969  next_future_num_parts_in_parts,
5970  concurrent_current_part_index,
5971  obtained_part_index);
5972 
5973  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
5974  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
5975 
5976  //get the initial estimated part assignments of the
5977  //coordinates.
5978  this->set_initial_coordinate_parts(
5979  max_coordinate,
5980  min_coordinate,
5981  concurrent_current_part_index,
5982  coordinate_begin_index, coordinate_end_index,
5983  this->coordinate_permutations,
5984  mj_current_dim_coords,
5985  this->assigned_part_ids,
5986  partition_count);
5987  }
5988  else {
5989  // e.g., if have fewer coordinates than parts, don't need to do next dim.
5990  this->my_incomplete_cut_count[kk] = 0;
5991  }
5992  obtained_part_index += partition_count;
5993  }
5994 
5995 
5996 
5997  //used imbalance, it is always 0, as it is difficult to
5998  //estimate a range.
5999  mj_scalar_t used_imbalance = 0;
6000 
6001 
6002  // Determine cut lines for all concurrent parts parts here.
6003  this->mj_1D_part(
6004  mj_current_dim_coords,
6005  used_imbalance,
6006  current_work_part,
6007  current_concurrent_num_parts,
6008  current_cut_coordinates,
6009  total_incomplete_cut_count,
6010  num_partitioning_in_current_dim);
6011  }
6012 
6013  //create new part chunks
6014  {
6015  mj_part_t output_array_shift = 0;
6016  mj_part_t cut_shift = 0;
6017  size_t tlr_shift = 0;
6018  size_t partweight_array_shift = 0;
6019 
6020  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6021  mj_part_t current_concurrent_work_part = current_work_part + kk;
6022  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6023 
6024  //if the part is empty, skip the part.
6025  if((num_parts != 1 )
6026  &&
6027  this->global_min_max_coord_total_weight[kk] >
6028  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6029 
6030  //we still need to write the begin and end point of the
6031  //empty part. simply set it zero, the array indices will be shifted later.
6032  for(mj_part_t jj = 0; jj < num_parts; ++jj){
6033  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6034  }
6035  cut_shift += num_parts - 1;
6036  tlr_shift += (4 *(num_parts - 1) + 1);
6037  output_array_shift += num_parts;
6038  partweight_array_shift += (2 * (num_parts - 1) + 1);
6039  continue;
6040  }
6041 
6042  mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6043  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6044  current_concurrent_work_part -1];
6045  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6046  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6047  cut_shift;
6048 
6049  //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift;
6050 
6051  for(int ii = 0; ii < this->num_threads; ++ii){
6052  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6053  }
6054 
6055  if(num_parts > 1){
6056  if(this->mj_keep_part_boxes){
6057  //if part boxes are to be stored update the boundaries.
6058  for (mj_part_t j = 0; j < num_parts - 1; ++j){
6059  (*output_part_boxes)[output_array_shift + output_part_index +
6060  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6061  /*update max*/, coordInd);
6062 
6063  (*output_part_boxes)[output_array_shift + output_part_index + j +
6064  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6065  /*update min*/, coordInd);
6066  }
6067  }
6068 
6069  // Rewrite the indices based on the computed cuts.
6070  this->mj_create_new_partitions(
6071  num_parts,
6072  mj_current_dim_coords,
6073  current_concurrent_cut_coordinate,
6074  coordinate_begin,
6075  coordinate_end,
6076  used_local_cut_line_weight_to_left,
6077  this->thread_part_weight_work,
6078  this->new_part_xadj + output_part_index + output_array_shift
6079  );
6080 
6081  }
6082  else {
6083  //if this part is partitioned into 1 then just copy
6084  //the old values.
6085  mj_lno_t part_size = coordinate_end - coordinate_begin;
6086  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6087  memcpy(
6088  this->new_coordinate_permutations + coordinate_begin,
6089  this->coordinate_permutations + coordinate_begin,
6090  part_size * sizeof(mj_lno_t));
6091  }
6092  cut_shift += num_parts - 1;
6093  tlr_shift += (4 *(num_parts - 1) + 1);
6094  output_array_shift += num_parts;
6095  partweight_array_shift += (2 * (num_parts - 1) + 1);
6096  }
6097 
6098  //shift cut coordinates so that all cut coordinates are stored.
6099  //no shift now because we dont keep the cuts.
6100  //current_cut_coordinates += cut_shift;
6101 
6102  //mj_create_new_partitions from coordinates partitioned the parts and
6103  //write the indices as if there were a single part.
6104  //now we need to shift the beginning indices.
6105  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6106  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6107  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6108  //shift it by previousCount
6109  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6110  }
6111  //increase the previous count by current end.
6112  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6113  //increase the current out.
6114  output_part_index += num_parts ;
6115  }
6116  }
6117  }
6118  // end of this partitioning dimension
6119 
6120 
6121  int current_world_size = this->comm->getSize();
6122  long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6123 
6124 
6125  bool is_migrated_in_current_dimension = false;
6126 
6127  //we migrate if there are more partitionings to be done after this step
6128  //and if the migration is not forced to be avoided.
6129  //and the operation is not sequential.
6130  if (future_num_parts > 1 &&
6131  this->check_migrate_avoid_migration_option >= 0 &&
6132  current_world_size > 1){
6133 
6134  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6135  mj_part_t num_parts = output_part_count_in_dimension;
6136  if ( this->mj_perform_migration(
6137  num_parts,
6138  current_num_parts, //output
6139  next_future_num_parts_in_parts, //output
6140  output_part_begin_index,
6141  migration_reduce_all_population,
6142  this->num_local_coords / (future_num_parts * current_num_parts),
6143  istring,
6144  input_part_boxes, output_part_boxes) ) {
6145  is_migrated_in_current_dimension = true;
6146  is_data_ever_migrated = true;
6147  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
6148  istring);
6149  //since data is migrated, we reduce the number of reduceAll operations for the last part.
6150  this->total_dim_num_reduce_all /= num_parts;
6151  }
6152  else {
6153  is_migrated_in_current_dimension = false;
6154  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6155  }
6156  }
6157 
6158  //swap the coordinate permutations for the next dimension.
6159  mj_lno_t * tmp = this->coordinate_permutations;
6160  this->coordinate_permutations = this->new_coordinate_permutations;
6161  this->new_coordinate_permutations = tmp;
6162 
6163  if(!is_migrated_in_current_dimension){
6164  this->total_dim_num_reduce_all -= current_num_parts;
6165  current_num_parts = output_part_count_in_dimension;
6166  }
6167  freeArray<mj_lno_t>(this->part_xadj);
6168  this->part_xadj = this->new_part_xadj;
6169 
6170  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6171  }
6172 
6173  // Partitioning is done
6174  delete future_num_part_in_parts;
6175  delete next_future_num_parts_in_parts;
6176 
6177  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6179 
6180 
6181  //get the final parts of each initial coordinate
6182  //the results will be written to
6183  //this->assigned_part_ids for gnos given in this->current_mj_gnos
6184  this->set_final_parts(
6185  current_num_parts,
6186  output_part_begin_index,
6187  output_part_boxes,
6188  is_data_ever_migrated);
6189 
6190  result_assigned_part_ids_ = this->assigned_part_ids;
6191  result_mj_gnos_ = this->current_mj_gnos;
6192 
6193  this->free_work_memory();
6194  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
6195  this->mj_env->debug(3, "Out of MultiJagged");
6196 
6197 }
6198 
6199 
6203 template <typename Adapter>
6204 class Zoltan2_AlgMJ : public Algorithm<Adapter>
6205 {
6206 private:
6207 
6208 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6209 
6210  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
6211  typedef typename Adapter::scalar_t mj_scalar_t;
6212  typedef typename Adapter::gno_t mj_gno_t;
6213  typedef typename Adapter::lno_t mj_lno_t;
6214  typedef typename Adapter::node_t mj_node_t;
6215  typedef typename Adapter::part_t mj_part_t;
6217  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6218 #endif
6220 
6221  RCP<const Environment> mj_env; //the environment object
6222  RCP<Comm<int> > mj_problemComm; //initial comm object
6223  RCP<const coordinateModel_t> mj_coords; //coordinate adapter
6224 
6225  //PARAMETERS
6226  double imbalance_tolerance; //input imbalance tolerance.
6227  size_t num_global_parts; //the targeted number of parts
6228  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
6229  int recursion_depth; //the number of steps that partitioning will be solved in.
6230 
6231  int coord_dim; // coordinate dimension.
6232  mj_lno_t num_local_coords; //number of local coords.
6233  mj_gno_t num_global_coords; //number of global coords.
6234  const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
6235  mj_scalar_t **mj_coordinates; //two dimension coordinate array
6236 
6237  int num_weights_per_coord; // number of weights per coordinate
6238  bool *mj_uniform_weights; //if the coordinates have uniform weights.
6239  mj_scalar_t **mj_weights; //two dimensional weight array
6240  bool *mj_uniform_parts; //if the target parts are uniform
6241  mj_scalar_t **mj_part_sizes; //target part weight sizes.
6242 
6243  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
6244  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
6245  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
6246  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
6247  int mj_keep_part_boxes; //if the boxes need to be kept.
6248 
6249  int num_threads;
6250 
6251  int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
6252 
6253  ArrayRCP<mj_part_t> comXAdj_; //communication graph xadj
6254  ArrayRCP<mj_part_t> comAdj_; //communication graph adj.
6255 
6256 
6257  //when we have strided data, it returns a unstrided data in RCP form.
6258  //we need to hold on to that data, during the execution of mj, so that the data is not released.
6259  //coordinate_rcp_holder will hold that data, and release it when MJ is deleted.
6260  ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6261 
6262  void set_up_partitioning_data(
6263  const RCP<PartitioningSolution<Adapter> >&solution);
6264 
6265  void set_input_parameters(const Teuchos::ParameterList &p);
6266 
6267  void free_work_memory();
6268 
6269  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
6270 
6271 public:
6272 
6273  Zoltan2_AlgMJ(const RCP<const Environment> &env,
6274  RCP<Comm<int> > &problemComm,
6275  const RCP<const coordinateModel_t> &coords) :
6276  mj_partitioner(), mj_env(env),
6277  mj_problemComm(problemComm),
6278  mj_coords(coords),
6279  imbalance_tolerance(0),
6280  num_global_parts(1), part_no_array(NULL),
6281  recursion_depth(0),
6282  coord_dim(0),num_local_coords(0), num_global_coords(0),
6283  initial_mj_gnos(NULL), mj_coordinates(NULL),
6284  num_weights_per_coord(0),
6285  mj_uniform_weights(NULL), mj_weights(NULL),
6286  mj_uniform_parts(NULL),
6287  mj_part_sizes(NULL),
6288  distribute_points_on_cut_lines(true),
6289  max_concurrent_part_calculation(1),
6290  check_migrate_avoid_migration_option(0),
6291  minimum_migration_imbalance(0.30),
6292  mj_keep_part_boxes(0), num_threads(1), mj_run_as_rcb(0),
6293  comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6294  {}
6296  if (coordinate_ArrayRCP_holder != NULL){
6297  delete [] this->coordinate_ArrayRCP_holder;
6298  this->coordinate_ArrayRCP_holder = NULL;
6299  }
6300  }
6301 
6308  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
6309 
6310  mj_partBoxVector_t &getPartBoxesView() const
6311  {
6312  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6313  return *pBoxes;
6314  }
6315 
6316  mj_part_t pointAssign(int dim, mj_scalar_t *point) const;
6317 
6318  void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6319  size_t &nPartsFound, mj_part_t **partsFound) const;
6320 
6321 
6324  void getCommunicationGraph(
6325  const PartitioningSolution<Adapter> *solution,
6326  ArrayRCP<mj_part_t> &comXAdj,
6327  ArrayRCP<mj_part_t> &comAdj);
6328 };
6329 
6330 
6340 template <typename Adapter>
6342  const RCP<PartitioningSolution<Adapter> > &solution
6343 )
6344 {
6345  this->set_up_partitioning_data(solution);
6346  this->set_input_parameters(this->mj_env->getParameters());
6347  if (this->mj_keep_part_boxes){
6348  this->mj_partitioner.set_to_keep_part_boxes();
6349  }
6350  this->mj_partitioner.set_partitioning_parameters(
6351  this->distribute_points_on_cut_lines,
6352  this->max_concurrent_part_calculation,
6353  this->check_migrate_avoid_migration_option,
6354  this->minimum_migration_imbalance);
6355 
6356  mj_part_t *result_assigned_part_ids = NULL;
6357  mj_gno_t *result_mj_gnos = NULL;
6358  this->mj_partitioner.multi_jagged_part(
6359  this->mj_env,
6360  this->mj_problemComm,
6361 
6362  this->imbalance_tolerance,
6363  this->num_global_parts,
6364  this->part_no_array,
6365  this->recursion_depth,
6366 
6367  this->coord_dim,
6368  this->num_local_coords,
6369  this->num_global_coords,
6370  this->initial_mj_gnos,
6371  this->mj_coordinates,
6372 
6373  this->num_weights_per_coord,
6374  this->mj_uniform_weights,
6375  this->mj_weights,
6376  this->mj_uniform_parts,
6377  this->mj_part_sizes,
6378 
6379  result_assigned_part_ids,
6380  result_mj_gnos
6381  );
6382 
6383  // Reorder results so that they match the order of the input
6384 #if defined(__cplusplus) && __cplusplus >= 201103L
6385  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6386  localGidToLid.reserve(this->num_local_coords);
6387  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6388  localGidToLid[this->initial_mj_gnos[i]] = i;
6389 
6390  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[this->num_local_coords],
6391  0, this->num_local_coords, true);
6392 
6393  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6394  mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6395  partId[origLID] = result_assigned_part_ids[i];
6396  }
6397 
6398 #else
6399  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6400  localGidToLid(this->num_local_coords);
6401  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6402  localGidToLid.put(this->initial_mj_gnos[i], i);
6403 
6404  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[this->num_local_coords],
6405  0, this->num_local_coords, true);
6406 
6407  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6408  mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6409  partId[origLID] = result_assigned_part_ids[i];
6410  }
6411 
6412 #endif // C++11 is enabled
6413 
6414  delete [] result_mj_gnos;
6415  delete [] result_assigned_part_ids;
6416 
6417  solution->setParts(partId);
6418  this->free_work_memory();
6419 }
6420 
6421 /* \brief Freeing the memory allocated.
6422  * */
6423 template <typename Adapter>
6425  freeArray<mj_scalar_t *>(this->mj_coordinates);
6426  freeArray<mj_scalar_t *>(this->mj_weights);
6427  freeArray<bool>(this->mj_uniform_parts);
6428  freeArray<mj_scalar_t *>(this->mj_part_sizes);
6429  freeArray<bool>(this->mj_uniform_weights);
6430 
6431 }
6432 
6433 /* \brief Sets the partitioning data for multijagged algorithm.
6434  * */
6435 template <typename Adapter>
6437  const RCP<PartitioningSolution<Adapter> > &solution
6438 )
6439 {
6440  this->coord_dim = this->mj_coords->getCoordinateDim();
6441  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
6442  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
6443  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
6444  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
6445 
6446  // From the Solution we get part information.
6447  // If the part sizes for a given criteria are not uniform,
6448  // then they are values that sum to 1.0.
6449  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
6450  //allocate only two dimensional pointer.
6451  //raw pointer addresess will be obtained from multivector.
6452  this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
6453  this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
6454 
6455  //if the partitioning results are to be uniform.
6456  this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
6457  //if in a criteria dimension, uniform part is false this shows ratios of
6458  //the target part weights.
6459  this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
6460  //if the weights of coordinates are uniform in a criteria dimension.
6461  this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
6462 
6463  typedef StridedData<mj_lno_t, mj_scalar_t> input_t;
6464  ArrayView<const mj_gno_t> gnos;
6465  ArrayView<input_t> xyz;
6466  ArrayView<input_t> wgts;
6467 
6468 
6469  this->coordinate_ArrayRCP_holder = new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
6470 
6471  this->mj_coords->getCoordinates(gnos, xyz, wgts);
6472  //obtain global ids.
6473  ArrayView<const mj_gno_t> mj_gnos = gnos;
6474  this->initial_mj_gnos = mj_gnos.getRawPtr();
6475 
6476  //extract coordinates from multivector.
6477  for (int dim=0; dim < this->coord_dim; dim++){
6478  ArrayRCP<const mj_scalar_t> ar;
6479  xyz[dim].getInputArray(ar);
6480  this->coordinate_ArrayRCP_holder[dim] = ar;
6481 
6482  //multiJagged coordinate values assignment
6483  this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
6484  }
6485 
6486  //if no weights are provided set uniform weight.
6487  if (this->num_weights_per_coord == 0){
6488  this->mj_uniform_weights[0] = true;
6489  this->mj_weights[0] = NULL;
6490  }
6491  else{
6492  //if weights are provided get weights for all weight indices
6493  for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
6494  ArrayRCP<const mj_scalar_t> ar;
6495  wgts[wdim].getInputArray(ar);
6496  this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
6497  this->mj_uniform_weights[wdim] = false;
6498  this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
6499  }
6500  }
6501 
6502  for (int wdim = 0; wdim < criteria_dim; wdim++){
6503  if (solution->criteriaHasUniformPartSizes(wdim)){
6504  this->mj_uniform_parts[wdim] = true;
6505  this->mj_part_sizes[wdim] = NULL;
6506  }
6507  else{
6508  std::cerr << "MJ does not support non uniform target part weights" << std::endl;
6509  exit(1);
6510  }
6511  }
6512 }
6513 
6514 /* \brief Sets the partitioning parameters for multijagged algorithm.
6515  * \param pl: is the parameter list provided to zoltan2 call
6516  * */
6517 template <typename Adapter>
6518 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
6519 
6520  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
6521  if (pe){
6522  double tol;
6523  tol = pe->getValue(&tol);
6524  this->imbalance_tolerance = tol - 1.0;
6525  }
6526 
6527  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
6528  if (this->imbalance_tolerance <= 0)
6529  this->imbalance_tolerance= 10e-4;
6530 
6531  //if an input partitioning array is provided.
6532  this->part_no_array = NULL;
6533  //the length of the input partitioning array.
6534  this->recursion_depth = 0;
6535 
6536  if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
6537  this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
6538  this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
6539  this->mj_env->debug(2, "mj_parts provided by user");
6540  }
6541 
6542  //get mj specific parameters.
6543  this->distribute_points_on_cut_lines = true;
6544  this->max_concurrent_part_calculation = 1;
6545 
6546  this->mj_run_as_rcb = 0;
6547  int mj_user_recursion_depth = -1;
6548  this->mj_keep_part_boxes = 0;
6549  this->check_migrate_avoid_migration_option = 0;
6550  this->minimum_migration_imbalance = 0.35;
6551 
6552  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
6553  if (pe){
6554  double imb;
6555  imb = pe->getValue(&imb);
6556  this->minimum_migration_imbalance = imb - 1.0;
6557  }
6558 
6559  pe = pl.getEntryPtr("mj_migration_option");
6560  if (pe){
6561  this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
6562  }else {
6563  this->check_migrate_avoid_migration_option = 0;
6564  }
6565  if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
6566 
6567 
6568  pe = pl.getEntryPtr("mj_concurrent_part_count");
6569  if (pe){
6570  this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
6571  }else {
6572  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
6573  }
6574 
6575  pe = pl.getEntryPtr("mj_keep_part_boxes");
6576  if (pe){
6577  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
6578  }else {
6579  this->mj_keep_part_boxes = 0; // Set to invalid value
6580  }
6581 
6582  // For now, need keep_part_boxes to do pointAssign and boxAssign.
6583  // pe = pl.getEntryPtr("keep_cuts");
6584  // if (pe){
6585  // int tmp = pe->getValue(&tmp);
6586  // if (tmp) this->mj_keep_part_boxes = 1;
6587  // }
6588 
6589  //need to keep part boxes if mapping type is geometric.
6590  if (this->mj_keep_part_boxes == 0){
6591  pe = pl.getEntryPtr("mapping_type");
6592  if (pe){
6593  int mapping_type = -1;
6594  mapping_type = pe->getValue(&mapping_type);
6595  if (mapping_type == 0){
6596  mj_keep_part_boxes = 1;
6597  }
6598  }
6599  }
6600 
6601  //need to keep part boxes if mapping type is geometric.
6602  pe = pl.getEntryPtr("mj_enable_rcb");
6603  if (pe){
6604  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
6605  }else {
6606  this->mj_run_as_rcb = 0; // Set to invalid value
6607  }
6608 
6609  pe = pl.getEntryPtr("mj_recursion_depth");
6610  if (pe){
6611  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
6612  }else {
6613  mj_user_recursion_depth = -1; // Set to invalid value
6614  }
6615 
6616  int val = 0;
6617  pe = pl.getEntryPtr("rectilinear");
6618  if (pe) val = pe->getValue(&val);
6619  if (val == 1){
6620  this->distribute_points_on_cut_lines = false;
6621  } else {
6622  this->distribute_points_on_cut_lines = true;
6623  }
6624 
6625  if (this->mj_run_as_rcb){
6626  mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
6627  }
6628  if (this->recursion_depth < 1){
6629  if (mj_user_recursion_depth > 0){
6630  this->recursion_depth = mj_user_recursion_depth;
6631  }
6632  else {
6633  this->recursion_depth = this->coord_dim;
6634  }
6635  }
6636 
6637  this->num_threads = 1;
6638 #ifdef HAVE_ZOLTAN2_OMP
6639 #pragma omp parallel
6640  {
6641  this->num_threads = omp_get_num_threads();
6642  }
6643 #endif
6644 
6645 }
6646 
6648 template <typename Adapter>
6650  int dim,
6651  typename Adapter::scalar_t *lower,
6652  typename Adapter::scalar_t *upper,
6653  size_t &nPartsFound,
6654  typename Adapter::part_t **partsFound) const
6655 {
6656  // TODO: Implement with cuts rather than boxes to reduce algorithmic
6657  // TODO: complexity. Or at least do a search through the boxes, using
6658  // TODO: p x q x r x ... if possible.
6659 
6660  nPartsFound = 0;
6661  *partsFound = NULL;
6662 
6663  if (this->mj_keep_part_boxes) {
6664 
6665  // Get vector of part boxes
6666  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6667 
6668  size_t nBoxes = (*partBoxes).size();
6669  if (nBoxes == 0) {
6670  throw std::logic_error("no part boxes exist");
6671  }
6672 
6673  // Determine whether the box overlaps the globalBox at all
6674  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6675 
6676  if (globalBox->boxesOverlap(dim, lower, upper)) {
6677 
6678  std::vector<typename Adapter::part_t> partlist;
6679 
6680  // box overlaps the global box; find specific overlapping boxes
6681  for (size_t i = 0; i < nBoxes; i++) {
6682  try {
6683  if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
6684  nPartsFound++;
6685  partlist.push_back((*partBoxes)[i].getpId());
6686 
6687 // std::cout << "Given box (";
6688 // for (int j = 0; j < dim; j++)
6689 // std::cout << lower[j] << " ";
6690 // std::cout << ") x (";
6691 // for (int j = 0; j < dim; j++)
6692 // std::cout << upper[j] << " ";
6693 // std::cout << ") overlaps PartBox "
6694 // << (*partBoxes)[i].getpId() << " (";
6695 // for (int j = 0; j < dim; j++)
6696 // std::cout << (*partBoxes)[i].getlmins()[j] << " ";
6697 // std::cout << ") x (";
6698 // for (int j = 0; j < dim; j++)
6699 // std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
6700 // std::cout << ")" << std::endl;
6701  }
6702  }
6704  }
6705  if (nPartsFound) {
6706  *partsFound = new mj_part_t[nPartsFound];
6707  for (size_t i = 0; i < nPartsFound; i++)
6708  (*partsFound)[i] = partlist[i];
6709  }
6710  }
6711  else {
6712  // Box does not overlap the domain at all. Find the closest part
6713  // Not sure how to perform this operation for MJ without having the
6714  // cuts. With the RCB cuts, the concept of a part extending to
6715  // infinity was natural. With the boxes, it is much more difficult.
6716  // TODO: For now, return information indicating NO OVERLAP.
6717 
6718  }
6719  }
6720  else {
6721  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
6722  }
6723 }
6724 
6726 template <typename Adapter>
6727 typename Adapter::part_t Zoltan2_AlgMJ<Adapter>::pointAssign(
6728  int dim,
6729  typename Adapter::scalar_t *point) const
6730 {
6731 
6732  // TODO: Implement with cuts rather than boxes to reduce algorithmic
6733  // TODO: complexity. Or at least do a search through the boxes, using
6734  // TODO: p x q x r x ... if possible.
6735 
6736  if (this->mj_keep_part_boxes) {
6737  typename Adapter::part_t foundPart = -1;
6738 
6739  // Get vector of part boxes
6740  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6741 
6742  size_t nBoxes = (*partBoxes).size();
6743  if (nBoxes == 0) {
6744  throw std::logic_error("no part boxes exist");
6745  }
6746 
6747  // Determine whether the point is within the global domain
6748  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6749 
6750  if (globalBox->pointInBox(dim, point)) {
6751 
6752  // point is in the global domain; determine in which part it is.
6753  size_t i;
6754  for (i = 0; i < nBoxes; i++) {
6755  try {
6756  if ((*partBoxes)[i].pointInBox(dim, point)) {
6757  foundPart = (*partBoxes)[i].getpId();
6758 // std::cout << "Point (";
6759 // for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
6760 // std::cout << ") found in box " << i << " part " << foundPart
6761 // << std::endl;
6762 // (*partBoxes)[i].print();
6763  break;
6764  }
6765  }
6767  }
6768 
6769  if (i == nBoxes) {
6770  // This error should never occur
6771  std::ostringstream oss;
6772  oss << "Point (";
6773  for (int j = 0; j < dim; j++) oss << point[j] << " ";
6774  oss << ") not found in domain";
6775  throw std::logic_error(oss.str());
6776  }
6777  }
6778 
6779  else {
6780  // Point is outside the global domain.
6781  // Determine to which part it is closest.
6782  // TODO: with cuts, would not need this special case
6783 
6784  size_t closestBox = 0;
6785  mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
6786  mj_scalar_t *centroid = new mj_scalar_t[dim];
6787  for (size_t i = 0; i < nBoxes; i++) {
6788  (*partBoxes)[i].computeCentroid(centroid);
6789  mj_scalar_t sum = 0.;
6790  mj_scalar_t diff;
6791  for (int j = 0; j < dim; j++) {
6792  diff = centroid[j] - point[j];
6793  sum += diff * diff;
6794  }
6795  if (sum < minDistance) {
6796  minDistance = sum;
6797  closestBox = i;
6798  }
6799  }
6800  foundPart = (*partBoxes)[closestBox].getpId();
6801  delete [] centroid;
6802  }
6803 
6804  return foundPart;
6805  }
6806  else {
6807  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
6808  }
6809 }
6810 
6811 template <typename Adapter>
6813  const PartitioningSolution<Adapter> *solution,
6814  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
6815  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
6816 {
6817  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
6818  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6819  mj_part_t ntasks = (*pBoxes).size();
6820  int dim = (*pBoxes)[0].getDim();
6821  GridHash<mj_scalar_t, mj_part_t> grid(pBoxes, ntasks, dim);
6822  grid.getAdjArrays(comXAdj_, comAdj_);
6823  }
6824  comAdj = comAdj_;
6825  comXAdj = comXAdj_;
6826 }
6827 
6828 
6829 template <typename Adapter>
6830 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
6832 {
6833  return this->mj_partitioner.get_kept_boxes();
6834 }
6835 
6836 
6837 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6838  typename mj_part_t>
6839 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6841 {
6842  if (this->mj_keep_part_boxes)
6843  return this->kept_boxes;
6844  else
6845  throw std::logic_error("Error: part boxes are not stored.");
6846 }
6847 
6848 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6849  typename mj_part_t>
6850 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6852  RCP<mj_partBoxVector_t> &localPartBoxes
6853 ) const
6854 {
6855  mj_part_t ntasks = this->num_global_parts;
6856  int dim = (*localPartBoxes)[0].getDim();
6857  mj_scalar_t *localPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
6858 
6859  memset(localPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
6860 
6861  mj_scalar_t *globalPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
6862  memset(globalPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
6863 
6864  mj_scalar_t *localPartMins = localPartBoundaries;
6865  mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
6866 
6867  mj_scalar_t *globalPartMins = globalPartBoundaries;
6868  mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
6869 
6870  mj_part_t boxCount = localPartBoxes->size();
6871  for (mj_part_t i = 0; i < boxCount; ++i){
6872  mj_part_t pId = (*localPartBoxes)[i].getpId();
6873  //cout << "me:" << comm->getRank() << " has:" << pId << endl;
6874 
6875  mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
6876  mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
6877 
6878  for (int j = 0; j < dim; ++j){
6879  localPartMins[dim * pId + j] = lmins[j];
6880  localPartMaxs[dim * pId + j] = lmaxs[j];
6881  /*
6882  cout << "me:" << comm->getRank() <<
6883  " dim * pId + j:"<< dim * pId + j <<
6884  " localMin:" << localPartMins[dim * pId + j] <<
6885  " localMax:" << localPartMaxs[dim * pId + j] << endl;
6886  */
6887  }
6888  }
6889 
6890  Teuchos::Zoltan2_BoxBoundaries<int, mj_scalar_t> reductionOp(ntasks * 2 *dim);
6891 
6892  reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
6893  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
6894  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
6895  for (mj_part_t i = 0; i < ntasks; ++i){
6897  globalPartMins + dim * i,
6898  globalPartMaxs + dim * i);
6899 
6900  /*
6901  for (int j = 0; j < dim; ++j){
6902  cout << "me:" << comm->getRank() <<
6903  " dim * pId + j:"<< dim * i + j <<
6904  " globalMin:" << globalPartMins[dim * i + j] <<
6905  " globalMax:" << globalPartMaxs[dim * i + j] << endl;
6906  }
6907  */
6908  pB->push_back(tpb);
6909  }
6910  delete []localPartBoundaries;
6911  delete []globalPartBoundaries;
6912  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
6913  return pB;
6914 }
6915 } // namespace Zoltan2
6916 
6917 #endif
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_)
Multi Jagged coordinate partitioning algorithm.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Defines Parameter related enumerators, declares functions.
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
size_t global_size_t
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static ArrayRCP< ArrayRCP< zscalar_t > > weights
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
#define SIGNIFICANCE_MUL
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
T * allocMemory(size_t size)
Allocates memory for the given size.
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
dictionary vals
Definition: xml2dox.py:186
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
#define ZOLTAN2_ABS(x)
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
void getInputArray(ArrayRCP< const T > &array) const
Create a contiguous array of the required type, perhaps for a TPL.
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
The StridedData class manages lists of weights or coordinates.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
std::string toString(tt obj)
Converts the given object to string.
Defines the CoordinateModel classes.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
void multi_jagged_part(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
#define epsilon
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.