Actual source code: gs.c

  2: /***********************************gs.c***************************************

  4: Author: Henry M. Tufo III

  6: e-mail: hmt@cs.brown.edu

  8: snail-mail:
  9: Division of Applied Mathematics
 10: Brown University
 11: Providence, RI 02912

 13: Last Modification: 
 14: 6.21.97
 15: ************************************gs.c**************************************/

 17: /***********************************gs.c***************************************
 18: File Description:
 19: -----------------

 21: ************************************gs.c**************************************/

 23: #include <../src/ksp/pc/impls/tfs/tfs.h>

 25: /* default length of number of items via tree - doubles if exceeded */
 26: #define TREE_BUF_SZ 2048;
 27: #define GS_VEC_SZ   1



 31: /***********************************gs.c***************************************
 32: Type: struct gather_scatter_id 
 33: ------------------------------

 35: ************************************gs.c**************************************/
 36: typedef struct gather_scatter_id {
 37:   PetscInt id;
 38:   PetscInt nel_min;
 39:   PetscInt nel_max;
 40:   PetscInt nel_sum;
 41:   PetscInt negl;
 42:   PetscInt gl_max;
 43:   PetscInt gl_min;
 44:   PetscInt repeats;
 45:   PetscInt ordered;
 46:   PetscInt positive;
 47:   PetscScalar *vals;

 49:   /* bit mask info */
 50:   PetscInt *my_proc_mask;
 51:   PetscInt mask_sz;
 52:   PetscInt *ngh_buf;
 53:   PetscInt ngh_buf_sz;
 54:   PetscInt *nghs;
 55:   PetscInt num_nghs;
 56:   PetscInt max_nghs;
 57:   PetscInt *pw_nghs;
 58:   PetscInt num_pw_nghs;
 59:   PetscInt *tree_nghs;
 60:   PetscInt num_tree_nghs;

 62:   PetscInt num_loads;

 64:   /* repeats == true -> local info */
 65:   PetscInt nel;         /* number of unique elememts */
 66:   PetscInt *elms;       /* of size nel */
 67:   PetscInt nel_total;
 68:   PetscInt *local_elms; /* of size nel_total */
 69:   PetscInt *companion;  /* of size nel_total */

 71:   /* local info */
 72:   PetscInt num_local_total;
 73:   PetscInt local_strength;
 74:   PetscInt num_local;
 75:   PetscInt *num_local_reduce;
 76:   PetscInt **local_reduce;
 77:   PetscInt num_local_gop;
 78:   PetscInt *num_gop_local_reduce;
 79:   PetscInt **gop_local_reduce;

 81:   /* pairwise info */
 82:   PetscInt level;
 83:   PetscInt num_pairs;
 84:   PetscInt max_pairs;
 85:   PetscInt loc_node_pairs;
 86:   PetscInt max_node_pairs;
 87:   PetscInt min_node_pairs;
 88:   PetscInt avg_node_pairs;
 89:   PetscInt *pair_list;
 90:   PetscInt *msg_sizes;
 91:   PetscInt **node_list;
 92:   PetscInt len_pw_list;
 93:   PetscInt *pw_elm_list;
 94:   PetscScalar *pw_vals;

 96:   MPI_Request *msg_ids_in;
 97:   MPI_Request *msg_ids_out;

 99:   PetscScalar *out;
100:   PetscScalar *in;
101:   PetscInt msg_total;

103:   /* tree - crystal accumulator info */
104:   PetscInt max_left_over;
105:   PetscInt *pre;
106:   PetscInt *in_num;
107:   PetscInt *out_num;
108:   PetscInt **in_list;
109:   PetscInt **out_list;

111:   /* new tree work*/
112:   PetscInt  tree_nel;
113:   PetscInt *tree_elms;
114:   PetscScalar *tree_buf;
115:   PetscScalar *tree_work;

117:   PetscInt  tree_map_sz;
118:   PetscInt *tree_map_in;
119:   PetscInt *tree_map_out;

121:   /* current memory status */
122:   PetscInt gl_bss_min;
123:   PetscInt gl_perm_min;

125:   /* max segment size for gs_gop_vec() */
126:   PetscInt vec_sz;

128:   /* hack to make paul happy */
129:   MPI_Comm gs_comm;

131: } gs_id;

133: static gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134: static PetscErrorCode gsi_via_bit_mask(gs_id *gs);
135: static PetscErrorCode get_ngh_buf(gs_id *gs);
136: static PetscErrorCode set_pairwise(gs_id *gs);
137: static gs_id * gsi_new(void);
138: static PetscErrorCode set_tree(gs_id *gs);

140: /* same for all but vector flavor */
141: static PetscErrorCode gs_gop_local_out(gs_id *gs, PetscScalar *vals);
142: /* vector flavor */
143: static PetscErrorCode gs_gop_vec_local_out(gs_id *gs, PetscScalar *vals, PetscInt step);

145: static PetscErrorCode gs_gop_vec_plus(gs_id *gs, PetscScalar *in_vals, PetscInt step);
146: static PetscErrorCode gs_gop_vec_pairwise_plus(gs_id *gs, PetscScalar *in_vals, PetscInt step);
147: static PetscErrorCode gs_gop_vec_local_plus(gs_id *gs, PetscScalar *vals, PetscInt step);
148: static PetscErrorCode gs_gop_vec_local_in_plus(gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode gs_gop_vec_tree_plus(gs_id *gs, PetscScalar *vals, PetscInt step);


152: static PetscErrorCode gs_gop_local_plus(gs_id *gs, PetscScalar *vals);
153: static PetscErrorCode gs_gop_local_in_plus(gs_id *gs, PetscScalar *vals);

155: static PetscErrorCode gs_gop_plus_hc(gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156: static PetscErrorCode gs_gop_pairwise_plus_hc(gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157: static PetscErrorCode gs_gop_tree_plus_hc(gs_id *gs, PetscScalar *vals, PetscInt dim);

159: /* global vars */
160: /* from comm.c module */

162: static PetscInt num_gs_ids = 0;

164: /* should make this dynamic ... later */
165: static PetscInt msg_buf=MAX_MSG_BUF;
166: static PetscInt vec_sz=GS_VEC_SZ;
167: static PetscInt *tree_buf=NULL;
168: static PetscInt tree_buf_sz=0;
169: static PetscInt ntree=0;

171: /***************************************************************************/
172: PetscErrorCode gs_init_vec_sz(PetscInt size)
173: {
175:   vec_sz = size;
176:   return(0);
177: }

179: /******************************************************************************/
180: PetscErrorCode gs_init_msg_buf_sz(PetscInt buf_size)
181: {
183:   msg_buf = buf_size;
184:   return(0);
185: }

187: /******************************************************************************/
188: gs_id *gs_init( PetscInt *elms, PetscInt nel, PetscInt level)
189: {
190:    gs_id *gs;
191:   MPI_Group gs_group;
192:   MPI_Comm  gs_comm;

196:   /* ensure that communication package has been initialized */
197:   comm_init();


200:   /* determines if we have enough dynamic/semi-static memory */
201:   /* checks input, allocs and sets gd_id template            */
202:   gs = gsi_check_args(elms,nel,level);

204:   /* only bit mask version up and working for the moment    */
205:   /* LATER :: get int list version working for sparse pblms */
206:   gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);


209:   MPI_Comm_group(MPI_COMM_WORLD,&gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210:   MPI_Comm_create(MPI_COMM_WORLD,gs_group,&gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211:   gs->gs_comm=gs_comm;

213:   return(gs);
214: }

216: /******************************************************************************/
217: static gs_id *gsi_new(void)
218: {
220:   gs_id *gs;
221:   gs = (gs_id *) malloc(sizeof(gs_id));
222:   PetscMemzero(gs,sizeof(gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
223:   return(gs);
224: }

226: /******************************************************************************/
227: static gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
228: {
229:    PetscInt i, j, k, t2;
230:   PetscInt *companion, *elms, *unique, *iptr;
231:   PetscInt num_local=0, *num_to_reduce, **local_reduce;
232:   PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
233:   PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
234:   PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
235:   gs_id *gs;


239:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
240:   if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");

242:   if (nel==0)
243:     {PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}

245:   /* get space for gs template */
246:   gs = gsi_new();
247:   gs->id = ++num_gs_ids;

249:   /* hmt 6.4.99                                            */
250:   /* caller can set global ids that don't participate to 0 */
251:   /* gs_init ignores all zeros in elm list                 */
252:   /* negative global ids are still invalid                 */
253:   for (i=j=0;i<nel;i++)
254:     {if (in_elms[i]!=0) {j++;}}

256:   k=nel; nel=j;

258:   /* copy over in_elms list and create inverse map */
259:   elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
260:   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));

262:   for (i=j=0;i<k;i++)
263:     {
264:       if (in_elms[i]!=0)
265:         {elms[j] = in_elms[i]; companion[j++] = i;}
266:     }

268:   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");

270:   /* pre-pass ... check to see if sorted */
271:   elms[nel] = INT_MAX;
272:   iptr = elms;
273:   unique = elms+1;
274:   j=0;
275:   while (*iptr!=INT_MAX)
276:     {
277:       if (*iptr++>*unique++)
278:         {j=1; break;}
279:     }

281:   /* set up inverse map */
282:   if (j)
283:     {
284:       PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
285:       SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
286:     }
287:   else
288:     {PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
289:   elms[nel] = INT_MIN;

291:   /* first pass */
292:   /* determine number of unique elements, check pd */
293:   for (i=k=0;i<nel;i+=j)
294:     {
295:       t2 = elms[i];
296:       j=++i;
297: 
298:       /* clump 'em for now */
299:       while (elms[j]==t2) {j++;}
300: 
301:       /* how many together and num local */
302:       if (j-=i)
303:         {num_local++; k+=j;}
304:     }

306:   /* how many unique elements? */
307:   gs->repeats=k;
308:   gs->nel = nel-k;


311:   /* number of repeats? */
312:   gs->num_local = num_local;
313:   num_local+=2;
314:   gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*));
315:   gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));

317:   unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
318:   gs->elms = unique;
319:   gs->nel_total = nel;
320:   gs->local_elms = elms;
321:   gs->companion = companion;

323:   /* compess map as well as keep track of local ops */
324:   for (num_local=i=j=0;i<gs->nel;i++)
325:     {
326:       k=j;
327:       t2 = unique[i] = elms[j];
328:       companion[i] = companion[j];
329: 
330:       while (elms[j]==t2) {j++;}

332:       if ((t2=(j-k))>1)
333:         {
334:           /* number together */
335:           num_to_reduce[num_local] = t2++;
336:           iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));

338:           /* to use binary searching don't remap until we check intersection */
339:           *iptr++ = i;
340: 
341:           /* note that we're skipping the first one */
342:           while (++k<j)
343:             {*(iptr++) = companion[k];}
344:           *iptr = -1;
345:         }
346:     }

348:   /* sentinel for ngh_buf */
349:   unique[gs->nel]=INT_MAX;

351:   /* for two partition sort hack */
352:   num_to_reduce[num_local] = 0;
353:   local_reduce[num_local] = NULL;
354:   num_to_reduce[++num_local] = 0;
355:   local_reduce[num_local] = NULL;

357:   /* load 'em up */
358:   /* note one extra to hold NON_UNIFORM flag!!! */
359:   vals[2] = vals[1] = vals[0] = nel;
360:   if (gs->nel>0)
361:     {
362:        vals[3] = unique[0];
363:        vals[4] = unique[gs->nel-1];
364:     }
365:   else
366:     {
367:        vals[3] = INT_MAX;
368:        vals[4] = INT_MIN;
369:     }
370:   vals[5] = level;
371:   vals[6] = num_gs_ids;

373:   /* GLOBAL: send 'em out */
374:   giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);

376:   /* must be semi-pos def - only pairwise depends on this */
377:   /* LATER - remove this restriction */
378:   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
379:   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");

381:   gs->nel_min = vals[0];
382:   gs->nel_max = vals[1];
383:   gs->nel_sum = vals[2];
384:   gs->gl_min  = vals[3];
385:   gs->gl_max  = vals[4];
386:   gs->negl    = vals[4]-vals[3]+1;

388:   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
389: 
390:   /* LATER :: add level == -1 -> program selects level */
391:   if (vals[5]<0)
392:     {vals[5]=0;}
393:   else if (vals[5]>num_nodes)
394:     {vals[5]=num_nodes;}
395:   gs->level = vals[5];

397:   return(gs);
398: }

400: /******************************************************************************/
401: static PetscErrorCode gsi_via_bit_mask(gs_id *gs)
402: {
403:    PetscInt i, nel, *elms;
404:   PetscInt t1;
405:   PetscInt **reduce;
406:   PetscInt *map;

410:   /* totally local removes ... ct_bits == 0 */
411:   get_ngh_buf(gs);

413:   if (gs->level)
414:     {set_pairwise(gs);}

416:   if (gs->max_left_over)
417:     {set_tree(gs);}

419:   /* intersection local and pairwise/tree? */
420:   gs->num_local_total = gs->num_local;
421:   gs->gop_local_reduce = gs->local_reduce;
422:   gs->num_gop_local_reduce = gs->num_local_reduce;

424:   map = gs->companion;

426:   /* is there any local compression */
427:   if (!gs->num_local) {
428:     gs->local_strength = NONE;
429:     gs->num_local_gop = 0;
430:   } else {
431:       /* ok find intersection */
432:       map = gs->companion;
433:       reduce = gs->local_reduce;
434:       for (i=0, t1=0; i<gs->num_local; i++, reduce++)
435:         {
436:           if ((ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
437:               ||
438:               ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0)
439:             {
440:               t1++;
441:               if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
442:               gs->num_local_reduce[i] *= -1;
443:             }
444:            **reduce=map[**reduce];
445:         }

447:       /* intersection is empty */
448:       if (!t1)
449:         {
450:           gs->local_strength = FULL;
451:           gs->num_local_gop = 0;
452:         }
453:       /* intersection not empty */
454:       else
455:         {
456:           gs->local_strength = PARTIAL;
457:           SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);

459:           gs->num_local_gop = t1;
460:           gs->num_local_total =  gs->num_local;
461:           gs->num_local    -= t1;
462:           gs->gop_local_reduce = gs->local_reduce;
463:           gs->num_gop_local_reduce = gs->num_local_reduce;

465:           for (i=0; i<t1; i++)
466:             {
467:               if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
468:               gs->num_gop_local_reduce[i] *= -1;
469:               gs->local_reduce++;
470:               gs->num_local_reduce++;
471:             }
472:           gs->local_reduce++;
473:           gs->num_local_reduce++;
474:         }
475:     }

477:   elms = gs->pw_elm_list;
478:   nel  = gs->len_pw_list;
479:   for (i=0; i<nel; i++)
480:     {elms[i] = map[elms[i]];}

482:   elms = gs->tree_map_in;
483:   nel  = gs->tree_map_sz;
484:   for (i=0; i<nel; i++)
485:     {elms[i] = map[elms[i]];}

487:   /* clean up */
488:   free((void*) gs->local_elms);
489:   free((void*) gs->companion);
490:   free((void*) gs->elms);
491:   free((void*) gs->ngh_buf);
492:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
493:   return(0);
494: }

496: /******************************************************************************/
497: static PetscErrorCode place_in_tree( PetscInt elm)
498: {
499:    PetscInt *tp, n;

502:   if (ntree==tree_buf_sz)
503:     {
504:       if (tree_buf_sz)
505:         {
506:           tp = tree_buf;
507:           n = tree_buf_sz;
508:           tree_buf_sz<<=1;
509:           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
510:           ivec_copy(tree_buf,tp,n);
511:           free(tp);
512:         }
513:       else
514:         {
515:           tree_buf_sz = TREE_BUF_SZ;
516:           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
517:         }
518:     }

520:   tree_buf[ntree++] = elm;
521:   return(0);
522: }

524: /******************************************************************************/
525: static PetscErrorCode get_ngh_buf(gs_id *gs)
526: {
527:    PetscInt i, j, npw=0, ntree_map=0;
528:   PetscInt p_mask_size, ngh_buf_size, buf_size;
529:   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
530:   PetscInt *ngh_buf, *buf1, *buf2;
531:   PetscInt offset, per_load, num_loads, or_ct, start, end;
532:   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
533:   PetscInt oper=GL_B_OR;
534:   PetscInt *ptr3, *t_mask, level, ct1, ct2;

538:   /* to make life easier */
539:   nel   = gs->nel;
540:   elms  = gs->elms;
541:   level = gs->level;
542: 
543:   /* det #bytes needed for processor bit masks and init w/mask cor. to my_id */
544:   p_mask = (PetscInt*) malloc(p_mask_size=len_bit_mask(num_nodes));
545:   set_bit_mask(p_mask,p_mask_size,my_id);

547:   /* allocate space for masks and info bufs */
548:   gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
549:   gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
550:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
551:   t_mask = (PetscInt*) malloc(p_mask_size);
552:   gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);

554:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
555:   /* had thought I could exploit rendezvous threshold */

557:   /* default is one pass */
558:   per_load = negl  = gs->negl;
559:   gs->num_loads = num_loads = 1;
560:   i=p_mask_size*negl;

562:   /* possible overflow on buffer size */
563:   /* overflow hack                    */
564:   if (i<0) {i=INT_MAX;}

566:   buf_size = PetscMin(msg_buf,i);

568:   /* can we do it? */
569:   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);

571:   /* get giop buf space ... make *only* one malloc */
572:   buf1 = (PetscInt*) malloc(buf_size<<1);

574:   /* more than one gior exchange needed? */
575:   if (buf_size!=i)
576:     {
577:       per_load = buf_size/p_mask_size;
578:       buf_size = per_load*p_mask_size;
579:       gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
580:     }


583:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
584:   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
585: 
586:   /* find giop work space */
587:   buf2 = buf1+buf_size;

589:   /* hold #ints needed for processor masks */
590:   gs->mask_sz=p_mask_size;

592:   /* init buffers */
593:   ivec_zero(sh_proc_mask,p_mask_size);
594:   ivec_zero(pw_sh_proc_mask,p_mask_size);
595:   ivec_zero(ngh_buf,ngh_buf_size);

597:   /* HACK reset tree info */
598:   tree_buf=NULL;
599:   tree_buf_sz=ntree=0;

601:   /* ok do it */
602:   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++)
603:     {
604:       /* identity for bitwise or is 000...000 */
605:       ivec_zero(buf1,buf_size);

607:       /* load msg buffer */
608:       for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++)
609:         {
610:           offset = (offset-start)*p_mask_size;
611:           ivec_copy(buf1+offset,p_mask,p_mask_size);
612:         }

614:       /* GLOBAL: pass buffer */
615:       giop(buf1,buf2,buf_size,&oper);


618:       /* unload buffer into ngh_buf */
619:       ptr2=(elms+i_start);
620:       for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++)
621:         {
622:           /* I own it ... may have to pairwise it */
623:           if (j==*ptr2)
624:             {
625:               /* do i share it w/anyone? */
626:               ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
627:               /* guess not */
628:               if (ct1<2)
629:                 {ptr2++; ptr1+=p_mask_size; continue;}

631:               /* i do ... so keep info and turn off my bit */
632:               ivec_copy(ptr1,ptr3,p_mask_size);
633:               ivec_xor(ptr1,p_mask,p_mask_size);
634:               ivec_or(sh_proc_mask,ptr1,p_mask_size);
635: 
636:               /* is it to be done pairwise? */
637:               if (--ct1<=level)
638:                 {
639:                   npw++;
640: 
641:                   /* turn on high bit to indicate pw need to process */
642:                   *ptr2++ |= TOP_BIT;
643:                   ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
644:                   ptr1+=p_mask_size;
645:                   continue;
646:                 }

648:               /* get set for next and note that I have a tree contribution */
649:               /* could save exact elm index for tree here -> save a search */
650:               ptr2++; ptr1+=p_mask_size; ntree_map++;
651:             }
652:           /* i don't but still might be involved in tree */
653:           else
654:             {

656:               /* shared by how many? */
657:               ct1 = ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));

659:               /* none! */
660:               if (ct1<2) continue;

662:               /* is it going to be done pairwise? but not by me of course!*/
663:               if (--ct1<=level) continue;
664:             }
665:           /* LATER we're going to have to process it NOW */
666:           /* nope ... tree it */
667:           place_in_tree(j);
668:         }
669:     }

671:   free((void*)t_mask);
672:   free((void*)buf1);

674:   gs->len_pw_list=npw;
675:   gs->num_nghs = ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));

677:   /* expand from bit mask list to int list and save ngh list */
678:   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
679:   bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);

681:   gs->num_pw_nghs = ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));

683:   oper = GL_MAX;
684:   ct1 = gs->num_nghs;
685:   giop(&ct1,&ct2,1,&oper);
686:   gs->max_nghs = ct1;

688:   gs->tree_map_sz  = ntree_map;
689:   gs->max_left_over=ntree;

691:   free((void*)p_mask);
692:   free((void*)sh_proc_mask);
693:   return(0);
694: }

696: /******************************************************************************/
697: static PetscErrorCode set_pairwise(gs_id *gs)
698: {
699:    PetscInt i, j;
700:   PetscInt p_mask_size;
701:   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
702:   PetscInt *ngh_buf, *buf2;
703:   PetscInt offset;
704:   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
705:   PetscInt *pairwise_elm_list, len_pair_list=0;
706:   PetscInt *iptr, t1, i_start, nel, *elms;
707:   PetscInt ct;

711:   /* to make life easier */
712:   nel  = gs->nel;
713:   elms = gs->elms;
714:   ngh_buf = gs->ngh_buf;
715:   sh_proc_mask  = gs->pw_nghs;

717:   /* need a few temp masks */
718:   p_mask_size   = len_bit_mask(num_nodes);
719:   p_mask        = (PetscInt*) malloc(p_mask_size);
720:   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);

722:   /* set mask to my my_id's bit mask */
723:   set_bit_mask(p_mask,p_mask_size,my_id);

725:   p_mask_size /= sizeof(PetscInt);
726: 
727:   len_pair_list=gs->len_pw_list;
728:   gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));

730:   /* how many processors (nghs) do we have to exchange with? */
731:   nprs=gs->num_pairs=ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));


734:   /* allocate space for gs_gop() info */
735:   gs->pair_list = msg_list = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
736:   gs->msg_sizes = msg_size  = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
737:   gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1));

739:   /* init msg_size list */
740:   ivec_zero(msg_size,nprs);

742:   /* expand from bit mask list to int list */
743:   bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
744: 
745:   /* keep list of elements being handled pairwise */
746:   for (i=j=0;i<nel;i++)
747:     {
748:       if (elms[i] & TOP_BIT)
749:         {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;}
750:     }
751:   pairwise_elm_list[j] = -1;

753:   gs->msg_ids_out = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
754:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
755:   gs->msg_ids_in = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
756:   gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
757:   gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);

759:   /* find who goes to each processor */
760:   for (i_start=i=0;i<nprs;i++)
761:     {
762:       /* processor i's mask */
763:       set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);

765:       /* det # going to processor i */
766:       for (ct=j=0;j<len_pair_list;j++)
767:         {
768:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
769:           ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
770:           if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
771:             {ct++;}
772:         }
773:       msg_size[i] = ct;
774:       i_start = PetscMax(i_start,ct);

776:       /*space to hold nodes in message to first neighbor */
777:       msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));

779:       for (j=0;j<len_pair_list;j++)
780:         {
781:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
782:           ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
783:           if (ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
784:             {*iptr++ = j;}
785:         }
786:       *iptr = -1;
787:     }
788:   msg_nodes[nprs] = NULL;

790:   j=gs->loc_node_pairs=i_start;
791:   t1 = GL_MAX;
792:   giop(&i_start,&offset,1,&t1);
793:   gs->max_node_pairs = i_start;

795:   i_start=j;
796:   t1 = GL_MIN;
797:   giop(&i_start,&offset,1,&t1);
798:   gs->min_node_pairs = i_start;

800:   i_start=j;
801:   t1 = GL_ADD;
802:   giop(&i_start,&offset,1,&t1);
803:   gs->avg_node_pairs = i_start/num_nodes + 1;

805:   i_start=nprs;
806:   t1 = GL_MAX;
807:   giop(&i_start,&offset,1,&t1);
808:   gs->max_pairs = i_start;


811:   /* remap pairwise in tail of gsi_via_bit_mask() */
812:   gs->msg_total = ivec_sum(gs->msg_sizes,nprs);
813:   gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
814:   gs->in  = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);

816:   /* reset malloc pool */
817:   free((void*)p_mask);
818:   free((void*)tmp_proc_mask);
819:   return(0);
820: }

822: /* to do pruned tree just save ngh buf copy for each one and decode here!
823: ******************************************************************************/
824: static PetscErrorCode set_tree(gs_id *gs)
825: {
826:   PetscInt i, j, n, nel;
827:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

830:   /* local work ptrs */
831:   elms = gs->elms;
832:   nel     = gs->nel;

834:   /* how many via tree */
835:   gs->tree_nel  = n = ntree;
836:   gs->tree_elms = tree_elms = iptr_in = tree_buf;
837:   gs->tree_buf  = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
838:   gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
839:   j=gs->tree_map_sz;
840:   gs->tree_map_in = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
841:   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));

843:   /* search the longer of the two lists */
844:   /* note ... could save this info in get_ngh_buf and save searches */
845:   if (n<=nel)
846:     {
847:       /* bijective fct w/remap - search elm list */
848:       for (i=0; i<n; i++)
849:         {
850:           if ((j=ivec_binary_search(*tree_elms++,elms,nel))>=0)
851:             {*iptr_in++ = j; *iptr_out++ = i;}
852:         }
853:     }
854:   else
855:     {
856:       for (i=0; i<nel; i++)
857:         {
858:           if ((j=ivec_binary_search(*elms++,tree_elms,n))>=0)
859:             {*iptr_in++ = i; *iptr_out++ = j;}
860:         }
861:     }

863:   /* sentinel */
864:   *iptr_in = *iptr_out = -1;
865:   return(0);
866: }

868: /******************************************************************************/
869: static PetscErrorCode gs_gop_local_out( gs_id *gs,  PetscScalar *vals)
870: {
871:   PetscInt *num, *map, **reduce;
872:   PetscScalar tmp;

875:   num    = gs->num_gop_local_reduce;
876:   reduce = gs->gop_local_reduce;
877:   while ((map = *reduce++))
878:     {
879:       /* wall */
880:       if (*num == 2)
881:         {
882:           num ++;
883:           vals[map[1]] = vals[map[0]];
884:         }
885:       /* corner shared by three elements */
886:       else if (*num == 3)
887:         {
888:           num ++;
889:           vals[map[2]] = vals[map[1]] = vals[map[0]];
890:         }
891:       /* corner shared by four elements */
892:       else if (*num == 4)
893:         {
894:           num ++;
895:           vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
896:         }
897:       /* general case ... odd geoms ... 3D*/
898:       else
899:         {
900:           num++;
901:           tmp = *(vals + *map++);
902:           while (*map >= 0)
903:             {*(vals + *map++) = tmp;}
904:         }
905:     }
906:   return(0);
907: }

909: /******************************************************************************/
910: static PetscErrorCode gs_gop_local_plus( gs_id *gs,  PetscScalar *vals)
911: {
912:    PetscInt *num, *map, **reduce;
913:    PetscScalar tmp;

916:   num    = gs->num_local_reduce;
917:   reduce = gs->local_reduce;
918:   while ((map = *reduce))
919:     {
920:       /* wall */
921:       if (*num == 2)
922:         {
923:           num ++; reduce++;
924:           vals[map[1]] = vals[map[0]] += vals[map[1]];
925:         }
926:       /* corner shared by three elements */
927:       else if (*num == 3)
928:         {
929:           num ++; reduce++;
930:           vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
931:         }
932:       /* corner shared by four elements */
933:       else if (*num == 4)
934:         {
935:           num ++; reduce++;
936:           vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
937:                                  (vals[map[1]] + vals[map[2]] + vals[map[3]]);
938:         }
939:       /* general case ... odd geoms ... 3D*/
940:       else
941:         {
942:           num ++;
943:           tmp = 0.0;
944:           while (*map >= 0)
945:             {tmp += *(vals + *map++);}

947:           map = *reduce++;
948:           while (*map >= 0)
949:             {*(vals + *map++) = tmp;}
950:         }
951:     }
952:   return(0);
953: }

955: /******************************************************************************/
956: static PetscErrorCode gs_gop_local_in_plus( gs_id *gs,  PetscScalar *vals)
957: {
958:    PetscInt *num, *map, **reduce;
959:    PetscScalar *base;

962:   num    = gs->num_gop_local_reduce;
963:   reduce = gs->gop_local_reduce;
964:   while ((map = *reduce++))
965:     {
966:       /* wall */
967:       if (*num == 2)
968:         {
969:           num ++;
970:           vals[map[0]] += vals[map[1]];
971:         }
972:       /* corner shared by three elements */
973:       else if (*num == 3)
974:         {
975:           num ++;
976:           vals[map[0]] += (vals[map[1]] + vals[map[2]]);
977:         }
978:       /* corner shared by four elements */
979:       else if (*num == 4)
980:         {
981:           num ++;
982:           vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
983:         }
984:       /* general case ... odd geoms ... 3D*/
985:       else
986:         {
987:           num++;
988:           base = vals + *map++;
989:           while (*map >= 0)
990:             {*base += *(vals + *map++);}
991:         }
992:     }
993:   return(0);
994: }

996: /******************************************************************************/
997: PetscErrorCode gs_free( gs_id *gs)
998: {
999:    PetscInt i;

1002:   if (gs->nghs) {free((void*) gs->nghs);}
1003:   if (gs->pw_nghs) {free((void*) gs->pw_nghs);}

1005:   /* tree */
1006:   if (gs->max_left_over)
1007:     {
1008:       if (gs->tree_elms) {free((void*) gs->tree_elms);}
1009:       if (gs->tree_buf) {free((void*) gs->tree_buf);}
1010:       if (gs->tree_work) {free((void*) gs->tree_work);}
1011:       if (gs->tree_map_in) {free((void*) gs->tree_map_in);}
1012:       if (gs->tree_map_out) {free((void*) gs->tree_map_out);}
1013:     }

1015:   /* pairwise info */
1016:   if (gs->num_pairs)
1017:     {
1018:       /* should be NULL already */
1019:       if (gs->ngh_buf) {free((void*) gs->ngh_buf);}
1020:       if (gs->elms) {free((void*) gs->elms);}
1021:       if (gs->local_elms) {free((void*) gs->local_elms);}
1022:       if (gs->companion) {free((void*) gs->companion);}
1023: 
1024:       /* only set if pairwise */
1025:       if (gs->vals) {free((void*) gs->vals);}
1026:       if (gs->in) {free((void*) gs->in);}
1027:       if (gs->out) {free((void*) gs->out);}
1028:       if (gs->msg_ids_in) {free((void*) gs->msg_ids_in);}
1029:       if (gs->msg_ids_out) {free((void*) gs->msg_ids_out);}
1030:       if (gs->pw_vals) {free((void*) gs->pw_vals);}
1031:       if (gs->pw_elm_list) {free((void*) gs->pw_elm_list);}
1032:       if (gs->node_list)
1033:         {
1034:           for (i=0;i<gs->num_pairs;i++)
1035:             {if (gs->node_list[i]) {free((void*) gs->node_list[i]);}}
1036:           free((void*) gs->node_list);
1037:         }
1038:       if (gs->msg_sizes) {free((void*) gs->msg_sizes);}
1039:       if (gs->pair_list) {free((void*) gs->pair_list);}
1040:     }

1042:   /* local info */
1043:   if (gs->num_local_total>=0)
1044:     {
1045:       for (i=0;i<gs->num_local_total+1;i++)
1046:         /*      for (i=0;i<gs->num_local_total;i++) */
1047:         {
1048:           if (gs->num_gop_local_reduce[i])
1049:             {free((void*) gs->gop_local_reduce[i]);}
1050:         }
1051:     }

1053:   /* if intersection tree/pairwise and local isn't empty */
1054:   if (gs->gop_local_reduce) {free((void*) gs->gop_local_reduce);}
1055:   if (gs->num_gop_local_reduce) {free((void*) gs->num_gop_local_reduce);}

1057:   free((void*) gs);
1058:   return(0);
1059: }

1061: /******************************************************************************/
1062: PetscErrorCode gs_gop_vec( gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
1063: {

1067:   switch (*op) {
1068:   case '+':
1069:     gs_gop_vec_plus(gs,vals,step);
1070:     break;
1071:   default:
1072:     PetscInfo1(0,"gs_gop_vec() :: %c is not a valid op",op[0]);
1073:     PetscInfo(0,"gs_gop_vec() :: default :: plus");
1074:     gs_gop_vec_plus(gs,vals,step);
1075:     break;
1076:   }
1077:   return(0);
1078: }

1080: /******************************************************************************/
1081: static PetscErrorCode gs_gop_vec_plus( gs_id *gs,  PetscScalar *vals,  PetscInt step)
1082: {
1084:   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"gs_gop_vec() passed NULL gs handle!!!");

1086:   /* local only operations!!! */
1087:   if (gs->num_local)
1088:     {gs_gop_vec_local_plus(gs,vals,step);}

1090:   /* if intersection tree/pairwise and local isn't empty */
1091:   if (gs->num_local_gop)
1092:     {
1093:       gs_gop_vec_local_in_plus(gs,vals,step);

1095:       /* pairwise */
1096:       if (gs->num_pairs)
1097:         {gs_gop_vec_pairwise_plus(gs,vals,step);}

1099:       /* tree */
1100:       else if (gs->max_left_over)
1101:         {gs_gop_vec_tree_plus(gs,vals,step);}

1103:       gs_gop_vec_local_out(gs,vals,step);
1104:     }
1105:   /* if intersection tree/pairwise and local is empty */
1106:   else
1107:     {
1108:       /* pairwise */
1109:       if (gs->num_pairs)
1110:         {gs_gop_vec_pairwise_plus(gs,vals,step);}

1112:       /* tree */
1113:       else if (gs->max_left_over)
1114:         {gs_gop_vec_tree_plus(gs,vals,step);}
1115:     }
1116:   return(0);
1117: }

1119: /******************************************************************************/
1120: static PetscErrorCode gs_gop_vec_local_plus( gs_id *gs,  PetscScalar *vals, PetscInt step)
1121: {
1122:    PetscInt *num, *map, **reduce;
1123:    PetscScalar *base;

1126:   num    = gs->num_local_reduce;
1127:   reduce = gs->local_reduce;
1128:   while ((map = *reduce))
1129:     {
1130:       base = vals + map[0] * step;

1132:       /* wall */
1133:       if (*num == 2)
1134:         {
1135:           num++; reduce++;
1136:           rvec_add (base,vals+map[1]*step,step);
1137:           rvec_copy(vals+map[1]*step,base,step);
1138:         }
1139:       /* corner shared by three elements */
1140:       else if (*num == 3)
1141:         {
1142:           num++; reduce++;
1143:           rvec_add (base,vals+map[1]*step,step);
1144:           rvec_add (base,vals+map[2]*step,step);
1145:           rvec_copy(vals+map[2]*step,base,step);
1146:           rvec_copy(vals+map[1]*step,base,step);
1147:         }
1148:       /* corner shared by four elements */
1149:       else if (*num == 4)
1150:         {
1151:           num++; reduce++;
1152:           rvec_add (base,vals+map[1]*step,step);
1153:           rvec_add (base,vals+map[2]*step,step);
1154:           rvec_add (base,vals+map[3]*step,step);
1155:           rvec_copy(vals+map[3]*step,base,step);
1156:           rvec_copy(vals+map[2]*step,base,step);
1157:           rvec_copy(vals+map[1]*step,base,step);
1158:         }
1159:       /* general case ... odd geoms ... 3D */
1160:       else
1161:         {
1162:           num++;
1163:           while (*++map >= 0)
1164:             {rvec_add (base,vals+*map*step,step);}
1165: 
1166:           map = *reduce;
1167:           while (*++map >= 0)
1168:             {rvec_copy(vals+*map*step,base,step);}
1169: 
1170:           reduce++;
1171:         }
1172:     }
1173:   return(0);
1174: }

1176: /******************************************************************************/
1177: static PetscErrorCode gs_gop_vec_local_in_plus( gs_id *gs,  PetscScalar *vals, PetscInt step)
1178: {
1179:    PetscInt  *num, *map, **reduce;
1180:    PetscScalar *base;
1182:   num    = gs->num_gop_local_reduce;
1183:   reduce = gs->gop_local_reduce;
1184:   while ((map = *reduce++))
1185:     {
1186:       base = vals + map[0] * step;

1188:       /* wall */
1189:       if (*num == 2)
1190:         {
1191:           num ++;
1192:           rvec_add(base,vals+map[1]*step,step);
1193:         }
1194:       /* corner shared by three elements */
1195:       else if (*num == 3)
1196:         {
1197:           num ++;
1198:           rvec_add(base,vals+map[1]*step,step);
1199:           rvec_add(base,vals+map[2]*step,step);
1200:         }
1201:       /* corner shared by four elements */
1202:       else if (*num == 4)
1203:         {
1204:           num ++;
1205:           rvec_add(base,vals+map[1]*step,step);
1206:           rvec_add(base,vals+map[2]*step,step);
1207:           rvec_add(base,vals+map[3]*step,step);
1208:         }
1209:       /* general case ... odd geoms ... 3D*/
1210:       else
1211:         {
1212:           num++;
1213:           while (*++map >= 0)
1214:             {rvec_add(base,vals+*map*step,step);}
1215:         }
1216:     }
1217:   return(0);
1218: }

1220: /******************************************************************************/
1221: static PetscErrorCode gs_gop_vec_local_out( gs_id *gs,  PetscScalar *vals, PetscInt step)
1222: {
1223:    PetscInt *num, *map, **reduce;
1224:    PetscScalar *base;

1227:   num    = gs->num_gop_local_reduce;
1228:   reduce = gs->gop_local_reduce;
1229:   while ((map = *reduce++))
1230:     {
1231:       base = vals + map[0] * step;

1233:       /* wall */
1234:       if (*num == 2)
1235:         {
1236:           num ++;
1237:           rvec_copy(vals+map[1]*step,base,step);
1238:         }
1239:       /* corner shared by three elements */
1240:       else if (*num == 3)
1241:         {
1242:           num ++;
1243:           rvec_copy(vals+map[1]*step,base,step);
1244:           rvec_copy(vals+map[2]*step,base,step);
1245:         }
1246:       /* corner shared by four elements */
1247:       else if (*num == 4)
1248:         {
1249:           num ++;
1250:           rvec_copy(vals+map[1]*step,base,step);
1251:           rvec_copy(vals+map[2]*step,base,step);
1252:           rvec_copy(vals+map[3]*step,base,step);
1253:         }
1254:       /* general case ... odd geoms ... 3D*/
1255:       else
1256:         {
1257:           num++;
1258:           while (*++map >= 0)
1259:             {rvec_copy(vals+*map*step,base,step);}
1260:         }
1261:     }
1262:   return(0);
1263: }

1265: /******************************************************************************/
1266: static PetscErrorCode gs_gop_vec_pairwise_plus( gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1267: {
1268:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1269:   PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1270:   PetscInt *pw, *list, *size, **nodes;
1271:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1272:   MPI_Status status;
1273:   PetscBLASInt i1 = 1,dstep;

1277:   /* strip and load s */
1278:   msg_list =list         = gs->pair_list;
1279:   msg_size =size         = gs->msg_sizes;
1280:   msg_nodes=nodes        = gs->node_list;
1281:   iptr=pw                = gs->pw_elm_list;
1282:   dptr1=dptr3            = gs->pw_vals;
1283:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1284:   msg_ids_out = ids_out  = gs->msg_ids_out;
1285:   dptr2                  = gs->out;
1286:   in1=in2                = gs->in;

1288:   /* post the receives */
1289:   /*  msg_nodes=nodes; */
1290:   do
1291:     {
1292:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1293:          second one *list and do list++ afterwards */
1294:       MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->gs_comm, msg_ids_in);
1295:       list++;msg_ids_in++;
1296:       in1 += *size++ *step;
1297:     }
1298:   while (*++msg_nodes);
1299:   msg_nodes=nodes;

1301:   /* load gs values into in out gs buffers */
1302:   while (*iptr >= 0)
1303:     {
1304:       rvec_copy(dptr3,in_vals + *iptr*step,step);
1305:       dptr3+=step;
1306:       iptr++;
1307:     }

1309:   /* load out buffers and post the sends */
1310:   while ((iptr = *msg_nodes++))
1311:     {
1312:       dptr3 = dptr2;
1313:       while (*iptr >= 0)
1314:         {
1315:           rvec_copy(dptr2,dptr1 + *iptr*step,step);
1316:           dptr2+=step;
1317:           iptr++;
1318:         }
1319:       MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+my_id, gs->gs_comm, msg_ids_out);
1320:       msg_size++; msg_list++;msg_ids_out++;
1321:     }

1323:   /* tree */
1324:   if (gs->max_left_over)
1325:     {gs_gop_vec_tree_plus(gs,in_vals,step);}

1327:   /* process the received data */
1328:   msg_nodes=nodes;
1329:   while ((iptr = *nodes++)){
1330:     PetscScalar d1 = 1.0;
1331:       /* Should I check the return value of MPI_Wait() or status? */
1332:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1333:       MPI_Wait(ids_in, &status);
1334:       ids_in++;
1335:       while (*iptr >= 0) {
1336:         dstep = PetscBLASIntCast(step);
1337:         BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1);
1338:         in2+=step;
1339:         iptr++;
1340:       }
1341:   }

1343:   /* replace vals */
1344:   while (*pw >= 0)
1345:     {
1346:       rvec_copy(in_vals + *pw*step,dptr1,step);
1347:       dptr1+=step;
1348:       pw++;
1349:     }

1351:   /* clear isend message handles */
1352:   /* This changed for clarity though it could be the same */
1353:   while (*msg_nodes++)
1354:     /* Should I check the return value of MPI_Wait() or status? */
1355:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1356:     {MPI_Wait(ids_out, &status);ids_out++;}
1357: 

1359:   return(0);
1360: }

1362: /******************************************************************************/
1363: static PetscErrorCode gs_gop_vec_tree_plus( gs_id *gs,  PetscScalar *vals,  PetscInt step)
1364: {
1365:   PetscInt size, *in, *out;
1366:   PetscScalar *buf, *work;
1367:   PetscInt op[] = {GL_ADD,0};
1368:   PetscBLASInt i1 = 1;

1371:   /* copy over to local variables */
1372:   in   = gs->tree_map_in;
1373:   out  = gs->tree_map_out;
1374:   buf  = gs->tree_buf;
1375:   work = gs->tree_work;
1376:   size = gs->tree_nel*step;

1378:   /* zero out collection buffer */
1379:   rvec_zero(buf,size);


1382:   /* copy over my contributions */
1383:   while (*in >= 0)
1384:     {
1385:       PetscBLASInt dstep = PetscBLASIntCast(step);
1386:       BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1);
1387:     }

1389:   /* perform fan in/out on full buffer */
1390:   /* must change grop to handle the blas */
1391:   grop(buf,work,size,op);

1393:   /* reset */
1394:   in   = gs->tree_map_in;
1395:   out  = gs->tree_map_out;

1397:   /* get the portion of the results I need */
1398:   while (*in >= 0)
1399:     {
1400:       PetscBLASInt dstep = PetscBLASIntCast(step);
1401:       BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1);
1402:     }
1403:   return(0);
1404: }

1406: /******************************************************************************/
1407: PetscErrorCode gs_gop_hc( gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1408: {

1412:   switch (*op) {
1413:   case '+':
1414:     gs_gop_plus_hc(gs,vals,dim);
1415:     break;
1416:   default:
1417:     PetscInfo1(0,"gs_gop_hc() :: %c is not a valid op",op[0]);
1418:     PetscInfo(0,"gs_gop_hc() :: default :: plus\n");
1419:     gs_gop_plus_hc(gs,vals,dim);
1420:     break;
1421:   }
1422:   return(0);
1423: }

1425: /******************************************************************************/
1426: static PetscErrorCode gs_gop_plus_hc( gs_id *gs,  PetscScalar *vals, PetscInt dim)
1427: {
1429:   /* if there's nothing to do return */
1430:   if (dim<=0)
1431:     {  return(0);}

1433:   /* can't do more dimensions then exist */
1434:   dim = PetscMin(dim,i_log2_num_nodes);

1436:   /* local only operations!!! */
1437:   if (gs->num_local)
1438:     {gs_gop_local_plus(gs,vals);}

1440:   /* if intersection tree/pairwise and local isn't empty */
1441:   if (gs->num_local_gop)
1442:     {
1443:       gs_gop_local_in_plus(gs,vals);

1445:       /* pairwise will do tree inside ... */
1446:       if (gs->num_pairs)
1447:         {gs_gop_pairwise_plus_hc(gs,vals,dim);}

1449:       /* tree only */
1450:       else if (gs->max_left_over)
1451:         {gs_gop_tree_plus_hc(gs,vals,dim);}
1452: 
1453:       gs_gop_local_out(gs,vals);
1454:     }
1455:   /* if intersection tree/pairwise and local is empty */
1456:   else
1457:     {
1458:       /* pairwise will do tree inside */
1459:       if (gs->num_pairs)
1460:         {gs_gop_pairwise_plus_hc(gs,vals,dim);}
1461: 
1462:       /* tree */
1463:       else if (gs->max_left_over)
1464:         {gs_gop_tree_plus_hc(gs,vals,dim);}
1465:     }
1466:   return(0);
1467: }

1469: /******************************************************************************/
1470: static PetscErrorCode gs_gop_pairwise_plus_hc( gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1471: {
1472:    PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1473:    PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1474:    PetscInt *pw, *list, *size, **nodes;
1475:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1476:   MPI_Status status;
1477:   PetscInt i, mask=1;

1481:   for (i=1; i<dim; i++)
1482:     {mask<<=1; mask++;}


1485:   /* strip and load s */
1486:   msg_list =list         = gs->pair_list;
1487:   msg_size =size         = gs->msg_sizes;
1488:   msg_nodes=nodes        = gs->node_list;
1489:   iptr=pw                = gs->pw_elm_list;
1490:   dptr1=dptr3            = gs->pw_vals;
1491:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1492:   msg_ids_out = ids_out  = gs->msg_ids_out;
1493:   dptr2                  = gs->out;
1494:   in1=in2                = gs->in;

1496:   /* post the receives */
1497:   /*  msg_nodes=nodes; */
1498:   do
1499:     {
1500:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1501:          second one *list and do list++ afterwards */
1502:       if ((my_id|mask)==(*list|mask))
1503:         {
1504:           MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->gs_comm, msg_ids_in);
1505:           list++; msg_ids_in++;in1 += *size++;
1506:         }
1507:       else
1508:         {list++; size++;}
1509:     }
1510:   while (*++msg_nodes);

1512:   /* load gs values into in out gs buffers */
1513:   while (*iptr >= 0)
1514:     {*dptr3++ = *(in_vals + *iptr++);}

1516:   /* load out buffers and post the sends */
1517:   msg_nodes=nodes;
1518:   list = msg_list;
1519:   while ((iptr = *msg_nodes++))
1520:     {
1521:       if ((my_id|mask)==(*list|mask))
1522:         {
1523:           dptr3 = dptr2;
1524:           while (*iptr >= 0)
1525:             {*dptr2++ = *(dptr1 + *iptr++);}
1526:           /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1527:           /* is msg_ids_out++ correct? */
1528:           MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+my_id, gs->gs_comm, msg_ids_out);
1529:           msg_size++;list++;msg_ids_out++;
1530:         }
1531:       else
1532:         {list++; msg_size++;}
1533:     }

1535:   /* do the tree while we're waiting */
1536:   if (gs->max_left_over)
1537:     {gs_gop_tree_plus_hc(gs,in_vals,dim);}

1539:   /* process the received data */
1540:   msg_nodes=nodes;
1541:   list = msg_list;
1542:   while ((iptr = *nodes++))
1543:     {
1544:       if ((my_id|mask)==(*list|mask))
1545:         {
1546:           /* Should I check the return value of MPI_Wait() or status? */
1547:           /* Can this loop be replaced by a call to MPI_Waitall()? */
1548:           MPI_Wait(ids_in, &status);
1549:           ids_in++;
1550:           while (*iptr >= 0)
1551:             {*(dptr1 + *iptr++) += *in2++;}
1552:         }
1553:       list++;
1554:     }

1556:   /* replace vals */
1557:   while (*pw >= 0)
1558:     {*(in_vals + *pw++) = *dptr1++;}

1560:   /* clear isend message handles */
1561:   /* This changed for clarity though it could be the same */
1562:   while (*msg_nodes++)
1563:     {
1564:       if ((my_id|mask)==(*msg_list|mask))
1565:         {
1566:           /* Should I check the return value of MPI_Wait() or status? */
1567:           /* Can this loop be replaced by a call to MPI_Waitall()? */
1568:           MPI_Wait(ids_out, &status);
1569:           ids_out++;
1570:         }
1571:       msg_list++;
1572:     }

1574:   return(0);
1575: }

1577: /******************************************************************************/
1578: static PetscErrorCode gs_gop_tree_plus_hc(gs_id *gs, PetscScalar *vals, PetscInt dim)
1579: {
1580:   PetscInt size;
1581:   PetscInt *in, *out;
1582:   PetscScalar *buf, *work;
1583:   PetscInt op[] = {GL_ADD,0};

1586:   in   = gs->tree_map_in;
1587:   out  = gs->tree_map_out;
1588:   buf  = gs->tree_buf;
1589:   work = gs->tree_work;
1590:   size = gs->tree_nel;

1592:   rvec_zero(buf,size);

1594:   while (*in >= 0)
1595:     {*(buf + *out++) = *(vals + *in++);}

1597:   in   = gs->tree_map_in;
1598:   out  = gs->tree_map_out;

1600:   grop_hc(buf,work,size,op,dim);

1602:   while (*in >= 0)
1603:     {*(vals + *in++) = *(buf + *out++);}
1604:   return(0);
1605: }