Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "kmp.h"
36 #include "kmp_atomic.h"
37 #include "kmp_wrapper_getpid.h"
38 #include "kmp_environment.h"
39 #include "kmp_itt.h"
40 #include "kmp_str.h"
41 #include "kmp_settings.h"
42 #include "kmp_i18n.h"
43 #include "kmp_io.h"
44 #include "kmp_error.h"
45 #include "kmp_stats.h"
46 #include "kmp_wait_release.h"
47 
48 /* these are temporary issues to be dealt with */
49 #define KMP_USE_PRCTL 0
50 #define KMP_USE_POOLED_ALLOC 0
51 
52 #if KMP_OS_WINDOWS
53 #include <process.h>
54 #endif
55 
56 
57 #if defined(KMP_GOMP_COMPAT)
58 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
59 #endif /* defined(KMP_GOMP_COMPAT) */
60 
61 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
62 #if OMP_40_ENABLED
63  "4.0 (201307)";
64 #else
65  "3.1 (201107)";
66 #endif
67 
68 #ifdef KMP_DEBUG
69 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 
73 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
74 
75 /* ------------------------------------------------------------------------ */
76 /* ------------------------------------------------------------------------ */
77 
78 kmp_info_t __kmp_monitor;
79 
80 /* ------------------------------------------------------------------------ */
81 /* ------------------------------------------------------------------------ */
82 
83 /* Forward declarations */
84 
85 void __kmp_cleanup( void );
86 
87 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
88 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
89 static void __kmp_partition_places( kmp_team_t *team );
90 static void __kmp_do_serial_initialize( void );
91 void __kmp_fork_barrier( int gtid, int tid );
92 void __kmp_join_barrier( int gtid );
93 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
94 
95 
96 #ifdef USE_LOAD_BALANCE
97 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
98 #endif
99 
100 static int __kmp_expand_threads(int nWish, int nNeed);
101 static int __kmp_unregister_root_other_thread( int gtid );
102 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
103 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
104 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
105 
106 /* ------------------------------------------------------------------------ */
107 /* ------------------------------------------------------------------------ */
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique */
111 /* identifier of executing thread. */
112 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
113 
114 int
115 __kmp_get_global_thread_id( )
116 {
117  int i;
118  kmp_info_t **other_threads;
119  size_t stack_data;
120  char *stack_addr;
121  size_t stack_size;
122  char *stack_base;
123 
124  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
125  __kmp_nth, __kmp_all_nth ));
126 
127  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
128  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
129  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
130  __kmp_init_gtid for this to work. */
131 
132  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
133 
134 #ifdef KMP_TDATA_GTID
135  if ( TCR_4(__kmp_gtid_mode) >= 3) {
136  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
137  return __kmp_gtid;
138  }
139 #endif
140  if ( TCR_4(__kmp_gtid_mode) >= 2) {
141  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
142  return __kmp_gtid_get_specific();
143  }
144  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
145 
146  stack_addr = (char*) & stack_data;
147  other_threads = __kmp_threads;
148 
149  /*
150  ATT: The code below is a source of potential bugs due to unsynchronized access to
151  __kmp_threads array. For example:
152  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
153  2. Current thread is suspended by OS.
154  3. Another thread unregisters and finishes (debug versions of free() may fill memory
155  with something like 0xEF).
156  4. Current thread is resumed.
157  5. Current thread reads junk from *thr.
158  TODO: Fix it.
159  --ln
160  */
161 
162  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
163 
164  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
165  if( !thr ) continue;
166 
167  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
168  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
169 
170  /* stack grows down -- search through all of the active threads */
171 
172  if( stack_addr <= stack_base ) {
173  size_t stack_diff = stack_base - stack_addr;
174 
175  if( stack_diff <= stack_size ) {
176  /* The only way we can be closer than the allocated */
177  /* stack size is if we are running on this thread. */
178  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
179  return i;
180  }
181  }
182  }
183 
184  /* get specific to try and determine our gtid */
185  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
186  "thread, using TLS\n" ));
187  i = __kmp_gtid_get_specific();
188 
189  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
190 
191  /* if we havn't been assigned a gtid, then return code */
192  if( i<0 ) return i;
193 
194  /* dynamically updated stack window for uber threads to avoid get_specific call */
195  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
196  KMP_FATAL( StackOverflow, i );
197  }
198 
199  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
200  if( stack_addr > stack_base ) {
201  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
202  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
204  } else {
205  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
206  }
207 
208  /* Reprint stack bounds for ubermaster since they have been refined */
209  if ( __kmp_storage_map ) {
210  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
211  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
212  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
213  other_threads[i]->th.th_info.ds.ds_stacksize,
214  "th_%d stack (refinement)", i );
215  }
216  return i;
217 }
218 
219 int
220 __kmp_get_global_thread_id_reg( )
221 {
222  int gtid;
223 
224  if ( !__kmp_init_serial ) {
225  gtid = KMP_GTID_DNE;
226  } else
227 #ifdef KMP_TDATA_GTID
228  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
229  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
230  gtid = __kmp_gtid;
231  } else
232 #endif
233  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
234  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
235  gtid = __kmp_gtid_get_specific();
236  } else {
237  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
238  gtid = __kmp_get_global_thread_id();
239  }
240 
241  /* we must be a new uber master sibling thread */
242  if( gtid == KMP_GTID_DNE ) {
243  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
244  "Registering a new gtid.\n" ));
245  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
246  if( !__kmp_init_serial ) {
247  __kmp_do_serial_initialize();
248  gtid = __kmp_gtid_get_specific();
249  } else {
250  gtid = __kmp_register_root(FALSE);
251  }
252  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
253  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
254  }
255 
256  KMP_DEBUG_ASSERT( gtid >=0 );
257 
258  return gtid;
259 }
260 
261 /* caller must hold forkjoin_lock */
262 void
263 __kmp_check_stack_overlap( kmp_info_t *th )
264 {
265  int f;
266  char *stack_beg = NULL;
267  char *stack_end = NULL;
268  int gtid;
269 
270  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
271  if ( __kmp_storage_map ) {
272  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
273  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
274 
275  gtid = __kmp_gtid_from_thread( th );
276 
277  if (gtid == KMP_GTID_MONITOR) {
278  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
279  "th_%s stack (%s)", "mon",
280  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
281  } else {
282  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
283  "th_%d stack (%s)", gtid,
284  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
285  }
286  }
287 
288  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
289  gtid = __kmp_gtid_from_thread( th );
290  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
291  {
292  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
293  if ( stack_beg == NULL ) {
294  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
295  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296  }
297 
298  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
299  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301  if( f_th && f_th != th ) {
302  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303  char *other_stack_beg = other_stack_end -
304  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308  /* Print the other stack values before the abort */
309  if ( __kmp_storage_map )
310  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
311  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
312  "th_%d stack (overlapped)",
313  __kmp_gtid_from_thread( f_th ) );
314 
315  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
316  }
317  }
318  }
319  }
320  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
321 }
322 
323 
324 /* ------------------------------------------------------------------------ */
325 
326 /* ------------------------------------------------------------------------ */
327 
328 void
329 __kmp_infinite_loop( void )
330 {
331  static int done = FALSE;
332 
333  while (! done) {
334  KMP_YIELD( 1 );
335  }
336 }
337 
338 #define MAX_MESSAGE 512
339 
340 void
341 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
342  char buffer[MAX_MESSAGE];
343  int node;
344  va_list ap;
345 
346  va_start( ap, format);
347  KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
348  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
349  __kmp_vprintf( kmp_err, buffer, ap );
350 #if KMP_PRINT_DATA_PLACEMENT
351  if(gtid >= 0) {
352  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
353  if( __kmp_storage_map_verbose ) {
354  node = __kmp_get_host_node(p1);
355  if(node < 0) /* doesn't work, so don't try this next time */
356  __kmp_storage_map_verbose = FALSE;
357  else {
358  char *last;
359  int lastNode;
360  int localProc = __kmp_get_cpu_from_gtid(gtid);
361 
362  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
363  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
364  if(localProc >= 0)
365  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
366  else
367  __kmp_printf_no_lock(" GTID %d\n", gtid);
368 # if KMP_USE_PRCTL
369 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
370  do {
371  last = p1;
372  lastNode = node;
373  /* This loop collates adjacent pages with the same host node. */
374  do {
375  (char*)p1 += PAGE_SIZE;
376  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
377  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
378  (char*)p1 - 1, lastNode);
379  } while(p1 <= p2);
380 # else
381  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
382  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
383  if(p1 < p2) {
384  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
385  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
386  }
387 # endif
388  }
389  }
390  } else
391  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
392  }
393 #endif /* KMP_PRINT_DATA_PLACEMENT */
394  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
395 }
396 
397 void
398 __kmp_warn( char const * format, ... )
399 {
400  char buffer[MAX_MESSAGE];
401  va_list ap;
402 
403  if ( __kmp_generate_warnings == kmp_warnings_off ) {
404  return;
405  }
406 
407  va_start( ap, format );
408 
409  KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
410  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
411  __kmp_vprintf( kmp_err, buffer, ap );
412  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
413 
414  va_end( ap );
415 }
416 
417 void
418 __kmp_abort_process()
419 {
420 
421  // Later threads may stall here, but that's ok because abort() will kill them.
422  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
423 
424  if ( __kmp_debug_buf ) {
425  __kmp_dump_debug_buffer();
426  }; // if
427 
428  if ( KMP_OS_WINDOWS ) {
429  // Let other threads know of abnormal termination and prevent deadlock
430  // if abort happened during library initialization or shutdown
431  __kmp_global.g.g_abort = SIGABRT;
432 
433  /*
434  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
435  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
436  works well, but this function is not available in VS7 (this is not problem for DLL, but
437  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
438  not help, at least in some versions of MS C RTL.
439 
440  It seems following sequence is the only way to simulate abort() and avoid pop-up error
441  box.
442  */
443  raise( SIGABRT );
444  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
445  } else {
446  abort();
447  }; // if
448 
449  __kmp_infinite_loop();
450  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
451 
452 } // __kmp_abort_process
453 
454 void
455 __kmp_abort_thread( void )
456 {
457  // TODO: Eliminate g_abort global variable and this function.
458  // In case of abort just call abort(), it will kill all the threads.
459  __kmp_infinite_loop();
460 } // __kmp_abort_thread
461 
462 /* ------------------------------------------------------------------------ */
463 
464 /*
465  * Print out the storage map for the major kmp_info_t thread data structures
466  * that are allocated together.
467  */
468 
469 static void
470 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
471 {
472  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
473 
474  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
475  "th_%d.th_info", gtid );
476 
477  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
478  "th_%d.th_local", gtid );
479 
480  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
481  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
482 
483  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
484  &thr->th.th_bar[bs_plain_barrier+1],
485  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
486 
487  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
488  &thr->th.th_bar[bs_forkjoin_barrier+1],
489  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
490 
491  #if KMP_FAST_REDUCTION_BARRIER
492  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
493  &thr->th.th_bar[bs_reduction_barrier+1],
494  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
495  #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /*
499  * Print out the storage map for the major kmp_team_t team data structures
500  * that are allocated together.
501  */
502 
503 static void
504 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
505 {
506  int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
507  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
508  header, team_id );
509 
510  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
511  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
512 
513 
514  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
515  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
516 
517  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
518  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
519 
520  #if KMP_FAST_REDUCTION_BARRIER
521  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
522  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
523  #endif // KMP_FAST_REDUCTION_BARRIER
524 
525  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
527 
528  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
529  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
530 
531  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
532  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
533  header, team_id );
534 
535  /*
536  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
537  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
538 
539  __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
540  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
541 
542  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
543  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
544 
545  __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
546  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
547 
548  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
549  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
550 
551  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
552  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
553 
554  //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
555  // sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
556 
557  __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
558  sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
559 #if OMP_40_ENABLED
560  __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
561  sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
562 #endif
563  */
564 
565  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
566  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
567 }
568 
569 static void __kmp_init_allocator() {}
570 static void __kmp_fini_allocator() {}
571 static void __kmp_fini_allocator_thread() {}
572 
573 /* ------------------------------------------------------------------------ */
574 
575 #ifdef KMP_DYNAMIC_LIB
576 # if KMP_OS_WINDOWS
577 
578 
579 static void
580 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
581  // TODO: Change to __kmp_break_bootstrap_lock().
582  __kmp_init_bootstrap_lock( lck ); // make the lock released
583 }
584 
585 static void
586 __kmp_reset_locks_on_process_detach( int gtid_req ) {
587  int i;
588  int thread_count;
589 
590  // PROCESS_DETACH is expected to be called by a thread
591  // that executes ProcessExit() or FreeLibrary().
592  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
593  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
594  // However, in fact, some threads can be still alive here, although being about to be terminated.
595  // The threads in the array with ds_thread==0 are most suspicious.
596  // Actually, it can be not safe to access the __kmp_threads[].
597 
598  // TODO: does it make sense to check __kmp_roots[] ?
599 
600  // Let's check that there are no other alive threads registered with the OMP lib.
601  while( 1 ) {
602  thread_count = 0;
603  for( i = 0; i < __kmp_threads_capacity; ++i ) {
604  if( !__kmp_threads ) continue;
605  kmp_info_t* th = __kmp_threads[ i ];
606  if( th == NULL ) continue;
607  int gtid = th->th.th_info.ds.ds_gtid;
608  if( gtid == gtid_req ) continue;
609  if( gtid < 0 ) continue;
610  DWORD exit_val;
611  int alive = __kmp_is_thread_alive( th, &exit_val );
612  if( alive ) {
613  ++thread_count;
614  }
615  }
616  if( thread_count == 0 ) break; // success
617  }
618 
619  // Assume that I'm alone.
620 
621  // Now it might be probably safe to check and reset locks.
622  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
623  __kmp_reset_lock( &__kmp_forkjoin_lock );
624  #ifdef KMP_DEBUG
625  __kmp_reset_lock( &__kmp_stdio_lock );
626  #endif // KMP_DEBUG
627 
628 
629 }
630 
631 BOOL WINAPI
632 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
633  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
634 
635  switch( fdwReason ) {
636 
637  case DLL_PROCESS_ATTACH:
638  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
639 
640  return TRUE;
641 
642  case DLL_PROCESS_DETACH:
643  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
644  __kmp_gtid_get_specific() ));
645 
646  if( lpReserved != NULL )
647  {
648  // lpReserved is used for telling the difference:
649  // lpReserved == NULL when FreeLibrary() was called,
650  // lpReserved != NULL when the process terminates.
651  // When FreeLibrary() is called, worker threads remain alive.
652  // So they will release the forkjoin lock by themselves.
653  // When the process terminates, worker threads disappear triggering
654  // the problem of unreleased forkjoin lock as described below.
655 
656  // A worker thread can take the forkjoin lock
657  // in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
658  // The problem comes up if that worker thread becomes dead
659  // before it releases the forkjoin lock.
660  // The forkjoin lock remains taken, while the thread
661  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
662  // will try to take the forkjoin lock and will always fail,
663  // so that the application will never finish [normally].
664  // This scenario is possible if __kmpc_end() has not been executed.
665  // It looks like it's not a corner case, but common cases:
666  // - the main function was compiled by an alternative compiler;
667  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
668  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
669  // - alive foreign thread prevented __kmpc_end from doing cleanup.
670 
671  // This is a hack to work around the problem.
672  // TODO: !!! to figure out something better.
673  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
674  }
675 
676  __kmp_internal_end_library( __kmp_gtid_get_specific() );
677 
678  return TRUE;
679 
680  case DLL_THREAD_ATTACH:
681  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
682 
683  /* if we wanted to register new siblings all the time here call
684  * __kmp_get_gtid(); */
685  return TRUE;
686 
687  case DLL_THREAD_DETACH:
688  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
689  __kmp_gtid_get_specific() ));
690 
691  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
692  return TRUE;
693  }
694 
695  return TRUE;
696 }
697 
698 # endif /* KMP_OS_WINDOWS */
699 #endif /* KMP_DYNAMIC_LIB */
700 
701 
702 /* ------------------------------------------------------------------------ */
703 
704 /* Change the library type to "status" and return the old type */
705 /* called from within initialization routines where __kmp_initz_lock is held */
706 int
707 __kmp_change_library( int status )
708 {
709  int old_status;
710 
711  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
712 
713  if (status) {
714  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
715  }
716  else {
717  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
718  }
719 
720  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
721 }
722 
723 /* ------------------------------------------------------------------------ */
724 /* ------------------------------------------------------------------------ */
725 
726 /* __kmp_parallel_deo --
727  * Wait until it's our turn.
728  */
729 void
730 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
731 {
732  int gtid = *gtid_ref;
733 #ifdef BUILD_PARALLEL_ORDERED
734  kmp_team_t *team = __kmp_team_from_gtid( gtid );
735 #endif /* BUILD_PARALLEL_ORDERED */
736 
737  if( __kmp_env_consistency_check ) {
738  if( __kmp_threads[gtid]->th.th_root->r.r_active )
739 #if KMP_USE_DYNAMIC_LOCK
740  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
741 #else
742  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
743 #endif
744  }
745 #ifdef BUILD_PARALLEL_ORDERED
746  if( !team->t.t_serialized ) {
747  kmp_uint32 spins;
748 
749  KMP_MB();
750  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
751  KMP_MB();
752  }
753 #endif /* BUILD_PARALLEL_ORDERED */
754 }
755 
756 /* __kmp_parallel_dxo --
757  * Signal the next task.
758  */
759 
760 void
761 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
762 {
763  int gtid = *gtid_ref;
764 #ifdef BUILD_PARALLEL_ORDERED
765  int tid = __kmp_tid_from_gtid( gtid );
766  kmp_team_t *team = __kmp_team_from_gtid( gtid );
767 #endif /* BUILD_PARALLEL_ORDERED */
768 
769  if( __kmp_env_consistency_check ) {
770  if( __kmp_threads[gtid]->th.th_root->r.r_active )
771  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
772  }
773 #ifdef BUILD_PARALLEL_ORDERED
774  if ( ! team->t.t_serialized ) {
775  KMP_MB(); /* Flush all pending memory write invalidates. */
776 
777  /* use the tid of the next thread in this team */
778  /* TODO repleace with general release procedure */
779  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
780 
781  KMP_MB(); /* Flush all pending memory write invalidates. */
782  }
783 #endif /* BUILD_PARALLEL_ORDERED */
784 }
785 
786 /* ------------------------------------------------------------------------ */
787 /* ------------------------------------------------------------------------ */
788 
789 /* ------------------------------------------------------------------------ */
790 /* ------------------------------------------------------------------------ */
791 
792 /* The BARRIER for a SINGLE process section is always explicit */
793 
794 int
795 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
796 {
797  int status;
798  kmp_info_t *th;
799  kmp_team_t *team;
800 
801  if( ! TCR_4(__kmp_init_parallel) )
802  __kmp_parallel_initialize();
803 
804  th = __kmp_threads[ gtid ];
805  team = th->th.th_team;
806  status = 0;
807 
808  th->th.th_ident = id_ref;
809 
810  if ( team->t.t_serialized ) {
811  status = 1;
812  } else {
813  kmp_int32 old_this = th->th.th_local.this_construct;
814 
815  ++th->th.th_local.this_construct;
816  /* try to set team count to thread count--success means thread got the
817  single block
818  */
819  /* TODO: Should this be acquire or release? */
820  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
821  th->th.th_local.this_construct);
822 #if USE_ITT_BUILD
823  if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
824 #if OMP_40_ENABLED
825  th->th.th_teams_microtask == NULL &&
826 #endif
827  team->t.t_active_level == 1 )
828  { // Only report metadata by master of active team at level 1
829  __kmp_itt_metadata_single( id_ref );
830  }
831 #endif /* USE_ITT_BUILD */
832  }
833 
834  if( __kmp_env_consistency_check ) {
835  if (status && push_ws) {
836  __kmp_push_workshare( gtid, ct_psingle, id_ref );
837  } else {
838  __kmp_check_workshare( gtid, ct_psingle, id_ref );
839  }
840  }
841 #if USE_ITT_BUILD
842  if ( status ) {
843  __kmp_itt_single_start( gtid );
844  }
845 #endif /* USE_ITT_BUILD */
846  return status;
847 }
848 
849 void
850 __kmp_exit_single( int gtid )
851 {
852 #if USE_ITT_BUILD
853  __kmp_itt_single_end( gtid );
854 #endif /* USE_ITT_BUILD */
855  if( __kmp_env_consistency_check )
856  __kmp_pop_workshare( gtid, ct_psingle, NULL );
857 }
858 
859 
860 /*
861  * determine if we can go parallel or must use a serialized parallel region and
862  * how many threads we can use
863  * set_nproc is the number of threads requested for the team
864  * returns 0 if we should serialize or only use one thread,
865  * otherwise the number of threads to use
866  * The forkjoin lock is held by the caller.
867  */
868 static int
869 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
870  int master_tid, int set_nthreads
871 #if OMP_40_ENABLED
872  , int enter_teams
873 #endif /* OMP_40_ENABLED */
874 )
875 {
876  int capacity;
877  int new_nthreads;
878  int use_rml_to_adjust_nth;
879  KMP_DEBUG_ASSERT( __kmp_init_serial );
880  KMP_DEBUG_ASSERT( root && parent_team );
881 
882  //
883  // Initial check to see if we should use a serialized team.
884  //
885  if ( set_nthreads == 1 ) {
886  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
887  __kmp_get_gtid(), set_nthreads ));
888  return 1;
889  }
890  if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
891 #if OMP_40_ENABLED
892  && !enter_teams
893 #endif /* OMP_40_ENABLED */
894  ) ) || ( __kmp_library == library_serial ) ) {
895  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
896  __kmp_get_gtid(), set_nthreads ));
897  return 1;
898  }
899 
900  //
901  // If dyn-var is set, dynamically adjust the number of desired threads,
902  // according to the method specified by dynamic_mode.
903  //
904  new_nthreads = set_nthreads;
905  use_rml_to_adjust_nth = FALSE;
906  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
907  ;
908  }
909 #ifdef USE_LOAD_BALANCE
910  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
911  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
912  if ( new_nthreads == 1 ) {
913  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
914  master_tid ));
915  return 1;
916  }
917  if ( new_nthreads < set_nthreads ) {
918  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
919  master_tid, new_nthreads ));
920  }
921  }
922 #endif /* USE_LOAD_BALANCE */
923  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
924  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
925  : root->r.r_hot_team->t.t_nproc);
926  if ( new_nthreads <= 1 ) {
927  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
928  master_tid ));
929  return 1;
930  }
931  if ( new_nthreads < set_nthreads ) {
932  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
933  master_tid, new_nthreads ));
934  }
935  else {
936  new_nthreads = set_nthreads;
937  }
938  }
939  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
940  if ( set_nthreads > 2 ) {
941  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
942  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
943  if ( new_nthreads == 1 ) {
944  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
945  master_tid ));
946  return 1;
947  }
948  if ( new_nthreads < set_nthreads ) {
949  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
950  master_tid, new_nthreads ));
951  }
952  }
953  }
954  else {
955  KMP_ASSERT( 0 );
956  }
957 
958  //
959  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
960  //
961  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
962  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
963  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
964  root->r.r_hot_team->t.t_nproc );
965  if ( tl_nthreads <= 0 ) {
966  tl_nthreads = 1;
967  }
968 
969  //
970  // If dyn-var is false, emit a 1-time warning.
971  //
972  if ( ! get__dynamic_2( parent_team, master_tid )
973  && ( ! __kmp_reserve_warn ) ) {
974  __kmp_reserve_warn = 1;
975  __kmp_msg(
976  kmp_ms_warning,
977  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
978  KMP_HNT( Unset_ALL_THREADS ),
979  __kmp_msg_null
980  );
981  }
982  if ( tl_nthreads == 1 ) {
983  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
984  master_tid ));
985  return 1;
986  }
987  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
988  master_tid, tl_nthreads ));
989  new_nthreads = tl_nthreads;
990  }
991 
992 
993  //
994  // Check if the threads array is large enough, or needs expanding.
995  //
996  // See comment in __kmp_register_root() about the adjustment if
997  // __kmp_threads[0] == NULL.
998  //
999  capacity = __kmp_threads_capacity;
1000  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
1001  --capacity;
1002  }
1003  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
1004  root->r.r_hot_team->t.t_nproc ) > capacity ) {
1005  //
1006  // Expand the threads array.
1007  //
1008  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
1009  root->r.r_hot_team->t.t_nproc ) - capacity;
1010  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
1011  if ( slotsAdded < slotsRequired ) {
1012  //
1013  // The threads array was not expanded enough.
1014  //
1015  new_nthreads -= ( slotsRequired - slotsAdded );
1016  KMP_ASSERT( new_nthreads >= 1 );
1017 
1018  //
1019  // If dyn-var is false, emit a 1-time warning.
1020  //
1021  if ( ! get__dynamic_2( parent_team, master_tid )
1022  && ( ! __kmp_reserve_warn ) ) {
1023  __kmp_reserve_warn = 1;
1024  if ( __kmp_tp_cached ) {
1025  __kmp_msg(
1026  kmp_ms_warning,
1027  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1028  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
1029  KMP_HNT( PossibleSystemLimitOnThreads ),
1030  __kmp_msg_null
1031  );
1032  }
1033  else {
1034  __kmp_msg(
1035  kmp_ms_warning,
1036  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1037  KMP_HNT( SystemLimitOnThreads ),
1038  __kmp_msg_null
1039  );
1040  }
1041  }
1042  }
1043  }
1044 
1045  if ( new_nthreads == 1 ) {
1046  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
1047  __kmp_get_gtid(), set_nthreads ) );
1048  return 1;
1049  }
1050 
1051  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
1052  __kmp_get_gtid(), new_nthreads, set_nthreads ));
1053  return new_nthreads;
1054 }
1055 
1056 /* ------------------------------------------------------------------------ */
1057 /* ------------------------------------------------------------------------ */
1058 
1059 /* allocate threads from the thread pool and assign them to the new team */
1060 /* we are assured that there are enough threads available, because we
1061  * checked on that earlier within critical section forkjoin */
1062 
1063 static void
1064 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1065  kmp_info_t *master_th, int master_gtid )
1066 {
1067  int i;
1068  int use_hot_team;
1069 
1070  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1071  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1072  KMP_MB();
1073 
1074  /* first, let's setup the master thread */
1075  master_th->th.th_info.ds.ds_tid = 0;
1076  master_th->th.th_team = team;
1077  master_th->th.th_team_nproc = team->t.t_nproc;
1078  master_th->th.th_team_master = master_th;
1079  master_th->th.th_team_serialized = FALSE;
1080  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1081 
1082  /* make sure we are not the optimized hot team */
1083 #if KMP_NESTED_HOT_TEAMS
1084  use_hot_team = 0;
1085  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1086  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1087  int level = team->t.t_active_level - 1; // index in array of hot teams
1088  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1089  if( master_th->th.th_teams_size.nteams > 1 ) {
1090  ++level; // level was not increased in teams construct for team_of_masters
1091  }
1092  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1093  master_th->th.th_teams_level == team->t.t_level ) {
1094  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1095  } // team->t.t_level will be increased inside parallel
1096  }
1097  if( level < __kmp_hot_teams_max_level ) {
1098  if( hot_teams[level].hot_team ) {
1099  // hot team has already been allocated for given level
1100  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1101  use_hot_team = 1; // the team is ready to use
1102  } else {
1103  use_hot_team = 0; // AC: threads are not allocated yet
1104  hot_teams[level].hot_team = team; // remember new hot team
1105  hot_teams[level].hot_team_nth = team->t.t_nproc;
1106  }
1107  } else {
1108  use_hot_team = 0;
1109  }
1110  }
1111 #else
1112  use_hot_team = team == root->r.r_hot_team;
1113 #endif
1114  if ( !use_hot_team ) {
1115 
1116  /* install the master thread */
1117  team->t.t_threads[ 0 ] = master_th;
1118  __kmp_initialize_info( master_th, team, 0, master_gtid );
1119 
1120  /* now, install the worker threads */
1121  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1122 
1123  /* fork or reallocate a new thread and install it in team */
1124  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1125  team->t.t_threads[ i ] = thr;
1126  KMP_DEBUG_ASSERT( thr );
1127  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1128  /* align team and thread arrived states */
1129  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
1130  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1131  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1132  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1133  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1134 #if OMP_40_ENABLED
1135  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1136  thr->th.th_teams_level = master_th->th.th_teams_level;
1137  thr->th.th_teams_size = master_th->th.th_teams_size;
1138 #endif
1139  { // Initialize threads' barrier data.
1140  int b;
1141  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1142  for ( b = 0; b < bs_last_barrier; ++ b ) {
1143  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1144  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1145 #if USE_DEBUGGER
1146  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1147 #endif
1148  }; // for b
1149  }
1150  }
1151 
1152 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1153  __kmp_partition_places( team );
1154 #endif
1155 
1156  }
1157 
1158  KMP_MB();
1159 }
1160 
1161 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1162 //
1163 // Propagate any changes to the floating point control registers out to the team
1164 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1165 // so we don't make changes unless they are needed.
1166 //
1167 inline static void
1168 propagateFPControl(kmp_team_t * team)
1169 {
1170  if ( __kmp_inherit_fp_control ) {
1171  kmp_int16 x87_fpu_control_word;
1172  kmp_uint32 mxcsr;
1173 
1174  // Get master values of FPU control flags (both X87 and vector)
1175  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1176  __kmp_store_mxcsr( &mxcsr );
1177  mxcsr &= KMP_X86_MXCSR_MASK;
1178 
1179  // There is no point looking at t_fp_control_saved here.
1180  // If it is TRUE, we still have to update the values if they are different from those we now have.
1181  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1182  // that the values in the team are the same as those we have.
1183  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1184  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1185  // cache-line into a written state, causing all threads in the team to have to read it again.
1186  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1187  team->t.t_x87_fpu_control_word = x87_fpu_control_word;
1188  }
1189  if ( team->t.t_mxcsr != mxcsr ) {
1190  team->t.t_mxcsr = mxcsr;
1191  }
1192  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1193  // So we must ensure it is correct.
1194  if (!team->t.t_fp_control_saved) {
1195  team->t.t_fp_control_saved = TRUE;
1196  }
1197  }
1198  else {
1199  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1200  if (team->t.t_fp_control_saved)
1201  team->t.t_fp_control_saved = FALSE;
1202  }
1203 }
1204 
1205 // Do the opposite, setting the hardware registers to the updated values from the team.
1206 inline static void
1207 updateHWFPControl(kmp_team_t * team)
1208 {
1209  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1210  //
1211  // Only reset the fp control regs if they have been changed in the team.
1212  // the parallel region that we are exiting.
1213  //
1214  kmp_int16 x87_fpu_control_word;
1215  kmp_uint32 mxcsr;
1216  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1217  __kmp_store_mxcsr( &mxcsr );
1218  mxcsr &= KMP_X86_MXCSR_MASK;
1219 
1220  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1221  __kmp_clear_x87_fpu_status_word();
1222  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1223  }
1224 
1225  if ( team->t.t_mxcsr != mxcsr ) {
1226  __kmp_load_mxcsr( &team->t.t_mxcsr );
1227  }
1228  }
1229 }
1230 #else
1231 # define propagateFPControl(x) ((void)0)
1232 # define updateHWFPControl(x) ((void)0)
1233 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1234 
1235 static void
1236 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1237 
1238 /*
1239  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1240  */
1241 void
1242 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1243 {
1244  kmp_info_t *this_thr;
1245  kmp_team_t *serial_team;
1246 
1247  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1248 
1249  /* Skip all this code for autopar serialized loops since it results in
1250  unacceptable overhead */
1251  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1252  return;
1253 
1254  if( ! TCR_4( __kmp_init_parallel ) )
1255  __kmp_parallel_initialize();
1256 
1257  this_thr = __kmp_threads[ global_tid ];
1258  serial_team = this_thr->th.th_serial_team;
1259 
1260  /* utilize the serialized team held by this thread */
1261  KMP_DEBUG_ASSERT( serial_team );
1262  KMP_MB();
1263 
1264  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1265  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1266  KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1267  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1268  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1269  this_thr->th.th_task_team = NULL;
1270  }
1271 
1272 #if OMP_40_ENABLED
1273  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1274  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1275  proc_bind = proc_bind_false;
1276  }
1277  else if ( proc_bind == proc_bind_default ) {
1278  //
1279  // No proc_bind clause was specified, so use the current value
1280  // of proc-bind-var for this parallel region.
1281  //
1282  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1283  }
1284  //
1285  // Reset for next parallel region
1286  //
1287  this_thr->th.th_set_proc_bind = proc_bind_default;
1288 #endif /* OMP_40_ENABLED */
1289 
1290  if( this_thr->th.th_team != serial_team ) {
1291  // Nested level will be an index in the nested nthreads array
1292  int level = this_thr->th.th_team->t.t_level;
1293 
1294  if( serial_team->t.t_serialized ) {
1295  /* this serial team was already used
1296  * TODO increase performance by making this locks more specific */
1297  kmp_team_t *new_team;
1298  int tid = this_thr->th.th_info.ds.ds_tid;
1299 
1300  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1301 
1302  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1303 #if OMP_40_ENABLED
1304  proc_bind,
1305 #endif
1306  & this_thr->th.th_current_task->td_icvs,
1307  0 USE_NESTED_HOT_ARG(NULL) );
1308  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1309  KMP_ASSERT( new_team );
1310 
1311  /* setup new serialized team and install it */
1312  new_team->t.t_threads[0] = this_thr;
1313  new_team->t.t_parent = this_thr->th.th_team;
1314  serial_team = new_team;
1315  this_thr->th.th_serial_team = serial_team;
1316 
1317  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1318  global_tid, serial_team ) );
1319 
1320 
1321  /* TODO the above breaks the requirement that if we run out of
1322  * resources, then we can still guarantee that serialized teams
1323  * are ok, since we may need to allocate a new one */
1324  } else {
1325  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1326  global_tid, serial_team ) );
1327  }
1328 
1329  /* we have to initialize this serial team */
1330  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1331  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1332  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1333  serial_team->t.t_ident = loc;
1334  serial_team->t.t_serialized = 1;
1335  serial_team->t.t_nproc = 1;
1336  serial_team->t.t_parent = this_thr->th.th_team;
1337  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1338  this_thr->th.th_team = serial_team;
1339  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1340 
1341  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1342  global_tid, this_thr->th.th_current_task ) );
1343  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1344  this_thr->th.th_current_task->td_flags.executing = 0;
1345 
1346  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1347 
1348  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1349  each serialized task represented by team->t.t_serialized? */
1350  copy_icvs(
1351  & this_thr->th.th_current_task->td_icvs,
1352  & this_thr->th.th_current_task->td_parent->td_icvs );
1353 
1354  // Thread value exists in the nested nthreads array for the next nested level
1355  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1356  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1357  }
1358 
1359 #if OMP_40_ENABLED
1360  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1361  this_thr->th.th_current_task->td_icvs.proc_bind
1362  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1363  }
1364 #endif /* OMP_40_ENABLED */
1365 
1366 #if USE_DEBUGGER
1367  serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1368 #endif
1369  this_thr->th.th_info.ds.ds_tid = 0;
1370 
1371  /* set thread cache values */
1372  this_thr->th.th_team_nproc = 1;
1373  this_thr->th.th_team_master = this_thr;
1374  this_thr->th.th_team_serialized = 1;
1375 
1376  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1377  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1378 
1379  propagateFPControl (serial_team);
1380 
1381  /* check if we need to allocate dispatch buffers stack */
1382  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1384  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1385  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1386  }
1387  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1388 
1389  KMP_MB();
1390 
1391  } else {
1392  /* this serialized team is already being used,
1393  * that's fine, just add another nested level */
1394  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1395  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1396  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1397  ++ serial_team->t.t_serialized;
1398  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1399 
1400  // Nested level will be an index in the nested nthreads array
1401  int level = this_thr->th.th_team->t.t_level;
1402  // Thread value exists in the nested nthreads array for the next nested level
1403  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1404  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1405  }
1406  serial_team->t.t_level++;
1407  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1408  global_tid, serial_team, serial_team->t.t_level ) );
1409 
1410  /* allocate/push dispatch buffers stack */
1411  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1412  {
1413  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1414  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1415  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1416  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1417  }
1418  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1419 
1420  KMP_MB();
1421  }
1422 
1423  if ( __kmp_env_consistency_check )
1424  __kmp_push_parallel( global_tid, NULL );
1425 
1426 #if USE_ITT_BUILD
1427  // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
1428  if ( serial_team->t.t_level == 1
1429 #if OMP_40_ENABLED
1430  && this_thr->th.th_teams_microtask == NULL
1431 #endif
1432  ) {
1433 #if USE_ITT_NOTIFY
1434  // Save the start of the "parallel" region for VTune. This is the frame begin at the same time.
1435  if ( ( __itt_get_timestamp_ptr || KMP_ITT_DEBUG ) &&
1436  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
1437  {
1438  serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
1439  } else // only one notification scheme (either "submit" or "forking/joined", not both)
1440 #endif
1441  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
1442  __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode )
1443  {
1444  this_thr->th.th_ident = loc;
1445  // 0 - no barriers; 1 - serialized parallel
1446  __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
1447  }
1448  }
1449 #endif /* USE_ITT_BUILD */
1450 }
1451 
1452 /* most of the work for a fork */
1453 /* return true if we really went parallel, false if serialized */
1454 int
1455 __kmp_fork_call(
1456  ident_t * loc,
1457  int gtid,
1458  enum fork_context_e call_context, // Intel, GNU, ...
1459  kmp_int32 argc,
1460  microtask_t microtask,
1461  launch_t invoker,
1462 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1463 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1464  va_list * ap
1465 #else
1466  va_list ap
1467 #endif
1468  )
1469 {
1470  void **argv;
1471  int i;
1472  int master_tid;
1473  int master_this_cons;
1474  kmp_team_t *team;
1475  kmp_team_t *parent_team;
1476  kmp_info_t *master_th;
1477  kmp_root_t *root;
1478  int nthreads;
1479  int master_active;
1480  int master_set_numthreads;
1481  int level;
1482 #if OMP_40_ENABLED
1483  int active_level;
1484  int teams_level;
1485 #endif
1486 #if KMP_NESTED_HOT_TEAMS
1487  kmp_hot_team_ptr_t **p_hot_teams;
1488 #endif
1489  { // KMP_TIME_BLOCK
1490  KMP_TIME_BLOCK(KMP_fork_call);
1491 
1492  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1493  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1494  /* Some systems prefer the stack for the root thread(s) to start with */
1495  /* some gap from the parent stack to prevent false sharing. */
1496  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1497  /* These 2 lines below are so this does not get optimized out */
1498  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1499  __kmp_stkpadding += (short)((kmp_int64)dummy);
1500  }
1501 
1502  /* initialize if needed */
1503  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1504  if( ! TCR_4(__kmp_init_parallel) )
1505  __kmp_parallel_initialize();
1506 
1507  /* setup current data */
1508  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1509  parent_team = master_th->th.th_team;
1510  master_tid = master_th->th.th_info.ds.ds_tid;
1511  master_this_cons = master_th->th.th_local.this_construct;
1512  root = master_th->th.th_root;
1513  master_active = root->r.r_active;
1514  master_set_numthreads = master_th->th.th_set_nproc;
1515  // Nested level will be an index in the nested nthreads array
1516  level = parent_team->t.t_level;
1517 #if OMP_40_ENABLED
1518  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1519  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1520 #endif
1521 #if KMP_NESTED_HOT_TEAMS
1522  p_hot_teams = &master_th->th.th_hot_teams;
1523  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1524  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1525  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1526  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1527  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1528  }
1529 #endif
1530 
1531 #if USE_DEBUGGER
1532  if ( __kmp_debugging ) { // Let debugger override number of threads.
1533  int nth = __kmp_omp_num_threads( loc );
1534  if ( nth > 0 ) { // 0 means debugger does not want to change number of threads.
1535  master_set_numthreads = nth;
1536  }; // if
1537  }; // if
1538 #endif
1539 
1540  master_th->th.th_ident = loc;
1541 
1542 #if OMP_40_ENABLED
1543  if ( master_th->th.th_teams_microtask &&
1544  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1545  // AC: This is start of parallel that is nested inside teams construct.
1546  // The team is actual (hot), all workers are ready at the fork barrier.
1547  // No lock needed to initialize the team a bit, then free workers.
1548  parent_team->t.t_ident = loc;
1549  parent_team->t.t_argc = argc;
1550  argv = (void**)parent_team->t.t_argv;
1551  for( i=argc-1; i >= 0; --i )
1552 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1553 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1554  *argv++ = va_arg( *ap, void * );
1555 #else
1556  *argv++ = va_arg( ap, void * );
1557 #endif
1558  /* Increment our nested depth levels, but not increase the serialization */
1559  if ( parent_team == master_th->th.th_serial_team ) {
1560  // AC: we are in serialized parallel
1561  __kmpc_serialized_parallel(loc, gtid);
1562  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1563  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1564  // work correctly, will restore at join time
1565  KMP_TIME_BLOCK(OMP_work);
1566  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
1567  return TRUE;
1568  }
1569  parent_team->t.t_pkfn = microtask;
1570  parent_team->t.t_invoke = invoker;
1571  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1572  parent_team->t.t_active_level ++;
1573  parent_team->t.t_level ++;
1574 
1575  /* Change number of threads in the team if requested */
1576  if ( master_set_numthreads ) { // The parallel has num_threads clause
1577  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1578  // AC: only can reduce the number of threads dynamically, cannot increase
1579  kmp_info_t **other_threads = parent_team->t.t_threads;
1580  parent_team->t.t_nproc = master_set_numthreads;
1581  for ( i = 0; i < master_set_numthreads; ++i ) {
1582  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1583  }
1584  // Keep extra threads hot in the team for possible next parallels
1585  }
1586  master_th->th.th_set_nproc = 0;
1587  }
1588 
1589 
1590  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1591  __kmp_internal_fork( loc, gtid, parent_team );
1592  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1593 
1594  /* Invoke microtask for MASTER thread */
1595  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1596  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1597 
1598  {
1599  KMP_TIME_BLOCK(OMP_work);
1600  if (! parent_team->t.t_invoke( gtid )) {
1601  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1602  }
1603  }
1604  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1605  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1606  KMP_MB(); /* Flush all pending memory write invalidates. */
1607 
1608  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1609 
1610  return TRUE;
1611  } // Parallel closely nested in teams construct
1612 #endif /* OMP_40_ENABLED */
1613 
1614 #if KMP_DEBUG
1615  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1616  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1617  }
1618 #endif
1619 
1620  /* determine how many new threads we can use */
1621  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1622 
1623  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1624  nthreads = 1;
1625  } else {
1626  nthreads = master_set_numthreads ?
1627  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1628  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1629 #if OMP_40_ENABLED
1630 /* AC: If we execute teams from parallel region (on host), then teams should be created
1631  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1632  then teams and their threads should be created regardless of the nesting setting. */
1633  , ((ap==NULL && active_level==0) ||
1634  (ap && teams_level>0 && teams_level==level))
1635 #endif /* OMP_40_ENABLED */
1636  );
1637  }
1638  KMP_DEBUG_ASSERT( nthreads > 0 );
1639 
1640  /* If we temporarily changed the set number of threads then restore it now */
1641  master_th->th.th_set_nproc = 0;
1642 
1643 
1644  /* create a serialized parallel region? */
1645  if ( nthreads == 1 ) {
1646  /* josh todo: hypothetical question: what do we do for OS X*? */
1647 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1648  void * args[ argc ];
1649 #else
1650  void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1651 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1652 
1653  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1654  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1655 
1656  __kmpc_serialized_parallel(loc, gtid);
1657 
1658  if ( call_context == fork_context_intel ) {
1659  /* TODO this sucks, use the compiler itself to pass args! :) */
1660  master_th->th.th_serial_team->t.t_ident = loc;
1661 #if OMP_40_ENABLED
1662  if ( !ap ) {
1663  // revert change made in __kmpc_serialized_parallel()
1664  master_th->th.th_serial_team->t.t_level--;
1665  // Get args from parent team for teams construct
1666  {
1667  KMP_TIME_BLOCK(OMP_work);
1668  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
1669  }
1670  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1671  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1672  team = master_th->th.th_team;
1673  //team->t.t_pkfn = microtask;
1674  team->t.t_invoke = invoker;
1675  __kmp_alloc_argv_entries( argc, team, TRUE );
1676  team->t.t_argc = argc;
1677  argv = (void**) team->t.t_argv;
1678  if ( ap ) {
1679  for( i=argc-1; i >= 0; --i )
1680 // TODO: revert workaround for Intel(R) 64 tracker #96
1681 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1682  *argv++ = va_arg( *ap, void * );
1683 # else
1684  *argv++ = va_arg( ap, void * );
1685 # endif
1686  } else {
1687  for( i=0; i < argc; ++i )
1688  // Get args from parent team for teams construct
1689  argv[i] = parent_team->t.t_argv[i];
1690  }
1691  // AC: revert change made in __kmpc_serialized_parallel()
1692  // because initial code in teams should have level=0
1693  team->t.t_level--;
1694  // AC: call special invoker for outer "parallel" of the teams construct
1695  {
1696  KMP_TIME_BLOCK(OMP_work);
1697  invoker(gtid);
1698  }
1699  } else {
1700 #endif /* OMP_40_ENABLED */
1701  argv = args;
1702  for( i=argc-1; i >= 0; --i )
1703 // TODO: revert workaround for Intel(R) 64 tracker #96
1704 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1705  *argv++ = va_arg( *ap, void * );
1706 #else
1707  *argv++ = va_arg( ap, void * );
1708 #endif
1709  KMP_MB();
1710  {
1711  KMP_TIME_BLOCK(OMP_work);
1712  __kmp_invoke_microtask( microtask, gtid, 0, argc, args );
1713  }
1714 #if OMP_40_ENABLED
1715  }
1716 #endif /* OMP_40_ENABLED */
1717  }
1718  else if ( call_context == fork_context_gnu ) {
1719  // we were called from GNU native code
1720  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1721  return FALSE;
1722  }
1723  else {
1724  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1725  }
1726 
1727 
1728  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1729  KMP_MB();
1730  return FALSE;
1731  }
1732 
1733  // GEH: only modify the executing flag in the case when not serialized
1734  // serialized case is handled in kmpc_serialized_parallel
1735  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1736  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1737  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1738  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1739  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1740  master_th->th.th_current_task->td_flags.executing = 0;
1741 
1742 #if OMP_40_ENABLED
1743  if ( !master_th->th.th_teams_microtask || level > teams_level )
1744 #endif /* OMP_40_ENABLED */
1745  {
1746  /* Increment our nested depth level */
1747  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1748  }
1749 
1750  // See if we need to make a copy of the ICVs.
1751  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1752  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1753  nthreads_icv = __kmp_nested_nth.nth[level+1];
1754  }
1755  else {
1756  nthreads_icv = 0; // don't update
1757  }
1758 
1759 #if OMP_40_ENABLED
1760  // Figure out the proc_bind_policy for the new team.
1761  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1762  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1763  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1764  proc_bind = proc_bind_false;
1765  }
1766  else {
1767  if (proc_bind == proc_bind_default) {
1768  // No proc_bind clause specified; use current proc-bind-var for this parallel region
1769  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1770  }
1771  /* else: The proc_bind policy was specified explicitly on parallel clause. This
1772  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1773  // Figure the value of proc-bind-var for the child threads.
1774  if ((level+1 < __kmp_nested_proc_bind.used)
1775  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1776  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1777  }
1778  }
1779 
1780  // Reset for next parallel region
1781  master_th->th.th_set_proc_bind = proc_bind_default;
1782 #endif /* OMP_40_ENABLED */
1783 
1784  if ((nthreads_icv > 0)
1785 #if OMP_40_ENABLED
1786  || (proc_bind_icv != proc_bind_default)
1787 #endif /* OMP_40_ENABLED */
1788  ) {
1789  kmp_internal_control_t new_icvs;
1790  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1791  new_icvs.next = NULL;
1792  if (nthreads_icv > 0) {
1793  new_icvs.nproc = nthreads_icv;
1794  }
1795 
1796 #if OMP_40_ENABLED
1797  if (proc_bind_icv != proc_bind_default) {
1798  new_icvs.proc_bind = proc_bind_icv;
1799  }
1800 #endif /* OMP_40_ENABLED */
1801 
1802  /* allocate a new parallel team */
1803  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1804  team = __kmp_allocate_team(root, nthreads, nthreads,
1805 #if OMP_40_ENABLED
1806  proc_bind,
1807 #endif
1808  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1809  } else {
1810  /* allocate a new parallel team */
1811  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1812  team = __kmp_allocate_team(root, nthreads, nthreads,
1813 #if OMP_40_ENABLED
1814  proc_bind,
1815 #endif
1816  &master_th->th.th_current_task->td_icvs, argc
1817  USE_NESTED_HOT_ARG(master_th) );
1818  }
1819  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
1820 
1821  /* setup the new team */
1822  team->t.t_master_tid = master_tid;
1823  team->t.t_master_this_cons = master_this_cons;
1824  team->t.t_ident = loc;
1825  team->t.t_parent = parent_team;
1826  TCW_SYNC_PTR(team->t.t_pkfn, microtask);
1827  team->t.t_invoke = invoker; /* TODO move this to root, maybe */
1828  // TODO: parent_team->t.t_level == INT_MAX ???
1829 #if OMP_40_ENABLED
1830  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
1831 #endif /* OMP_40_ENABLED */
1832  team->t.t_level = parent_team->t.t_level + 1;
1833  team->t.t_active_level = parent_team->t.t_active_level + 1;
1834 #if OMP_40_ENABLED
1835  } else {
1836  // AC: Do not increase parallel level at start of the teams construct
1837  team->t.t_level = parent_team->t.t_level;
1838  team->t.t_active_level = parent_team->t.t_active_level;
1839  }
1840 #endif /* OMP_40_ENABLED */
1841  team->t.t_sched = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
1842 
1843  // Update the floating point rounding in the team if required.
1844  propagateFPControl(team);
1845 
1846  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1847  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
1848  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1849  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
1850  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
1851  parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
1852  if (level) {
1853  // Take a memo of master's task_state
1854  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
1855  if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
1856  kmp_uint8 *old_stack, *new_stack = (kmp_uint8 *) __kmp_allocate( 2*master_th->th.th_task_state_stack_sz );
1857  kmp_uint32 i;
1858  for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
1859  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
1860  }
1861  old_stack = master_th->th.th_task_state_memo_stack;
1862  master_th->th.th_task_state_memo_stack = new_stack;
1863  master_th->th.th_task_state_stack_sz *= 2;
1864  __kmp_free(old_stack);
1865  }
1866  // Store master's task_state on stack
1867  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
1868  master_th->th.th_task_state_top++;
1869  master_th->th.th_task_state = 0;
1870  }
1871  master_th->th.th_task_team = team->t.t_task_team[master_th->th.th_task_state];
1872 
1873 #if !KMP_NESTED_HOT_TEAMS
1874  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
1875 #endif
1876  }
1877 
1878  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
1879  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
1880  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
1881  ( team->t.t_master_tid == 0 &&
1882  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
1883  KMP_MB();
1884 
1885  /* now, setup the arguments */
1886  argv = (void**)team->t.t_argv;
1887 #if OMP_40_ENABLED
1888  if ( ap ) {
1889 #endif /* OMP_40_ENABLED */
1890  for ( i=argc-1; i >= 0; --i )
1891 // TODO: revert workaround for Intel(R) 64 tracker #96
1892 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1893  *argv++ = va_arg( *ap, void * );
1894 #else
1895  *argv++ = va_arg( ap, void * );
1896 #endif
1897 #if OMP_40_ENABLED
1898  } else {
1899  for ( i=0; i < argc; ++i )
1900  // Get args from parent team for teams construct
1901  argv[i] = team->t.t_parent->t.t_argv[i];
1902  }
1903 #endif /* OMP_40_ENABLED */
1904 
1905  /* now actually fork the threads */
1906  team->t.t_master_active = master_active;
1907  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
1908  root->r.r_active = TRUE;
1909 
1910  __kmp_fork_team_threads( root, team, master_th, gtid );
1911  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
1912 
1913 
1914  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1915 
1916 
1917 #if USE_ITT_BUILD
1918  if ( team->t.t_active_level == 1 // only report frames at level 1
1919 # if OMP_40_ENABLED
1920  && !master_th->th.th_teams_microtask // not in teams construct
1921 # endif /* OMP_40_ENABLED */
1922  ) {
1923 #if USE_ITT_NOTIFY
1924  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
1925  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
1926  {
1927  kmp_uint64 tmp_time = 0;
1928  if ( __itt_get_timestamp_ptr )
1929  tmp_time = __itt_get_timestamp();
1930  // Internal fork - report frame begin
1931  master_th->th.th_frame_time = tmp_time;
1932  if ( __kmp_forkjoin_frames_mode == 3 )
1933  team->t.t_region_time = tmp_time;
1934  } else // only one notification scheme (either "submit" or "forking/joined", not both)
1935 #endif /* USE_ITT_NOTIFY */
1936  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
1937  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
1938  { // Mark start of "parallel" region for VTune.
1939  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
1940  }
1941  }
1942 #endif /* USE_ITT_BUILD */
1943 
1944  /* now go on and do the work */
1945  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
1946  KMP_MB();
1947  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
1948  root, team, master_th, gtid));
1949 
1950 #if USE_ITT_BUILD
1951  if ( __itt_stack_caller_create_ptr ) {
1952  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
1953  }
1954 #endif /* USE_ITT_BUILD */
1955 
1956 #if OMP_40_ENABLED
1957  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
1958 #endif /* OMP_40_ENABLED */
1959  {
1960  __kmp_internal_fork( loc, gtid, team );
1961  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
1962  root, team, master_th, gtid));
1963  }
1964 
1965  if (call_context == fork_context_gnu) {
1966  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1967  return TRUE;
1968  }
1969 
1970  /* Invoke microtask for MASTER thread */
1971  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1972  gtid, team->t.t_id, team->t.t_pkfn ) );
1973  } // END of timer KMP_fork_call block
1974 
1975  {
1976  //KMP_TIME_BLOCK(OMP_work);
1977  KMP_TIME_BLOCK(USER_master_invoke);
1978  if (! team->t.t_invoke( gtid )) {
1979  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1980  }
1981  }
1982  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1983  gtid, team->t.t_id, team->t.t_pkfn ) );
1984  KMP_MB(); /* Flush all pending memory write invalidates. */
1985 
1986  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1987 
1988  return TRUE;
1989 }
1990 
1991 void
1992 __kmp_join_call(ident_t *loc, int gtid
1993 #if OMP_40_ENABLED
1994  , int exit_teams
1995 #endif /* OMP_40_ENABLED */
1996 )
1997 {
1998  KMP_TIME_BLOCK(KMP_join_call);
1999  kmp_team_t *team;
2000  kmp_team_t *parent_team;
2001  kmp_info_t *master_th;
2002  kmp_root_t *root;
2003  int master_active;
2004  int i;
2005 
2006  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2007 
2008  /* setup current data */
2009  master_th = __kmp_threads[ gtid ];
2010  root = master_th->th.th_root;
2011  team = master_th->th.th_team;
2012  parent_team = team->t.t_parent;
2013 
2014  master_th->th.th_ident = loc;
2015 
2016 #if KMP_DEBUG
2017  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2018  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2019  __kmp_gtid_from_thread( master_th ), team,
2020  team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2021  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2022  }
2023 #endif
2024 
2025  if( team->t.t_serialized ) {
2026 #if OMP_40_ENABLED
2027  if ( master_th->th.th_teams_microtask ) {
2028  // We are in teams construct
2029  int level = team->t.t_level;
2030  int tlevel = master_th->th.th_teams_level;
2031  if ( level == tlevel ) {
2032  // AC: we haven't incremented it earlier at start of teams construct,
2033  // so do it here - at the end of teams construct
2034  team->t.t_level++;
2035  } else if ( level == tlevel + 1 ) {
2036  // AC: we are exiting parallel inside teams, need to increment serialization
2037  // in order to restore it in the next call to __kmpc_end_serialized_parallel
2038  team->t.t_serialized++;
2039  }
2040  }
2041 #endif /* OMP_40_ENABLED */
2042  __kmpc_end_serialized_parallel( loc, gtid );
2043  return;
2044  }
2045 
2046  master_active = team->t.t_master_active;
2047 
2048 #if OMP_40_ENABLED
2049  if (!exit_teams)
2050 #endif /* OMP_40_ENABLED */
2051  {
2052  // AC: No barrier for internal teams at exit from teams construct.
2053  // But there is barrier for external team (league).
2054  __kmp_internal_join( loc, gtid, team );
2055  }
2056  else {
2057  master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2058  }
2059 
2060  KMP_MB();
2061 
2062 #if USE_ITT_BUILD
2063  if ( __itt_stack_caller_create_ptr ) {
2064  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2065  }
2066 
2067  // Mark end of "parallel" region for VTune.
2068  if ( team->t.t_active_level == 1
2069 # if OMP_40_ENABLED
2070  && !master_th->th.th_teams_microtask /* not in teams construct */
2071 # endif /* OMP_40_ENABLED */
2072  ) {
2073  master_th->th.th_ident = loc;
2074  // only one notification scheme (either "submit" or "forking/joined", not both)
2075  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2076  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2077  0, loc, master_th->th.th_team_nproc, 1 );
2078  else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2079  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2080  __kmp_itt_region_joined( gtid );
2081  } // active_level == 1
2082 #endif /* USE_ITT_BUILD */
2083 
2084 #if OMP_40_ENABLED
2085  if ( master_th->th.th_teams_microtask &&
2086  !exit_teams &&
2087  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2088  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2089  // AC: We need to leave the team structure intact at the end
2090  // of parallel inside the teams construct, so that at the next
2091  // parallel same (hot) team works, only adjust nesting levels
2092 
2093  /* Decrement our nested depth level */
2094  team->t.t_level --;
2095  team->t.t_active_level --;
2096  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2097 
2098  /* Restore number of threads in the team if needed */
2099  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2100  int old_num = master_th->th.th_team_nproc;
2101  int new_num = master_th->th.th_teams_size.nth;
2102  kmp_info_t **other_threads = team->t.t_threads;
2103  kmp_task_team_t * task_team = master_th->th.th_task_team;
2104  team->t.t_nproc = new_num;
2105  if ( task_team ) { // task team might have lesser value of counters
2106  task_team->tt.tt_ref_ct = new_num - 1;
2107  task_team->tt.tt_unfinished_threads = new_num;
2108  }
2109  for ( i = 0; i < old_num; ++i ) {
2110  other_threads[i]->th.th_team_nproc = new_num;
2111  }
2112  // Adjust states of non-used threads of the team
2113  for ( i = old_num; i < new_num; ++i ) {
2114  // Re-initialize thread's barrier data.
2115  int b;
2116  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2117  for ( b = 0; b < bs_last_barrier; ++ b ) {
2118  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2119  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2120 #if USE_DEBUGGER
2121  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2122 #endif
2123  }
2124  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2125  // Synchronize thread's task state
2126  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2127  }
2128  }
2129  }
2130  return;
2131  }
2132 #endif /* OMP_40_ENABLED */
2133 
2134  /* do cleanup and restore the parent team */
2135  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2136  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2137 
2138  master_th->th.th_dispatch =
2139  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2140 
2141  /* jc: The following lock has instructions with REL and ACQ semantics,
2142  separating the parallel user code called in this parallel region
2143  from the serial user code called after this function returns.
2144  */
2145  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2146 
2147 #if OMP_40_ENABLED
2148  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2149 #endif /* OMP_40_ENABLED */
2150  {
2151  /* Decrement our nested depth level */
2152  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2153  }
2154  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2155 
2156  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2157  0, master_th, team ) );
2158  __kmp_pop_current_task_from_thread( master_th );
2159 
2160 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2161  //
2162  // Restore master thread's partition.
2163  //
2164  master_th->th.th_first_place = team->t.t_first_place;
2165  master_th->th.th_last_place = team->t.t_last_place;
2166 #endif /* OMP_40_ENABLED */
2167 
2168  updateHWFPControl (team);
2169 
2170  if ( root->r.r_active != master_active )
2171  root->r.r_active = master_active;
2172 
2173  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2174 
2175  /* this race was fun to find. make sure the following is in the critical
2176  * region otherwise assertions may fail occasionally since the old team
2177  * may be reallocated and the hierarchy appears inconsistent. it is
2178  * actually safe to run and won't cause any bugs, but will cause those
2179  * assertion failures. it's only one deref&assign so might as well put this
2180  * in the critical region */
2181  master_th->th.th_team = parent_team;
2182  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2183  master_th->th.th_team_master = parent_team->t.t_threads[0];
2184  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2185 
2186  /* restore serialized team, if need be */
2187  if( parent_team->t.t_serialized &&
2188  parent_team != master_th->th.th_serial_team &&
2189  parent_team != root->r.r_root_team ) {
2190  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2191  master_th->th.th_serial_team = parent_team;
2192  }
2193 
2194  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2195  // Restore task state from memo stack
2196  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2197  if (master_th->th.th_task_state_top > 0) {
2198  --master_th->th.th_task_state_top; // pop
2199  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2200  }
2201  // Copy the first task team from the new child / old parent team to the thread and reset state flag.
2202  master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2203 
2204  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2205  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2206  parent_team ) );
2207  }
2208 
2209  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2210  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2211  master_th->th.th_current_task->td_flags.executing = 1;
2212 
2213  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2214 
2215  KMP_MB();
2216  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2217 }
2218 
2219 /* ------------------------------------------------------------------------ */
2220 /* ------------------------------------------------------------------------ */
2221 
2222 /* Check whether we should push an internal control record onto the
2223  serial team stack. If so, do it. */
2224 void
2225 __kmp_save_internal_controls ( kmp_info_t * thread )
2226 {
2227 
2228  if ( thread->th.th_team != thread->th.th_serial_team ) {
2229  return;
2230  }
2231  if (thread->th.th_team->t.t_serialized > 1) {
2232  int push = 0;
2233 
2234  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2235  push = 1;
2236  } else {
2237  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2238  thread->th.th_team->t.t_serialized ) {
2239  push = 1;
2240  }
2241  }
2242  if (push) { /* push a record on the serial team's stack */
2243  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2244 
2245  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2246 
2247  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2248 
2249  control->next = thread->th.th_team->t.t_control_stack_top;
2250  thread->th.th_team->t.t_control_stack_top = control;
2251  }
2252  }
2253 }
2254 
2255 /* Changes set_nproc */
2256 void
2257 __kmp_set_num_threads( int new_nth, int gtid )
2258 {
2259  kmp_info_t *thread;
2260  kmp_root_t *root;
2261 
2262  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2263  KMP_DEBUG_ASSERT( __kmp_init_serial );
2264 
2265  if (new_nth < 1)
2266  new_nth = 1;
2267  else if (new_nth > __kmp_max_nth)
2268  new_nth = __kmp_max_nth;
2269 
2270  thread = __kmp_threads[gtid];
2271 
2272  __kmp_save_internal_controls( thread );
2273 
2274  set__nproc( thread, new_nth );
2275 
2276  //
2277  // If this omp_set_num_threads() call will cause the hot team size to be
2278  // reduced (in the absence of a num_threads clause), then reduce it now,
2279  // rather than waiting for the next parallel region.
2280  //
2281  root = thread->th.th_root;
2282  if ( __kmp_init_parallel && ( ! root->r.r_active )
2283  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2284 #if KMP_NESTED_HOT_TEAMS
2285  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2286 #endif
2287  ) {
2288  kmp_team_t *hot_team = root->r.r_hot_team;
2289  int f;
2290 
2291  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2292 
2293 
2294  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2295  int tt_idx;
2296  for (tt_idx=0; tt_idx<2; ++tt_idx) {
2297  kmp_task_team_t *task_team = hot_team->t.t_task_team[tt_idx];
2298  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
2299  // Signal worker threads (esp. the extra ones) to stop looking for tasks while spin waiting.
2300  // The task teams are reference counted and will be deallocated by the last worker thread.
2301  KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
2302  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2303  KMP_MB();
2304  KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
2305  &hot_team->t.t_task_team[tt_idx] ) );
2306  hot_team->t.t_task_team[tt_idx] = NULL;
2307  }
2308  else {
2309  KMP_DEBUG_ASSERT( task_team == NULL );
2310  }
2311  }
2312  }
2313 
2314  //
2315  // Release the extra threads we don't need any more.
2316  //
2317  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2318  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2319  __kmp_free_thread( hot_team->t.t_threads[f] );
2320  hot_team->t.t_threads[f] = NULL;
2321  }
2322  hot_team->t.t_nproc = new_nth;
2323 #if KMP_NESTED_HOT_TEAMS
2324  if( thread->th.th_hot_teams ) {
2325  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2326  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2327  }
2328 #endif
2329 
2330 
2331  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2332 
2333  //
2334  // Update the t_nproc field in the threads that are still active.
2335  //
2336  for( f=0 ; f < new_nth; f++ ) {
2337  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2338  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2339  }
2340  // Special flag in case omp_set_num_threads() call
2341  hot_team->t.t_size_changed = -1;
2342  }
2343 
2344 }
2345 
2346 /* Changes max_active_levels */
2347 void
2348 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2349 {
2350  kmp_info_t *thread;
2351 
2352  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2353  KMP_DEBUG_ASSERT( __kmp_init_serial );
2354 
2355  // validate max_active_levels
2356  if( max_active_levels < 0 ) {
2357  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2358  // We ignore this call if the user has specified a negative value.
2359  // The current setting won't be changed. The last valid setting will be used.
2360  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2361  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2362  return;
2363  }
2364  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2365  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2366  // We allow a zero value. (implementation defined behavior)
2367  } else {
2368  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2369  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2370  // Current upper limit is MAX_INT. (implementation defined behavior)
2371  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2372  // Actually, the flow should never get here until we use MAX_INT limit.
2373  }
2374  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2375 
2376  thread = __kmp_threads[ gtid ];
2377 
2378  __kmp_save_internal_controls( thread );
2379 
2380  set__max_active_levels( thread, max_active_levels );
2381 
2382 }
2383 
2384 /* Gets max_active_levels */
2385 int
2386 __kmp_get_max_active_levels( int gtid )
2387 {
2388  kmp_info_t *thread;
2389 
2390  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2391  KMP_DEBUG_ASSERT( __kmp_init_serial );
2392 
2393  thread = __kmp_threads[ gtid ];
2394  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2395  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2396  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2397  return thread->th.th_current_task->td_icvs.max_active_levels;
2398 }
2399 
2400 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2401 void
2402 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2403 {
2404  kmp_info_t *thread;
2405 // kmp_team_t *team;
2406 
2407  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2408  KMP_DEBUG_ASSERT( __kmp_init_serial );
2409 
2410  // Check if the kind parameter is valid, correct if needed.
2411  // Valid parameters should fit in one of two intervals - standard or extended:
2412  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2413  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2414  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2415  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2416  {
2417  // TODO: Hint needs attention in case we change the default schedule.
2418  __kmp_msg(
2419  kmp_ms_warning,
2420  KMP_MSG( ScheduleKindOutOfRange, kind ),
2421  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2422  __kmp_msg_null
2423  );
2424  kind = kmp_sched_default;
2425  chunk = 0; // ignore chunk value in case of bad kind
2426  }
2427 
2428  thread = __kmp_threads[ gtid ];
2429 
2430  __kmp_save_internal_controls( thread );
2431 
2432  if ( kind < kmp_sched_upper_std ) {
2433  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2434  // differ static chunked vs. unchunked:
2435  // chunk should be invalid to indicate unchunked schedule (which is the default)
2436  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2437  } else {
2438  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2439  }
2440  } else {
2441  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2442  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2443  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2444  }
2445  if ( kind == kmp_sched_auto ) {
2446  // ignore parameter chunk for schedule auto
2447  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2448  } else {
2449  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2450  }
2451 }
2452 
2453 /* Gets def_sched_var ICV values */
2454 void
2455 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2456 {
2457  kmp_info_t *thread;
2458  enum sched_type th_type;
2459  int i;
2460 
2461  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2462  KMP_DEBUG_ASSERT( __kmp_init_serial );
2463 
2464  thread = __kmp_threads[ gtid ];
2465 
2466  //th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
2467  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2468 
2469  switch ( th_type ) {
2470  case kmp_sch_static:
2471  case kmp_sch_static_greedy:
2472  case kmp_sch_static_balanced:
2473  *kind = kmp_sched_static;
2474  *chunk = 0; // chunk was not set, try to show this fact via zero value
2475  return;
2476  case kmp_sch_static_chunked:
2477  *kind = kmp_sched_static;
2478  break;
2479  case kmp_sch_dynamic_chunked:
2480  *kind = kmp_sched_dynamic;
2481  break;
2483  case kmp_sch_guided_iterative_chunked:
2484  case kmp_sch_guided_analytical_chunked:
2485  *kind = kmp_sched_guided;
2486  break;
2487  case kmp_sch_auto:
2488  *kind = kmp_sched_auto;
2489  break;
2490  case kmp_sch_trapezoidal:
2491  *kind = kmp_sched_trapezoidal;
2492  break;
2493 /*
2494  case kmp_sch_static_steal:
2495  *kind = kmp_sched_static_steal;
2496  break;
2497 */
2498  default:
2499  KMP_FATAL( UnknownSchedulingType, th_type );
2500  }
2501 
2502  //*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
2503  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2504 }
2505 
2506 int
2507 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2508 
2509  int ii, dd;
2510  kmp_team_t *team;
2511  kmp_info_t *thr;
2512 
2513  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2514  KMP_DEBUG_ASSERT( __kmp_init_serial );
2515 
2516  // validate level
2517  if( level == 0 ) return 0;
2518  if( level < 0 ) return -1;
2519  thr = __kmp_threads[ gtid ];
2520  team = thr->th.th_team;
2521  ii = team->t.t_level;
2522  if( level > ii ) return -1;
2523 
2524 #if OMP_40_ENABLED
2525  if( thr->th.th_teams_microtask ) {
2526  // AC: we are in teams region where multiple nested teams have same level
2527  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2528  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2529  KMP_DEBUG_ASSERT( ii >= tlevel );
2530  // AC: As we need to pass by the teams league, we need to artificially increase ii
2531  if ( ii == tlevel ) {
2532  ii += 2; // three teams have same level
2533  } else {
2534  ii ++; // two teams have same level
2535  }
2536  }
2537  }
2538 #endif
2539 
2540  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2541 
2542  dd = team->t.t_serialized;
2543  level++;
2544  while( ii > level )
2545  {
2546  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2547  {
2548  }
2549  if( ( team->t.t_serialized ) && ( !dd ) ) {
2550  team = team->t.t_parent;
2551  continue;
2552  }
2553  if( ii > level ) {
2554  team = team->t.t_parent;
2555  dd = team->t.t_serialized;
2556  ii--;
2557  }
2558  }
2559 
2560  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2561 }
2562 
2563 int
2564 __kmp_get_team_size( int gtid, int level ) {
2565 
2566  int ii, dd;
2567  kmp_team_t *team;
2568  kmp_info_t *thr;
2569 
2570  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2571  KMP_DEBUG_ASSERT( __kmp_init_serial );
2572 
2573  // validate level
2574  if( level == 0 ) return 1;
2575  if( level < 0 ) return -1;
2576  thr = __kmp_threads[ gtid ];
2577  team = thr->th.th_team;
2578  ii = team->t.t_level;
2579  if( level > ii ) return -1;
2580 
2581 #if OMP_40_ENABLED
2582  if( thr->th.th_teams_microtask ) {
2583  // AC: we are in teams region where multiple nested teams have same level
2584  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2585  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2586  KMP_DEBUG_ASSERT( ii >= tlevel );
2587  // AC: As we need to pass by the teams league, we need to artificially increase ii
2588  if ( ii == tlevel ) {
2589  ii += 2; // three teams have same level
2590  } else {
2591  ii ++; // two teams have same level
2592  }
2593  }
2594  }
2595 #endif
2596 
2597  while( ii > level )
2598  {
2599  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2600  {
2601  }
2602  if( team->t.t_serialized && ( !dd ) ) {
2603  team = team->t.t_parent;
2604  continue;
2605  }
2606  if( ii > level ) {
2607  team = team->t.t_parent;
2608  ii--;
2609  }
2610  }
2611 
2612  return team->t.t_nproc;
2613 }
2614 
2615 kmp_r_sched_t
2616 __kmp_get_schedule_global() {
2617 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2618 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2619 
2620  kmp_r_sched_t r_sched;
2621 
2622  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2623  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2624  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2625  if ( __kmp_sched == kmp_sch_static ) {
2626  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2627  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2628  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2629  } else {
2630  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2631  }
2632 
2633  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2634  r_sched.chunk = KMP_DEFAULT_CHUNK;
2635  } else {
2636  r_sched.chunk = __kmp_chunk;
2637  }
2638 
2639  return r_sched;
2640 }
2641 
2642 /* ------------------------------------------------------------------------ */
2643 /* ------------------------------------------------------------------------ */
2644 
2645 
2646 /*
2647  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2648  * at least argc number of *t_argv entries for the requested team.
2649  */
2650 static void
2651 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2652 {
2653 
2654  KMP_DEBUG_ASSERT( team );
2655  if( !realloc || argc > team->t.t_max_argc ) {
2656 
2657  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2658  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2659  /* if previously allocated heap space for args, free them */
2660  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2661  __kmp_free( (void *) team->t.t_argv );
2662 
2663  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2664  /* use unused space in the cache line for arguments */
2665  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2666  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2667  team->t.t_id, team->t.t_max_argc ));
2668  team->t.t_argv = &team->t.t_inline_argv[0];
2669  if ( __kmp_storage_map ) {
2670  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2671  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2672  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2673  "team_%d.t_inline_argv",
2674  team->t.t_id );
2675  }
2676  } else {
2677  /* allocate space for arguments in the heap */
2678  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2679  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2680  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2681  team->t.t_id, team->t.t_max_argc ));
2682  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2683  if ( __kmp_storage_map ) {
2684  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2685  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2686  team->t.t_id );
2687  }
2688  }
2689  }
2690 }
2691 
2692 static void
2693 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2694 {
2695  int i;
2696  int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
2697 #if KMP_USE_POOLED_ALLOC
2698  // AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
2699  char *ptr = __kmp_allocate(max_nth *
2700  ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
2701  + sizeof(kmp_disp_t) + sizeof(int)*6
2702  //+ sizeof(int)
2703  + sizeof(kmp_r_sched_t)
2704  + sizeof(kmp_taskdata_t) ) );
2705 
2706  team->t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
2707  team->t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
2708  ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
2709  team->t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
2710  team->t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
2711  team->t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
2712  team->t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
2713  team->t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
2714  team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
2715  team->t.t_set_bt_set = (int*) ptr;
2716  ptr += sizeof(int) * max_nth;
2717  //team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
2718  team->t.t_set_sched = (kmp_r_sched_t*) ptr;
2719  ptr += sizeof(kmp_r_sched_t) * max_nth;
2720  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
2721  ptr += sizeof(kmp_taskdata_t) * max_nth;
2722 #else
2723 
2724  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2725  team->t.t_disp_buffer = (dispatch_shared_info_t*)
2726  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2727  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2728  //team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
2729  //team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
2730  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2731 #endif
2732  team->t.t_max_nproc = max_nth;
2733 
2734  /* setup dispatch buffers */
2735  for(i = 0 ; i < num_disp_buff; ++i)
2736  team->t.t_disp_buffer[i].buffer_index = i;
2737 }
2738 
2739 static void
2740 __kmp_free_team_arrays(kmp_team_t *team) {
2741  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2742  int i;
2743  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2744  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2745  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
2746  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
2747  }; // if
2748  }; // for
2749  __kmp_free(team->t.t_threads);
2750  #if !KMP_USE_POOLED_ALLOC
2751  __kmp_free(team->t.t_disp_buffer);
2752  __kmp_free(team->t.t_dispatch);
2753  //__kmp_free(team->t.t_set_max_active_levels);
2754  //__kmp_free(team->t.t_set_sched);
2755  __kmp_free(team->t.t_implicit_task_taskdata);
2756  #endif
2757  team->t.t_threads = NULL;
2758  team->t.t_disp_buffer = NULL;
2759  team->t.t_dispatch = NULL;
2760  //team->t.t_set_sched = 0;
2761  //team->t.t_set_max_active_levels = 0;
2762  team->t.t_implicit_task_taskdata = 0;
2763 }
2764 
2765 static void
2766 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
2767  kmp_info_t **oldThreads = team->t.t_threads;
2768 
2769  #if !KMP_USE_POOLED_ALLOC
2770  __kmp_free(team->t.t_disp_buffer);
2771  __kmp_free(team->t.t_dispatch);
2772  //__kmp_free(team->t.t_set_max_active_levels);
2773  //__kmp_free(team->t.t_set_sched);
2774  __kmp_free(team->t.t_implicit_task_taskdata);
2775  #endif
2776  __kmp_allocate_team_arrays(team, max_nth);
2777 
2778  KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
2779 
2780  __kmp_free(oldThreads);
2781 }
2782 
2783 static kmp_internal_control_t
2784 __kmp_get_global_icvs( void ) {
2785 
2786  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
2787 
2788 #if OMP_40_ENABLED
2789  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
2790 #endif /* OMP_40_ENABLED */
2791 
2792  kmp_internal_control_t g_icvs = {
2793  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
2794  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
2795  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
2796  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
2797  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
2798  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
2799  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
2800  // (use a max ub on value if __kmp_parallel_initialize not called yet)
2801  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
2802  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
2803 #if OMP_40_ENABLED
2804  __kmp_nested_proc_bind.bind_types[0],
2805 #endif /* OMP_40_ENABLED */
2806  NULL //struct kmp_internal_control *next;
2807  };
2808 
2809  return g_icvs;
2810 }
2811 
2812 static kmp_internal_control_t
2813 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
2814 
2815  kmp_internal_control_t gx_icvs;
2816  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
2817  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
2818  gx_icvs.next = NULL;
2819 
2820  return gx_icvs;
2821 }
2822 
2823 static void
2824 __kmp_initialize_root( kmp_root_t *root )
2825 {
2826  int f;
2827  kmp_team_t *root_team;
2828  kmp_team_t *hot_team;
2829  size_t disp_size, dispatch_size, bar_size;
2830  int hot_team_max_nth;
2831  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
2832  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
2833  KMP_DEBUG_ASSERT( root );
2834  KMP_ASSERT( ! root->r.r_begin );
2835 
2836  /* setup the root state structure */
2837  __kmp_init_lock( &root->r.r_begin_lock );
2838  root->r.r_begin = FALSE;
2839  root->r.r_active = FALSE;
2840  root->r.r_in_parallel = 0;
2841  root->r.r_blocktime = __kmp_dflt_blocktime;
2842  root->r.r_nested = __kmp_dflt_nested;
2843 
2844  /* setup the root team for this task */
2845  /* allocate the root team structure */
2846  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
2847  root_team =
2848  __kmp_allocate_team(
2849  root,
2850  1, // new_nproc
2851  1, // max_nproc
2852 #if OMP_40_ENABLED
2853  __kmp_nested_proc_bind.bind_types[0],
2854 #endif
2855  &r_icvs,
2856  0 // argc
2857  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
2858  );
2859 #if USE_DEBUGGER
2860  // Non-NULL value should be assigned to make the debugger display the root team.
2861  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
2862 #endif
2863 
2864  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
2865 
2866  root->r.r_root_team = root_team;
2867  root_team->t.t_control_stack_top = NULL;
2868 
2869  /* initialize root team */
2870  root_team->t.t_threads[0] = NULL;
2871  root_team->t.t_nproc = 1;
2872  root_team->t.t_serialized = 1;
2873  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
2874  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
2875  root_team->t.t_sched.chunk = r_sched.chunk;
2876  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
2877  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
2878 
2879  /* setup the hot team for this task */
2880  /* allocate the hot team structure */
2881  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
2882  hot_team =
2883  __kmp_allocate_team(
2884  root,
2885  1, // new_nproc
2886  __kmp_dflt_team_nth_ub * 2, // max_nproc
2887 #if OMP_40_ENABLED
2888  __kmp_nested_proc_bind.bind_types[0],
2889 #endif
2890  &r_icvs,
2891  0 // argc
2892  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
2893  );
2894  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
2895 
2896  root->r.r_hot_team = hot_team;
2897  root_team->t.t_control_stack_top = NULL;
2898 
2899  /* first-time initialization */
2900  hot_team->t.t_parent = root_team;
2901 
2902  /* initialize hot team */
2903  hot_team_max_nth = hot_team->t.t_max_nproc;
2904  for ( f = 0; f < hot_team_max_nth; ++ f ) {
2905  hot_team->t.t_threads[ f ] = NULL;
2906  }; // for
2907  hot_team->t.t_nproc = 1;
2908  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
2909  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
2910  hot_team->t.t_sched.chunk = r_sched.chunk;
2911  hot_team->t.t_size_changed = 0;
2912 
2913 }
2914 
2915 #ifdef KMP_DEBUG
2916 
2917 
2918 typedef struct kmp_team_list_item {
2919  kmp_team_p const * entry;
2920  struct kmp_team_list_item * next;
2921 } kmp_team_list_item_t;
2922 typedef kmp_team_list_item_t * kmp_team_list_t;
2923 
2924 
2925 static void
2926 __kmp_print_structure_team_accum( // Add team to list of teams.
2927  kmp_team_list_t list, // List of teams.
2928  kmp_team_p const * team // Team to add.
2929 ) {
2930 
2931  // List must terminate with item where both entry and next are NULL.
2932  // Team is added to the list only once.
2933  // List is sorted in ascending order by team id.
2934  // Team id is *not* a key.
2935 
2936  kmp_team_list_t l;
2937 
2938  KMP_DEBUG_ASSERT( list != NULL );
2939  if ( team == NULL ) {
2940  return;
2941  }; // if
2942 
2943  __kmp_print_structure_team_accum( list, team->t.t_parent );
2944  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
2945 
2946  // Search list for the team.
2947  l = list;
2948  while ( l->next != NULL && l->entry != team ) {
2949  l = l->next;
2950  }; // while
2951  if ( l->next != NULL ) {
2952  return; // Team has been added before, exit.
2953  }; // if
2954 
2955  // Team is not found. Search list again for insertion point.
2956  l = list;
2957  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
2958  l = l->next;
2959  }; // while
2960 
2961  // Insert team.
2962  {
2963  kmp_team_list_item_t * item =
2964  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
2965  * item = * l;
2966  l->entry = team;
2967  l->next = item;
2968  }
2969 
2970 }
2971 
2972 static void
2973 __kmp_print_structure_team(
2974  char const * title,
2975  kmp_team_p const * team
2976 
2977 ) {
2978  __kmp_printf( "%s", title );
2979  if ( team != NULL ) {
2980  __kmp_printf( "%2x %p\n", team->t.t_id, team );
2981  } else {
2982  __kmp_printf( " - (nil)\n" );
2983  }; // if
2984 }
2985 
2986 static void
2987 __kmp_print_structure_thread(
2988  char const * title,
2989  kmp_info_p const * thread
2990 
2991 ) {
2992  __kmp_printf( "%s", title );
2993  if ( thread != NULL ) {
2994  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
2995  } else {
2996  __kmp_printf( " - (nil)\n" );
2997  }; // if
2998 }
2999 
3000 void
3001 __kmp_print_structure(
3002  void
3003 ) {
3004 
3005  kmp_team_list_t list;
3006 
3007  // Initialize list of teams.
3008  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3009  list->entry = NULL;
3010  list->next = NULL;
3011 
3012  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3013  {
3014  int gtid;
3015  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3016  __kmp_printf( "%2d", gtid );
3017  if ( __kmp_threads != NULL ) {
3018  __kmp_printf( " %p", __kmp_threads[ gtid ] );
3019  }; // if
3020  if ( __kmp_root != NULL ) {
3021  __kmp_printf( " %p", __kmp_root[ gtid ] );
3022  }; // if
3023  __kmp_printf( "\n" );
3024  }; // for gtid
3025  }
3026 
3027  // Print out __kmp_threads array.
3028  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3029  if ( __kmp_threads != NULL ) {
3030  int gtid;
3031  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3032  kmp_info_t const * thread = __kmp_threads[ gtid ];
3033  if ( thread != NULL ) {
3034  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3035  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
3036  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
3037  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
3038  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
3039  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
3040  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
3041  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
3042 #if OMP_40_ENABLED
3043  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
3044 #endif
3045  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
3046  __kmp_printf( "\n" );
3047  __kmp_print_structure_team_accum( list, thread->th.th_team );
3048  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3049  }; // if
3050  }; // for gtid
3051  } else {
3052  __kmp_printf( "Threads array is not allocated.\n" );
3053  }; // if
3054 
3055  // Print out __kmp_root array.
3056  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3057  if ( __kmp_root != NULL ) {
3058  int gtid;
3059  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3060  kmp_root_t const * root = __kmp_root[ gtid ];
3061  if ( root != NULL ) {
3062  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3063  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3064  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3065  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3066  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3067  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3068  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3069  __kmp_printf( "\n" );
3070  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3071  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3072  }; // if
3073  }; // for gtid
3074  } else {
3075  __kmp_printf( "Ubers array is not allocated.\n" );
3076  }; // if
3077 
3078  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3079  while ( list->next != NULL ) {
3080  kmp_team_p const * team = list->entry;
3081  int i;
3082  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3083  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3084  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3085  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3086  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3087  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3088  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3089  __kmp_printf( " Thread %2d: ", i );
3090  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3091  }; // for i
3092  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3093  __kmp_printf( "\n" );
3094  list = list->next;
3095  }; // while
3096 
3097  // Print out __kmp_thread_pool and __kmp_team_pool.
3098  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3099  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3100  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3101  __kmp_printf( "\n" );
3102 
3103  // Free team list.
3104  while ( list != NULL ) {
3105  kmp_team_list_item_t * item = list;
3106  list = list->next;
3107  KMP_INTERNAL_FREE( item );
3108  }; // while
3109 
3110 }
3111 
3112 #endif
3113 
3114 
3115 //---------------------------------------------------------------------------
3116 // Stuff for per-thread fast random number generator
3117 // Table of primes
3118 
3119 static const unsigned __kmp_primes[] = {
3120  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3121  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3122  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3123  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3124  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3125  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3126  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3127  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3128  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3129  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3130  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3131  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3132  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3133  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3134  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3135  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3136 };
3137 
3138 //---------------------------------------------------------------------------
3139 // __kmp_get_random: Get a random number using a linear congruential method.
3140 
3141 unsigned short
3142 __kmp_get_random( kmp_info_t * thread )
3143 {
3144  unsigned x = thread->th.th_x;
3145  unsigned short r = x>>16;
3146 
3147  thread->th.th_x = x*thread->th.th_a+1;
3148 
3149  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3150  thread->th.th_info.ds.ds_tid, r) );
3151 
3152  return r;
3153 }
3154 //--------------------------------------------------------
3155 // __kmp_init_random: Initialize a random number generator
3156 
3157 void
3158 __kmp_init_random( kmp_info_t * thread )
3159 {
3160  unsigned seed = thread->th.th_info.ds.ds_tid;
3161 
3162  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3163  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3164  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3165 }
3166 
3167 
3168 #if KMP_OS_WINDOWS
3169 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3170 static int
3171 __kmp_reclaim_dead_roots(void) {
3172  int i, r = 0;
3173 
3174  for(i = 0; i < __kmp_threads_capacity; ++i) {
3175  if( KMP_UBER_GTID( i ) &&
3176  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3177  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3178  r += __kmp_unregister_root_other_thread(i);
3179  }
3180  }
3181  return r;
3182 }
3183 #endif
3184 
3185 /*
3186  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3187  free entries generated.
3188 
3189  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3190  already dead.
3191 
3192  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3193  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3194  __kmp_tp_capacity, if threadprivate cache array has been created.
3195  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3196 
3197  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3198  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3199  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3200  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3201  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3202  as many free slots as possible up to nWish.
3203 
3204  If any argument is negative, the behavior is undefined.
3205 */
3206 static int
3207 __kmp_expand_threads(int nWish, int nNeed) {
3208  int added = 0;
3209  int old_tp_cached;
3210  int __kmp_actual_max_nth;
3211 
3212  if(nNeed > nWish) /* normalize the arguments */
3213  nWish = nNeed;
3214 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3215 /* only for Windows static library */
3216  /* reclaim array entries for root threads that are already dead */
3217  added = __kmp_reclaim_dead_roots();
3218 
3219  if(nNeed) {
3220  nNeed -= added;
3221  if(nNeed < 0)
3222  nNeed = 0;
3223  }
3224  if(nWish) {
3225  nWish -= added;
3226  if(nWish < 0)
3227  nWish = 0;
3228  }
3229 #endif
3230  if(nWish <= 0)
3231  return added;
3232 
3233  while(1) {
3234  int nTarget;
3235  int minimumRequiredCapacity;
3236  int newCapacity;
3237  kmp_info_t **newThreads;
3238  kmp_root_t **newRoot;
3239 
3240  //
3241  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3242  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3243  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3244  // become > __kmp_max_nth in one of two ways:
3245  //
3246  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3247  // may not be resused by another thread, so we may need to increase
3248  // __kmp_threads_capacity to __kmp_max_threads + 1.
3249  //
3250  // 2) New foreign root(s) are encountered. We always register new
3251  // foreign roots. This may cause a smaller # of threads to be
3252  // allocated at subsequent parallel regions, but the worker threads
3253  // hang around (and eventually go to sleep) and need slots in the
3254  // __kmp_threads[] array.
3255  //
3256  // Anyway, that is the reason for moving the check to see if
3257  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3258  // instead of having it performed here. -BB
3259  //
3260  old_tp_cached = __kmp_tp_cached;
3261  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3262  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3263 
3264  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3265  nTarget = nWish;
3266  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3267  /* can't fulfil nWish, so try nNeed */
3268  if(nNeed) {
3269  nTarget = nNeed;
3270  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3271  /* possible expansion too small -- give up */
3272  break;
3273  }
3274  } else {
3275  /* best-effort */
3276  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3277  if(!nTarget) {
3278  /* can expand at all -- give up */
3279  break;
3280  }
3281  }
3282  }
3283  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3284 
3285  newCapacity = __kmp_threads_capacity;
3286  do{
3287  newCapacity =
3288  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3289  (newCapacity << 1) :
3290  __kmp_actual_max_nth;
3291  } while(newCapacity < minimumRequiredCapacity);
3292  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3293  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3294  KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3295  KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3296  memset(newThreads + __kmp_threads_capacity, 0,
3297  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3298  memset(newRoot + __kmp_threads_capacity, 0,
3299  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3300 
3301  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3302  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3303  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3304  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3305  of a double-check pair.
3306  */
3307  __kmp_free(newThreads);
3308  continue; /* start over and try again */
3309  }
3310  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3311  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3312  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3313  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3314  __kmp_free(newThreads);
3315  continue; /* start over and try again */
3316  } else {
3317  /* success */
3318  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3319  //
3320  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3321  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3322  added += newCapacity - __kmp_threads_capacity;
3323  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3324  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3325  break; /* succeeded, so we can exit the loop */
3326  }
3327  }
3328  return added;
3329 }
3330 
3331 /* register the current thread as a root thread and obtain our gtid */
3332 /* we must have the __kmp_initz_lock held at this point */
3333 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3334 int
3335 __kmp_register_root( int initial_thread )
3336 {
3337  kmp_info_t *root_thread;
3338  kmp_root_t *root;
3339  int gtid;
3340  int capacity;
3341  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3342  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3343  KMP_MB();
3344 
3345 
3346  /*
3347  2007-03-02:
3348 
3349  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3350  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3351  return false (that means there is at least one empty slot in __kmp_threads array), but it
3352  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3353  used for this one. Following code workarounds this bug.
3354 
3355  However, right solution seems to be not reserving slot #0 for initial thread because:
3356  (1) there is no magic in slot #0,
3357  (2) we cannot detect initial thread reliably (the first thread which does serial
3358  initialization may be not a real initial thread).
3359  */
3360  capacity = __kmp_threads_capacity;
3361  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3362  -- capacity;
3363  }; // if
3364 
3365  /* see if there are too many threads */
3366  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3367  if ( __kmp_tp_cached ) {
3368  __kmp_msg(
3369  kmp_ms_fatal,
3370  KMP_MSG( CantRegisterNewThread ),
3371  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3372  KMP_HNT( PossibleSystemLimitOnThreads ),
3373  __kmp_msg_null
3374  );
3375  }
3376  else {
3377  __kmp_msg(
3378  kmp_ms_fatal,
3379  KMP_MSG( CantRegisterNewThread ),
3380  KMP_HNT( SystemLimitOnThreads ),
3381  __kmp_msg_null
3382  );
3383  }
3384  }; // if
3385 
3386  /* find an available thread slot */
3387  /* Don't reassign the zero slot since we need that to only be used by initial
3388  thread */
3389  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3390  ;
3391  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3392  KMP_ASSERT( gtid < __kmp_threads_capacity );
3393 
3394  /* update global accounting */
3395  __kmp_all_nth ++;
3396  TCW_4(__kmp_nth, __kmp_nth + 1);
3397 
3398  //
3399  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3400  // for low numbers of procs, and method #2 (keyed API call) for higher
3401  // numbers of procs.
3402  //
3403  if ( __kmp_adjust_gtid_mode ) {
3404  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3405  if ( TCR_4(__kmp_gtid_mode) != 2) {
3406  TCW_4(__kmp_gtid_mode, 2);
3407  }
3408  }
3409  else {
3410  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3411  TCW_4(__kmp_gtid_mode, 1);
3412  }
3413  }
3414  }
3415 
3416 #ifdef KMP_ADJUST_BLOCKTIME
3417  /* Adjust blocktime to zero if necessary */
3418  /* Middle initialization might not have occurred yet */
3419  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3420  if ( __kmp_nth > __kmp_avail_proc ) {
3421  __kmp_zero_bt = TRUE;
3422  }
3423  }
3424 #endif /* KMP_ADJUST_BLOCKTIME */
3425 
3426  /* setup this new hierarchy */
3427  if( ! ( root = __kmp_root[gtid] )) {
3428  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3429  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3430  }
3431 
3432  __kmp_initialize_root( root );
3433 
3434  /* setup new root thread structure */
3435  if( root->r.r_uber_thread ) {
3436  root_thread = root->r.r_uber_thread;
3437  } else {
3438  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3439  if ( __kmp_storage_map ) {
3440  __kmp_print_thread_storage_map( root_thread, gtid );
3441  }
3442  root_thread->th.th_info .ds.ds_gtid = gtid;
3443  root_thread->th.th_root = root;
3444  if( __kmp_env_consistency_check ) {
3445  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3446  }
3447  #if USE_FAST_MEMORY
3448  __kmp_initialize_fast_memory( root_thread );
3449  #endif /* USE_FAST_MEMORY */
3450 
3451  #if KMP_USE_BGET
3452  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3453  __kmp_initialize_bget( root_thread );
3454  #endif
3455  __kmp_init_random( root_thread ); // Initialize random number generator
3456  }
3457 
3458  /* setup the serial team held in reserve by the root thread */
3459  if( ! root_thread->th.th_serial_team ) {
3460  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3461  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3462  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3463 #if OMP_40_ENABLED
3464  proc_bind_default,
3465 #endif
3466  &r_icvs,
3467  0 USE_NESTED_HOT_ARG(NULL) );
3468  }
3469  KMP_ASSERT( root_thread->th.th_serial_team );
3470  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3471  root_thread->th.th_serial_team ) );
3472 
3473  /* drop root_thread into place */
3474  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3475 
3476  root->r.r_root_team->t.t_threads[0] = root_thread;
3477  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3478  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3479  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3480  root->r.r_uber_thread = root_thread;
3481 
3482  /* initialize the thread, get it ready to go */
3483  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3484 
3485  /* prepare the master thread for get_gtid() */
3486  __kmp_gtid_set_specific( gtid );
3487 
3488  __kmp_itt_thread_name( gtid );
3489 
3490  #ifdef KMP_TDATA_GTID
3491  __kmp_gtid = gtid;
3492  #endif
3493  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3494  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3495  TCW_4(__kmp_init_gtid, TRUE);
3496 
3497  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3498  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3499  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3500  KMP_INIT_BARRIER_STATE ) );
3501  { // Initialize barrier data.
3502  int b;
3503  for ( b = 0; b < bs_last_barrier; ++ b ) {
3504  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3505 #if USE_DEBUGGER
3506  root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3507 #endif
3508  }; // for
3509  }
3510  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3511 
3512 
3513 #if KMP_AFFINITY_SUPPORTED
3514  if ( TCR_4(__kmp_init_middle) ) {
3515  __kmp_affinity_set_init_mask( gtid, TRUE );
3516  }
3517 #endif /* KMP_AFFINITY_SUPPORTED */
3518 
3519  __kmp_root_counter ++;
3520 
3521  KMP_MB();
3522  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3523 
3524  return gtid;
3525 }
3526 
3527 #if KMP_NESTED_HOT_TEAMS
3528 static int
3529 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3530 {
3531  int i, n, nth;
3532  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3533  if( !hot_teams || !hot_teams[level].hot_team ) {
3534  return 0;
3535  }
3536  KMP_DEBUG_ASSERT( level < max_level );
3537  kmp_team_t *team = hot_teams[level].hot_team;
3538  nth = hot_teams[level].hot_team_nth;
3539  n = nth - 1; // master is not freed
3540  if( level < max_level - 1 ) {
3541  for( i = 0; i < nth; ++i ) {
3542  kmp_info_t *th = team->t.t_threads[i];
3543  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3544  if( i > 0 && th->th.th_hot_teams ) {
3545  __kmp_free( th->th.th_hot_teams );
3546  th->th.th_hot_teams = NULL;
3547  }
3548  }
3549  }
3550  __kmp_free_team( root, team, NULL );
3551  return n;
3552 }
3553 #endif
3554 
3555 /* Resets a root thread and clear its root and hot teams.
3556  Returns the number of __kmp_threads entries directly and indirectly freed.
3557 */
3558 static int
3559 __kmp_reset_root(int gtid, kmp_root_t *root)
3560 {
3561  kmp_team_t * root_team = root->r.r_root_team;
3562  kmp_team_t * hot_team = root->r.r_hot_team;
3563  int n = hot_team->t.t_nproc;
3564  int i;
3565 
3566  KMP_DEBUG_ASSERT( ! root->r.r_active );
3567 
3568  root->r.r_root_team = NULL;
3569  root->r.r_hot_team = NULL;
3570  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3571  // to __kmp_free_team().
3572  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3573 #if KMP_NESTED_HOT_TEAMS
3574  if( __kmp_hot_teams_max_level > 1 ) { // need to free nested hot teams and their threads if any
3575  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3576  kmp_info_t *th = hot_team->t.t_threads[i];
3577  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3578  if( th->th.th_hot_teams ) {
3579  __kmp_free( th->th.th_hot_teams );
3580  th->th.th_hot_teams = NULL;
3581  }
3582  }
3583  }
3584 #endif
3585  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3586 
3587  //
3588  // Before we can reap the thread, we need to make certain that all
3589  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3590  //
3591  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3592  __kmp_wait_to_unref_task_teams();
3593  }
3594 
3595  #if KMP_OS_WINDOWS
3596  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3597  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3598  (LPVOID)&(root->r.r_uber_thread->th),
3599  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3600  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3601  #endif /* KMP_OS_WINDOWS */
3602 
3603  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3604  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3605 
3606  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3607  root->r.r_uber_thread = NULL;
3608  /* mark root as no longer in use */
3609  root->r.r_begin = FALSE;
3610 
3611  return n;
3612 }
3613 
3614 void
3615 __kmp_unregister_root_current_thread( int gtid )
3616 {
3617  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3618  /* this lock should be ok, since unregister_root_current_thread is never called during
3619  * and abort, only during a normal close. furthermore, if you have the
3620  * forkjoin lock, you should never try to get the initz lock */
3621 
3622  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3623  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3624  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3625  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3626  return;
3627  }
3628  kmp_root_t *root = __kmp_root[gtid];
3629 
3630  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3631  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3632  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3633  KMP_ASSERT( root->r.r_active == FALSE );
3634 
3635 
3636  KMP_MB();
3637 
3638 #if OMP_41_ENABLED
3639  kmp_info_t * thread = __kmp_threads[gtid];
3640  kmp_team_t * team = thread->th.th_team;
3641  kmp_task_team_t * task_team = thread->th.th_task_team;
3642 
3643  // we need to wait for the proxy tasks before finishing the thread
3644  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
3645  __kmp_task_team_wait(thread, team, NULL );
3646 #endif
3647 
3648  __kmp_reset_root(gtid, root);
3649 
3650  /* free up this thread slot */
3651  __kmp_gtid_set_specific( KMP_GTID_DNE );
3652 #ifdef KMP_TDATA_GTID
3653  __kmp_gtid = KMP_GTID_DNE;
3654 #endif
3655 
3656  KMP_MB();
3657  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3658 
3659  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3660 }
3661 
3662 /* __kmp_forkjoin_lock must be already held
3663  Unregisters a root thread that is not the current thread. Returns the number of
3664  __kmp_threads entries freed as a result.
3665  */
3666 static int
3667 __kmp_unregister_root_other_thread( int gtid )
3668 {
3669  kmp_root_t *root = __kmp_root[gtid];
3670  int r;
3671 
3672  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3673  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3674  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3675  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3676  KMP_ASSERT( root->r.r_active == FALSE );
3677 
3678  r = __kmp_reset_root(gtid, root);
3679  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3680  return r;
3681 }
3682 
3683 #if KMP_DEBUG
3684 void __kmp_task_info() {
3685 
3686  kmp_int32 gtid = __kmp_entry_gtid();
3687  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
3688  kmp_info_t *this_thr = __kmp_threads[ gtid ];
3689  kmp_team_t *steam = this_thr->th.th_serial_team;
3690  kmp_team_t *team = this_thr->th.th_team;
3691 
3692  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3693  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3694 }
3695 #endif // KMP_DEBUG
3696 
3697 /* TODO optimize with one big memclr, take out what isn't needed,
3698  * split responsibility to workers as much as possible, and delay
3699  * initialization of features as much as possible */
3700 static void
3701 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3702 {
3703  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3704  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3705  kmp_info_t *master = team->t.t_threads[0];
3706  KMP_DEBUG_ASSERT( this_thr != NULL );
3707  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3708  KMP_DEBUG_ASSERT( team );
3709  KMP_DEBUG_ASSERT( team->t.t_threads );
3710  KMP_DEBUG_ASSERT( team->t.t_dispatch );
3711  KMP_DEBUG_ASSERT( master );
3712  KMP_DEBUG_ASSERT( master->th.th_root );
3713 
3714  KMP_MB();
3715 
3716  TCW_SYNC_PTR(this_thr->th.th_team, team);
3717 
3718  this_thr->th.th_info.ds.ds_tid = tid;
3719  this_thr->th.th_set_nproc = 0;
3720 #if OMP_40_ENABLED
3721  this_thr->th.th_set_proc_bind = proc_bind_default;
3722 # if KMP_AFFINITY_SUPPORTED
3723  this_thr->th.th_new_place = this_thr->th.th_current_place;
3724 # endif
3725 #endif
3726  this_thr->th.th_root = master->th.th_root;
3727 
3728  /* setup the thread's cache of the team structure */
3729  this_thr->th.th_team_nproc = team->t.t_nproc;
3730  this_thr->th.th_team_master = master;
3731  this_thr->th.th_team_serialized = team->t.t_serialized;
3732  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
3733 
3734  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
3735  this_thr->th.th_task_state = 0;
3736 
3737  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
3738  tid, gtid, this_thr, this_thr->th.th_current_task ) );
3739 
3740  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
3741 
3742  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
3743  tid, gtid, this_thr, this_thr->th.th_current_task ) );
3744  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
3745 
3746  /* TODO no worksharing in speculative threads */
3747  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
3748 
3749  this_thr->th.th_local.this_construct = 0;
3750 
3751 #ifdef BUILD_TV
3752  this_thr->th.th_local.tv_data = 0;
3753 #endif
3754 
3755  if ( ! this_thr->th.th_pri_common ) {
3756  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
3757  if ( __kmp_storage_map ) {
3758  __kmp_print_storage_map_gtid(
3759  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
3760  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
3761  );
3762  }; // if
3763  this_thr->th.th_pri_head = NULL;
3764  }; // if
3765 
3766  /* Initialize dynamic dispatch */
3767  {
3768  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
3769  /*
3770  * Use team max_nproc since this will never change for the team.
3771  */
3772  size_t disp_size = sizeof( dispatch_private_info_t ) *
3773  ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
3774  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
3775  KMP_ASSERT( dispatch );
3776  KMP_DEBUG_ASSERT( team->t.t_dispatch );
3777  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
3778 
3779  dispatch->th_disp_index = 0;
3780 
3781  if( ! dispatch->th_disp_buffer ) {
3782  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
3783 
3784  if ( __kmp_storage_map ) {
3785  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
3786  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
3787  disp_size, "th_%d.th_dispatch.th_disp_buffer "
3788  "(team_%d.t_dispatch[%d].th_disp_buffer)",
3789  gtid, team->t.t_id, gtid );
3790  }
3791  } else {
3792  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
3793  }
3794 
3795  dispatch->th_dispatch_pr_current = 0;
3796  dispatch->th_dispatch_sh_current = 0;
3797 
3798  dispatch->th_deo_fcn = 0; /* ORDERED */
3799  dispatch->th_dxo_fcn = 0; /* END ORDERED */
3800  }
3801 
3802  this_thr->th.th_next_pool = NULL;
3803 
3804  if (!this_thr->th.th_task_state_memo_stack) {
3805  this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
3806  this_thr->th.th_task_state_top = 0;
3807  this_thr->th.th_task_state_stack_sz = 4;
3808  }
3809 
3810  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
3811  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
3812 
3813  KMP_MB();
3814 }
3815 
3816 
3817 /* allocate a new thread for the requesting team. this is only called from within a
3818  * forkjoin critical section. we will first try to get an available thread from the
3819  * thread pool. if none is available, we will fork a new one assuming we are able
3820  * to create a new one. this should be assured, as the caller should check on this
3821  * first.
3822  */
3823 kmp_info_t *
3824 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
3825 {
3826  kmp_team_t *serial_team;
3827  kmp_info_t *new_thr;
3828  int new_gtid;
3829 
3830  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
3831  KMP_DEBUG_ASSERT( root && team );
3832 #if !KMP_NESTED_HOT_TEAMS
3833  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
3834 #endif
3835  KMP_MB();
3836 
3837  /* first, try to get one from the thread pool */
3838  if ( __kmp_thread_pool ) {
3839 
3840  new_thr = (kmp_info_t*)__kmp_thread_pool;
3841  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
3842  if ( new_thr == __kmp_thread_pool_insert_pt ) {
3843  __kmp_thread_pool_insert_pt = NULL;
3844  }
3845  TCW_4(new_thr->th.th_in_pool, FALSE);
3846  //
3847  // Don't touch th_active_in_pool or th_active.
3848  // The worker thread adjusts those flags as it sleeps/awakens.
3849  //
3850 
3851  __kmp_thread_pool_nth--;
3852 
3853  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
3854  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
3855  KMP_ASSERT( ! new_thr->th.th_team );
3856  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
3857  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
3858 
3859  /* setup the thread structure */
3860  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
3861  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
3862 
3863  TCW_4(__kmp_nth, __kmp_nth + 1);
3864 
3865  new_thr->th.th_task_state_top = 0;
3866  new_thr->th.th_task_state_stack_sz = 4;
3867 
3868 #ifdef KMP_ADJUST_BLOCKTIME
3869  /* Adjust blocktime back to zero if necessary */
3870  /* Middle initialization might not have occurred yet */
3871  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3872  if ( __kmp_nth > __kmp_avail_proc ) {
3873  __kmp_zero_bt = TRUE;
3874  }
3875  }
3876 #endif /* KMP_ADJUST_BLOCKTIME */
3877 
3878 #if KMP_DEBUG
3879  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
3880  int b;
3881  kmp_balign_t * balign = new_thr->th.th_bar;
3882  for( b = 0; b < bs_last_barrier; ++ b )
3883  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
3884 #endif
3885 
3886  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
3887  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
3888 
3889  KMP_MB();
3890  return new_thr;
3891  }
3892 
3893 
3894  /* no, well fork a new one */
3895  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
3896  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
3897 
3898  //
3899  // If this is the first worker thread the RTL is creating, then also
3900  // launch the monitor thread. We try to do this as early as possible.
3901  //
3902  if ( ! TCR_4( __kmp_init_monitor ) ) {
3903  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
3904  if ( ! TCR_4( __kmp_init_monitor ) ) {
3905  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
3906  TCW_4( __kmp_init_monitor, 1 );
3907  __kmp_create_monitor( & __kmp_monitor );
3908  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
3909  #if KMP_OS_WINDOWS
3910  // AC: wait until monitor has started. This is a fix for CQ232808.
3911  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
3912  // work in between, then there is high probability that monitor thread started after
3913  // the library shutdown. At shutdown it is too late to cope with the problem, because
3914  // when the master is in DllMain (process detach) the monitor has no chances to start
3915  // (it is blocked), and master has no means to inform the monitor that the library has gone,
3916  // because all the memory which the monitor can access is going to be released/reset.
3917  while ( TCR_4(__kmp_init_monitor) < 2 ) {
3918  KMP_YIELD( TRUE );
3919  }
3920  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
3921  #endif
3922  }
3923  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
3924  }
3925 
3926  KMP_MB();
3927  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
3928  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
3929  }
3930 
3931  /* allocate space for it. */
3932  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3933 
3934  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
3935 
3936  if ( __kmp_storage_map ) {
3937  __kmp_print_thread_storage_map( new_thr, new_gtid );
3938  }
3939 
3940  /* add the reserve serialized team, initialized from the team's master thread */
3941  {
3942  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
3943  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
3944  new_thr->th.th_serial_team = serial_team =
3945  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
3946 #if OMP_40_ENABLED
3947  proc_bind_default,
3948 #endif
3949  &r_icvs,
3950  0 USE_NESTED_HOT_ARG(NULL) );
3951  }
3952  KMP_ASSERT ( serial_team );
3953  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3954  serial_team->t.t_threads[0] = new_thr;
3955  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
3956  new_thr ) );
3957 
3958  /* setup the thread structures */
3959  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
3960 
3961  #if USE_FAST_MEMORY
3962  __kmp_initialize_fast_memory( new_thr );
3963  #endif /* USE_FAST_MEMORY */
3964 
3965  #if KMP_USE_BGET
3966  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
3967  __kmp_initialize_bget( new_thr );
3968  #endif
3969 
3970  __kmp_init_random( new_thr ); // Initialize random number generator
3971 
3972  /* Initialize these only once when thread is grabbed for a team allocation */
3973  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
3974  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3975 
3976  int b;
3977  kmp_balign_t * balign = new_thr->th.th_bar;
3978  for(b=0; b<bs_last_barrier; ++b) {
3979  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
3980  balign[b].bb.team = NULL;
3981  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
3982  balign[b].bb.use_oncore_barrier = 0;
3983  }
3984 
3985  new_thr->th.th_spin_here = FALSE;
3986  new_thr->th.th_next_waiting = 0;
3987 
3988 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
3989  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
3990  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
3991  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
3992  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
3993 #endif
3994 
3995  TCW_4(new_thr->th.th_in_pool, FALSE);
3996  new_thr->th.th_active_in_pool = FALSE;
3997  TCW_4(new_thr->th.th_active, TRUE);
3998 
3999  /* adjust the global counters */
4000  __kmp_all_nth ++;
4001  __kmp_nth ++;
4002 
4003  //
4004  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4005  // for low numbers of procs, and method #2 (keyed API call) for higher
4006  // numbers of procs.
4007  //
4008  if ( __kmp_adjust_gtid_mode ) {
4009  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4010  if ( TCR_4(__kmp_gtid_mode) != 2) {
4011  TCW_4(__kmp_gtid_mode, 2);
4012  }
4013  }
4014  else {
4015  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4016  TCW_4(__kmp_gtid_mode, 1);
4017  }
4018  }
4019  }
4020 
4021 #ifdef KMP_ADJUST_BLOCKTIME
4022  /* Adjust blocktime back to zero if necessary */
4023  /* Middle initialization might not have occurred yet */
4024  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4025  if ( __kmp_nth > __kmp_avail_proc ) {
4026  __kmp_zero_bt = TRUE;
4027  }
4028  }
4029 #endif /* KMP_ADJUST_BLOCKTIME */
4030 
4031  /* actually fork it and create the new worker thread */
4032  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4033  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4034  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4035 
4036 
4037  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4038  KMP_MB();
4039  return new_thr;
4040 }
4041 
4042 /*
4043  * reinitialize team for reuse.
4044  *
4045  * The hot team code calls this case at every fork barrier, so EPCC barrier
4046  * test are extremely sensitive to changes in it, esp. writes to the team
4047  * struct, which cause a cache invalidation in all threads.
4048  *
4049  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4050  */
4051 static void
4052 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4053  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4054  team->t.t_threads[0], team ) );
4055  KMP_DEBUG_ASSERT( team && new_icvs);
4056  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4057  team->t.t_ident = loc;
4058 
4059  team->t.t_id = KMP_GEN_TEAM_ID();
4060 
4061  // Copy ICVs to the master thread's implicit taskdata
4062  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4063  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4064 
4065  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4066  team->t.t_threads[0], team ) );
4067 }
4068 
4069 
4070 /* initialize the team data structure
4071  * this assumes the t_threads and t_max_nproc are already set
4072  * also, we don't touch the arguments */
4073 static void
4074 __kmp_initialize_team(
4075  kmp_team_t * team,
4076  int new_nproc,
4077  kmp_internal_control_t * new_icvs,
4078  ident_t * loc
4079 ) {
4080  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4081 
4082  /* verify */
4083  KMP_DEBUG_ASSERT( team );
4084  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4085  KMP_DEBUG_ASSERT( team->t.t_threads );
4086  KMP_MB();
4087 
4088  team->t.t_master_tid = 0; /* not needed */
4089  /* team->t.t_master_bar; not needed */
4090  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4091  team->t.t_nproc = new_nproc;
4092 
4093  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4094  team->t.t_next_pool = NULL;
4095  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4096 
4097  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4098  team->t.t_invoke = NULL; /* not needed */
4099 
4100  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4101  team->t.t_sched = new_icvs->sched;
4102 
4103 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4104  team->t.t_fp_control_saved = FALSE; /* not needed */
4105  team->t.t_x87_fpu_control_word = 0; /* not needed */
4106  team->t.t_mxcsr = 0; /* not needed */
4107 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4108 
4109  team->t.t_construct = 0;
4110  __kmp_init_lock( & team->t.t_single_lock );
4111 
4112  team->t.t_ordered .dt.t_value = 0;
4113  team->t.t_master_active = FALSE;
4114 
4115  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4116 
4117 #ifdef KMP_DEBUG
4118  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4119 #endif
4120  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4121 
4122  team->t.t_control_stack_top = NULL;
4123 
4124  __kmp_reinitialize_team( team, new_icvs, loc );
4125 
4126  KMP_MB();
4127  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4128 }
4129 
4130 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4131 /* Sets full mask for thread and returns old mask, no changes to structures. */
4132 static void
4133 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4134 {
4135  if ( KMP_AFFINITY_CAPABLE() ) {
4136  int status;
4137  if ( old_mask != NULL ) {
4138  status = __kmp_get_system_affinity( old_mask, TRUE );
4139  int error = errno;
4140  if ( status != 0 ) {
4141  __kmp_msg(
4142  kmp_ms_fatal,
4143  KMP_MSG( ChangeThreadAffMaskError ),
4144  KMP_ERR( error ),
4145  __kmp_msg_null
4146  );
4147  }
4148  }
4149  __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
4150  }
4151 }
4152 #endif
4153 
4154 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4155 
4156 //
4157 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4158 // It calculats the worker + master thread's partition based upon the parent
4159 // thread's partition, and binds each worker to a thread in their partition.
4160 // The master thread's partition should already include its current binding.
4161 //
4162 static void
4163 __kmp_partition_places( kmp_team_t *team )
4164 {
4165  //
4166  // Copy the master thread's place partion to the team struct
4167  //
4168  kmp_info_t *master_th = team->t.t_threads[0];
4169  KMP_DEBUG_ASSERT( master_th != NULL );
4170  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4171  int first_place = master_th->th.th_first_place;
4172  int last_place = master_th->th.th_last_place;
4173  int masters_place = master_th->th.th_current_place;
4174  team->t.t_first_place = first_place;
4175  team->t.t_last_place = last_place;
4176 
4177  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4178  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4179  masters_place, first_place, last_place ) );
4180 
4181  switch ( proc_bind ) {
4182 
4183  case proc_bind_default:
4184  //
4185  // serial teams might have the proc_bind policy set to
4186  // proc_bind_default. It doesn't matter, as we don't
4187  // rebind the master thread for any proc_bind policy.
4188  //
4189  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4190  break;
4191 
4192  case proc_bind_master:
4193  {
4194  int f;
4195  int n_th = team->t.t_nproc;
4196  for ( f = 1; f < n_th; f++ ) {
4197  kmp_info_t *th = team->t.t_threads[f];
4198  KMP_DEBUG_ASSERT( th != NULL );
4199  th->th.th_first_place = first_place;
4200  th->th.th_last_place = last_place;
4201  th->th.th_new_place = masters_place;
4202 
4203  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4204  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4205  team->t.t_id, f, masters_place, first_place, last_place ) );
4206  }
4207  }
4208  break;
4209 
4210  case proc_bind_close:
4211  {
4212  int f;
4213  int n_th = team->t.t_nproc;
4214  int n_places;
4215  if ( first_place <= last_place ) {
4216  n_places = last_place - first_place + 1;
4217  }
4218  else {
4219  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4220  }
4221  if ( n_th <= n_places ) {
4222  int place = masters_place;
4223  for ( f = 1; f < n_th; f++ ) {
4224  kmp_info_t *th = team->t.t_threads[f];
4225  KMP_DEBUG_ASSERT( th != NULL );
4226 
4227  if ( place == last_place ) {
4228  place = first_place;
4229  }
4230  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4231  place = 0;
4232  }
4233  else {
4234  place++;
4235  }
4236  th->th.th_first_place = first_place;
4237  th->th.th_last_place = last_place;
4238  th->th.th_new_place = place;
4239 
4240  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4241  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4242  team->t.t_id, f, place, first_place, last_place ) );
4243  }
4244  }
4245  else {
4246  int S, rem, gap, s_count;
4247  S = n_th / n_places;
4248  s_count = 0;
4249  rem = n_th - ( S * n_places );
4250  gap = rem > 0 ? n_places/rem : n_places;
4251  int place = masters_place;
4252  int gap_ct = gap;
4253  for ( f = 0; f < n_th; f++ ) {
4254  kmp_info_t *th = team->t.t_threads[f];
4255  KMP_DEBUG_ASSERT( th != NULL );
4256 
4257  th->th.th_first_place = first_place;
4258  th->th.th_last_place = last_place;
4259  th->th.th_new_place = place;
4260  s_count++;
4261 
4262  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4263  // do nothing, add an extra thread to place on next iteration
4264  }
4265  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4266  // we added an extra thread to this place; move to next place
4267  if ( place == last_place ) {
4268  place = first_place;
4269  }
4270  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4271  place = 0;
4272  }
4273  else {
4274  place++;
4275  }
4276  s_count = 0;
4277  gap_ct = 1;
4278  rem--;
4279  }
4280  else if (s_count == S) { // place full; don't add extra
4281  if ( place == last_place ) {
4282  place = first_place;
4283  }
4284  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4285  place = 0;
4286  }
4287  else {
4288  place++;
4289  }
4290  gap_ct++;
4291  s_count = 0;
4292  }
4293 
4294  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4295  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4296  team->t.t_id, f, th->th.th_new_place, first_place,
4297  last_place ) );
4298  }
4299  KMP_DEBUG_ASSERT( place == masters_place );
4300  }
4301  }
4302  break;
4303 
4304  case proc_bind_spread:
4305  {
4306  int f;
4307  int n_th = team->t.t_nproc;
4308  int n_places;
4309  if ( first_place <= last_place ) {
4310  n_places = last_place - first_place + 1;
4311  }
4312  else {
4313  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4314  }
4315  if ( n_th <= n_places ) {
4316  int place = masters_place;
4317  int S = n_places/n_th;
4318  int s_count, rem, gap, gap_ct;
4319  rem = n_places - n_th*S;
4320  gap = rem ? n_th/rem : 1;
4321  gap_ct = gap;
4322  for ( f = 0; f < n_th; f++ ) {
4323  kmp_info_t *th = team->t.t_threads[f];
4324  KMP_DEBUG_ASSERT( th != NULL );
4325 
4326  th->th.th_first_place = place;
4327  th->th.th_new_place = place;
4328  s_count = 1;
4329  while (s_count < S) {
4330  if ( place == last_place ) {
4331  place = first_place;
4332  }
4333  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4334  place = 0;
4335  }
4336  else {
4337  place++;
4338  }
4339  s_count++;
4340  }
4341  if (rem && (gap_ct == gap)) {
4342  if ( place == last_place ) {
4343  place = first_place;
4344  }
4345  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4346  place = 0;
4347  }
4348  else {
4349  place++;
4350  }
4351  rem--;
4352  gap_ct = 0;
4353  }
4354  th->th.th_last_place = place;
4355  gap_ct++;
4356 
4357  if ( place == last_place ) {
4358  place = first_place;
4359  }
4360  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4361  place = 0;
4362  }
4363  else {
4364  place++;
4365  }
4366 
4367  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4368  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4369  team->t.t_id, f, th->th.th_new_place,
4370  th->th.th_first_place, th->th.th_last_place ) );
4371  }
4372  KMP_DEBUG_ASSERT( place == masters_place );
4373  }
4374  else {
4375  int S, rem, gap, s_count;
4376  S = n_th / n_places;
4377  s_count = 0;
4378  rem = n_th - ( S * n_places );
4379  gap = rem > 0 ? n_places/rem : n_places;
4380  int place = masters_place;
4381  int gap_ct = gap;
4382  for ( f = 0; f < n_th; f++ ) {
4383  kmp_info_t *th = team->t.t_threads[f];
4384  KMP_DEBUG_ASSERT( th != NULL );
4385 
4386  th->th.th_first_place = place;
4387  th->th.th_last_place = place;
4388  th->th.th_new_place = place;
4389  s_count++;
4390 
4391  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4392  // do nothing, add an extra thread to place on next iteration
4393  }
4394  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4395  // we added an extra thread to this place; move on to next place
4396  if ( place == last_place ) {
4397  place = first_place;
4398  }
4399  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4400  place = 0;
4401  }
4402  else {
4403  place++;
4404  }
4405  s_count = 0;
4406  gap_ct = 1;
4407  rem--;
4408  }
4409  else if (s_count == S) { // place is full; don't add extra thread
4410  if ( place == last_place ) {
4411  place = first_place;
4412  }
4413  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4414  place = 0;
4415  }
4416  else {
4417  place++;
4418  }
4419  gap_ct++;
4420  s_count = 0;
4421  }
4422 
4423  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4424  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4425  team->t.t_id, f, th->th.th_new_place,
4426  th->th.th_first_place, th->th.th_last_place) );
4427  }
4428  KMP_DEBUG_ASSERT( place == masters_place );
4429  }
4430  }
4431  break;
4432 
4433  default:
4434  break;
4435  }
4436 
4437  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4438 }
4439 
4440 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4441 
4442 /* allocate a new team data structure to use. take one off of the free pool if available */
4443 kmp_team_t *
4444 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4445 #if OMP_40_ENABLED
4446  kmp_proc_bind_t new_proc_bind,
4447 #endif
4448  kmp_internal_control_t *new_icvs,
4449  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4450 {
4451  KMP_TIME_BLOCK(KMP_allocate_team);
4452  int f;
4453  kmp_team_t *team;
4454  char *ptr;
4455  size_t size;
4456  int use_hot_team = ! root->r.r_active;
4457  int level = 0;
4458 
4459  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4460  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4461  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4462  KMP_MB();
4463 
4464 #if KMP_NESTED_HOT_TEAMS
4465  kmp_hot_team_ptr_t *hot_teams;
4466  if( master ) {
4467  team = master->th.th_team;
4468  level = team->t.t_active_level;
4469  if( master->th.th_teams_microtask ) { // in teams construct?
4470  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4471  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4472  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4473  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4474  }
4475  }
4476  hot_teams = master->th.th_hot_teams;
4477  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4478  { // hot team has already been allocated for given level
4479  use_hot_team = 1;
4480  } else {
4481  use_hot_team = 0;
4482  }
4483  }
4484 #endif
4485  // Optimization to use a "hot" team
4486  if( use_hot_team && new_nproc > 1 ) {
4487  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4488 #if KMP_NESTED_HOT_TEAMS
4489  team = hot_teams[level].hot_team;
4490 #else
4491  team = root->r.r_hot_team;
4492 #endif
4493 #if KMP_DEBUG
4494  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4495  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4496  team->t.t_task_team[0], team->t.t_task_team[1] ));
4497  }
4498 #endif
4499 
4500  // Has the number of threads changed?
4501  /* Let's assume the most common case is that the number of threads is unchanged, and
4502  put that case first. */
4503  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4504  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4505  // This case can mean that omp_set_num_threads() was called and the hot team size
4506  // was already reduced, so we check the special flag
4507  if ( team->t.t_size_changed == -1 ) {
4508  team->t.t_size_changed = 1;
4509  } else {
4510  team->t.t_size_changed = 0;
4511  }
4512 
4513  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4514  team->t.t_sched = new_icvs->sched;
4515 
4516  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4517 
4518  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4519  0, team->t.t_threads[0], team ) );
4520  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4521 
4522 #if OMP_40_ENABLED
4523 # if KMP_AFFINITY_SUPPORTED
4524  if ( team->t.t_proc_bind == new_proc_bind ) {
4525  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4526  team->t.t_id, new_proc_bind, team->t.t_first_place,
4527  team->t.t_last_place ) );
4528  }
4529  else {
4530  team->t.t_proc_bind = new_proc_bind;
4531  __kmp_partition_places( team );
4532  }
4533 # else
4534  if ( team->t.t_proc_bind != new_proc_bind ) {
4535  team->t.t_proc_bind = new_proc_bind;
4536  }
4537 # endif /* KMP_AFFINITY_SUPPORTED */
4538 #endif /* OMP_40_ENABLED */
4539 
4540  if (level) {
4541  for(f = 0; f < new_nproc; ++f) {
4542  team->t.t_threads[f]->th.th_task_state = 0;
4543  }
4544  }
4545  }
4546  else if( team->t.t_nproc > new_nproc ) {
4547  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4548 
4549  team->t.t_size_changed = 1;
4550  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4551  // Signal the worker threads (esp. extra ones) to stop looking for tasks while spin waiting.
4552  // The task teams are reference counted and will be deallocated by the last worker thread.
4553  int tt_idx;
4554  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4555  // We don't know which of the two task teams workers are waiting on, so deactivate both.
4556  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4557  if ( (task_team != NULL) && TCR_SYNC_4(task_team->tt.tt_active) ) {
4558  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
4559  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
4560  KMP_MB();
4561  KA_TRACE(20, ("__kmp_allocate_team: setting task_team %p to NULL\n",
4562  &team->t.t_task_team[tt_idx]));
4563  team->t.t_task_team[tt_idx] = NULL;
4564  }
4565  else {
4566  KMP_DEBUG_ASSERT( task_team == NULL );
4567  }
4568  }
4569  }
4570 #if KMP_NESTED_HOT_TEAMS
4571  if( __kmp_hot_teams_mode == 0 ) {
4572  // AC: saved number of threads should correspond to team's value in this mode,
4573  // can be bigger in mode 1, when hot team has some threads in reserve
4574  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4575  hot_teams[level].hot_team_nth = new_nproc;
4576 #endif // KMP_NESTED_HOT_TEAMS
4577  /* release the extra threads we don't need any more */
4578  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4579  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4580  __kmp_free_thread( team->t.t_threads[ f ] );
4581  team->t.t_threads[ f ] = NULL;
4582  }
4583 #if KMP_NESTED_HOT_TEAMS
4584  } // (__kmp_hot_teams_mode == 0)
4585 #endif // KMP_NESTED_HOT_TEAMS
4586  team->t.t_nproc = new_nproc;
4587  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4588  team->t.t_sched = new_icvs->sched;
4589  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4590 
4591  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4592  // Init both task teams
4593  int tt_idx;
4594  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4595  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4596  if ( task_team != NULL ) {
4597  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
4598  task_team->tt.tt_nproc = new_nproc;
4599  task_team->tt.tt_unfinished_threads = new_nproc;
4600  task_team->tt.tt_ref_ct = new_nproc - 1;
4601  }
4602  }
4603  }
4604 
4605  /* update the remaining threads */
4606  if (level) {
4607  for(f = 0; f < new_nproc; ++f) {
4608  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4609  team->t.t_threads[f]->th.th_task_state = 0;
4610  }
4611  }
4612  else {
4613  for(f = 0; f < new_nproc; ++f) {
4614  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4615  }
4616  }
4617  // restore the current task state of the master thread: should be the implicit task
4618  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4619  0, team->t.t_threads[0], team ) );
4620 
4621  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4622 
4623 #ifdef KMP_DEBUG
4624  for ( f = 0; f < team->t.t_nproc; f++ ) {
4625  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4626  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4627  }
4628 #endif
4629 
4630 #if OMP_40_ENABLED
4631  team->t.t_proc_bind = new_proc_bind;
4632 # if KMP_AFFINITY_SUPPORTED
4633  __kmp_partition_places( team );
4634 # endif
4635 #endif
4636  }
4637  else { // team->t.t_nproc < new_nproc
4638 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4639  kmp_affin_mask_t *old_mask;
4640  if ( KMP_AFFINITY_CAPABLE() ) {
4641  KMP_CPU_ALLOC(old_mask);
4642  }
4643 #endif
4644 
4645  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4646 
4647  team->t.t_size_changed = 1;
4648 
4649 
4650 #if KMP_NESTED_HOT_TEAMS
4651  int avail_threads = hot_teams[level].hot_team_nth;
4652  if( new_nproc < avail_threads )
4653  avail_threads = new_nproc;
4654  kmp_info_t **other_threads = team->t.t_threads;
4655  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4656  // Adjust barrier data of reserved threads (if any) of the team
4657  // Other data will be set in __kmp_initialize_info() below.
4658  int b;
4659  kmp_balign_t * balign = other_threads[f]->th.th_bar;
4660  for ( b = 0; b < bs_last_barrier; ++ b ) {
4661  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4662  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4663 #if USE_DEBUGGER
4664  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4665 #endif
4666  }
4667  }
4668  if( hot_teams[level].hot_team_nth >= new_nproc ) {
4669  // we have all needed threads in reserve, no need to allocate any
4670  // this only possible in mode 1, cannot have reserved threads in mode 0
4671  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4672  team->t.t_nproc = new_nproc; // just get reserved threads involved
4673  } else {
4674  // we may have some threads in reserve, but not enough
4675  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4676  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4677 #endif // KMP_NESTED_HOT_TEAMS
4678  if(team->t.t_max_nproc < new_nproc) {
4679  /* reallocate larger arrays */
4680  __kmp_reallocate_team_arrays(team, new_nproc);
4681  __kmp_reinitialize_team( team, new_icvs, NULL );
4682  }
4683 
4684 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4685  /* Temporarily set full mask for master thread before
4686  creation of workers. The reason is that workers inherit
4687  the affinity from master, so if a lot of workers are
4688  created on the single core quickly, they don't get
4689  a chance to set their own affinity for a long time.
4690  */
4691  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4692 #endif
4693 
4694  /* allocate new threads for the hot team */
4695  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
4696  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4697  KMP_DEBUG_ASSERT( new_worker );
4698  team->t.t_threads[ f ] = new_worker;
4699  new_worker->th.th_team_nproc = team->t.t_nproc;
4700 
4701  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%u, plain=%u\n",
4702  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4703  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4704  team->t.t_bar[bs_plain_barrier].b_arrived ) );
4705 
4706  { // Initialize barrier data for new threads.
4707  int b;
4708  kmp_balign_t * balign = new_worker->th.th_bar;
4709  for( b = 0; b < bs_last_barrier; ++ b ) {
4710  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
4711  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4712 #if USE_DEBUGGER
4713  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
4714 #endif
4715  }
4716  }
4717  }
4718 
4719 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4720  if ( KMP_AFFINITY_CAPABLE() ) {
4721  /* Restore initial master thread's affinity mask */
4722  __kmp_set_system_affinity( old_mask, TRUE );
4723  KMP_CPU_FREE(old_mask);
4724  }
4725 #endif
4726 #if KMP_NESTED_HOT_TEAMS
4727  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
4728 #endif // KMP_NESTED_HOT_TEAMS
4729  /* make sure everyone is syncronized */
4730  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
4731 
4732  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4733  int tt_idx;
4734  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4735  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4736  if ( task_team != NULL ) {
4737  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
4738  task_team->tt.tt_nproc = new_nproc;
4739  task_team->tt.tt_unfinished_threads = new_nproc;
4740  task_team->tt.tt_ref_ct = new_nproc - 1;
4741  }
4742  }
4743  }
4744 
4745  /* reinitialize the old threads */
4746  if (level) {
4747  for( f = 0 ; f < team->t.t_nproc ; f++ ) {
4748  __kmp_initialize_info( team->t.t_threads[ f ], team, f,
4749  __kmp_gtid_from_tid( f, team ) );
4750  }
4751  }
4752  else {
4753  int old_state = team->t.t_threads[0]->th.th_task_state;
4754  for (f=0; f < team->t.t_nproc; ++f) {
4755  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
4756  }
4757  // Old threads (except for master) and new threads need task_state and task_team reinitialized.
4758  // Master thread will make a memo of old task_state later, then it will update task_state and task_team
4759  for( f = 1 ; f < new_nproc ; f++ ) {
4760  team->t.t_threads[f]->th.th_task_state = old_state;
4761  team->t.t_threads[f]->th.th_task_team = team->t.t_task_team[team->t.t_threads[f]->th.th_task_state];
4762  }
4763  }
4764 
4765 #ifdef KMP_DEBUG
4766  for ( f = 0; f < team->t.t_nproc; ++ f ) {
4767  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4768  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4769  }
4770 #endif
4771 
4772 #if OMP_40_ENABLED
4773  team->t.t_proc_bind = new_proc_bind;
4774 # if KMP_AFFINITY_SUPPORTED
4775  __kmp_partition_places( team );
4776 # endif
4777 #endif
4778  } // Check changes in number of threads
4779 
4780 #if OMP_40_ENABLED
4781  kmp_info_t *master = team->t.t_threads[0];
4782  if( master->th.th_teams_microtask ) {
4783  for( f = 1; f < new_nproc; ++f ) {
4784  // propagate teams construct specific info to workers
4785  kmp_info_t *thr = team->t.t_threads[f];
4786  thr->th.th_teams_microtask = master->th.th_teams_microtask;
4787  thr->th.th_teams_level = master->th.th_teams_level;
4788  thr->th.th_teams_size = master->th.th_teams_size;
4789  }
4790  }
4791 #endif /* OMP_40_ENABLED */
4792 #if KMP_NESTED_HOT_TEAMS
4793  if( level ) {
4794  // Sync task (TODO: and barrier?) state for nested hot teams, not needed for outermost hot team.
4795  for( f = 1; f < new_nproc; ++f ) {
4796  kmp_info_t *thr = team->t.t_threads[f];
4797  thr->th.th_task_state = 0;
4798  int b;
4799  kmp_balign_t * balign = thr->th.th_bar;
4800  for( b = 0; b < bs_last_barrier; ++ b ) {
4801  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
4802  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4803 #if USE_DEBUGGER
4804  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
4805 #endif
4806  }
4807  }
4808  }
4809 #endif // KMP_NESTED_HOT_TEAMS
4810 
4811  /* reallocate space for arguments if necessary */
4812  __kmp_alloc_argv_entries( argc, team, TRUE );
4813  team->t.t_argc = argc;
4814  //
4815  // The hot team re-uses the previous task team,
4816  // if untouched during the previous release->gather phase.
4817  //
4818 
4819  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
4820 
4821 #if KMP_DEBUG
4822  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4823  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
4824  team->t.t_task_team[0], team->t.t_task_team[1] ));
4825  }
4826 #endif
4827 
4828  KMP_MB();
4829 
4830  return team;
4831  }
4832 
4833  /* next, let's try to take one from the team pool */
4834  KMP_MB();
4835  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
4836  {
4837  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
4838  if ( team->t.t_max_nproc >= max_nproc ) {
4839  /* take this team from the team pool */
4840  __kmp_team_pool = team->t.t_next_pool;
4841 
4842  /* setup the team for fresh use */
4843  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
4844 
4845  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
4846  &team->t.t_task_team[0], &team->t.t_task_team[1]) );
4847  team->t.t_task_team[0] = NULL;
4848  team->t.t_task_team[1] = NULL;
4849 
4850  /* reallocate space for arguments if necessary */
4851  __kmp_alloc_argv_entries( argc, team, TRUE );
4852  team->t.t_argc = argc;
4853 
4854  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
4855  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4856  { // Initialize barrier data.
4857  int b;
4858  for ( b = 0; b < bs_last_barrier; ++ b) {
4859  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
4860 #if USE_DEBUGGER
4861  team->t.t_bar[ b ].b_master_arrived = 0;
4862  team->t.t_bar[ b ].b_team_arrived = 0;
4863 #endif
4864  }
4865  }
4866 
4867 #if OMP_40_ENABLED
4868  team->t.t_proc_bind = new_proc_bind;
4869 #endif
4870 
4871  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
4872  KMP_MB();
4873 
4874  return team;
4875  }
4876 
4877  /* reap team if it is too small, then loop back and check the next one */
4878  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
4879  /* TODO: Use technique to find the right size hot-team, don't reap them */
4880  team = __kmp_reap_team( team );
4881  __kmp_team_pool = team;
4882  }
4883 
4884  /* nothing available in the pool, no matter, make a new team! */
4885  KMP_MB();
4886  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
4887 
4888  /* and set it up */
4889  team->t.t_max_nproc = max_nproc;
4890  /* NOTE well, for some reason allocating one big buffer and dividing it
4891  * up seems to really hurt performance a lot on the P4, so, let's not use
4892  * this... */
4893  __kmp_allocate_team_arrays( team, max_nproc );
4894 
4895  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
4896  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
4897 
4898  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
4899  &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
4900  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
4901  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
4902 
4903  if ( __kmp_storage_map ) {
4904  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
4905  }
4906 
4907  /* allocate space for arguments */
4908  __kmp_alloc_argv_entries( argc, team, FALSE );
4909  team->t.t_argc = argc;
4910 
4911  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
4912  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4913  { // Initialize barrier data.
4914  int b;
4915  for ( b = 0; b < bs_last_barrier; ++ b ) {
4916  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
4917 #if USE_DEBUGGER
4918  team->t.t_bar[ b ].b_master_arrived = 0;
4919  team->t.t_bar[ b ].b_team_arrived = 0;
4920 #endif
4921  }
4922  }
4923 
4924 #if OMP_40_ENABLED
4925  team->t.t_proc_bind = new_proc_bind;
4926 #endif
4927 
4928  KMP_MB();
4929 
4930  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
4931 
4932  return team;
4933 }
4934 
4935 /* TODO implement hot-teams at all levels */
4936 /* TODO implement lazy thread release on demand (disband request) */
4937 
4938 /* free the team. return it to the team pool. release all the threads
4939  * associated with it */
4940 void
4941 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
4942 {
4943  int f;
4944  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
4945 
4946  /* verify state */
4947  KMP_DEBUG_ASSERT( root );
4948  KMP_DEBUG_ASSERT( team );
4949  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
4950  KMP_DEBUG_ASSERT( team->t.t_threads );
4951 
4952  int use_hot_team = team == root->r.r_hot_team;
4953 #if KMP_NESTED_HOT_TEAMS
4954  int level;
4955  kmp_hot_team_ptr_t *hot_teams;
4956  if( master ) {
4957  level = team->t.t_active_level - 1;
4958  if( master->th.th_teams_microtask ) { // in teams construct?
4959  if( master->th.th_teams_size.nteams > 1 ) {
4960  ++level; // level was not increased in teams construct for team_of_masters
4961  }
4962  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
4963  master->th.th_teams_level == team->t.t_level ) {
4964  ++level; // level was not increased in teams construct for team_of_workers before the parallel
4965  } // team->t.t_level will be increased inside parallel
4966  }
4967  hot_teams = master->th.th_hot_teams;
4968  if( level < __kmp_hot_teams_max_level ) {
4969  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
4970  use_hot_team = 1;
4971  }
4972  }
4973 #endif // KMP_NESTED_HOT_TEAMS
4974 
4975  /* team is done working */
4976  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
4977  team->t.t_copyin_counter = 0; // init counter for possible reuse
4978  // Do not reset pointer to parent team to NULL for hot teams.
4979 
4980  /* if we are non-hot team, release our threads */
4981  if( ! use_hot_team ) {
4982  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4983  int tt_idx;
4984  for (tt_idx=0; tt_idx<2; ++tt_idx) {
4985  // We don't know which of the two task teams workers are waiting on, so deactivate both.
4986  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
4987  if ( task_team != NULL ) {
4988  // Signal the worker threads to stop looking for tasks while spin waiting. The task
4989  // teams are reference counted and will be deallocated by the last worker thread via the
4990  // thread's pointer to the task team.
4991  KA_TRACE( 20, ( "__kmp_free_team: deactivating task_team %p\n", task_team ) );
4992  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
4993  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
4994  KMP_MB();
4995  team->t.t_task_team[tt_idx] = NULL;
4996  }
4997  }
4998  }
4999 
5000  // Reset pointer to parent team only for non-hot teams.
5001  team->t.t_parent = NULL;
5002 
5003 
5004  /* free the worker threads */
5005  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5006  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5007  __kmp_free_thread( team->t.t_threads[ f ] );
5008  team->t.t_threads[ f ] = NULL;
5009  }
5010 
5011 
5012  /* put the team back in the team pool */
5013  /* TODO limit size of team pool, call reap_team if pool too large */
5014  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5015  __kmp_team_pool = (volatile kmp_team_t*) team;
5016  }
5017 
5018  KMP_MB();
5019 }
5020 
5021 
5022 /* reap the team. destroy it, reclaim all its resources and free its memory */
5023 kmp_team_t *
5024 __kmp_reap_team( kmp_team_t *team )
5025 {
5026  kmp_team_t *next_pool = team->t.t_next_pool;
5027 
5028  KMP_DEBUG_ASSERT( team );
5029  KMP_DEBUG_ASSERT( team->t.t_dispatch );
5030  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5031  KMP_DEBUG_ASSERT( team->t.t_threads );
5032  KMP_DEBUG_ASSERT( team->t.t_argv );
5033 
5034  /* TODO clean the threads that are a part of this? */
5035 
5036  /* free stuff */
5037 
5038  __kmp_free_team_arrays( team );
5039  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5040  __kmp_free( (void*) team->t.t_argv );
5041  __kmp_free( team );
5042 
5043  KMP_MB();
5044  return next_pool;
5045 }
5046 
5047 //
5048 // Free the thread. Don't reap it, just place it on the pool of available
5049 // threads.
5050 //
5051 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5052 // binding for the affinity mechanism to be useful.
5053 //
5054 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5055 // However, we want to avoid a potential performance problem by always
5056 // scanning through the list to find the correct point at which to insert
5057 // the thread (potential N**2 behavior). To do this we keep track of the
5058 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5059 // With single-level parallelism, threads will always be added to the tail
5060 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5061 // parallelism, all bets are off and we may need to scan through the entire
5062 // free list.
5063 //
5064 // This change also has a potentially large performance benefit, for some
5065 // applications. Previously, as threads were freed from the hot team, they
5066 // would be placed back on the free list in inverse order. If the hot team
5067 // grew back to it's original size, then the freed thread would be placed
5068 // back on the hot team in reverse order. This could cause bad cache
5069 // locality problems on programs where the size of the hot team regularly
5070 // grew and shrunk.
5071 //
5072 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5073 //
5074 void
5075 __kmp_free_thread( kmp_info_t *this_th )
5076 {
5077  int gtid;
5078  kmp_info_t **scan;
5079 
5080  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5081  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5082 
5083  KMP_DEBUG_ASSERT( this_th );
5084 
5085  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5086  int b;
5087  kmp_balign_t *balign = this_th->th.th_bar;
5088  for (b=0; b<bs_last_barrier; ++b) {
5089  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5090  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5091  balign[b].bb.team = NULL;
5092  }
5093 
5094 
5095  /* put thread back on the free pool */
5096  TCW_PTR(this_th->th.th_team, NULL);
5097  TCW_PTR(this_th->th.th_root, NULL);
5098  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5099 
5100  //
5101  // If the __kmp_thread_pool_insert_pt is already past the new insert
5102  // point, then we need to re-scan the entire list.
5103  //
5104  gtid = this_th->th.th_info.ds.ds_gtid;
5105  if ( __kmp_thread_pool_insert_pt != NULL ) {
5106  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5107  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5108  __kmp_thread_pool_insert_pt = NULL;
5109  }
5110  }
5111 
5112  //
5113  // Scan down the list to find the place to insert the thread.
5114  // scan is the address of a link in the list, possibly the address of
5115  // __kmp_thread_pool itself.
5116  //
5117  // In the absence of nested parallism, the for loop will have 0 iterations.
5118  //
5119  if ( __kmp_thread_pool_insert_pt != NULL ) {
5120  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5121  }
5122  else {
5123  scan = (kmp_info_t **)&__kmp_thread_pool;
5124  }
5125  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5126  scan = &( (*scan)->th.th_next_pool ) );
5127 
5128  //
5129  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5130  // to its address.
5131  //
5132  TCW_PTR(this_th->th.th_next_pool, *scan);
5133  __kmp_thread_pool_insert_pt = *scan = this_th;
5134  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5135  || ( this_th->th.th_info.ds.ds_gtid
5136  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5137  TCW_4(this_th->th.th_in_pool, TRUE);
5138  __kmp_thread_pool_nth++;
5139 
5140  TCW_4(__kmp_nth, __kmp_nth - 1);
5141 
5142 #ifdef KMP_ADJUST_BLOCKTIME
5143  /* Adjust blocktime back to user setting or default if necessary */
5144  /* Middle initialization might never have occurred */
5145  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5146  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5147  if ( __kmp_nth <= __kmp_avail_proc ) {
5148  __kmp_zero_bt = FALSE;
5149  }
5150  }
5151 #endif /* KMP_ADJUST_BLOCKTIME */
5152 
5153  KMP_MB();
5154 }
5155 
5156 
5157 /* ------------------------------------------------------------------------ */
5158 
5159 void *
5160 __kmp_launch_thread( kmp_info_t *this_thr )
5161 {
5162  int gtid = this_thr->th.th_info.ds.ds_gtid;
5163 /* void *stack_data;*/
5164  kmp_team_t *(*volatile pteam);
5165 
5166  KMP_MB();
5167  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5168 
5169  if( __kmp_env_consistency_check ) {
5170  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5171  }
5172 
5173  /* This is the place where threads wait for work */
5174  while( ! TCR_4(__kmp_global.g.g_done) ) {
5175  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5176  KMP_MB();
5177 
5178  /* wait for work to do */
5179  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5180 
5181  /* No tid yet since not part of a team */
5182  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5183 
5184  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5185 
5186  /* have we been allocated? */
5187  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5188  /* we were just woken up, so run our new task */
5189  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5190  int rc;
5191  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5192  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5193 
5194  updateHWFPControl (*pteam);
5195 
5196  KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
5197  {
5198  KMP_TIME_BLOCK(USER_worker_invoke);
5199  rc = (*pteam)->t.t_invoke( gtid );
5200  }
5201  KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
5202  KMP_ASSERT( rc );
5203 
5204  KMP_MB();
5205  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5206  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5207  }
5208  /* join barrier after parallel region */
5209  __kmp_join_barrier( gtid );
5210  }
5211  }
5212  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5213 
5214  if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
5215  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
5216  }
5217  /* run the destructors for the threadprivate data for this thread */
5218  __kmp_common_destroy_gtid( gtid );
5219 
5220  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5221  KMP_MB();
5222  return this_thr;
5223 }
5224 
5225 /* ------------------------------------------------------------------------ */
5226 /* ------------------------------------------------------------------------ */
5227 
5228 void
5229 __kmp_internal_end_dest( void *specific_gtid )
5230 {
5231  #if KMP_COMPILER_ICC
5232  #pragma warning( push )
5233  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5234  #endif
5235  // Make sure no significant bits are lost
5236  int gtid = (kmp_intptr_t)specific_gtid - 1;
5237  #if KMP_COMPILER_ICC
5238  #pragma warning( pop )
5239  #endif
5240 
5241  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5242  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5243  * this is because 0 is reserved for the nothing-stored case */
5244 
5245  /* josh: One reason for setting the gtid specific data even when it is being
5246  destroyed by pthread is to allow gtid lookup through thread specific data
5247  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5248  that gets executed in the call to __kmp_internal_end_thread, actually
5249  gets the gtid through the thread specific data. Setting it here seems
5250  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5251  to run smoothly.
5252  todo: get rid of this after we remove the dependence on
5253  __kmp_gtid_get_specific
5254  */
5255  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5256  __kmp_gtid_set_specific( gtid );
5257  #ifdef KMP_TDATA_GTID
5258  __kmp_gtid = gtid;
5259  #endif
5260  __kmp_internal_end_thread( gtid );
5261 }
5262 
5263 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5264 
5265 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5266 // perfectly, but in real libiomp5.so I have no evidence it is ever called. However, -fini linker
5267 // option in makefile.mk works fine.
5268 
5269 __attribute__(( destructor ))
5270 void
5271 __kmp_internal_end_dtor( void )
5272 {
5273  __kmp_internal_end_atexit();
5274 }
5275 
5276 void
5277 __kmp_internal_end_fini( void )
5278 {
5279  __kmp_internal_end_atexit();
5280 }
5281 
5282 #endif
5283 
5284 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5285 void
5286 __kmp_internal_end_atexit( void )
5287 {
5288  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5289  /* [Windows]
5290  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5291  stat code that depends on thread specific data for gtid fails because that data becomes
5292  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5293  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5294  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5295 
5296 // TODO: Can some of this comment about GVS be removed?
5297  I suspect that the offending stat code is executed when the calling thread tries to
5298  clean up a dead root thread's data structures, resulting in GVS code trying to close
5299  the GVS structures for that thread, but since the stat code uses
5300  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5301  cleaning up itself instead of another thread, it gets confused. This happens because
5302  allowing a thread to unregister and cleanup another thread is a recent modification for
5303  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5304  thread may end up trying to unregister another thread only if thread death does not
5305  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5306  specific data destructor function to detect thread death. For Windows dynamic, there
5307  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5308  workaround is applicable only for Windows static stat library.
5309  */
5310  __kmp_internal_end_library( -1 );
5311  #if KMP_OS_WINDOWS
5312  __kmp_close_console();
5313  #endif
5314 }
5315 
5316 static void
5317 __kmp_reap_thread(
5318  kmp_info_t * thread,
5319  int is_root
5320 ) {
5321 
5322  // It is assumed __kmp_forkjoin_lock is acquired.
5323 
5324  int gtid;
5325 
5326  KMP_DEBUG_ASSERT( thread != NULL );
5327 
5328  gtid = thread->th.th_info.ds.ds_gtid;
5329 
5330  if ( ! is_root ) {
5331 
5332  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5333  /* Assume the threads are at the fork barrier here */
5334  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5335  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5336  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5337  __kmp_release_64(&flag);
5338  }; // if
5339 
5340 
5341  // Terminate OS thread.
5342  __kmp_reap_worker( thread );
5343 
5344  //
5345  // The thread was killed asynchronously. If it was actively
5346  // spinning in the in the thread pool, decrement the global count.
5347  //
5348  // There is a small timing hole here - if the worker thread was
5349  // just waking up after sleeping in the pool, had reset it's
5350  // th_active_in_pool flag but not decremented the global counter
5351  // __kmp_thread_pool_active_nth yet, then the global counter
5352  // might not get updated.
5353  //
5354  // Currently, this can only happen as the library is unloaded,
5355  // so there are no harmful side effects.
5356  //
5357  if ( thread->th.th_active_in_pool ) {
5358  thread->th.th_active_in_pool = FALSE;
5359  KMP_TEST_THEN_DEC32(
5360  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5361  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5362  }
5363 
5364  // Decrement # of [worker] threads in the pool.
5365  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5366  --__kmp_thread_pool_nth;
5367  }; // if
5368 
5369  // Free the fast memory for tasking
5370  #if USE_FAST_MEMORY
5371  __kmp_free_fast_memory( thread );
5372  #endif /* USE_FAST_MEMORY */
5373 
5374  __kmp_suspend_uninitialize_thread( thread );
5375 
5376  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5377  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5378 
5379  -- __kmp_all_nth;
5380  // __kmp_nth was decremented when thread is added to the pool.
5381 
5382 #ifdef KMP_ADJUST_BLOCKTIME
5383  /* Adjust blocktime back to user setting or default if necessary */
5384  /* Middle initialization might never have occurred */
5385  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5386  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5387  if ( __kmp_nth <= __kmp_avail_proc ) {
5388  __kmp_zero_bt = FALSE;
5389  }
5390  }
5391 #endif /* KMP_ADJUST_BLOCKTIME */
5392 
5393  /* free the memory being used */
5394  if( __kmp_env_consistency_check ) {
5395  if ( thread->th.th_cons ) {
5396  __kmp_free_cons_stack( thread->th.th_cons );
5397  thread->th.th_cons = NULL;
5398  }; // if
5399  }
5400 
5401  if ( thread->th.th_pri_common != NULL ) {
5402  __kmp_free( thread->th.th_pri_common );
5403  thread->th.th_pri_common = NULL;
5404  }; // if
5405 
5406  if (thread->th.th_task_state_memo_stack != NULL) {
5407  __kmp_free(thread->th.th_task_state_memo_stack);
5408  thread->th.th_task_state_memo_stack = NULL;
5409  }
5410 
5411  #if KMP_USE_BGET
5412  if ( thread->th.th_local.bget_data != NULL ) {
5413  __kmp_finalize_bget( thread );
5414  }; // if
5415  #endif
5416 
5417 #if KMP_AFFINITY_SUPPORTED
5418  if ( thread->th.th_affin_mask != NULL ) {
5419  KMP_CPU_FREE( thread->th.th_affin_mask );
5420  thread->th.th_affin_mask = NULL;
5421  }; // if
5422 #endif /* KMP_AFFINITY_SUPPORTED */
5423 
5424  __kmp_reap_team( thread->th.th_serial_team );
5425  thread->th.th_serial_team = NULL;
5426  __kmp_free( thread );
5427 
5428  KMP_MB();
5429 
5430 } // __kmp_reap_thread
5431 
5432 static void
5433 __kmp_internal_end(void)
5434 {
5435  int i;
5436 
5437  /* First, unregister the library */
5438  __kmp_unregister_library();
5439 
5440  #if KMP_OS_WINDOWS
5441  /* In Win static library, we can't tell when a root actually dies, so we
5442  reclaim the data structures for any root threads that have died but not
5443  unregistered themselves, in order to shut down cleanly.
5444  In Win dynamic library we also can't tell when a thread dies.
5445  */
5446  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5447  #endif
5448 
5449  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5450  if( __kmp_root[i] )
5451  if( __kmp_root[i]->r.r_active )
5452  break;
5453  KMP_MB(); /* Flush all pending memory write invalidates. */
5454  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5455 
5456  if ( i < __kmp_threads_capacity ) {
5457  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5458  KMP_MB(); /* Flush all pending memory write invalidates. */
5459 
5460  //
5461  // Need to check that monitor was initialized before reaping it.
5462  // If we are called form __kmp_atfork_child (which sets
5463  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5464  // contain valid data, but it is only valid in the parent process,
5465  // not the child.
5466  //
5467  // One of the possible fixes for CQ138434 / CQ140126
5468  // (used in 20091103_dreamworks patch)
5469  //
5470  // New behavior (201008): instead of keying off of the flag
5471  // __kmp_init_parallel, the monitor thread creation is keyed off
5472  // of the new flag __kmp_init_monitor.
5473  //
5474  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5475  if ( TCR_4( __kmp_init_monitor ) ) {
5476  __kmp_reap_monitor( & __kmp_monitor );
5477  TCW_4( __kmp_init_monitor, 0 );
5478  }
5479  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5480  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5481  } else {
5482  /* TODO move this to cleanup code */
5483  #ifdef KMP_DEBUG
5484  /* make sure that everything has properly ended */
5485  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5486  if( __kmp_root[i] ) {
5487 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5488  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5489  }
5490  }
5491  #endif
5492 
5493  KMP_MB();
5494 
5495  // Reap the worker threads.
5496  // This is valid for now, but be careful if threads are reaped sooner.
5497  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5498  // Get the next thread from the pool.
5499  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5500  __kmp_thread_pool = thread->th.th_next_pool;
5501  // Reap it.
5502  thread->th.th_next_pool = NULL;
5503  thread->th.th_in_pool = FALSE;
5504  __kmp_reap_thread( thread, 0 );
5505  }; // while
5506  __kmp_thread_pool_insert_pt = NULL;
5507 
5508  // Reap teams.
5509  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5510  // Get the next team from the pool.
5511  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5512  __kmp_team_pool = team->t.t_next_pool;
5513  // Reap it.
5514  team->t.t_next_pool = NULL;
5515  __kmp_reap_team( team );
5516  }; // while
5517 
5518  __kmp_reap_task_teams( );
5519 
5520  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5521  // TBD: Add some checking...
5522  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5523  }
5524 
5525  /* Make sure all threadprivate destructors get run by joining with all worker
5526  threads before resetting this flag */
5527  TCW_SYNC_4(__kmp_init_common, FALSE);
5528 
5529  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5530  KMP_MB();
5531 
5532  //
5533  // See note above: One of the possible fixes for CQ138434 / CQ140126
5534  //
5535  // FIXME: push both code fragments down and CSE them?
5536  // push them into __kmp_cleanup() ?
5537  //
5538  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5539  if ( TCR_4( __kmp_init_monitor ) ) {
5540  __kmp_reap_monitor( & __kmp_monitor );
5541  TCW_4( __kmp_init_monitor, 0 );
5542  }
5543  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5544  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5545 
5546  } /* else !__kmp_global.t_active */
5547  TCW_4(__kmp_init_gtid, FALSE);
5548  KMP_MB(); /* Flush all pending memory write invalidates. */
5549 
5550 
5551  __kmp_cleanup();
5552 }
5553 
5554 void
5555 __kmp_internal_end_library( int gtid_req )
5556 {
5557  int i;
5558 
5559  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5560  /* this shouldn't be a race condition because __kmp_internal_end() is the
5561  * only place to clear __kmp_serial_init */
5562  /* we'll check this later too, after we get the lock */
5563  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5564  // because the next check will work in any case.
5565  if( __kmp_global.g.g_abort ) {
5566  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5567  /* TODO abort? */
5568  return;
5569  }
5570  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5571  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5572  return;
5573  }
5574 
5575 
5576  KMP_MB(); /* Flush all pending memory write invalidates. */
5577 
5578  /* find out who we are and what we should do */
5579  {
5580  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5581  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5582  if( gtid == KMP_GTID_SHUTDOWN ) {
5583  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5584  return;
5585  } else if( gtid == KMP_GTID_MONITOR ) {
5586  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5587  return;
5588  } else if( gtid == KMP_GTID_DNE ) {
5589  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5590  /* we don't know who we are, but we may still shutdown the library */
5591  } else if( KMP_UBER_GTID( gtid )) {
5592  /* unregister ourselves as an uber thread. gtid is no longer valid */
5593  if( __kmp_root[gtid]->r.r_active ) {
5594  __kmp_global.g.g_abort = -1;
5595  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5596  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5597  return;
5598  } else {
5599  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5600  __kmp_unregister_root_current_thread( gtid );
5601  }
5602  } else {
5603  /* worker threads may call this function through the atexit handler, if they call exit() */
5604  /* For now, skip the usual subsequent processing and just dump the debug buffer.
5605  TODO: do a thorough shutdown instead
5606  */
5607  #ifdef DUMP_DEBUG_ON_EXIT
5608  if ( __kmp_debug_buf )
5609  __kmp_dump_debug_buffer( );
5610  #endif
5611  return;
5612  }
5613  }
5614  /* synchronize the termination process */
5615  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5616 
5617  /* have we already finished */
5618  if( __kmp_global.g.g_abort ) {
5619  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5620  /* TODO abort? */
5621  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5622  return;
5623  }
5624  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5625  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5626  return;
5627  }
5628 
5629  /* We need this lock to enforce mutex between this reading of
5630  __kmp_threads_capacity and the writing by __kmp_register_root.
5631  Alternatively, we can use a counter of roots that is
5632  atomically updated by __kmp_get_global_thread_id_reg,
5633  __kmp_do_serial_initialize and __kmp_internal_end_*.
5634  */
5635  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5636 
5637  /* now we can safely conduct the actual termination */
5638  __kmp_internal_end();
5639 
5640  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5641  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5642 
5643  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
5644 
5645  #ifdef DUMP_DEBUG_ON_EXIT
5646  if ( __kmp_debug_buf )
5647  __kmp_dump_debug_buffer();
5648  #endif
5649 
5650  #if KMP_OS_WINDOWS
5651  __kmp_close_console();
5652  #endif
5653 
5654  __kmp_fini_allocator();
5655 
5656 } // __kmp_internal_end_library
5657 
5658 void
5659 __kmp_internal_end_thread( int gtid_req )
5660 {
5661  int i;
5662 
5663  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5664  /* this shouldn't be a race condition because __kmp_internal_end() is the
5665  * only place to clear __kmp_serial_init */
5666  /* we'll check this later too, after we get the lock */
5667  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
5668  // because the next check will work in any case.
5669  if( __kmp_global.g.g_abort ) {
5670  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
5671  /* TODO abort? */
5672  return;
5673  }
5674  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5675  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
5676  return;
5677  }
5678 
5679  KMP_MB(); /* Flush all pending memory write invalidates. */
5680 
5681  /* find out who we are and what we should do */
5682  {
5683  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5684  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
5685  if( gtid == KMP_GTID_SHUTDOWN ) {
5686  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
5687  return;
5688  } else if( gtid == KMP_GTID_MONITOR ) {
5689  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
5690  return;
5691  } else if( gtid == KMP_GTID_DNE ) {
5692  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
5693  return;
5694  /* we don't know who we are */
5695  } else if( KMP_UBER_GTID( gtid )) {
5696  /* unregister ourselves as an uber thread. gtid is no longer valid */
5697  if( __kmp_root[gtid]->r.r_active ) {
5698  __kmp_global.g.g_abort = -1;
5699  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5700  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
5701  return;
5702  } else {
5703  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
5704  __kmp_unregister_root_current_thread( gtid );
5705  }
5706  } else {
5707  /* just a worker thread, let's leave */
5708  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
5709 
5710  if ( gtid >= 0 ) {
5711  kmp_info_t *this_thr = __kmp_threads[ gtid ];
5712  if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
5713  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
5714  }
5715  }
5716 
5717  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
5718  return;
5719  }
5720  }
5721  #if defined KMP_DYNAMIC_LIB
5722  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
5723  // because we will better shutdown later in the library destructor.
5724  // The reason of this change is performance problem when non-openmp thread
5725  // in a loop forks and joins many openmp threads. We can save a lot of time
5726  // keeping worker threads alive until the program shutdown.
5727  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
5728  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
5729  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
5730  return;
5731  #endif
5732  /* synchronize the termination process */
5733  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5734 
5735  /* have we already finished */
5736  if( __kmp_global.g.g_abort ) {
5737  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
5738  /* TODO abort? */
5739  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5740  return;
5741  }
5742  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5743  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5744  return;
5745  }
5746 
5747  /* We need this lock to enforce mutex between this reading of
5748  __kmp_threads_capacity and the writing by __kmp_register_root.
5749  Alternatively, we can use a counter of roots that is
5750  atomically updated by __kmp_get_global_thread_id_reg,
5751  __kmp_do_serial_initialize and __kmp_internal_end_*.
5752  */
5753 
5754  /* should we finish the run-time? are all siblings done? */
5755  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5756 
5757  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5758  if ( KMP_UBER_GTID( i ) ) {
5759  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
5760  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5761  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5762  return;
5763  };
5764  }
5765 
5766  /* now we can safely conduct the actual termination */
5767 
5768  __kmp_internal_end();
5769 
5770  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5771  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5772 
5773  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
5774 
5775  #ifdef DUMP_DEBUG_ON_EXIT
5776  if ( __kmp_debug_buf )
5777  __kmp_dump_debug_buffer();
5778  #endif
5779 } // __kmp_internal_end_thread
5780 
5781 // -------------------------------------------------------------------------------------------------
5782 // Library registration stuff.
5783 
5784 static long __kmp_registration_flag = 0;
5785  // Random value used to indicate library initialization.
5786 static char * __kmp_registration_str = NULL;
5787  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
5788 
5789 
5790 static inline
5791 char *
5792 __kmp_reg_status_name() {
5793  /*
5794  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
5795  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
5796  the name of registered_lib_env env var can not be found, because the name will contain different pid.
5797  */
5798  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
5799 } // __kmp_reg_status_get
5800 
5801 
5802 void
5803 __kmp_register_library_startup(
5804  void
5805 ) {
5806 
5807  char * name = __kmp_reg_status_name(); // Name of the environment variable.
5808  int done = 0;
5809  union {
5810  double dtime;
5811  long ltime;
5812  } time;
5813  #if KMP_OS_WINDOWS
5814  __kmp_initialize_system_tick();
5815  #endif
5816  __kmp_read_system_time( & time.dtime );
5817  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
5818  __kmp_registration_str =
5819  __kmp_str_format(
5820  "%p-%lx-%s",
5821  & __kmp_registration_flag,
5822  __kmp_registration_flag,
5823  KMP_LIBRARY_FILE
5824  );
5825 
5826  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
5827 
5828  while ( ! done ) {
5829 
5830  char * value = NULL; // Actual value of the environment variable.
5831 
5832  // Set environment variable, but do not overwrite if it is exist.
5833  __kmp_env_set( name, __kmp_registration_str, 0 );
5834  // Check the variable is written.
5835  value = __kmp_env_get( name );
5836  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
5837 
5838  done = 1; // Ok, environment variable set successfully, exit the loop.
5839 
5840  } else {
5841 
5842  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
5843  // Check whether it alive or dead.
5844  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
5845  char * tail = value;
5846  char * flag_addr_str = NULL;
5847  char * flag_val_str = NULL;
5848  char const * file_name = NULL;
5849  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
5850  __kmp_str_split( tail, '-', & flag_val_str, & tail );
5851  file_name = tail;
5852  if ( tail != NULL ) {
5853  long * flag_addr = 0;
5854  long flag_val = 0;
5855  KMP_SSCANF( flag_addr_str, "%p", & flag_addr );
5856  KMP_SSCANF( flag_val_str, "%lx", & flag_val );
5857  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
5858  // First, check whether environment-encoded address is mapped into addr space.
5859  // If so, dereference it to see if it still has the right value.
5860 
5861  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
5862  neighbor = 1;
5863  } else {
5864  // If not, then we know the other copy of the library is no longer running.
5865  neighbor = 2;
5866  }; // if
5867  }; // if
5868  }; // if
5869  switch ( neighbor ) {
5870  case 0 : // Cannot parse environment variable -- neighbor status unknown.
5871  // Assume it is the incompatible format of future version of the library.
5872  // Assume the other library is alive.
5873  // WARN( ... ); // TODO: Issue a warning.
5874  file_name = "unknown library";
5875  // Attention! Falling to the next case. That's intentional.
5876  case 1 : { // Neighbor is alive.
5877  // Check it is allowed.
5878  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
5879  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
5880  // That's not allowed. Issue fatal error.
5881  __kmp_msg(
5882  kmp_ms_fatal,
5883  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
5884  KMP_HNT( DuplicateLibrary ),
5885  __kmp_msg_null
5886  );
5887  }; // if
5888  KMP_INTERNAL_FREE( duplicate_ok );
5889  __kmp_duplicate_library_ok = 1;
5890  done = 1; // Exit the loop.
5891  } break;
5892  case 2 : { // Neighbor is dead.
5893  // Clear the variable and try to register library again.
5894  __kmp_env_unset( name );
5895  } break;
5896  default : {
5897  KMP_DEBUG_ASSERT( 0 );
5898  } break;
5899  }; // switch
5900 
5901  }; // if
5902  KMP_INTERNAL_FREE( (void *) value );
5903 
5904  }; // while
5905  KMP_INTERNAL_FREE( (void *) name );
5906 
5907 } // func __kmp_register_library_startup
5908 
5909 
5910 void
5911 __kmp_unregister_library( void ) {
5912 
5913  char * name = __kmp_reg_status_name();
5914  char * value = __kmp_env_get( name );
5915 
5916  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
5917  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
5918  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
5919  // Ok, this is our variable. Delete it.
5920  __kmp_env_unset( name );
5921  }; // if
5922 
5923  KMP_INTERNAL_FREE( __kmp_registration_str );
5924  KMP_INTERNAL_FREE( value );
5925  KMP_INTERNAL_FREE( name );
5926 
5927  __kmp_registration_flag = 0;
5928  __kmp_registration_str = NULL;
5929 
5930 } // __kmp_unregister_library
5931 
5932 
5933 // End of Library registration stuff.
5934 // -------------------------------------------------------------------------------------------------
5935 
5936 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
5937 
5938 static void __kmp_check_mic_type()
5939 {
5940  kmp_cpuid_t cpuid_state = {0};
5941  kmp_cpuid_t * cs_p = &cpuid_state;
5942  cs_p->eax=1;
5943  cs_p->ecx=0;
5944  __asm__ __volatile__("cpuid"
5945  : "+a" (cs_p->eax), "=b" (cs_p->ebx), "+c" (cs_p->ecx), "=d" (cs_p->edx));
5946  // We don't support mic1 at the moment
5947  if( (cs_p->eax & 0xff0) == 0xB10 ) {
5948  __kmp_mic_type = mic2;
5949  } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
5950  __kmp_mic_type = mic3;
5951  } else {
5952  __kmp_mic_type = non_mic;
5953  }
5954 }
5955 
5956 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
5957 
5958 static void
5959 __kmp_do_serial_initialize( void )
5960 {
5961  int i, gtid;
5962  int size;
5963 
5964  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
5965 
5966  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
5967  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
5968  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
5969  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
5970  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
5971 
5972  __kmp_validate_locks();
5973 
5974  /* Initialize internal memory allocator */
5975  __kmp_init_allocator();
5976 
5977  /* Register the library startup via an environment variable
5978  and check to see whether another copy of the library is already
5979  registered. */
5980 
5981  __kmp_register_library_startup( );
5982 
5983  /* TODO reinitialization of library */
5984  if( TCR_4(__kmp_global.g.g_done) ) {
5985  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
5986  }
5987 
5988  __kmp_global.g.g_abort = 0;
5989  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
5990 
5991  /* initialize the locks */
5992 #if KMP_USE_ADAPTIVE_LOCKS
5993 #if KMP_DEBUG_ADAPTIVE_LOCKS
5994  __kmp_init_speculative_stats();
5995 #endif
5996 #endif
5997  __kmp_init_lock( & __kmp_global_lock );
5998  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
5999  __kmp_init_lock( & __kmp_debug_lock );
6000  __kmp_init_atomic_lock( & __kmp_atomic_lock );
6001  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
6002  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
6003  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
6004  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
6005  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
6006  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
6007  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
6008  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6009  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6010  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6011  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6012  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6013  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
6014  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
6015  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
6016  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6017 
6018  /* conduct initialization and initial setup of configuration */
6019 
6020  __kmp_runtime_initialize();
6021 
6022 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6023  __kmp_check_mic_type();
6024 #endif
6025 
6026  // Some global variable initialization moved here from kmp_env_initialize()
6027 #ifdef KMP_DEBUG
6028  kmp_diag = 0;
6029 #endif
6030  __kmp_abort_delay = 0;
6031 
6032  // From __kmp_init_dflt_team_nth()
6033  /* assume the entire machine will be used */
6034  __kmp_dflt_team_nth_ub = __kmp_xproc;
6035  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6036  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6037  }
6038  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6039  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6040  }
6041  __kmp_max_nth = __kmp_sys_max_nth;
6042 
6043  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6044  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6045  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6046  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6047  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6048  __kmp_library = library_throughput;
6049  // From KMP_SCHEDULE initialization
6050  __kmp_static = kmp_sch_static_balanced;
6051  // AC: do not use analytical here, because it is non-monotonous
6052  //__kmp_guided = kmp_sch_guided_iterative_chunked;
6053  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6054  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6055  // control parts
6056  #if KMP_FAST_REDUCTION_BARRIER
6057  #define kmp_reduction_barrier_gather_bb ((int)1)
6058  #define kmp_reduction_barrier_release_bb ((int)1)
6059  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6060  #define kmp_reduction_barrier_release_pat bp_hyper_bar
6061  #endif // KMP_FAST_REDUCTION_BARRIER
6062  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6063  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6064  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6065  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6066  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6067  #if KMP_FAST_REDUCTION_BARRIER
6068  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6069  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6070  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6071  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6072  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6073  }
6074  #endif // KMP_FAST_REDUCTION_BARRIER
6075  }
6076  #if KMP_FAST_REDUCTION_BARRIER
6077  #undef kmp_reduction_barrier_release_pat
6078  #undef kmp_reduction_barrier_gather_pat
6079  #undef kmp_reduction_barrier_release_bb
6080  #undef kmp_reduction_barrier_gather_bb
6081  #endif // KMP_FAST_REDUCTION_BARRIER
6082 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6083  if( __kmp_mic_type != non_mic ) {
6084  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6085  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plane gather
6086  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
6087  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6088  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6089  }
6090 #if KMP_FAST_REDUCTION_BARRIER
6091  if( __kmp_mic_type != non_mic ) {
6092  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6093  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6094  }
6095 #endif
6096 #endif
6097 
6098  // From KMP_CHECKS initialization
6099 #ifdef KMP_DEBUG
6100  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6101 #else
6102  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6103 #endif
6104 
6105  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6106  __kmp_foreign_tp = TRUE;
6107 
6108  __kmp_global.g.g_dynamic = FALSE;
6109  __kmp_global.g.g_dynamic_mode = dynamic_default;
6110 
6111  __kmp_env_initialize( NULL );
6112 
6113  // Print all messages in message catalog for testing purposes.
6114  #ifdef KMP_DEBUG
6115  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6116  if ( __kmp_str_match_true( val ) ) {
6117  kmp_str_buf_t buffer;
6118  __kmp_str_buf_init( & buffer );
6119  __kmp_i18n_dump_catalog( & buffer );
6120  __kmp_printf( "%s", buffer.str );
6121  __kmp_str_buf_free( & buffer );
6122  }; // if
6123  __kmp_env_free( & val );
6124  #endif
6125 
6126  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6127  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6128  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6129 
6130 
6131  // If the library is shut down properly, both pools must be NULL. Just in case, set them
6132  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6133  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6134  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6135  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
6136  __kmp_thread_pool = NULL;
6137  __kmp_thread_pool_insert_pt = NULL;
6138  __kmp_team_pool = NULL;
6139 
6140  /* Allocate all of the variable sized records */
6141  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6142  /* Since allocation is cache-aligned, just add extra padding at the end */
6143  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6144  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6145  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6146 
6147  /* init thread counts */
6148  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6149  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
6150  __kmp_all_nth = 0;
6151  __kmp_nth = 0;
6152 
6153  /* setup the uber master thread and hierarchy */
6154  gtid = __kmp_register_root( TRUE );
6155  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6156  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6157  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6158 
6159  KMP_MB(); /* Flush all pending memory write invalidates. */
6160 
6161  __kmp_common_initialize();
6162 
6163  #if KMP_OS_UNIX
6164  /* invoke the child fork handler */
6165  __kmp_register_atfork();
6166  #endif
6167 
6168  #if ! defined KMP_DYNAMIC_LIB
6169  {
6170  /* Invoke the exit handler when the program finishes, only for static library.
6171  For dynamic library, we already have _fini and DllMain.
6172  */
6173  int rc = atexit( __kmp_internal_end_atexit );
6174  if ( rc != 0 ) {
6175  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6176  }; // if
6177  }
6178  #endif
6179 
6180  #if KMP_HANDLE_SIGNALS
6181  #if KMP_OS_UNIX
6182  /* NOTE: make sure that this is called before the user installs
6183  * their own signal handlers so that the user handlers
6184  * are called first. this way they can return false,
6185  * not call our handler, avoid terminating the library,
6186  * and continue execution where they left off. */
6187  __kmp_install_signals( FALSE );
6188  #endif /* KMP_OS_UNIX */
6189  #if KMP_OS_WINDOWS
6190  __kmp_install_signals( TRUE );
6191  #endif /* KMP_OS_WINDOWS */
6192  #endif
6193 
6194  /* we have finished the serial initialization */
6195  __kmp_init_counter ++;
6196 
6197  __kmp_init_serial = TRUE;
6198 
6199  if (__kmp_settings) {
6200  __kmp_env_print();
6201  }
6202 
6203 #if OMP_40_ENABLED
6204  if (__kmp_display_env || __kmp_display_env_verbose) {
6205  __kmp_env_print_2();
6206  }
6207 #endif // OMP_40_ENABLED
6208 
6209  KMP_MB();
6210 
6211  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6212 }
6213 
6214 void
6215 __kmp_serial_initialize( void )
6216 {
6217  if ( __kmp_init_serial ) {
6218  return;
6219  }
6220  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6221  if ( __kmp_init_serial ) {
6222  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6223  return;
6224  }
6225  __kmp_do_serial_initialize();
6226  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6227 }
6228 
6229 static void
6230 __kmp_do_middle_initialize( void )
6231 {
6232  int i, j;
6233  int prev_dflt_team_nth;
6234 
6235  if( !__kmp_init_serial ) {
6236  __kmp_do_serial_initialize();
6237  }
6238 
6239  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6240 
6241  //
6242  // Save the previous value for the __kmp_dflt_team_nth so that
6243  // we can avoid some reinitialization if it hasn't changed.
6244  //
6245  prev_dflt_team_nth = __kmp_dflt_team_nth;
6246 
6247 #if KMP_AFFINITY_SUPPORTED
6248  //
6249  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6250  // number of cores on the machine.
6251  //
6252  __kmp_affinity_initialize();
6253 
6254  //
6255  // Run through the __kmp_threads array and set the affinity mask
6256  // for each root thread that is currently registered with the RTL.
6257  //
6258  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6259  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6260  __kmp_affinity_set_init_mask( i, TRUE );
6261  }
6262  }
6263 #endif /* KMP_AFFINITY_SUPPORTED */
6264 
6265  KMP_ASSERT( __kmp_xproc > 0 );
6266  if ( __kmp_avail_proc == 0 ) {
6267  __kmp_avail_proc = __kmp_xproc;
6268  }
6269 
6270  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6271  j = 0;
6272  while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6273  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6274  j++;
6275  }
6276 
6277  if ( __kmp_dflt_team_nth == 0 ) {
6278 #ifdef KMP_DFLT_NTH_CORES
6279  //
6280  // Default #threads = #cores
6281  //
6282  __kmp_dflt_team_nth = __kmp_ncores;
6283  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6284  __kmp_dflt_team_nth ) );
6285 #else
6286  //
6287  // Default #threads = #available OS procs
6288  //
6289  __kmp_dflt_team_nth = __kmp_avail_proc;
6290  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6291  __kmp_dflt_team_nth ) );
6292 #endif /* KMP_DFLT_NTH_CORES */
6293  }
6294 
6295  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6296  __kmp_dflt_team_nth = KMP_MIN_NTH;
6297  }
6298  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6299  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6300  }
6301 
6302  //
6303  // There's no harm in continuing if the following check fails,
6304  // but it indicates an error in the previous logic.
6305  //
6306  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6307 
6308  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6309  //
6310  // Run through the __kmp_threads array and set the num threads icv
6311  // for each root thread that is currently registered with the RTL
6312  // (which has not already explicitly set its nthreads-var with a
6313  // call to omp_set_num_threads()).
6314  //
6315  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6316  kmp_info_t *thread = __kmp_threads[ i ];
6317  if ( thread == NULL ) continue;
6318  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6319 
6320  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6321  }
6322  }
6323  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6324  __kmp_dflt_team_nth) );
6325 
6326 #ifdef KMP_ADJUST_BLOCKTIME
6327  /* Adjust blocktime to zero if necessary */
6328  /* now that __kmp_avail_proc is set */
6329  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6330  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6331  if ( __kmp_nth > __kmp_avail_proc ) {
6332  __kmp_zero_bt = TRUE;
6333  }
6334  }
6335 #endif /* KMP_ADJUST_BLOCKTIME */
6336 
6337  /* we have finished middle initialization */
6338  TCW_SYNC_4(__kmp_init_middle, TRUE);
6339 
6340  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6341 }
6342 
6343 void
6344 __kmp_middle_initialize( void )
6345 {
6346  if ( __kmp_init_middle ) {
6347  return;
6348  }
6349  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6350  if ( __kmp_init_middle ) {
6351  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6352  return;
6353  }
6354  __kmp_do_middle_initialize();
6355  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6356 }
6357 
6358 void
6359 __kmp_parallel_initialize( void )
6360 {
6361  int gtid = __kmp_entry_gtid(); // this might be a new root
6362 
6363  /* syncronize parallel initialization (for sibling) */
6364  if( TCR_4(__kmp_init_parallel) ) return;
6365  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6366  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6367 
6368  /* TODO reinitialization after we have already shut down */
6369  if( TCR_4(__kmp_global.g.g_done) ) {
6370  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6371  __kmp_infinite_loop();
6372  }
6373 
6374  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6375  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6376  */
6377  if( !__kmp_init_middle ) {
6378  __kmp_do_middle_initialize();
6379  }
6380 
6381  /* begin initialization */
6382  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6383  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6384 
6385 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6386  //
6387  // Save the FP control regs.
6388  // Worker threads will set theirs to these values at thread startup.
6389  //
6390  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6391  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6392  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6393 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6394 
6395 #if KMP_OS_UNIX
6396 # if KMP_HANDLE_SIGNALS
6397  /* must be after __kmp_serial_initialize */
6398  __kmp_install_signals( TRUE );
6399 # endif
6400 #endif
6401 
6402  __kmp_suspend_initialize();
6403 
6404 # if defined(USE_LOAD_BALANCE)
6405  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6406  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6407  }
6408 #else
6409  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6410  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6411  }
6412 #endif
6413 
6414  if ( __kmp_version ) {
6415  __kmp_print_version_2();
6416  }
6417 
6418  /* we have finished parallel initialization */
6419  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6420 
6421  KMP_MB();
6422  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6423 
6424  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6425 }
6426 
6427 
6428 /* ------------------------------------------------------------------------ */
6429 
6430 void
6431 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6432  kmp_team_t *team )
6433 {
6434  kmp_disp_t *dispatch;
6435 
6436  KMP_MB();
6437 
6438  /* none of the threads have encountered any constructs, yet. */
6439  this_thr->th.th_local.this_construct = 0;
6440 #if KMP_CACHE_MANAGE
6441  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6442 #endif /* KMP_CACHE_MANAGE */
6443  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6444  KMP_DEBUG_ASSERT( dispatch );
6445  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6446  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6447 
6448  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6449 
6450  if( __kmp_env_consistency_check )
6451  __kmp_push_parallel( gtid, team->t.t_ident );
6452 
6453  KMP_MB(); /* Flush all pending memory write invalidates. */
6454 }
6455 
6456 void
6457 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6458  kmp_team_t *team )
6459 {
6460  if( __kmp_env_consistency_check )
6461  __kmp_pop_parallel( gtid, team->t.t_ident );
6462 }
6463 
6464 int
6465 __kmp_invoke_task_func( int gtid )
6466 {
6467  int rc;
6468  int tid = __kmp_tid_from_gtid( gtid );
6469  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6470  kmp_team_t *team = this_thr->th.th_team;
6471 
6472  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6473 #if USE_ITT_BUILD
6474  if ( __itt_stack_caller_create_ptr ) {
6475  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6476  }
6477 #endif /* USE_ITT_BUILD */
6478 #if INCLUDE_SSC_MARKS
6479  SSC_MARK_INVOKING();
6480 #endif
6481  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6482  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv );
6483 
6484 #if USE_ITT_BUILD
6485  if ( __itt_stack_caller_create_ptr ) {
6486  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6487  }
6488 #endif /* USE_ITT_BUILD */
6489  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6490 
6491  return rc;
6492 }
6493 
6494 #if OMP_40_ENABLED
6495 void
6496 __kmp_teams_master( int gtid )
6497 {
6498  // This routine is called by all master threads in teams construct
6499  kmp_info_t *thr = __kmp_threads[ gtid ];
6500  kmp_team_t *team = thr->th.th_team;
6501  ident_t *loc = team->t.t_ident;
6502  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6503  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6504  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6505  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6506  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6507  // Launch league of teams now, but not let workers execute
6508  // (they hang on fork barrier until next parallel)
6509 #if INCLUDE_SSC_MARKS
6510  SSC_MARK_FORKING();
6511 #endif
6512  __kmp_fork_call( loc, gtid, fork_context_intel,
6513  team->t.t_argc,
6514  (microtask_t)thr->th.th_teams_microtask,
6515  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6516  NULL );
6517 #if INCLUDE_SSC_MARKS
6518  SSC_MARK_JOINING();
6519 #endif
6520  __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
6521  // worker threads are in a fork barrier waiting for more parallel regions
6522 }
6523 
6524 int
6525 __kmp_invoke_teams_master( int gtid )
6526 {
6527  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6528  kmp_team_t *team = this_thr->th.th_team;
6529  #if KMP_DEBUG
6530  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6531  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6532  #endif
6533  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6534  __kmp_teams_master( gtid );
6535  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6536  return 1;
6537 }
6538 #endif /* OMP_40_ENABLED */
6539 
6540 /* this sets the requested number of threads for the next parallel region
6541  * encountered by this team */
6542 /* since this should be enclosed in the forkjoin critical section it
6543  * should avoid race conditions with assymmetrical nested parallelism */
6544 
6545 void
6546 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6547 {
6548  kmp_info_t *thr = __kmp_threads[gtid];
6549 
6550  if( num_threads > 0 )
6551  thr->th.th_set_nproc = num_threads;
6552 }
6553 
6554 #if OMP_40_ENABLED
6555 
6556 /* this sets the requested number of teams for the teams region and/or
6557  * the number of threads for the next parallel region encountered */
6558 void
6559 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6560 {
6561  kmp_info_t *thr = __kmp_threads[gtid];
6562  KMP_DEBUG_ASSERT(num_teams >= 0);
6563  KMP_DEBUG_ASSERT(num_threads >= 0);
6564  if( num_teams == 0 ) {
6565  num_teams = 1; // default number of teams is 1.
6566  }
6567  // Set number of teams (number of threads in the outer "parallel" of the teams)
6568  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6569 
6570  // Remember the number of threads for inner parallel regions
6571  if( num_threads > 0 ) {
6572  thr->th.th_teams_size.nth = num_threads;
6573  } else {
6574  if( !TCR_4(__kmp_init_middle) )
6575  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6576  thr->th.th_teams_size.nth = __kmp_avail_proc / num_teams;
6577  }
6578 }
6579 
6580 
6581 //
6582 // Set the proc_bind var to use in the following parallel region.
6583 //
6584 void
6585 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
6586 {
6587  kmp_info_t *thr = __kmp_threads[gtid];
6588  thr->th.th_set_proc_bind = proc_bind;
6589 }
6590 
6591 #endif /* OMP_40_ENABLED */
6592 
6593 /* Launch the worker threads into the microtask. */
6594 
6595 void
6596 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
6597 {
6598  kmp_info_t *this_thr = __kmp_threads[gtid];
6599 
6600 #ifdef KMP_DEBUG
6601  int f;
6602 #endif /* KMP_DEBUG */
6603 
6604  KMP_DEBUG_ASSERT( team );
6605  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
6606  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
6607  KMP_MB(); /* Flush all pending memory write invalidates. */
6608 
6609  team->t.t_construct = 0; /* no single directives seen yet */
6610  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
6611 
6612  /* Reset the identifiers on the dispatch buffer */
6613  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
6614  if ( team->t.t_max_nproc > 1 ) {
6615  int i;
6616  for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
6617  team->t.t_disp_buffer[ i ].buffer_index = i;
6618  } else {
6619  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
6620  }
6621 
6622  KMP_MB(); /* Flush all pending memory write invalidates. */
6623  KMP_ASSERT( this_thr->th.th_team == team );
6624 
6625 #ifdef KMP_DEBUG
6626  for( f=0 ; f<team->t.t_nproc ; f++ ) {
6627  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
6628  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
6629  }
6630 #endif /* KMP_DEBUG */
6631 
6632  /* release the worker threads so they may begin working */
6633  __kmp_fork_barrier( gtid, 0 );
6634 }
6635 
6636 
6637 void
6638 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
6639 {
6640  kmp_info_t *this_thr = __kmp_threads[gtid];
6641 
6642  KMP_DEBUG_ASSERT( team );
6643  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
6644  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
6645  KMP_MB(); /* Flush all pending memory write invalidates. */
6646 
6647  /* Join barrier after fork */
6648 
6649 #ifdef KMP_DEBUG
6650  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
6651  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
6652  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
6653  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
6654  __kmp_print_structure();
6655  }
6656  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
6657  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
6658 #endif /* KMP_DEBUG */
6659 
6660  __kmp_join_barrier( gtid ); /* wait for everyone */
6661 
6662  KMP_MB(); /* Flush all pending memory write invalidates. */
6663  KMP_ASSERT( this_thr->th.th_team == team );
6664 }
6665 
6666 
6667 /* ------------------------------------------------------------------------ */
6668 /* ------------------------------------------------------------------------ */
6669 
6670 #ifdef USE_LOAD_BALANCE
6671 
6672 //
6673 // Return the worker threads actively spinning in the hot team, if we
6674 // are at the outermost level of parallelism. Otherwise, return 0.
6675 //
6676 static int
6677 __kmp_active_hot_team_nproc( kmp_root_t *root )
6678 {
6679  int i;
6680  int retval;
6681  kmp_team_t *hot_team;
6682 
6683  if ( root->r.r_active ) {
6684  return 0;
6685  }
6686  hot_team = root->r.r_hot_team;
6687  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
6688  return hot_team->t.t_nproc - 1; // Don't count master thread
6689  }
6690 
6691  //
6692  // Skip the master thread - it is accounted for elsewhere.
6693  //
6694  retval = 0;
6695  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
6696  if ( hot_team->t.t_threads[i]->th.th_active ) {
6697  retval++;
6698  }
6699  }
6700  return retval;
6701 }
6702 
6703 //
6704 // Perform an automatic adjustment to the number of
6705 // threads used by the next parallel region.
6706 //
6707 static int
6708 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
6709 {
6710  int retval;
6711  int pool_active;
6712  int hot_team_active;
6713  int team_curr_active;
6714  int system_active;
6715 
6716  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
6717  root, set_nproc ) );
6718  KMP_DEBUG_ASSERT( root );
6719  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
6720  KMP_DEBUG_ASSERT( set_nproc > 1 );
6721 
6722  if ( set_nproc == 1) {
6723  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
6724  return 1;
6725  }
6726 
6727  //
6728  // Threads that are active in the thread pool, active in the hot team
6729  // for this particular root (if we are at the outer par level), and
6730  // the currently executing thread (to become the master) are available
6731  // to add to the new team, but are currently contributing to the system
6732  // load, and must be accounted for.
6733  //
6734  pool_active = TCR_4(__kmp_thread_pool_active_nth);
6735  hot_team_active = __kmp_active_hot_team_nproc( root );
6736  team_curr_active = pool_active + hot_team_active + 1;
6737 
6738  //
6739  // Check the system load.
6740  //
6741  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
6742  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
6743  system_active, pool_active, hot_team_active ) );
6744 
6745  if ( system_active < 0 ) {
6746  //
6747  // There was an error reading the necessary info from /proc,
6748  // so use the thread limit algorithm instead. Once we set
6749  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
6750  // we shouldn't wind up getting back here.
6751  //
6752  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6753  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
6754 
6755  //
6756  // Make this call behave like the thread limit algorithm.
6757  //
6758  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
6759  : root->r.r_hot_team->t.t_nproc);
6760  if ( retval > set_nproc ) {
6761  retval = set_nproc;
6762  }
6763  if ( retval < KMP_MIN_NTH ) {
6764  retval = KMP_MIN_NTH;
6765  }
6766 
6767  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
6768  return retval;
6769  }
6770 
6771  //
6772  // There is a slight delay in the load balance algorithm in detecting
6773  // new running procs. The real system load at this instant should be
6774  // at least as large as the #active omp thread that are available to
6775  // add to the team.
6776  //
6777  if ( system_active < team_curr_active ) {
6778  system_active = team_curr_active;
6779  }
6780  retval = __kmp_avail_proc - system_active + team_curr_active;
6781  if ( retval > set_nproc ) {
6782  retval = set_nproc;
6783  }
6784  if ( retval < KMP_MIN_NTH ) {
6785  retval = KMP_MIN_NTH;
6786  }
6787 
6788  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
6789  return retval;
6790 } // __kmp_load_balance_nproc()
6791 
6792 #endif /* USE_LOAD_BALANCE */
6793 
6794 
6795 /* ------------------------------------------------------------------------ */
6796 /* ------------------------------------------------------------------------ */
6797 
6798 /* NOTE: this is called with the __kmp_init_lock held */
6799 void
6800 __kmp_cleanup( void )
6801 {
6802  int f;
6803 
6804  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
6805 
6806  if (TCR_4(__kmp_init_parallel)) {
6807 #if KMP_HANDLE_SIGNALS
6808  __kmp_remove_signals();
6809 #endif
6810  TCW_4(__kmp_init_parallel, FALSE);
6811  }
6812 
6813  if (TCR_4(__kmp_init_middle)) {
6814 #if KMP_AFFINITY_SUPPORTED
6815  __kmp_affinity_uninitialize();
6816 #endif /* KMP_AFFINITY_SUPPORTED */
6817  TCW_4(__kmp_init_middle, FALSE);
6818  }
6819 
6820  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
6821 
6822  if (__kmp_init_serial) {
6823 
6824  __kmp_runtime_destroy();
6825 
6826  __kmp_init_serial = FALSE;
6827  }
6828 
6829  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
6830  if ( __kmp_root[ f ] != NULL ) {
6831  __kmp_free( __kmp_root[ f ] );
6832  __kmp_root[ f ] = NULL;
6833  }
6834  }
6835  __kmp_free( __kmp_threads );
6836  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
6837  // freeing __kmp_root.
6838  __kmp_threads = NULL;
6839  __kmp_root = NULL;
6840  __kmp_threads_capacity = 0;
6841 
6842 #if KMP_USE_DYNAMIC_LOCK
6843  __kmp_cleanup_indirect_user_locks();
6844 #else
6845  __kmp_cleanup_user_locks();
6846 #endif
6847 
6848  #if KMP_AFFINITY_SUPPORTED
6849  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
6850  __kmp_cpuinfo_file = NULL;
6851  #endif /* KMP_AFFINITY_SUPPORTED */
6852 
6853  #if KMP_USE_ADAPTIVE_LOCKS
6854  #if KMP_DEBUG_ADAPTIVE_LOCKS
6855  __kmp_print_speculative_stats();
6856  #endif
6857  #endif
6858  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
6859  __kmp_nested_nth.nth = NULL;
6860  __kmp_nested_nth.size = 0;
6861  __kmp_nested_nth.used = 0;
6862 
6863  __kmp_i18n_catclose();
6864 
6865 #if KMP_STATS_ENABLED
6866  __kmp_accumulate_stats_at_exit();
6867  __kmp_stats_list.deallocate();
6868 #endif
6869 
6870  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
6871 }
6872 
6873 /* ------------------------------------------------------------------------ */
6874 /* ------------------------------------------------------------------------ */
6875 
6876 int
6877 __kmp_ignore_mppbeg( void )
6878 {
6879  char *env;
6880 
6881  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
6882  if (__kmp_str_match_false( env ))
6883  return FALSE;
6884  }
6885  // By default __kmpc_begin() is no-op.
6886  return TRUE;
6887 }
6888 
6889 int
6890 __kmp_ignore_mppend( void )
6891 {
6892  char *env;
6893 
6894  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
6895  if (__kmp_str_match_false( env ))
6896  return FALSE;
6897  }
6898  // By default __kmpc_end() is no-op.
6899  return TRUE;
6900 }
6901 
6902 void
6903 __kmp_internal_begin( void )
6904 {
6905  int gtid;
6906  kmp_root_t *root;
6907 
6908  /* this is a very important step as it will register new sibling threads
6909  * and assign these new uber threads a new gtid */
6910  gtid = __kmp_entry_gtid();
6911  root = __kmp_threads[ gtid ]->th.th_root;
6912  KMP_ASSERT( KMP_UBER_GTID( gtid ));
6913 
6914  if( root->r.r_begin ) return;
6915  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
6916  if( root->r.r_begin ) {
6917  __kmp_release_lock( & root->r.r_begin_lock, gtid );
6918  return;
6919  }
6920 
6921  root->r.r_begin = TRUE;
6922 
6923  __kmp_release_lock( & root->r.r_begin_lock, gtid );
6924 }
6925 
6926 
6927 /* ------------------------------------------------------------------------ */
6928 /* ------------------------------------------------------------------------ */
6929 
6930 void
6931 __kmp_user_set_library (enum library_type arg)
6932 {
6933  int gtid;
6934  kmp_root_t *root;
6935  kmp_info_t *thread;
6936 
6937  /* first, make sure we are initialized so we can get our gtid */
6938 
6939  gtid = __kmp_entry_gtid();
6940  thread = __kmp_threads[ gtid ];
6941 
6942  root = thread->th.th_root;
6943 
6944  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
6945  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
6946  KMP_WARNING( SetLibraryIncorrectCall );
6947  return;
6948  }
6949 
6950  switch ( arg ) {
6951  case library_serial :
6952  thread->th.th_set_nproc = 0;
6953  set__nproc( thread, 1 );
6954  break;
6955  case library_turnaround :
6956  thread->th.th_set_nproc = 0;
6957  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
6958  break;
6959  case library_throughput :
6960  thread->th.th_set_nproc = 0;
6961  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
6962  break;
6963  default:
6964  KMP_FATAL( UnknownLibraryType, arg );
6965  }
6966 
6967  __kmp_aux_set_library ( arg );
6968 }
6969 
6970 void
6971 __kmp_aux_set_stacksize( size_t arg )
6972 {
6973  if (! __kmp_init_serial)
6974  __kmp_serial_initialize();
6975 
6976 #if KMP_OS_DARWIN
6977  if (arg & (0x1000 - 1)) {
6978  arg &= ~(0x1000 - 1);
6979  if(arg + 0x1000) /* check for overflow if we round up */
6980  arg += 0x1000;
6981  }
6982 #endif
6983  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6984 
6985  /* only change the default stacksize before the first parallel region */
6986  if (! TCR_4(__kmp_init_parallel)) {
6987  size_t value = arg; /* argument is in bytes */
6988 
6989  if (value < __kmp_sys_min_stksize )
6990  value = __kmp_sys_min_stksize ;
6991  else if (value > KMP_MAX_STKSIZE)
6992  value = KMP_MAX_STKSIZE;
6993 
6994  __kmp_stksize = value;
6995 
6996  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
6997  }
6998 
6999  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7000 }
7001 
7002 /* set the behaviour of the runtime library */
7003 /* TODO this can cause some odd behaviour with sibling parallelism... */
7004 void
7005 __kmp_aux_set_library (enum library_type arg)
7006 {
7007  __kmp_library = arg;
7008 
7009  switch ( __kmp_library ) {
7010  case library_serial :
7011  {
7012  KMP_INFORM( LibraryIsSerial );
7013  (void) __kmp_change_library( TRUE );
7014  }
7015  break;
7016  case library_turnaround :
7017  (void) __kmp_change_library( TRUE );
7018  break;
7019  case library_throughput :
7020  (void) __kmp_change_library( FALSE );
7021  break;
7022  default:
7023  KMP_FATAL( UnknownLibraryType, arg );
7024  }
7025 }
7026 
7027 /* ------------------------------------------------------------------------ */
7028 /* ------------------------------------------------------------------------ */
7029 
7030 void
7031 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7032 {
7033  int blocktime = arg; /* argument is in milliseconds */
7034  int bt_intervals;
7035  int bt_set;
7036 
7037  __kmp_save_internal_controls( thread );
7038 
7039  /* Normalize and set blocktime for the teams */
7040  if (blocktime < KMP_MIN_BLOCKTIME)
7041  blocktime = KMP_MIN_BLOCKTIME;
7042  else if (blocktime > KMP_MAX_BLOCKTIME)
7043  blocktime = KMP_MAX_BLOCKTIME;
7044 
7045  set__blocktime_team( thread->th.th_team, tid, blocktime );
7046  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7047 
7048  /* Calculate and set blocktime intervals for the teams */
7049  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7050 
7051  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7052  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7053 
7054  /* Set whether blocktime has been set to "TRUE" */
7055  bt_set = TRUE;
7056 
7057  set__bt_set_team( thread->th.th_team, tid, bt_set );
7058  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7059  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
7060  __kmp_gtid_from_tid(tid, thread->th.th_team),
7061  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
7062 }
7063 
7064 void
7065 __kmp_aux_set_defaults(
7066  char const * str,
7067  int len
7068 ) {
7069  if ( ! __kmp_init_serial ) {
7070  __kmp_serial_initialize();
7071  };
7072  __kmp_env_initialize( str );
7073 
7074  if (__kmp_settings
7075 #if OMP_40_ENABLED
7076  || __kmp_display_env || __kmp_display_env_verbose
7077 #endif // OMP_40_ENABLED
7078  ) {
7079  __kmp_env_print();
7080  }
7081 } // __kmp_aux_set_defaults
7082 
7083 /* ------------------------------------------------------------------------ */
7084 
7085 /*
7086  * internal fast reduction routines
7087  */
7088 
7089 PACKED_REDUCTION_METHOD_T
7090 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7091  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7092  kmp_critical_name *lck )
7093 {
7094 
7095  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7096  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7097  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7098  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7099 
7100  PACKED_REDUCTION_METHOD_T retval;
7101 
7102  int team_size;
7103 
7104  int teamsize_cutoff = 4;
7105 
7106  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
7107  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
7108 
7109  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7110  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
7111 
7112  retval = critical_reduce_block;
7113 
7114  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7115 
7116  if( team_size == 1 ) {
7117 
7118  retval = empty_reduce_block;
7119 
7120  } else {
7121 
7122  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7123  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7124 
7125  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
7126 
7127  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7128 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7129  if( __kmp_mic_type != non_mic ) {
7130  teamsize_cutoff = 8;
7131  }
7132 #endif
7133  if( tree_available ) {
7134  if( team_size <= teamsize_cutoff ) {
7135  if ( atomic_available ) {
7136  retval = atomic_reduce_block;
7137  }
7138  } else {
7139  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7140  }
7141  } else if ( atomic_available ) {
7142  retval = atomic_reduce_block;
7143  }
7144  #else
7145  #error "Unknown or unsupported OS"
7146  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7147 
7148  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH
7149 
7150  #if KMP_OS_LINUX || KMP_OS_WINDOWS
7151 
7152  // basic tuning
7153 
7154  if( atomic_available ) {
7155  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7156  retval = atomic_reduce_block;
7157  }
7158  } // otherwise: use critical section
7159 
7160  #elif KMP_OS_DARWIN
7161 
7162  if( atomic_available && ( num_vars <= 3 ) ) {
7163  retval = atomic_reduce_block;
7164  } else if( tree_available ) {
7165  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7166  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7167  }
7168  } // otherwise: use critical section
7169 
7170  #else
7171  #error "Unknown or unsupported OS"
7172  #endif
7173 
7174  #else
7175  #error "Unknown or unsupported architecture"
7176  #endif
7177 
7178  }
7179 
7180  // KMP_FORCE_REDUCTION
7181 
7182  if( __kmp_force_reduction_method != reduction_method_not_defined ) {
7183 
7184  PACKED_REDUCTION_METHOD_T forced_retval;
7185 
7186  int atomic_available, tree_available;
7187 
7188  switch( ( forced_retval = __kmp_force_reduction_method ) )
7189  {
7190  case critical_reduce_block:
7191  KMP_ASSERT( lck ); // lck should be != 0
7192  if( team_size <= 1 ) {
7193  forced_retval = empty_reduce_block;
7194  }
7195  break;
7196 
7197  case atomic_reduce_block:
7198  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7199  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
7200  break;
7201 
7202  case tree_reduce_block:
7203  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7204  KMP_ASSERT( tree_available ); // tree_available should be != 0
7205  #if KMP_FAST_REDUCTION_BARRIER
7206  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7207  #endif
7208  break;
7209 
7210  default:
7211  KMP_ASSERT( 0 ); // "unsupported method specified"
7212  }
7213 
7214  retval = forced_retval;
7215  }
7216 
7217  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7218 
7219  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7220  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7221 
7222  return ( retval );
7223 }
7224 
7225 // this function is for testing set/get/determine reduce method
7226 kmp_int32
7227 __kmp_get_reduce_method( void ) {
7228  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7229 }
7230 
7231 /* ------------------------------------------------------------------------ */
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:438
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:204
Definition: kmp.h:221
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:423
sched_type
Definition: kmp.h:323
kmp_int32 flags
Definition: kmp.h:223