Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  * $Revision: 43473 $
4  * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2014 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_atomic.h"
39 #include "kmp_wrapper_getpid.h"
40 #include "kmp_environment.h"
41 #include "kmp_itt.h"
42 #include "kmp_str.h"
43 #include "kmp_settings.h"
44 #include "kmp_i18n.h"
45 #include "kmp_io.h"
46 #include "kmp_error.h"
47 #include "kmp_stats.h"
48 #include "kmp_wait_release.h"
49 
50 /* these are temporary issues to be dealt with */
51 #define KMP_USE_PRCTL 0
52 #define KMP_USE_POOLED_ALLOC 0
53 
54 #if KMP_OS_WINDOWS
55 #include <process.h>
56 #endif
57 
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
64 #if OMP_40_ENABLED
65  "4.0 (201307)";
66 #else
67  "3.1 (201107)";
68 #endif
69 
70 #ifdef KMP_DEBUG
71 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 
75 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
76 
77 /* ------------------------------------------------------------------------ */
78 /* ------------------------------------------------------------------------ */
79 
80 kmp_info_t __kmp_monitor;
81 
82 /* ------------------------------------------------------------------------ */
83 /* ------------------------------------------------------------------------ */
84 
85 /* Forward declarations */
86 
87 void __kmp_cleanup( void );
88 
89 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
90 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
91 static void __kmp_partition_places( kmp_team_t *team );
92 static void __kmp_do_serial_initialize( void );
93 void __kmp_fork_barrier( int gtid, int tid );
94 void __kmp_join_barrier( int gtid );
95 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
96 
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
100 #endif
101 
102 static int __kmp_expand_threads(int nWish, int nNeed);
103 static int __kmp_unregister_root_other_thread( int gtid );
104 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
105 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
106 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
107 
108 /* ------------------------------------------------------------------------ */
109 /* ------------------------------------------------------------------------ */
110 
111 /* Calculate the identifier of the current thread */
112 /* fast (and somewhat portable) way to get unique */
113 /* identifier of executing thread. */
114 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
115 
116 int
117 __kmp_get_global_thread_id( )
118 {
119  int i;
120  kmp_info_t **other_threads;
121  size_t stack_data;
122  char *stack_addr;
123  size_t stack_size;
124  char *stack_base;
125 
126  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
127  __kmp_nth, __kmp_all_nth ));
128 
129  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
130  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
131  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
132  __kmp_init_gtid for this to work. */
133 
134  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
135 
136 #ifdef KMP_TDATA_GTID
137  if ( TCR_4(__kmp_gtid_mode) >= 3) {
138  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
139  return __kmp_gtid;
140  }
141 #endif
142  if ( TCR_4(__kmp_gtid_mode) >= 2) {
143  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
144  return __kmp_gtid_get_specific();
145  }
146  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
147 
148  stack_addr = (char*) & stack_data;
149  other_threads = __kmp_threads;
150 
151  /*
152  ATT: The code below is a source of potential bugs due to unsynchronized access to
153  __kmp_threads array. For example:
154  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
155  2. Current thread is suspended by OS.
156  3. Another thread unregisters and finishes (debug versions of free() may fill memory
157  with something like 0xEF).
158  4. Current thread is resumed.
159  5. Current thread reads junk from *thr.
160  TODO: Fix it.
161  --ln
162  */
163 
164  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if( !thr ) continue;
168 
169  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
170  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
171 
172  /* stack grows down -- search through all of the active threads */
173 
174  if( stack_addr <= stack_base ) {
175  size_t stack_diff = stack_base - stack_addr;
176 
177  if( stack_diff <= stack_size ) {
178  /* The only way we can be closer than the allocated */
179  /* stack size is if we are running on this thread. */
180  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
181  return i;
182  }
183  }
184  }
185 
186  /* get specific to try and determine our gtid */
187  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
188  "thread, using TLS\n" ));
189  i = __kmp_gtid_get_specific();
190 
191  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
192 
193  /* if we havn't been assigned a gtid, then return code */
194  if( i<0 ) return i;
195 
196  /* dynamically updated stack window for uber threads to avoid get_specific call */
197  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
198  KMP_FATAL( StackOverflow, i );
199  }
200 
201  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
202  if( stack_addr > stack_base ) {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
204  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
205  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
206  } else {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
208  }
209 
210  /* Reprint stack bounds for ubermaster since they have been refined */
211  if ( __kmp_storage_map ) {
212  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
213  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
214  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
215  other_threads[i]->th.th_info.ds.ds_stacksize,
216  "th_%d stack (refinement)", i );
217  }
218  return i;
219 }
220 
221 int
222 __kmp_get_global_thread_id_reg( )
223 {
224  int gtid;
225 
226  if ( !__kmp_init_serial ) {
227  gtid = KMP_GTID_DNE;
228  } else
229 #ifdef KMP_TDATA_GTID
230  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
231  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
232  gtid = __kmp_gtid;
233  } else
234 #endif
235  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
236  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
237  gtid = __kmp_gtid_get_specific();
238  } else {
239  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
240  gtid = __kmp_get_global_thread_id();
241  }
242 
243  /* we must be a new uber master sibling thread */
244  if( gtid == KMP_GTID_DNE ) {
245  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
246  "Registering a new gtid.\n" ));
247  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
248  if( !__kmp_init_serial ) {
249  __kmp_do_serial_initialize();
250  gtid = __kmp_gtid_get_specific();
251  } else {
252  gtid = __kmp_register_root(FALSE);
253  }
254  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
255  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
256  }
257 
258  KMP_DEBUG_ASSERT( gtid >=0 );
259 
260  return gtid;
261 }
262 
263 /* caller must hold forkjoin_lock */
264 void
265 __kmp_check_stack_overlap( kmp_info_t *th )
266 {
267  int f;
268  char *stack_beg = NULL;
269  char *stack_end = NULL;
270  int gtid;
271 
272  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
273  if ( __kmp_storage_map ) {
274  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
275  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
276 
277  gtid = __kmp_gtid_from_thread( th );
278 
279  if (gtid == KMP_GTID_MONITOR) {
280  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281  "th_%s stack (%s)", "mon",
282  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
283  } else {
284  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
285  "th_%d stack (%s)", gtid,
286  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
287  }
288  }
289 
290  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
291  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid = __kmp_gtid_from_thread( th )))
292  {
293  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
294  if ( stack_beg == NULL ) {
295  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
296  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
297  }
298 
299  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
300  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
301 
302  if( f_th && f_th != th ) {
303  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304  char *other_stack_beg = other_stack_end -
305  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309  /* Print the other stack values before the abort */
310  if ( __kmp_storage_map )
311  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
312  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313  "th_%d stack (overlapped)",
314  __kmp_gtid_from_thread( f_th ) );
315 
316  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
317  }
318  }
319  }
320  }
321  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 
325 /* ------------------------------------------------------------------------ */
326 
327 /* ------------------------------------------------------------------------ */
328 
329 void
330 __kmp_infinite_loop( void )
331 {
332  static int done = FALSE;
333 
334  while (! done) {
335  KMP_YIELD( 1 );
336  }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void
342 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
343  char buffer[MAX_MESSAGE];
344  int node;
345  va_list ap;
346 
347  va_start( ap, format);
348  sprintf( buffer, "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
349  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
350  __kmp_vprintf( kmp_err, buffer, ap );
351 #if KMP_PRINT_DATA_PLACEMENT
352  if(gtid >= 0) {
353  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
354  if( __kmp_storage_map_verbose ) {
355  node = __kmp_get_host_node(p1);
356  if(node < 0) /* doesn't work, so don't try this next time */
357  __kmp_storage_map_verbose = FALSE;
358  else {
359  char *last;
360  int lastNode;
361  int localProc = __kmp_get_cpu_from_gtid(gtid);
362 
363  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
364  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
365  if(localProc >= 0)
366  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
367  else
368  __kmp_printf_no_lock(" GTID %d\n", gtid);
369 # if KMP_USE_PRCTL
370 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
371  do {
372  last = p1;
373  lastNode = node;
374  /* This loop collates adjacent pages with the same host node. */
375  do {
376  (char*)p1 += PAGE_SIZE;
377  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
379  (char*)p1 - 1, lastNode);
380  } while(p1 <= p2);
381 # else
382  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
383  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
384  if(p1 < p2) {
385  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
386  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
387  }
388 # endif
389  }
390  }
391  } else
392  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
393  }
394 #endif /* KMP_PRINT_DATA_PLACEMENT */
395  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
396 }
397 
398 void
399 __kmp_warn( char const * format, ... )
400 {
401  char buffer[MAX_MESSAGE];
402  va_list ap;
403 
404  if ( __kmp_generate_warnings == kmp_warnings_off ) {
405  return;
406  }
407 
408  va_start( ap, format );
409 
410  snprintf( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
411  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
412  __kmp_vprintf( kmp_err, buffer, ap );
413  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
414 
415  va_end( ap );
416 }
417 
418 void
419 __kmp_abort_process()
420 {
421 
422  // Later threads may stall here, but that's ok because abort() will kill them.
423  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
424 
425  if ( __kmp_debug_buf ) {
426  __kmp_dump_debug_buffer();
427  }; // if
428 
429  if ( KMP_OS_WINDOWS ) {
430  // Let other threads know of abnormal termination and prevent deadlock
431  // if abort happened during library initialization or shutdown
432  __kmp_global.g.g_abort = SIGABRT;
433 
434  /*
435  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
436  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
437  works well, but this function is not available in VS7 (this is not problem for DLL, but
438  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
439  not help, at least in some versions of MS C RTL.
440 
441  It seems following sequence is the only way to simulate abort() and avoid pop-up error
442  box.
443  */
444  raise( SIGABRT );
445  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
446  } else {
447  abort();
448  }; // if
449 
450  __kmp_infinite_loop();
451  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
452 
453 } // __kmp_abort_process
454 
455 void
456 __kmp_abort_thread( void )
457 {
458  // TODO: Eliminate g_abort global variable and this function.
459  // In case of abort just call abort(), it will kill all the threads.
460  __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* ------------------------------------------------------------------------ */
464 
465 /*
466  * Print out the storage map for the major kmp_info_t thread data structures
467  * that are allocated together.
468  */
469 
470 static void
471 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
472 {
473  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
474 
475  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
476  "th_%d.th_info", gtid );
477 
478  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
479  "th_%d.th_local", gtid );
480 
481  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
482  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
483 
484  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
485  &thr->th.th_bar[bs_plain_barrier+1],
486  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
487 
488  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
489  &thr->th.th_bar[bs_forkjoin_barrier+1],
490  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
491 
492  #if KMP_FAST_REDUCTION_BARRIER
493  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
494  &thr->th.th_bar[bs_reduction_barrier+1],
495  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
496  #endif // KMP_FAST_REDUCTION_BARRIER
497 }
498 
499 /*
500  * Print out the storage map for the major kmp_team_t team data structures
501  * that are allocated together.
502  */
503 
504 static void
505 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
506 {
507  int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
508  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
509  header, team_id );
510 
511  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
512  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
513 
514 
515  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
516  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
517 
518  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
520 
521  #if KMP_FAST_REDUCTION_BARRIER
522  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
523  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
524  #endif // KMP_FAST_REDUCTION_BARRIER
525 
526  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
527  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
528 
529  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
531 
532  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
533  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
534  header, team_id );
535 
536  /*
537  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
538  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
539 
540  __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
541  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
542 
543  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
544  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
545 
546  __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
547  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
548 
549  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
550  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
551 
552  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
553  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
554 
555  //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
556  // sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
557 
558  __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
559  sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
560 #if OMP_40_ENABLED
561  __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
562  sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
563 #endif
564  */
565 
566  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
567  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
568 }
569 
570 static void __kmp_init_allocator() {}
571 static void __kmp_fini_allocator() {}
572 static void __kmp_fini_allocator_thread() {}
573 
574 /* ------------------------------------------------------------------------ */
575 
576 #ifdef GUIDEDLL_EXPORTS
577 # if KMP_OS_WINDOWS
578 
579 
580 static void
581 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
582  // TODO: Change to __kmp_break_bootstrap_lock().
583  __kmp_init_bootstrap_lock( lck ); // make the lock released
584 }
585 
586 static void
587 __kmp_reset_locks_on_process_detach( int gtid_req ) {
588  int i;
589  int thread_count;
590 
591  // PROCESS_DETACH is expected to be called by a thread
592  // that executes ProcessExit() or FreeLibrary().
593  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
594  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
595  // However, in fact, some threads can be still alive here, although being about to be terminated.
596  // The threads in the array with ds_thread==0 are most suspicious.
597  // Actually, it can be not safe to access the __kmp_threads[].
598 
599  // TODO: does it make sense to check __kmp_roots[] ?
600 
601  // Let's check that there are no other alive threads registered with the OMP lib.
602  while( 1 ) {
603  thread_count = 0;
604  for( i = 0; i < __kmp_threads_capacity; ++i ) {
605  if( !__kmp_threads ) continue;
606  kmp_info_t* th = __kmp_threads[ i ];
607  if( th == NULL ) continue;
608  int gtid = th->th.th_info.ds.ds_gtid;
609  if( gtid == gtid_req ) continue;
610  if( gtid < 0 ) continue;
611  DWORD exit_val;
612  int alive = __kmp_is_thread_alive( th, &exit_val );
613  if( alive ) {
614  ++thread_count;
615  }
616  }
617  if( thread_count == 0 ) break; // success
618  }
619 
620  // Assume that I'm alone.
621 
622  // Now it might be probably safe to check and reset locks.
623  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
624  __kmp_reset_lock( &__kmp_forkjoin_lock );
625  #ifdef KMP_DEBUG
626  __kmp_reset_lock( &__kmp_stdio_lock );
627  #endif // KMP_DEBUG
628 
629 
630 }
631 
632 BOOL WINAPI
633 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
634  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
635 
636  switch( fdwReason ) {
637 
638  case DLL_PROCESS_ATTACH:
639  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
640 
641  return TRUE;
642 
643  case DLL_PROCESS_DETACH:
644  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
645  __kmp_gtid_get_specific() ));
646 
647  if( lpReserved != NULL )
648  {
649  // lpReserved is used for telling the difference:
650  // lpReserved == NULL when FreeLibrary() was called,
651  // lpReserved != NULL when the process terminates.
652  // When FreeLibrary() is called, worker threads remain alive.
653  // So they will release the forkjoin lock by themselves.
654  // When the process terminates, worker threads disappear triggering
655  // the problem of unreleased forkjoin lock as described below.
656 
657  // A worker thread can take the forkjoin lock
658  // in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
659  // The problem comes up if that worker thread becomes dead
660  // before it releases the forkjoin lock.
661  // The forkjoin lock remains taken, while the thread
662  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
663  // will try to take the forkjoin lock and will always fail,
664  // so that the application will never finish [normally].
665  // This scenario is possible if __kmpc_end() has not been executed.
666  // It looks like it's not a corner case, but common cases:
667  // - the main function was compiled by an alternative compiler;
668  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
669  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
670  // - alive foreign thread prevented __kmpc_end from doing cleanup.
671 
672  // This is a hack to work around the problem.
673  // TODO: !!! to figure out something better.
674  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
675  }
676 
677  __kmp_internal_end_library( __kmp_gtid_get_specific() );
678 
679  return TRUE;
680 
681  case DLL_THREAD_ATTACH:
682  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
683 
684  /* if we wanted to register new siblings all the time here call
685  * __kmp_get_gtid(); */
686  return TRUE;
687 
688  case DLL_THREAD_DETACH:
689  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
690  __kmp_gtid_get_specific() ));
691 
692  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
693  return TRUE;
694  }
695 
696  return TRUE;
697 }
698 
699 # endif /* KMP_OS_WINDOWS */
700 #endif /* GUIDEDLL_EXPORTS */
701 
702 
703 /* ------------------------------------------------------------------------ */
704 
705 /* Change the library type to "status" and return the old type */
706 /* called from within initialization routines where __kmp_initz_lock is held */
707 int
708 __kmp_change_library( int status )
709 {
710  int old_status;
711 
712  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
713 
714  if (status) {
715  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
716  }
717  else {
718  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
719  }
720 
721  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
722 }
723 
724 /* ------------------------------------------------------------------------ */
725 /* ------------------------------------------------------------------------ */
726 
727 /* __kmp_parallel_deo --
728  * Wait until it's our turn.
729  */
730 void
731 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
732 {
733  int gtid = *gtid_ref;
734 #ifdef BUILD_PARALLEL_ORDERED
735  kmp_team_t *team = __kmp_team_from_gtid( gtid );
736 #endif /* BUILD_PARALLEL_ORDERED */
737 
738  if( __kmp_env_consistency_check ) {
739  if( __kmp_threads[gtid]->th.th_root->r.r_active )
740  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
741  }
742 #ifdef BUILD_PARALLEL_ORDERED
743  if( !team->t.t_serialized ) {
744  kmp_uint32 spins;
745 
746  KMP_MB();
747  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
748  KMP_MB();
749  }
750 #endif /* BUILD_PARALLEL_ORDERED */
751 }
752 
753 /* __kmp_parallel_dxo --
754  * Signal the next task.
755  */
756 
757 void
758 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
759 {
760  int gtid = *gtid_ref;
761 #ifdef BUILD_PARALLEL_ORDERED
762  int tid = __kmp_tid_from_gtid( gtid );
763  kmp_team_t *team = __kmp_team_from_gtid( gtid );
764 #endif /* BUILD_PARALLEL_ORDERED */
765 
766  if( __kmp_env_consistency_check ) {
767  if( __kmp_threads[gtid]->th.th_root->r.r_active )
768  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
769  }
770 #ifdef BUILD_PARALLEL_ORDERED
771  if ( ! team->t.t_serialized ) {
772  KMP_MB(); /* Flush all pending memory write invalidates. */
773 
774  /* use the tid of the next thread in this team */
775  /* TODO repleace with general release procedure */
776  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
777 
778  KMP_MB(); /* Flush all pending memory write invalidates. */
779  }
780 #endif /* BUILD_PARALLEL_ORDERED */
781 }
782 
783 /* ------------------------------------------------------------------------ */
784 /* ------------------------------------------------------------------------ */
785 
786 /* ------------------------------------------------------------------------ */
787 /* ------------------------------------------------------------------------ */
788 
789 /* The BARRIER for a SINGLE process section is always explicit */
790 
791 int
792 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
793 {
794  int status;
795  kmp_info_t *th;
796  kmp_team_t *team;
797 
798  if( ! TCR_4(__kmp_init_parallel) )
799  __kmp_parallel_initialize();
800 
801  th = __kmp_threads[ gtid ];
802  team = th->th.th_team;
803  status = 0;
804 
805  th->th.th_ident = id_ref;
806 
807  if ( team->t.t_serialized ) {
808  status = 1;
809  } else {
810  kmp_int32 old_this = th->th.th_local.this_construct;
811 
812  ++th->th.th_local.this_construct;
813  /* try to set team count to thread count--success means thread got the
814  single block
815  */
816  /* TODO: Should this be acquire or release? */
817  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
818  th->th.th_local.this_construct);
819  }
820 
821  if( __kmp_env_consistency_check ) {
822  if (status && push_ws) {
823  __kmp_push_workshare( gtid, ct_psingle, id_ref );
824  } else {
825  __kmp_check_workshare( gtid, ct_psingle, id_ref );
826  }
827  }
828 #if USE_ITT_BUILD
829  if ( status ) {
830  __kmp_itt_single_start( gtid );
831  }
832  if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
833  __kmp_itt_metadata_single();
834  }
835 
836 #endif /* USE_ITT_BUILD */
837  return status;
838 }
839 
840 void
841 __kmp_exit_single( int gtid )
842 {
843 #if USE_ITT_BUILD
844  __kmp_itt_single_end( gtid );
845 #endif /* USE_ITT_BUILD */
846  if( __kmp_env_consistency_check )
847  __kmp_pop_workshare( gtid, ct_psingle, NULL );
848 }
849 
850 
851 /*
852  * determine if we can go parallel or must use a serialized parallel region and
853  * how many threads we can use
854  * set_nproc is the number of threads requested for the team
855  * returns 0 if we should serialize or only use one thread,
856  * otherwise the number of threads to use
857  * The forkjoin lock is held by the caller.
858  */
859 static int
860 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
861  int master_tid, int set_nthreads
862 #if OMP_40_ENABLED
863  , int enter_teams
864 #endif /* OMP_40_ENABLED */
865 )
866 {
867  int capacity;
868  int new_nthreads;
869  int use_rml_to_adjust_nth;
870  KMP_DEBUG_ASSERT( __kmp_init_serial );
871  KMP_DEBUG_ASSERT( root && parent_team );
872 
873  //
874  // Initial check to see if we should use a serialized team.
875  //
876  if ( set_nthreads == 1 ) {
877  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
878  __kmp_get_gtid(), set_nthreads ));
879  return 1;
880  }
881  if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
882 #if OMP_40_ENABLED
883  && !enter_teams
884 #endif /* OMP_40_ENABLED */
885  ) ) || ( __kmp_library == library_serial ) ) {
886  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
887  __kmp_get_gtid(), set_nthreads ));
888  return 1;
889  }
890 
891  //
892  // If dyn-var is set, dynamically adjust the number of desired threads,
893  // according to the method specified by dynamic_mode.
894  //
895  new_nthreads = set_nthreads;
896  use_rml_to_adjust_nth = FALSE;
897  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
898  ;
899  }
900 #ifdef USE_LOAD_BALANCE
901  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
902  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
903  if ( new_nthreads == 1 ) {
904  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
905  master_tid ));
906  return 1;
907  }
908  if ( new_nthreads < set_nthreads ) {
909  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
910  master_tid, new_nthreads ));
911  }
912  }
913 #endif /* USE_LOAD_BALANCE */
914  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
915  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
916  : root->r.r_hot_team->t.t_nproc);
917  if ( new_nthreads <= 1 ) {
918  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
919  master_tid ));
920  return 1;
921  }
922  if ( new_nthreads < set_nthreads ) {
923  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
924  master_tid, new_nthreads ));
925  }
926  else {
927  new_nthreads = set_nthreads;
928  }
929  }
930  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
931  if ( set_nthreads > 2 ) {
932  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
933  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
934  if ( new_nthreads == 1 ) {
935  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
936  master_tid ));
937  return 1;
938  }
939  if ( new_nthreads < set_nthreads ) {
940  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
941  master_tid, new_nthreads ));
942  }
943  }
944  }
945  else {
946  KMP_ASSERT( 0 );
947  }
948 
949  //
950  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
951  //
952  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
953  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
954  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
955  root->r.r_hot_team->t.t_nproc );
956  if ( tl_nthreads <= 0 ) {
957  tl_nthreads = 1;
958  }
959 
960  //
961  // If dyn-var is false, emit a 1-time warning.
962  //
963  if ( ! get__dynamic_2( parent_team, master_tid )
964  && ( ! __kmp_reserve_warn ) ) {
965  __kmp_reserve_warn = 1;
966  __kmp_msg(
967  kmp_ms_warning,
968  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
969  KMP_HNT( Unset_ALL_THREADS ),
970  __kmp_msg_null
971  );
972  }
973  if ( tl_nthreads == 1 ) {
974  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
975  master_tid ));
976  return 1;
977  }
978  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
979  master_tid, tl_nthreads ));
980  new_nthreads = tl_nthreads;
981  }
982 
983 
984  //
985  // Check if the threads array is large enough, or needs expanding.
986  //
987  // See comment in __kmp_register_root() about the adjustment if
988  // __kmp_threads[0] == NULL.
989  //
990  capacity = __kmp_threads_capacity;
991  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
992  --capacity;
993  }
994  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
995  root->r.r_hot_team->t.t_nproc ) > capacity ) {
996  //
997  // Expand the threads array.
998  //
999  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
1000  root->r.r_hot_team->t.t_nproc ) - capacity;
1001  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
1002  if ( slotsAdded < slotsRequired ) {
1003  //
1004  // The threads array was not expanded enough.
1005  //
1006  new_nthreads -= ( slotsRequired - slotsAdded );
1007  KMP_ASSERT( new_nthreads >= 1 );
1008 
1009  //
1010  // If dyn-var is false, emit a 1-time warning.
1011  //
1012  if ( ! get__dynamic_2( parent_team, master_tid )
1013  && ( ! __kmp_reserve_warn ) ) {
1014  __kmp_reserve_warn = 1;
1015  if ( __kmp_tp_cached ) {
1016  __kmp_msg(
1017  kmp_ms_warning,
1018  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1019  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
1020  KMP_HNT( PossibleSystemLimitOnThreads ),
1021  __kmp_msg_null
1022  );
1023  }
1024  else {
1025  __kmp_msg(
1026  kmp_ms_warning,
1027  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
1028  KMP_HNT( SystemLimitOnThreads ),
1029  __kmp_msg_null
1030  );
1031  }
1032  }
1033  }
1034  }
1035 
1036  if ( new_nthreads == 1 ) {
1037  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
1038  __kmp_get_gtid(), set_nthreads ) );
1039  return 1;
1040  }
1041 
1042  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
1043  __kmp_get_gtid(), new_nthreads, set_nthreads ));
1044  return new_nthreads;
1045 }
1046 
1047 /* ------------------------------------------------------------------------ */
1048 /* ------------------------------------------------------------------------ */
1049 
1050 /* allocate threads from the thread pool and assign them to the new team */
1051 /* we are assured that there are enough threads available, because we
1052  * checked on that earlier within critical section forkjoin */
1053 
1054 static void
1055 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1056  kmp_info_t *master_th, int master_gtid )
1057 {
1058  int i;
1059  int use_hot_team;
1060 
1061  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1062  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1063  KMP_MB();
1064 
1065  /* first, let's setup the master thread */
1066  master_th->th.th_info.ds.ds_tid = 0;
1067  master_th->th.th_team = team;
1068  master_th->th.th_team_nproc = team->t.t_nproc;
1069  master_th->th.th_team_master = master_th;
1070  master_th->th.th_team_serialized = FALSE;
1071  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1072 
1073  /* make sure we are not the optimized hot team */
1074 #if KMP_NESTED_HOT_TEAMS
1075  use_hot_team = 0;
1076  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1077  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1078  int level = team->t.t_active_level - 1; // index in array of hot teams
1079  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1080  if( master_th->th.th_teams_size.nteams > 1 ) {
1081  ++level; // level was not increased in teams construct for team_of_masters
1082  }
1083  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1084  master_th->th.th_teams_level == team->t.t_level ) {
1085  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1086  } // team->t.t_level will be increased inside parallel
1087  }
1088  if( level < __kmp_hot_teams_max_level ) {
1089  if( hot_teams[level].hot_team ) {
1090  // hot team has already been allocated for given level
1091  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1092  use_hot_team = 1; // the team is ready to use
1093  } else {
1094  use_hot_team = 0; // AC: threads are not allocated yet
1095  hot_teams[level].hot_team = team; // remember new hot team
1096  hot_teams[level].hot_team_nth = team->t.t_nproc;
1097  }
1098  } else {
1099  use_hot_team = 0;
1100  }
1101  }
1102 #else
1103  use_hot_team = team == root->r.r_hot_team;
1104 #endif
1105  if ( !use_hot_team ) {
1106 
1107  /* install the master thread */
1108  team->t.t_threads[ 0 ] = master_th;
1109  __kmp_initialize_info( master_th, team, 0, master_gtid );
1110 
1111  /* now, install the worker threads */
1112  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1113 
1114  /* fork or reallocate a new thread and install it in team */
1115  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1116  team->t.t_threads[ i ] = thr;
1117  KMP_DEBUG_ASSERT( thr );
1118  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1119  /* align team and thread arrived states */
1120  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
1121  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1122  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1123  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1124  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1125 #if OMP_40_ENABLED
1126  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1127  thr->th.th_teams_level = master_th->th.th_teams_level;
1128  thr->th.th_teams_size = master_th->th.th_teams_size;
1129 #endif
1130  { // Initialize threads' barrier data.
1131  int b;
1132  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1133  for ( b = 0; b < bs_last_barrier; ++ b ) {
1134  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1135  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1136  }; // for b
1137  }
1138  }
1139 
1140 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1141  __kmp_partition_places( team );
1142 #endif
1143 
1144  }
1145 
1146  KMP_MB();
1147 }
1148 
1149 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1150 //
1151 // Propagate any changes to the floating point control registers out to the team
1152 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1153 // so we don't make changes unless they are needed.
1154 //
1155 inline static void
1156 propagateFPControl(kmp_team_t * team)
1157 {
1158  if ( __kmp_inherit_fp_control ) {
1159  kmp_int16 x87_fpu_control_word;
1160  kmp_uint32 mxcsr;
1161 
1162  // Get master values of FPU control flags (both X87 and vector)
1163  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1164  __kmp_store_mxcsr( &mxcsr );
1165  mxcsr &= KMP_X86_MXCSR_MASK;
1166 
1167  // There is no point looking at t_fp_control_saved here.
1168  // If it is TRUE, we still have to update the values if they are different from those we now have.
1169  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1170  // that the values in the team are the same as those we have.
1171  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1172  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1173  // cache-line into a written state, causing all threads in the team to have to read it again.
1174  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1175  team->t.t_x87_fpu_control_word = x87_fpu_control_word;
1176  }
1177  if ( team->t.t_mxcsr != mxcsr ) {
1178  team->t.t_mxcsr = mxcsr;
1179  }
1180  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1181  // So we must ensure it is correct.
1182  if (!team->t.t_fp_control_saved) {
1183  team->t.t_fp_control_saved = TRUE;
1184  }
1185  }
1186  else {
1187  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1188  if (team->t.t_fp_control_saved)
1189  team->t.t_fp_control_saved = FALSE;
1190  }
1191 }
1192 
1193 // Do the opposite, setting the hardware registers to the updated values from the team.
1194 inline static void
1195 updateHWFPControl(kmp_team_t * team)
1196 {
1197  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1198  //
1199  // Only reset the fp control regs if they have been changed in the team.
1200  // the parallel region that we are exiting.
1201  //
1202  kmp_int16 x87_fpu_control_word;
1203  kmp_uint32 mxcsr;
1204  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1205  __kmp_store_mxcsr( &mxcsr );
1206  mxcsr &= KMP_X86_MXCSR_MASK;
1207 
1208  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1209  __kmp_clear_x87_fpu_status_word();
1210  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1211  }
1212 
1213  if ( team->t.t_mxcsr != mxcsr ) {
1214  __kmp_load_mxcsr( &team->t.t_mxcsr );
1215  }
1216  }
1217 }
1218 #else
1219 # define propagateFPControl(x) ((void)0)
1220 # define updateHWFPControl(x) ((void)0)
1221 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1222 
1223 static void
1224 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1225 
1226 /*
1227  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1228  */
1229 void
1230 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1231 {
1232  kmp_info_t *this_thr;
1233  kmp_team_t *serial_team;
1234 
1235  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1236 
1237  /* Skip all this code for autopar serialized loops since it results in
1238  unacceptable overhead */
1239  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1240  return;
1241 
1242  if( ! TCR_4( __kmp_init_parallel ) )
1243  __kmp_parallel_initialize();
1244 
1245  this_thr = __kmp_threads[ global_tid ];
1246  serial_team = this_thr->th.th_serial_team;
1247 
1248  /* utilize the serialized team held by this thread */
1249  KMP_DEBUG_ASSERT( serial_team );
1250  KMP_MB();
1251 
1252  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1253  KMP_DEBUG_ASSERT( this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team );
1254  KMP_DEBUG_ASSERT( serial_team->t.t_task_team == NULL );
1255  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1256  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1257  this_thr->th.th_task_team = NULL;
1258  }
1259 
1260 #if OMP_40_ENABLED
1261  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1262  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1263  proc_bind = proc_bind_false;
1264  }
1265  else if ( proc_bind == proc_bind_default ) {
1266  //
1267  // No proc_bind clause was specified, so use the current value
1268  // of proc-bind-var for this parallel region.
1269  //
1270  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1271  }
1272  //
1273  // Reset for next parallel region
1274  //
1275  this_thr->th.th_set_proc_bind = proc_bind_default;
1276 #endif /* OMP_40_ENABLED */
1277 
1278  if( this_thr->th.th_team != serial_team ) {
1279  // Nested level will be an index in the nested nthreads array
1280  int level = this_thr->th.th_team->t.t_level;
1281 
1282  if( serial_team->t.t_serialized ) {
1283  /* this serial team was already used
1284  * TODO increase performance by making this locks more specific */
1285  kmp_team_t *new_team;
1286  int tid = this_thr->th.th_info.ds.ds_tid;
1287 
1288  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1289 
1290  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1291 #if OMP_40_ENABLED
1292  proc_bind,
1293 #endif
1294  & this_thr->th.th_current_task->td_icvs,
1295  0 USE_NESTED_HOT_ARG(NULL) );
1296  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1297  KMP_ASSERT( new_team );
1298 
1299  /* setup new serialized team and install it */
1300  new_team->t.t_threads[0] = this_thr;
1301  new_team->t.t_parent = this_thr->th.th_team;
1302  serial_team = new_team;
1303  this_thr->th.th_serial_team = serial_team;
1304 
1305  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1306  global_tid, serial_team ) );
1307 
1308 
1309  /* TODO the above breaks the requirement that if we run out of
1310  * resources, then we can still guarantee that serialized teams
1311  * are ok, since we may need to allocate a new one */
1312  } else {
1313  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1314  global_tid, serial_team ) );
1315  }
1316 
1317  /* we have to initialize this serial team */
1318  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1319  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1320  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1321  serial_team->t.t_ident = loc;
1322  serial_team->t.t_serialized = 1;
1323  serial_team->t.t_nproc = 1;
1324  serial_team->t.t_parent = this_thr->th.th_team;
1325  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1326  this_thr->th.th_team = serial_team;
1327  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1328 
1329  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1330  global_tid, this_thr->th.th_current_task ) );
1331  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1332  this_thr->th.th_current_task->td_flags.executing = 0;
1333 
1334  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1335 
1336  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1337  each serialized task represented by team->t.t_serialized? */
1338  copy_icvs(
1339  & this_thr->th.th_current_task->td_icvs,
1340  & this_thr->th.th_current_task->td_parent->td_icvs );
1341 
1342  // Thread value exists in the nested nthreads array for the next nested level
1343  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1344  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1345  }
1346 
1347 #if OMP_40_ENABLED
1348  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1349  this_thr->th.th_current_task->td_icvs.proc_bind
1350  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1351  }
1352 #endif /* OMP_40_ENABLED */
1353 
1354  this_thr->th.th_info.ds.ds_tid = 0;
1355 
1356  /* set thread cache values */
1357  this_thr->th.th_team_nproc = 1;
1358  this_thr->th.th_team_master = this_thr;
1359  this_thr->th.th_team_serialized = 1;
1360 
1361  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1362  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1363 
1364  propagateFPControl (serial_team);
1365 
1366  /* check if we need to allocate dispatch buffers stack */
1367  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1368  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1369  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1370  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1371  }
1372  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1373 
1374  KMP_MB();
1375 
1376  } else {
1377  /* this serialized team is already being used,
1378  * that's fine, just add another nested level */
1379  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1380  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1381  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1382  ++ serial_team->t.t_serialized;
1383  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1384 
1385  // Nested level will be an index in the nested nthreads array
1386  int level = this_thr->th.th_team->t.t_level;
1387  // Thread value exists in the nested nthreads array for the next nested level
1388  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1389  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1390  }
1391  serial_team->t.t_level++;
1392  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1393  global_tid, serial_team, serial_team->t.t_level ) );
1394 
1395  /* allocate/push dispatch buffers stack */
1396  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1397  {
1398  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1399  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1400  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1401  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1402  }
1403  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1404 
1405  KMP_MB();
1406  }
1407 
1408  if ( __kmp_env_consistency_check )
1409  __kmp_push_parallel( global_tid, NULL );
1410 
1411 #if USE_ITT_BUILD
1412  // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
1413  if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
1414  {
1415  this_thr->th.th_ident = loc;
1416  // 0 - no barriers; 1 - serialized parallel
1417  __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
1418  }
1419  // Save the start of the "parallel" region for VTune. This is the join barrier begin at the same time.
1420  if( ( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) &&
1421  __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr ) || KMP_ITT_DEBUG )
1422  {
1423  this_thr->th.th_ident = loc;
1424 #if USE_ITT_NOTIFY
1425  if( this_thr->th.th_team->t.t_level == 1 ) {
1426  serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
1427  }
1428 #endif
1429  }
1430 #endif /* USE_ITT_BUILD */
1431 }
1432 
1433 /* most of the work for a fork */
1434 /* return true if we really went parallel, false if serialized */
1435 int
1436 __kmp_fork_call(
1437  ident_t * loc,
1438  int gtid,
1439  enum fork_context_e call_context, // Intel, GNU, ...
1440  kmp_int32 argc,
1441  microtask_t microtask,
1442  launch_t invoker,
1443 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1444 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
1445  va_list * ap
1446 #else
1447  va_list ap
1448 #endif
1449  )
1450 {
1451  void **argv;
1452  int i;
1453  int master_tid;
1454  int master_this_cons;
1455  kmp_team_t *team;
1456  kmp_team_t *parent_team;
1457  kmp_info_t *master_th;
1458  kmp_root_t *root;
1459  int nthreads;
1460  int master_active;
1461  int master_set_numthreads;
1462  int level;
1463 #if OMP_40_ENABLED
1464  int active_level;
1465  int teams_level;
1466 #endif
1467 #if KMP_NESTED_HOT_TEAMS
1468  kmp_hot_team_ptr_t **p_hot_teams;
1469 #endif
1470  { // KMP_TIME_BLOCK
1471  KMP_TIME_BLOCK(KMP_fork_call);
1472 
1473  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1474  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1475  /* Some systems prefer the stack for the root thread(s) to start with */
1476  /* some gap from the parent stack to prevent false sharing. */
1477  void *dummy = alloca(__kmp_stkpadding);
1478  /* These 2 lines below are so this does not get optimized out */
1479  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1480  __kmp_stkpadding += (short)((kmp_int64)dummy);
1481  }
1482 
1483  /* initialize if needed */
1484  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1485  if( ! TCR_4(__kmp_init_parallel) )
1486  __kmp_parallel_initialize();
1487 
1488  /* setup current data */
1489  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1490  parent_team = master_th->th.th_team;
1491  master_tid = master_th->th.th_info.ds.ds_tid;
1492  master_this_cons = master_th->th.th_local.this_construct;
1493  root = master_th->th.th_root;
1494  master_active = root->r.r_active;
1495  master_set_numthreads = master_th->th.th_set_nproc;
1496  // Nested level will be an index in the nested nthreads array
1497  level = parent_team->t.t_level;
1498 #if OMP_40_ENABLED
1499  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1500  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1501 #endif
1502 #if KMP_NESTED_HOT_TEAMS
1503  p_hot_teams = &master_th->th.th_hot_teams;
1504  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1505  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1506  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1507  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1508  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1509  }
1510 #endif
1511 
1512 
1513  master_th->th.th_ident = loc;
1514 
1515 #if OMP_40_ENABLED
1516  if ( master_th->th.th_teams_microtask &&
1517  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1518  // AC: This is start of parallel that is nested inside teams construct.
1519  // The team is actual (hot), all workers are ready at the fork barrier.
1520  // No lock needed to initialize the team a bit, then free workers.
1521  parent_team->t.t_ident = loc;
1522  parent_team->t.t_argc = argc;
1523  argv = (void**)parent_team->t.t_argv;
1524  for( i=argc-1; i >= 0; --i )
1525 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1526 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
1527  *argv++ = va_arg( *ap, void * );
1528 #else
1529  *argv++ = va_arg( ap, void * );
1530 #endif
1531  /* Increment our nested depth levels, but not increase the serialization */
1532  if ( parent_team == master_th->th.th_serial_team ) {
1533  // AC: we are in serialized parallel
1534  __kmpc_serialized_parallel(loc, gtid);
1535  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1536  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1537  // work correctly, will restore at join time
1538  KMP_TIME_BLOCK(OMP_work);
1539  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
1540  return TRUE;
1541  }
1542  parent_team->t.t_pkfn = microtask;
1543  parent_team->t.t_invoke = invoker;
1544  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1545  parent_team->t.t_active_level ++;
1546  parent_team->t.t_level ++;
1547 
1548  /* Change number of threads in the team if requested */
1549  if ( master_set_numthreads ) { // The parallel has num_threads clause
1550  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1551  // AC: only can reduce the number of threads dynamically, cannot increase
1552  kmp_info_t **other_threads = parent_team->t.t_threads;
1553  parent_team->t.t_nproc = master_set_numthreads;
1554  for ( i = 0; i < master_set_numthreads; ++i ) {
1555  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1556  }
1557  // Keep extra threads hot in the team for possible next parallels
1558  }
1559  master_th->th.th_set_nproc = 0;
1560  }
1561 
1562 
1563  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1564  __kmp_internal_fork( loc, gtid, parent_team );
1565  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1566 
1567  /* Invoke microtask for MASTER thread */
1568  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1569  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1570 
1571  {
1572  KMP_TIME_BLOCK(OMP_work);
1573  if (! parent_team->t.t_invoke( gtid )) {
1574  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1575  }
1576  }
1577  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1578  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1579  KMP_MB(); /* Flush all pending memory write invalidates. */
1580 
1581  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1582 
1583  return TRUE;
1584  } // Parallel closely nested in teams construct
1585 #endif /* OMP_40_ENABLED */
1586 
1587 #if KMP_DEBUG
1588  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1589  KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
1590  }
1591 #endif
1592 
1593  /* determine how many new threads we can use */
1594  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1595 
1596  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1597  nthreads = 1;
1598  } else {
1599  nthreads = master_set_numthreads ?
1600  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1601  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1602 #if OMP_40_ENABLED
1603 /* AC: If we execute teams from parallel region (on host), then teams should be created
1604  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1605  then teams and their threads should be created regardless of the nesting setting. */
1606  , ((ap==NULL && active_level==0) ||
1607  (ap && teams_level>0 && teams_level==level))
1608 #endif /* OMP_40_ENABLED */
1609  );
1610  }
1611  KMP_DEBUG_ASSERT( nthreads > 0 );
1612 
1613  /* If we temporarily changed the set number of threads then restore it now */
1614  master_th->th.th_set_nproc = 0;
1615 
1616 
1617  /* create a serialized parallel region? */
1618  if ( nthreads == 1 ) {
1619  /* josh todo: hypothetical question: what do we do for OS X*? */
1620 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM )
1621  void * args[ argc ];
1622 #else
1623  void * * args = (void**) alloca( argc * sizeof( void * ) );
1624 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM ) */
1625 
1626  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1627  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1628 
1629  __kmpc_serialized_parallel(loc, gtid);
1630 
1631  if ( call_context == fork_context_intel ) {
1632  /* TODO this sucks, use the compiler itself to pass args! :) */
1633  master_th->th.th_serial_team->t.t_ident = loc;
1634 #if OMP_40_ENABLED
1635  if ( !ap ) {
1636  // revert change made in __kmpc_serialized_parallel()
1637  master_th->th.th_serial_team->t.t_level--;
1638  // Get args from parent team for teams construct
1639  {
1640  KMP_TIME_BLOCK(OMP_work);
1641  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
1642  }
1643  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1644  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1645  team = master_th->th.th_team;
1646  //team->t.t_pkfn = microtask;
1647  team->t.t_invoke = invoker;
1648  __kmp_alloc_argv_entries( argc, team, TRUE );
1649  team->t.t_argc = argc;
1650  argv = (void**) team->t.t_argv;
1651  if ( ap ) {
1652  for( i=argc-1; i >= 0; --i )
1653 // TODO: revert workaround for Intel(R) 64 tracker #96
1654 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
1655  *argv++ = va_arg( *ap, void * );
1656 # else
1657  *argv++ = va_arg( ap, void * );
1658 # endif
1659  } else {
1660  for( i=0; i < argc; ++i )
1661  // Get args from parent team for teams construct
1662  argv[i] = parent_team->t.t_argv[i];
1663  }
1664  // AC: revert change made in __kmpc_serialized_parallel()
1665  // because initial code in teams should have level=0
1666  team->t.t_level--;
1667  // AC: call special invoker for outer "parallel" of the teams construct
1668  {
1669  KMP_TIME_BLOCK(OMP_work);
1670  invoker(gtid);
1671  }
1672  } else {
1673 #endif /* OMP_40_ENABLED */
1674  argv = args;
1675  for( i=argc-1; i >= 0; --i )
1676 // TODO: revert workaround for Intel(R) 64 tracker #96
1677 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
1678  *argv++ = va_arg( *ap, void * );
1679 #else
1680  *argv++ = va_arg( ap, void * );
1681 #endif
1682  KMP_MB();
1683  {
1684  KMP_TIME_BLOCK(OMP_work);
1685  __kmp_invoke_microtask( microtask, gtid, 0, argc, args );
1686  }
1687 #if OMP_40_ENABLED
1688  }
1689 #endif /* OMP_40_ENABLED */
1690  }
1691  else if ( call_context == fork_context_gnu ) {
1692  // we were called from GNU native code
1693  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1694  return FALSE;
1695  }
1696  else {
1697  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1698  }
1699 
1700 
1701  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1702  KMP_MB();
1703  return FALSE;
1704  }
1705 
1706  // GEH: only modify the executing flag in the case when not serialized
1707  // serialized case is handled in kmpc_serialized_parallel
1708  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1709  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1710  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1711  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1712  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1713  master_th->th.th_current_task->td_flags.executing = 0;
1714 
1715 #if OMP_40_ENABLED
1716  if ( !master_th->th.th_teams_microtask || level > teams_level )
1717 #endif /* OMP_40_ENABLED */
1718  {
1719  /* Increment our nested depth level */
1720  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1721  }
1722 
1723  // See if we need to make a copy of the ICVs.
1724  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1725  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1726  nthreads_icv = __kmp_nested_nth.nth[level+1];
1727  }
1728  else {
1729  nthreads_icv = 0; // don't update
1730  }
1731 
1732 #if OMP_40_ENABLED
1733  // Figure out the proc_bind_policy for the new team.
1734  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1735  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1736  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1737  proc_bind = proc_bind_false;
1738  }
1739  else {
1740  if (proc_bind == proc_bind_default) {
1741  // No proc_bind clause specified; use current proc-bind-var for this parallel region
1742  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1743  }
1744  /* else: The proc_bind policy was specified explicitly on parallel clause. This
1745  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1746  // Figure the value of proc-bind-var for the child threads.
1747  if ((level+1 < __kmp_nested_proc_bind.used)
1748  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1749  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1750  }
1751  }
1752 
1753  // Reset for next parallel region
1754  master_th->th.th_set_proc_bind = proc_bind_default;
1755 #endif /* OMP_40_ENABLED */
1756 
1757  if ((nthreads_icv > 0)
1758 #if OMP_40_ENABLED
1759  || (proc_bind_icv != proc_bind_default)
1760 #endif /* OMP_40_ENABLED */
1761  ) {
1762  kmp_internal_control_t new_icvs;
1763  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1764  new_icvs.next = NULL;
1765  if (nthreads_icv > 0) {
1766  new_icvs.nproc = nthreads_icv;
1767  }
1768 
1769 #if OMP_40_ENABLED
1770  if (proc_bind_icv != proc_bind_default) {
1771  new_icvs.proc_bind = proc_bind_icv;
1772  }
1773 #endif /* OMP_40_ENABLED */
1774 
1775  /* allocate a new parallel team */
1776  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1777  team = __kmp_allocate_team(root, nthreads, nthreads,
1778 #if OMP_40_ENABLED
1779  proc_bind,
1780 #endif
1781  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1782  } else {
1783  /* allocate a new parallel team */
1784  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1785  team = __kmp_allocate_team(root, nthreads, nthreads,
1786 #if OMP_40_ENABLED
1787  proc_bind,
1788 #endif
1789  &master_th->th.th_current_task->td_icvs, argc
1790  USE_NESTED_HOT_ARG(master_th) );
1791  }
1792  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
1793 
1794  /* setup the new team */
1795  team->t.t_master_tid = master_tid;
1796  team->t.t_master_this_cons = master_this_cons;
1797  team->t.t_ident = loc;
1798  team->t.t_parent = parent_team;
1799  TCW_SYNC_PTR(team->t.t_pkfn, microtask);
1800  team->t.t_invoke = invoker; /* TODO move this to root, maybe */
1801  // TODO: parent_team->t.t_level == INT_MAX ???
1802 #if OMP_40_ENABLED
1803  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
1804 #endif /* OMP_40_ENABLED */
1805  team->t.t_level = parent_team->t.t_level + 1;
1806  team->t.t_active_level = parent_team->t.t_active_level + 1;
1807 #if OMP_40_ENABLED
1808  } else {
1809  // AC: Do not increase parallel level at start of the teams construct
1810  team->t.t_level = parent_team->t.t_level;
1811  team->t.t_active_level = parent_team->t.t_active_level;
1812  }
1813 #endif /* OMP_40_ENABLED */
1814  team->t.t_sched = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
1815 
1816  // Update the floating point rounding in the team if required.
1817  propagateFPControl(team);
1818 
1819  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1820  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
1821  KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
1822  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
1823  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
1824  parent_team, team->t.t_task_team, team ) );
1825  master_th->th.th_task_team = team->t.t_task_team;
1826 #if !KMP_NESTED_HOT_TEAMS
1827  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
1828 #endif
1829  }
1830 
1831  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
1832  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
1833  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
1834  ( team->t.t_master_tid == 0 &&
1835  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
1836  KMP_MB();
1837 
1838  /* now, setup the arguments */
1839  argv = (void**)team->t.t_argv;
1840 #if OMP_40_ENABLED
1841  if ( ap ) {
1842 #endif /* OMP_40_ENABLED */
1843  for ( i=argc-1; i >= 0; --i )
1844 // TODO: revert workaround for Intel(R) 64 tracker #96
1845 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
1846  *argv++ = va_arg( *ap, void * );
1847 #else
1848  *argv++ = va_arg( ap, void * );
1849 #endif
1850 #if OMP_40_ENABLED
1851  } else {
1852  for ( i=0; i < argc; ++i )
1853  // Get args from parent team for teams construct
1854  argv[i] = team->t.t_parent->t.t_argv[i];
1855  }
1856 #endif /* OMP_40_ENABLED */
1857 
1858  /* now actually fork the threads */
1859  team->t.t_master_active = master_active;
1860  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
1861  root->r.r_active = TRUE;
1862 
1863  __kmp_fork_team_threads( root, team, master_th, gtid );
1864  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
1865 
1866 
1867  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1868 
1869 
1870 #if USE_ITT_BUILD
1871  // Mark start of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
1872  if ((__itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) || KMP_ITT_DEBUG)
1873 # if OMP_40_ENABLED
1874  if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master)
1875  // Either not in teams or the outer fork of the teams construct
1876 # endif /* OMP_40_ENABLED */
1877  {
1878  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
1879  }
1880  kmp_uint64 tmp_time = 0;
1881 #if USE_ITT_NOTIFY
1882  if ( __itt_get_timestamp_ptr )
1883  tmp_time = __itt_get_timestamp();
1884 #endif
1885  if ((__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode==3)|| KMP_ITT_DEBUG)
1886 # if OMP_40_ENABLED
1887  if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master)
1888  // Either not in teams or the outer fork of the teams construct
1889 # endif /* OMP_40_ENABLED */
1890  team->t.t_region_time = tmp_time;
1891 
1892  // Internal fork - report frame begin
1893  if ((__kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3) && __itt_frame_submit_v3_ptr ) {
1894  if (!(team->t.t_active_level > 1)) {
1895  master_th->th.th_frame_time = tmp_time;
1896  }
1897  }
1898 #endif /* USE_ITT_BUILD */
1899 
1900  /* now go on and do the work */
1901  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
1902  KMP_MB();
1903  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
1904  root, team, master_th, gtid));
1905 
1906 #if USE_ITT_BUILD
1907  if ( __itt_stack_caller_create_ptr ) {
1908  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
1909  }
1910 #endif /* USE_ITT_BUILD */
1911 
1912 #if OMP_40_ENABLED
1913  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
1914 #endif /* OMP_40_ENABLED */
1915  {
1916  __kmp_internal_fork( loc, gtid, team );
1917  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
1918  root, team, master_th, gtid));
1919  }
1920 
1921  if (call_context == fork_context_gnu) {
1922  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1923  return TRUE;
1924  }
1925 
1926  /* Invoke microtask for MASTER thread */
1927  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1928  gtid, team->t.t_id, team->t.t_pkfn ) );
1929  } // END of timer KMP_fork_call block
1930 
1931  {
1932  //KMP_TIME_BLOCK(OMP_work);
1933  KMP_TIME_BLOCK(USER_master_invoke);
1934  if (! team->t.t_invoke( gtid )) {
1935  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1936  }
1937  }
1938  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1939  gtid, team->t.t_id, team->t.t_pkfn ) );
1940  KMP_MB(); /* Flush all pending memory write invalidates. */
1941 
1942  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1943 
1944  return TRUE;
1945 }
1946 
1947 void
1948 __kmp_join_call(ident_t *loc, int gtid
1949 #if OMP_40_ENABLED
1950  , int exit_teams
1951 #endif /* OMP_40_ENABLED */
1952 )
1953 {
1954  KMP_TIME_BLOCK(KMP_join_call);
1955  kmp_team_t *team;
1956  kmp_team_t *parent_team;
1957  kmp_info_t *master_th;
1958  kmp_root_t *root;
1959  int master_active;
1960  int i;
1961 
1962  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
1963 
1964  /* setup current data */
1965  master_th = __kmp_threads[ gtid ];
1966  root = master_th->th.th_root;
1967  team = master_th->th.th_team;
1968  parent_team = team->t.t_parent;
1969 
1970  master_th->th.th_ident = loc;
1971 
1972 #if KMP_DEBUG
1973  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1974  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
1975  __kmp_gtid_from_thread( master_th ), team,
1976  team->t.t_task_team, master_th->th.th_task_team) );
1977  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team );
1978  }
1979 #endif
1980 
1981  if( team->t.t_serialized ) {
1982 #if OMP_40_ENABLED
1983  if ( master_th->th.th_teams_microtask ) {
1984  // We are in teams construct
1985  int level = team->t.t_level;
1986  int tlevel = master_th->th.th_teams_level;
1987  if ( level == tlevel ) {
1988  // AC: we haven't incremented it earlier at start of teams construct,
1989  // so do it here - at the end of teams construct
1990  team->t.t_level++;
1991  } else if ( level == tlevel + 1 ) {
1992  // AC: we are exiting parallel inside teams, need to increment serialization
1993  // in order to restore it in the next call to __kmpc_end_serialized_parallel
1994  team->t.t_serialized++;
1995  }
1996  }
1997 #endif /* OMP_40_ENABLED */
1998  __kmpc_end_serialized_parallel( loc, gtid );
1999  return;
2000  }
2001 
2002  master_active = team->t.t_master_active;
2003 
2004 #if OMP_40_ENABLED
2005  if (!exit_teams)
2006 #endif /* OMP_40_ENABLED */
2007  {
2008  // AC: No barrier for internal teams at exit from teams construct.
2009  // But there is barrier for external team (league).
2010  __kmp_internal_join( loc, gtid, team );
2011  }
2012  KMP_MB();
2013 
2014 #if USE_ITT_BUILD
2015  if ( __itt_stack_caller_create_ptr ) {
2016  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2017  }
2018 
2019  // Mark end of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
2020  if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
2021 # if OMP_40_ENABLED
2022  if ( !master_th->th.th_teams_microtask /* not in teams */ ||
2023  ( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) )
2024  // Either not in teams or exiting teams region
2025  // (teams is a frame and no other frames inside the teams)
2026 # endif /* OMP_40_ENABLED */
2027  {
2028  master_th->th.th_ident = loc;
2029  __kmp_itt_region_joined( gtid );
2030  }
2031  if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG )
2032 # if OMP_40_ENABLED
2033  if ( !master_th->th.th_teams_microtask /* not in teams */ ||
2034  ( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) )
2035  // Either not in teams or exiting teams region
2036  // (teams is a frame and no other frames inside the teams)
2037 # endif /* OMP_40_ENABLED */
2038  {
2039  master_th->th.th_ident = loc;
2040  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time, 0, loc, master_th->th.th_team_nproc, 1 );
2041  }
2042 #endif /* USE_ITT_BUILD */
2043 
2044 #if OMP_40_ENABLED
2045  if ( master_th->th.th_teams_microtask &&
2046  !exit_teams &&
2047  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2048  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2049  // AC: We need to leave the team structure intact at the end
2050  // of parallel inside the teams construct, so that at the next
2051  // parallel same (hot) team works, only adjust nesting levels
2052 
2053  /* Decrement our nested depth level */
2054  team->t.t_level --;
2055  team->t.t_active_level --;
2056  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2057 
2058  /* Restore number of threads in the team if needed */
2059  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2060  int old_num = master_th->th.th_team_nproc;
2061  int new_num = master_th->th.th_teams_size.nth;
2062  kmp_info_t **other_threads = team->t.t_threads;
2063  team->t.t_nproc = new_num;
2064  for ( i = 0; i < old_num; ++i ) {
2065  other_threads[i]->th.th_team_nproc = new_num;
2066  }
2067  // Adjust states of non-used threads of the team
2068  for ( i = old_num; i < new_num; ++i ) {
2069  // Re-initialize thread's barrier data.
2070  int b;
2071  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2072  for ( b = 0; b < bs_last_barrier; ++ b ) {
2073  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2074  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2075  }
2076  // Synchronize thread's task state
2077  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2078  }
2079  }
2080  return;
2081  }
2082 #endif /* OMP_40_ENABLED */
2083 
2084  /* do cleanup and restore the parent team */
2085  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2086  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2087 
2088  master_th->th.th_dispatch =
2089  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2090 
2091  /* jc: The following lock has instructions with REL and ACQ semantics,
2092  separating the parallel user code called in this parallel region
2093  from the serial user code called after this function returns.
2094  */
2095  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2096 
2097 #if OMP_40_ENABLED
2098  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2099 #endif /* OMP_40_ENABLED */
2100  {
2101  /* Decrement our nested depth level */
2102  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2103  }
2104  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2105 
2106  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2107  0, master_th, team ) );
2108  __kmp_pop_current_task_from_thread( master_th );
2109 
2110 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2111  //
2112  // Restore master thread's partition.
2113  //
2114  master_th->th.th_first_place = team->t.t_first_place;
2115  master_th->th.th_last_place = team->t.t_last_place;
2116 #endif /* OMP_40_ENABLED */
2117 
2118  updateHWFPControl (team);
2119 
2120  if ( root->r.r_active != master_active )
2121  root->r.r_active = master_active;
2122 
2123  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2124 
2125  /* this race was fun to find. make sure the following is in the critical
2126  * region otherwise assertions may fail occasiounally since the old team
2127  * may be reallocated and the hierarchy appears inconsistent. it is
2128  * actually safe to run and won't cause any bugs, but will cause thoose
2129  * assertion failures. it's only one deref&assign so might as well put this
2130  * in the critical region */
2131  master_th->th.th_team = parent_team;
2132  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2133  master_th->th.th_team_master = parent_team->t.t_threads[0];
2134  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2135 
2136  /* restore serialized team, if need be */
2137  if( parent_team->t.t_serialized &&
2138  parent_team != master_th->th.th_serial_team &&
2139  parent_team != root->r.r_root_team ) {
2140  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2141  master_th->th.th_serial_team = parent_team;
2142  }
2143 
2144  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2145  //
2146  // Copy the task team from the new child / old parent team
2147  // to the thread. If non-NULL, copy the state flag also.
2148  //
2149  if ( ( master_th->th.th_task_team = parent_team->t.t_task_team ) != NULL ) {
2150  master_th->th.th_task_state = master_th->th.th_task_team->tt.tt_state;
2151  }
2152  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2153  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2154  parent_team ) );
2155  }
2156 
2157  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2158  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2159  master_th->th.th_current_task->td_flags.executing = 1;
2160 
2161  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2162 
2163  KMP_MB();
2164  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2165 }
2166 
2167 /* ------------------------------------------------------------------------ */
2168 /* ------------------------------------------------------------------------ */
2169 
2170 /* Check whether we should push an internal control record onto the
2171  serial team stack. If so, do it. */
2172 void
2173 __kmp_save_internal_controls ( kmp_info_t * thread )
2174 {
2175 
2176  if ( thread->th.th_team != thread->th.th_serial_team ) {
2177  return;
2178  }
2179  if (thread->th.th_team->t.t_serialized > 1) {
2180  int push = 0;
2181 
2182  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2183  push = 1;
2184  } else {
2185  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2186  thread->th.th_team->t.t_serialized ) {
2187  push = 1;
2188  }
2189  }
2190  if (push) { /* push a record on the serial team's stack */
2191  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2192 
2193  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2194 
2195  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2196 
2197  control->next = thread->th.th_team->t.t_control_stack_top;
2198  thread->th.th_team->t.t_control_stack_top = control;
2199  }
2200  }
2201 }
2202 
2203 /* Changes set_nproc */
2204 void
2205 __kmp_set_num_threads( int new_nth, int gtid )
2206 {
2207  kmp_info_t *thread;
2208  kmp_root_t *root;
2209 
2210  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2211  KMP_DEBUG_ASSERT( __kmp_init_serial );
2212 
2213  if (new_nth < 1)
2214  new_nth = 1;
2215  else if (new_nth > __kmp_max_nth)
2216  new_nth = __kmp_max_nth;
2217 
2218  thread = __kmp_threads[gtid];
2219 
2220  __kmp_save_internal_controls( thread );
2221 
2222  set__nproc( thread, new_nth );
2223 
2224  //
2225  // If this omp_set_num_threads() call will cause the hot team size to be
2226  // reduced (in the absence of a num_threads clause), then reduce it now,
2227  // rather than waiting for the next parallel region.
2228  //
2229  root = thread->th.th_root;
2230  if ( __kmp_init_parallel && ( ! root->r.r_active )
2231  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2232 #if KMP_NESTED_HOT_TEAMS
2233  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2234 #endif
2235  ) {
2236  kmp_team_t *hot_team = root->r.r_hot_team;
2237  int f;
2238 
2239  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2240 
2241 
2242  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2243  kmp_task_team_t *task_team = hot_team->t.t_task_team;
2244  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
2245  //
2246  // Signal the worker threads (esp. the extra ones) to stop
2247  // looking for tasks while spin waiting. The task teams
2248  // are reference counted and will be deallocated by the
2249  // last worker thread.
2250  //
2251  KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
2252  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2253  KMP_MB();
2254 
2255  KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
2256  &hot_team->t.t_task_team ) );
2257  hot_team->t.t_task_team = NULL;
2258  }
2259  else {
2260  KMP_DEBUG_ASSERT( task_team == NULL );
2261  }
2262  }
2263 
2264  //
2265  // Release the extra threads we don't need any more.
2266  //
2267  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2268  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2269  __kmp_free_thread( hot_team->t.t_threads[f] );
2270  hot_team->t.t_threads[f] = NULL;
2271  }
2272  hot_team->t.t_nproc = new_nth;
2273 #if KMP_NESTED_HOT_TEAMS
2274  if( thread->th.th_hot_teams ) {
2275  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2276  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2277  }
2278 #endif
2279 
2280 
2281  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2282 
2283  //
2284  // Update the t_nproc field in the threads that are still active.
2285  //
2286  for( f=0 ; f < new_nth; f++ ) {
2287  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2288  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2289  }
2290 #if KMP_MIC
2291  // Special flag in case omp_set_num_threads() call
2292  hot_team->t.t_size_changed = -1;
2293 #endif
2294  }
2295 
2296 }
2297 
2298 /* Changes max_active_levels */
2299 void
2300 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2301 {
2302  kmp_info_t *thread;
2303 
2304  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2305  KMP_DEBUG_ASSERT( __kmp_init_serial );
2306 
2307  // validate max_active_levels
2308  if( max_active_levels < 0 ) {
2309  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2310  // We ignore this call if the user has specified a negative value.
2311  // The current setting won't be changed. The last valid setting will be used.
2312  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2313  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2314  return;
2315  }
2316  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2317  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2318  // We allow a zero value. (implementation defined behavior)
2319  } else {
2320  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2321  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2322  // Current upper limit is MAX_INT. (implementation defined behavior)
2323  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2324  // Actually, the flow should never get here until we use MAX_INT limit.
2325  }
2326  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2327 
2328  thread = __kmp_threads[ gtid ];
2329 
2330  __kmp_save_internal_controls( thread );
2331 
2332  set__max_active_levels( thread, max_active_levels );
2333 
2334 }
2335 
2336 /* Gets max_active_levels */
2337 int
2338 __kmp_get_max_active_levels( int gtid )
2339 {
2340  kmp_info_t *thread;
2341 
2342  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2343  KMP_DEBUG_ASSERT( __kmp_init_serial );
2344 
2345  thread = __kmp_threads[ gtid ];
2346  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2347  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2348  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2349  return thread->th.th_current_task->td_icvs.max_active_levels;
2350 }
2351 
2352 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2353 void
2354 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2355 {
2356  kmp_info_t *thread;
2357 // kmp_team_t *team;
2358 
2359  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2360  KMP_DEBUG_ASSERT( __kmp_init_serial );
2361 
2362  // Check if the kind parameter is valid, correct if needed.
2363  // Valid parameters should fit in one of two intervals - standard or extended:
2364  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2365  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2366  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2367  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2368  {
2369  // TODO: Hint needs attention in case we change the default schedule.
2370  __kmp_msg(
2371  kmp_ms_warning,
2372  KMP_MSG( ScheduleKindOutOfRange, kind ),
2373  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2374  __kmp_msg_null
2375  );
2376  kind = kmp_sched_default;
2377  chunk = 0; // ignore chunk value in case of bad kind
2378  }
2379 
2380  thread = __kmp_threads[ gtid ];
2381 
2382  __kmp_save_internal_controls( thread );
2383 
2384  if ( kind < kmp_sched_upper_std ) {
2385  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2386  // differ static chunked vs. unchunked:
2387  // chunk should be invalid to indicate unchunked schedule (which is the default)
2388  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2389  } else {
2390  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2391  }
2392  } else {
2393  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2394  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2395  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2396  }
2397  if ( kind == kmp_sched_auto ) {
2398  // ignore parameter chunk for schedule auto
2399  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2400  } else {
2401  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2402  }
2403 }
2404 
2405 /* Gets def_sched_var ICV values */
2406 void
2407 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2408 {
2409  kmp_info_t *thread;
2410  enum sched_type th_type;
2411  int i;
2412 
2413  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2414  KMP_DEBUG_ASSERT( __kmp_init_serial );
2415 
2416  thread = __kmp_threads[ gtid ];
2417 
2418  //th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
2419  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2420 
2421  switch ( th_type ) {
2422  case kmp_sch_static:
2423  case kmp_sch_static_greedy:
2424  case kmp_sch_static_balanced:
2425  *kind = kmp_sched_static;
2426  *chunk = 0; // chunk was not set, try to show this fact via zero value
2427  return;
2428  case kmp_sch_static_chunked:
2429  *kind = kmp_sched_static;
2430  break;
2431  case kmp_sch_dynamic_chunked:
2432  *kind = kmp_sched_dynamic;
2433  break;
2435  case kmp_sch_guided_iterative_chunked:
2436  case kmp_sch_guided_analytical_chunked:
2437  *kind = kmp_sched_guided;
2438  break;
2439  case kmp_sch_auto:
2440  *kind = kmp_sched_auto;
2441  break;
2442  case kmp_sch_trapezoidal:
2443  *kind = kmp_sched_trapezoidal;
2444  break;
2445 /*
2446  case kmp_sch_static_steal:
2447  *kind = kmp_sched_static_steal;
2448  break;
2449 */
2450  default:
2451  KMP_FATAL( UnknownSchedulingType, th_type );
2452  }
2453 
2454  //*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
2455  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2456 }
2457 
2458 int
2459 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2460 
2461  int ii, dd;
2462  kmp_team_t *team;
2463  kmp_info_t *thr;
2464 
2465  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2466  KMP_DEBUG_ASSERT( __kmp_init_serial );
2467 
2468  // validate level
2469  if( level == 0 ) return 0;
2470  if( level < 0 ) return -1;
2471  thr = __kmp_threads[ gtid ];
2472  team = thr->th.th_team;
2473  ii = team->t.t_level;
2474  if( level > ii ) return -1;
2475 
2476 #if OMP_40_ENABLED
2477  if( thr->th.th_teams_microtask ) {
2478  // AC: we are in teams region where multiple nested teams have same level
2479  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2480  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2481  KMP_DEBUG_ASSERT( ii >= tlevel );
2482  // AC: As we need to pass by the teams league, we need to artificially increase ii
2483  if ( ii == tlevel ) {
2484  ii += 2; // three teams have same level
2485  } else {
2486  ii ++; // two teams have same level
2487  }
2488  }
2489  }
2490 #endif
2491 
2492  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2493 
2494  dd = team->t.t_serialized;
2495  level++;
2496  while( ii > level )
2497  {
2498  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2499  {
2500  }
2501  if( ( team->t.t_serialized ) && ( !dd ) ) {
2502  team = team->t.t_parent;
2503  continue;
2504  }
2505  if( ii > level ) {
2506  team = team->t.t_parent;
2507  dd = team->t.t_serialized;
2508  ii--;
2509  }
2510  }
2511 
2512  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2513 }
2514 
2515 int
2516 __kmp_get_team_size( int gtid, int level ) {
2517 
2518  int ii, dd;
2519  kmp_team_t *team;
2520  kmp_info_t *thr;
2521 
2522  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2523  KMP_DEBUG_ASSERT( __kmp_init_serial );
2524 
2525  // validate level
2526  if( level == 0 ) return 1;
2527  if( level < 0 ) return -1;
2528  thr = __kmp_threads[ gtid ];
2529  team = thr->th.th_team;
2530  ii = team->t.t_level;
2531  if( level > ii ) return -1;
2532 
2533 #if OMP_40_ENABLED
2534  if( thr->th.th_teams_microtask ) {
2535  // AC: we are in teams region where multiple nested teams have same level
2536  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2537  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2538  KMP_DEBUG_ASSERT( ii >= tlevel );
2539  // AC: As we need to pass by the teams league, we need to artificially increase ii
2540  if ( ii == tlevel ) {
2541  ii += 2; // three teams have same level
2542  } else {
2543  ii ++; // two teams have same level
2544  }
2545  }
2546  }
2547 #endif
2548 
2549  while( ii > level )
2550  {
2551  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2552  {
2553  }
2554  if( team->t.t_serialized && ( !dd ) ) {
2555  team = team->t.t_parent;
2556  continue;
2557  }
2558  if( ii > level ) {
2559  team = team->t.t_parent;
2560  ii--;
2561  }
2562  }
2563 
2564  return team->t.t_nproc;
2565 }
2566 
2567 kmp_r_sched_t
2568 __kmp_get_schedule_global() {
2569 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2570 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2571 
2572  kmp_r_sched_t r_sched;
2573 
2574  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2575  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2576  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2577  if ( __kmp_sched == kmp_sch_static ) {
2578  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2579  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2580  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2581  } else {
2582  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2583  }
2584 
2585  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2586  r_sched.chunk = KMP_DEFAULT_CHUNK;
2587  } else {
2588  r_sched.chunk = __kmp_chunk;
2589  }
2590 
2591  return r_sched;
2592 }
2593 
2594 /* ------------------------------------------------------------------------ */
2595 /* ------------------------------------------------------------------------ */
2596 
2597 
2598 /*
2599  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2600  * at least argc number of *t_argv entries for the requested team.
2601  */
2602 static void
2603 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2604 {
2605 
2606  KMP_DEBUG_ASSERT( team );
2607  if( !realloc || argc > team->t.t_max_argc ) {
2608 
2609  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2610  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2611  /* if previously allocated heap space for args, free them */
2612  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2613  __kmp_free( (void *) team->t.t_argv );
2614 
2615  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2616  /* use unused space in the cache line for arguments */
2617  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2618  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2619  team->t.t_id, team->t.t_max_argc ));
2620  team->t.t_argv = &team->t.t_inline_argv[0];
2621  if ( __kmp_storage_map ) {
2622  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2623  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2624  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2625  "team_%d.t_inline_argv",
2626  team->t.t_id );
2627  }
2628  } else {
2629  /* allocate space for arguments in the heap */
2630  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2631  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2632  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2633  team->t.t_id, team->t.t_max_argc ));
2634  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2635  if ( __kmp_storage_map ) {
2636  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2637  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2638  team->t.t_id );
2639  }
2640  }
2641  }
2642 }
2643 
2644 static void
2645 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2646 {
2647  int i;
2648  int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
2649 #if KMP_USE_POOLED_ALLOC
2650  // AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
2651  char *ptr = __kmp_allocate(max_nth *
2652  ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
2653  + sizeof(kmp_disp_t) + sizeof(int)*6
2654  //+ sizeof(int)
2655  + sizeof(kmp_r_sched_t)
2656  + sizeof(kmp_taskdata_t) ) );
2657 
2658  team->t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
2659  team->t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
2660  ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
2661  team->t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
2662  team->t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
2663  team->t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
2664  team->t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
2665  team->t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
2666  team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
2667  team->t.t_set_bt_set = (int*) ptr;
2668  ptr += sizeof(int) * max_nth;
2669  //team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
2670  team->t.t_set_sched = (kmp_r_sched_t*) ptr;
2671  ptr += sizeof(kmp_r_sched_t) * max_nth;
2672  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
2673  ptr += sizeof(kmp_taskdata_t) * max_nth;
2674 #else
2675 
2676  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2677  team->t.t_disp_buffer = (dispatch_shared_info_t*)
2678  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2679  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2680  //team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
2681  //team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
2682  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2683 #endif
2684  team->t.t_max_nproc = max_nth;
2685 
2686  /* setup dispatch buffers */
2687  for(i = 0 ; i < num_disp_buff; ++i)
2688  team->t.t_disp_buffer[i].buffer_index = i;
2689 }
2690 
2691 static void
2692 __kmp_free_team_arrays(kmp_team_t *team) {
2693  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2694  int i;
2695  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2696  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2697  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
2698  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
2699  }; // if
2700  }; // for
2701  __kmp_free(team->t.t_threads);
2702  #if !KMP_USE_POOLED_ALLOC
2703  __kmp_free(team->t.t_disp_buffer);
2704  __kmp_free(team->t.t_dispatch);
2705  //__kmp_free(team->t.t_set_max_active_levels);
2706  //__kmp_free(team->t.t_set_sched);
2707  __kmp_free(team->t.t_implicit_task_taskdata);
2708  #endif
2709  team->t.t_threads = NULL;
2710  team->t.t_disp_buffer = NULL;
2711  team->t.t_dispatch = NULL;
2712  //team->t.t_set_sched = 0;
2713  //team->t.t_set_max_active_levels = 0;
2714  team->t.t_implicit_task_taskdata = 0;
2715 }
2716 
2717 static void
2718 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
2719  kmp_info_t **oldThreads = team->t.t_threads;
2720 
2721  #if !KMP_USE_POOLED_ALLOC
2722  __kmp_free(team->t.t_disp_buffer);
2723  __kmp_free(team->t.t_dispatch);
2724  //__kmp_free(team->t.t_set_max_active_levels);
2725  //__kmp_free(team->t.t_set_sched);
2726  __kmp_free(team->t.t_implicit_task_taskdata);
2727  #endif
2728  __kmp_allocate_team_arrays(team, max_nth);
2729 
2730  memcpy(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
2731 
2732  __kmp_free(oldThreads);
2733 }
2734 
2735 static kmp_internal_control_t
2736 __kmp_get_global_icvs( void ) {
2737 
2738  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
2739 
2740 #if OMP_40_ENABLED
2741  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
2742 #endif /* OMP_40_ENABLED */
2743 
2744  kmp_internal_control_t g_icvs = {
2745  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
2746  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
2747  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
2748  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
2749  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
2750  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
2751  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
2752  // (use a max ub on value if __kmp_parallel_initialize not called yet)
2753  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
2754  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
2755 #if OMP_40_ENABLED
2756  __kmp_nested_proc_bind.bind_types[0],
2757 #endif /* OMP_40_ENABLED */
2758  NULL //struct kmp_internal_control *next;
2759  };
2760 
2761  return g_icvs;
2762 }
2763 
2764 static kmp_internal_control_t
2765 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
2766 
2767  kmp_internal_control_t gx_icvs;
2768  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
2769  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
2770  gx_icvs.next = NULL;
2771 
2772  return gx_icvs;
2773 }
2774 
2775 static void
2776 __kmp_initialize_root( kmp_root_t *root )
2777 {
2778  int f;
2779  kmp_team_t *root_team;
2780  kmp_team_t *hot_team;
2781  size_t disp_size, dispatch_size, bar_size;
2782  int hot_team_max_nth;
2783  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
2784  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
2785  KMP_DEBUG_ASSERT( root );
2786  KMP_ASSERT( ! root->r.r_begin );
2787 
2788  /* setup the root state structure */
2789  __kmp_init_lock( &root->r.r_begin_lock );
2790  root->r.r_begin = FALSE;
2791  root->r.r_active = FALSE;
2792  root->r.r_in_parallel = 0;
2793  root->r.r_blocktime = __kmp_dflt_blocktime;
2794  root->r.r_nested = __kmp_dflt_nested;
2795 
2796  /* setup the root team for this task */
2797  /* allocate the root team structure */
2798  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
2799  root_team =
2800  __kmp_allocate_team(
2801  root,
2802  1, // new_nproc
2803  1, // max_nproc
2804 #if OMP_40_ENABLED
2805  __kmp_nested_proc_bind.bind_types[0],
2806 #endif
2807  &r_icvs,
2808  0 // argc
2809  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
2810  );
2811 
2812  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
2813 
2814  root->r.r_root_team = root_team;
2815  root_team->t.t_control_stack_top = NULL;
2816 
2817  /* initialize root team */
2818  root_team->t.t_threads[0] = NULL;
2819  root_team->t.t_nproc = 1;
2820  root_team->t.t_serialized = 1;
2821  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
2822  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
2823  root_team->t.t_sched.chunk = r_sched.chunk;
2824  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
2825  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
2826 
2827  /* setup the hot team for this task */
2828  /* allocate the hot team structure */
2829  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
2830  hot_team =
2831  __kmp_allocate_team(
2832  root,
2833  1, // new_nproc
2834  __kmp_dflt_team_nth_ub * 2, // max_nproc
2835 #if OMP_40_ENABLED
2836  __kmp_nested_proc_bind.bind_types[0],
2837 #endif
2838  &r_icvs,
2839  0 // argc
2840  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
2841  );
2842  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
2843 
2844  root->r.r_hot_team = hot_team;
2845  root_team->t.t_control_stack_top = NULL;
2846 
2847  /* first-time initialization */
2848  hot_team->t.t_parent = root_team;
2849 
2850  /* initialize hot team */
2851  hot_team_max_nth = hot_team->t.t_max_nproc;
2852  for ( f = 0; f < hot_team_max_nth; ++ f ) {
2853  hot_team->t.t_threads[ f ] = NULL;
2854  }; // for
2855  hot_team->t.t_nproc = 1;
2856  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
2857  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
2858  hot_team->t.t_sched.chunk = r_sched.chunk;
2859 #if KMP_MIC
2860  hot_team->t.t_size_changed = 0;
2861 #endif
2862 
2863 }
2864 
2865 #ifdef KMP_DEBUG
2866 
2867 
2868 typedef struct kmp_team_list_item {
2869  kmp_team_p const * entry;
2870  struct kmp_team_list_item * next;
2871 } kmp_team_list_item_t;
2872 typedef kmp_team_list_item_t * kmp_team_list_t;
2873 
2874 
2875 static void
2876 __kmp_print_structure_team_accum( // Add team to list of teams.
2877  kmp_team_list_t list, // List of teams.
2878  kmp_team_p const * team // Team to add.
2879 ) {
2880 
2881  // List must terminate with item where both entry and next are NULL.
2882  // Team is added to the list only once.
2883  // List is sorted in ascending order by team id.
2884  // Team id is *not* a key.
2885 
2886  kmp_team_list_t l;
2887 
2888  KMP_DEBUG_ASSERT( list != NULL );
2889  if ( team == NULL ) {
2890  return;
2891  }; // if
2892 
2893  __kmp_print_structure_team_accum( list, team->t.t_parent );
2894  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
2895 
2896  // Search list for the team.
2897  l = list;
2898  while ( l->next != NULL && l->entry != team ) {
2899  l = l->next;
2900  }; // while
2901  if ( l->next != NULL ) {
2902  return; // Team has been added before, exit.
2903  }; // if
2904 
2905  // Team is not found. Search list again for insertion point.
2906  l = list;
2907  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
2908  l = l->next;
2909  }; // while
2910 
2911  // Insert team.
2912  {
2913  kmp_team_list_item_t * item =
2914  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
2915  * item = * l;
2916  l->entry = team;
2917  l->next = item;
2918  }
2919 
2920 }
2921 
2922 static void
2923 __kmp_print_structure_team(
2924  char const * title,
2925  kmp_team_p const * team
2926 
2927 ) {
2928  __kmp_printf( "%s", title );
2929  if ( team != NULL ) {
2930  __kmp_printf( "%2x %p\n", team->t.t_id, team );
2931  } else {
2932  __kmp_printf( " - (nil)\n" );
2933  }; // if
2934 }
2935 
2936 static void
2937 __kmp_print_structure_thread(
2938  char const * title,
2939  kmp_info_p const * thread
2940 
2941 ) {
2942  __kmp_printf( "%s", title );
2943  if ( thread != NULL ) {
2944  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
2945  } else {
2946  __kmp_printf( " - (nil)\n" );
2947  }; // if
2948 }
2949 
2950 void
2951 __kmp_print_structure(
2952  void
2953 ) {
2954 
2955  kmp_team_list_t list;
2956 
2957  // Initialize list of teams.
2958  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
2959  list->entry = NULL;
2960  list->next = NULL;
2961 
2962  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
2963  {
2964  int gtid;
2965  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
2966  __kmp_printf( "%2d", gtid );
2967  if ( __kmp_threads != NULL ) {
2968  __kmp_printf( " %p", __kmp_threads[ gtid ] );
2969  }; // if
2970  if ( __kmp_root != NULL ) {
2971  __kmp_printf( " %p", __kmp_root[ gtid ] );
2972  }; // if
2973  __kmp_printf( "\n" );
2974  }; // for gtid
2975  }
2976 
2977  // Print out __kmp_threads array.
2978  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
2979  if ( __kmp_threads != NULL ) {
2980  int gtid;
2981  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
2982  kmp_info_t const * thread = __kmp_threads[ gtid ];
2983  if ( thread != NULL ) {
2984  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
2985  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
2986  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
2987  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
2988  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
2989  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
2990  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
2991  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
2992 #if OMP_40_ENABLED
2993  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
2994 #endif
2995  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
2996  __kmp_printf( "\n" );
2997  __kmp_print_structure_team_accum( list, thread->th.th_team );
2998  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
2999  }; // if
3000  }; // for gtid
3001  } else {
3002  __kmp_printf( "Threads array is not allocated.\n" );
3003  }; // if
3004 
3005  // Print out __kmp_root array.
3006  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3007  if ( __kmp_root != NULL ) {
3008  int gtid;
3009  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3010  kmp_root_t const * root = __kmp_root[ gtid ];
3011  if ( root != NULL ) {
3012  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3013  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3014  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3015  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3016  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3017  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3018  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3019  __kmp_printf( "\n" );
3020  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3021  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3022  }; // if
3023  }; // for gtid
3024  } else {
3025  __kmp_printf( "Ubers array is not allocated.\n" );
3026  }; // if
3027 
3028  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3029  while ( list->next != NULL ) {
3030  kmp_team_p const * team = list->entry;
3031  int i;
3032  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3033  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3034  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3035  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3036  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3037  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3038  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3039  __kmp_printf( " Thread %2d: ", i );
3040  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3041  }; // for i
3042  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3043  __kmp_printf( "\n" );
3044  list = list->next;
3045  }; // while
3046 
3047  // Print out __kmp_thread_pool and __kmp_team_pool.
3048  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3049  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3050  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3051  __kmp_printf( "\n" );
3052 
3053  // Free team list.
3054  while ( list != NULL ) {
3055  kmp_team_list_item_t * item = list;
3056  list = list->next;
3057  KMP_INTERNAL_FREE( item );
3058  }; // while
3059 
3060 }
3061 
3062 #endif
3063 
3064 
3065 //---------------------------------------------------------------------------
3066 // Stuff for per-thread fast random number generator
3067 // Table of primes
3068 
3069 static const unsigned __kmp_primes[] = {
3070  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3071  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3072  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3073  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3074  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3075  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3076  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3077  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3078  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3079  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3080  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3081  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3082  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3083  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3084  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3085  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3086 };
3087 
3088 //---------------------------------------------------------------------------
3089 // __kmp_get_random: Get a random number using a linear congruential method.
3090 
3091 unsigned short
3092 __kmp_get_random( kmp_info_t * thread )
3093 {
3094  unsigned x = thread->th.th_x;
3095  unsigned short r = x>>16;
3096 
3097  thread->th.th_x = x*thread->th.th_a+1;
3098 
3099  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3100  thread->th.th_info.ds.ds_tid, r) );
3101 
3102  return r;
3103 }
3104 //--------------------------------------------------------
3105 // __kmp_init_random: Initialize a random number generator
3106 
3107 void
3108 __kmp_init_random( kmp_info_t * thread )
3109 {
3110  unsigned seed = thread->th.th_info.ds.ds_tid;
3111 
3112  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3113  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3114  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3115 }
3116 
3117 
3118 #if KMP_OS_WINDOWS
3119 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3120 static int
3121 __kmp_reclaim_dead_roots(void) {
3122  int i, r = 0;
3123 
3124  for(i = 0; i < __kmp_threads_capacity; ++i) {
3125  if( KMP_UBER_GTID( i ) &&
3126  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3127  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3128  r += __kmp_unregister_root_other_thread(i);
3129  }
3130  }
3131  return r;
3132 }
3133 #endif
3134 
3135 /*
3136  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3137  free entries generated.
3138 
3139  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3140  already dead.
3141 
3142  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3143  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3144  __kmp_tp_capacity, if threadprivate cache array has been created.
3145  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3146 
3147  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3148  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3149  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3150  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3151  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3152  as many free slots as possible up to nWish.
3153 
3154  If any argument is negative, the behavior is undefined.
3155 */
3156 static int
3157 __kmp_expand_threads(int nWish, int nNeed) {
3158  int added = 0;
3159  int old_tp_cached;
3160  int __kmp_actual_max_nth;
3161 
3162  if(nNeed > nWish) /* normalize the arguments */
3163  nWish = nNeed;
3164 #if KMP_OS_WINDOWS && !defined GUIDEDLL_EXPORTS
3165 /* only for Windows static library */
3166  /* reclaim array entries for root threads that are already dead */
3167  added = __kmp_reclaim_dead_roots();
3168 
3169  if(nNeed) {
3170  nNeed -= added;
3171  if(nNeed < 0)
3172  nNeed = 0;
3173  }
3174  if(nWish) {
3175  nWish -= added;
3176  if(nWish < 0)
3177  nWish = 0;
3178  }
3179 #endif
3180  if(nWish <= 0)
3181  return added;
3182 
3183  while(1) {
3184  int nTarget;
3185  int minimumRequiredCapacity;
3186  int newCapacity;
3187  kmp_info_t **newThreads;
3188  kmp_root_t **newRoot;
3189 
3190  //
3191  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3192  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3193  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3194  // become > __kmp_max_nth in one of two ways:
3195  //
3196  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3197  // may not be resused by another thread, so we may need to increase
3198  // __kmp_threads_capacity to __kmp_max_threads + 1.
3199  //
3200  // 2) New foreign root(s) are encountered. We always register new
3201  // foreign roots. This may cause a smaller # of threads to be
3202  // allocated at subsequent parallel regions, but the worker threads
3203  // hang around (and eventually go to sleep) and need slots in the
3204  // __kmp_threads[] array.
3205  //
3206  // Anyway, that is the reason for moving the check to see if
3207  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3208  // instead of having it performed here. -BB
3209  //
3210  old_tp_cached = __kmp_tp_cached;
3211  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3212  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3213 
3214  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3215  nTarget = nWish;
3216  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3217  /* can't fulfil nWish, so try nNeed */
3218  if(nNeed) {
3219  nTarget = nNeed;
3220  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3221  /* possible expansion too small -- give up */
3222  break;
3223  }
3224  } else {
3225  /* best-effort */
3226  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3227  if(!nTarget) {
3228  /* can expand at all -- give up */
3229  break;
3230  }
3231  }
3232  }
3233  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3234 
3235  newCapacity = __kmp_threads_capacity;
3236  do{
3237  newCapacity =
3238  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3239  (newCapacity << 1) :
3240  __kmp_actual_max_nth;
3241  } while(newCapacity < minimumRequiredCapacity);
3242  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3243  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3244  memcpy(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3245  memcpy(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3246  memset(newThreads + __kmp_threads_capacity, 0,
3247  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3248  memset(newRoot + __kmp_threads_capacity, 0,
3249  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3250 
3251  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3252  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3253  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3254  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3255  of a double-check pair.
3256  */
3257  __kmp_free(newThreads);
3258  continue; /* start over and try again */
3259  }
3260  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3261  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3262  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3263  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3264  __kmp_free(newThreads);
3265  continue; /* start over and try again */
3266  } else {
3267  /* success */
3268  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3269  //
3270  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3271  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3272  added += newCapacity - __kmp_threads_capacity;
3273  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3274  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3275  break; /* succeeded, so we can exit the loop */
3276  }
3277  }
3278  return added;
3279 }
3280 
3281 /* register the current thread as a root thread and obtain our gtid */
3282 /* we must have the __kmp_initz_lock held at this point */
3283 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3284 int
3285 __kmp_register_root( int initial_thread )
3286 {
3287  kmp_info_t *root_thread;
3288  kmp_root_t *root;
3289  int gtid;
3290  int capacity;
3291  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3292  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3293  KMP_MB();
3294 
3295 
3296  /*
3297  2007-03-02:
3298 
3299  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3300  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3301  return false (that means there is at least one empty slot in __kmp_threads array), but it
3302  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3303  used for this one. Following code workarounds this bug.
3304 
3305  However, right solution seems to be not reserving slot #0 for initial thread because:
3306  (1) there is no magic in slot #0,
3307  (2) we cannot detect initial thread reliably (the first thread which does serial
3308  initialization may be not a real initial thread).
3309  */
3310  capacity = __kmp_threads_capacity;
3311  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3312  -- capacity;
3313  }; // if
3314 
3315  /* see if there are too many threads */
3316  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3317  if ( __kmp_tp_cached ) {
3318  __kmp_msg(
3319  kmp_ms_fatal,
3320  KMP_MSG( CantRegisterNewThread ),
3321  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3322  KMP_HNT( PossibleSystemLimitOnThreads ),
3323  __kmp_msg_null
3324  );
3325  }
3326  else {
3327  __kmp_msg(
3328  kmp_ms_fatal,
3329  KMP_MSG( CantRegisterNewThread ),
3330  KMP_HNT( SystemLimitOnThreads ),
3331  __kmp_msg_null
3332  );
3333  }
3334  }; // if
3335 
3336  /* find an available thread slot */
3337  /* Don't reassign the zero slot since we need that to only be used by initial
3338  thread */
3339  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3340  ;
3341  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3342  KMP_ASSERT( gtid < __kmp_threads_capacity );
3343 
3344  /* update global accounting */
3345  __kmp_all_nth ++;
3346  TCW_4(__kmp_nth, __kmp_nth + 1);
3347 
3348  //
3349  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3350  // for low numbers of procs, and method #2 (keyed API call) for higher
3351  // numbers of procs.
3352  //
3353  if ( __kmp_adjust_gtid_mode ) {
3354  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3355  if ( TCR_4(__kmp_gtid_mode) != 2) {
3356  TCW_4(__kmp_gtid_mode, 2);
3357  }
3358  }
3359  else {
3360  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3361  TCW_4(__kmp_gtid_mode, 1);
3362  }
3363  }
3364  }
3365 
3366 #ifdef KMP_ADJUST_BLOCKTIME
3367  /* Adjust blocktime to zero if necessary */
3368  /* Middle initialization might not have occurred yet */
3369  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3370  if ( __kmp_nth > __kmp_avail_proc ) {
3371  __kmp_zero_bt = TRUE;
3372  }
3373  }
3374 #endif /* KMP_ADJUST_BLOCKTIME */
3375 
3376  /* setup this new hierarchy */
3377  if( ! ( root = __kmp_root[gtid] )) {
3378  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3379  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3380  }
3381 
3382  __kmp_initialize_root( root );
3383 
3384  /* setup new root thread structure */
3385  if( root->r.r_uber_thread ) {
3386  root_thread = root->r.r_uber_thread;
3387  } else {
3388  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3389  if ( __kmp_storage_map ) {
3390  __kmp_print_thread_storage_map( root_thread, gtid );
3391  }
3392  root_thread->th.th_info .ds.ds_gtid = gtid;
3393  root_thread->th.th_root = root;
3394  if( __kmp_env_consistency_check ) {
3395  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3396  }
3397  #if USE_FAST_MEMORY
3398  __kmp_initialize_fast_memory( root_thread );
3399  #endif /* USE_FAST_MEMORY */
3400 
3401  #if KMP_USE_BGET
3402  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3403  __kmp_initialize_bget( root_thread );
3404  #endif
3405  __kmp_init_random( root_thread ); // Initialize random number generator
3406  }
3407 
3408  /* setup the serial team held in reserve by the root thread */
3409  if( ! root_thread->th.th_serial_team ) {
3410  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3411  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3412  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3413 #if OMP_40_ENABLED
3414  proc_bind_default,
3415 #endif
3416  &r_icvs,
3417  0 USE_NESTED_HOT_ARG(NULL) );
3418  }
3419  KMP_ASSERT( root_thread->th.th_serial_team );
3420  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3421  root_thread->th.th_serial_team ) );
3422 
3423  /* drop root_thread into place */
3424  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3425 
3426  root->r.r_root_team->t.t_threads[0] = root_thread;
3427  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3428  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3429  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3430  root->r.r_uber_thread = root_thread;
3431 
3432  /* initialize the thread, get it ready to go */
3433  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3434 
3435  /* prepare the master thread for get_gtid() */
3436  __kmp_gtid_set_specific( gtid );
3437 
3438  __kmp_itt_thread_name( gtid );
3439 
3440  #ifdef KMP_TDATA_GTID
3441  __kmp_gtid = gtid;
3442  #endif
3443  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3444  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3445  TCW_4(__kmp_init_gtid, TRUE);
3446 
3447  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3448  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3449  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3450  KMP_INIT_BARRIER_STATE ) );
3451  { // Initialize barrier data.
3452  int b;
3453  for ( b = 0; b < bs_last_barrier; ++ b ) {
3454  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3455  }; // for
3456  }
3457  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3458 
3459 
3460 #if KMP_AFFINITY_SUPPORTED
3461  if ( TCR_4(__kmp_init_middle) ) {
3462  __kmp_affinity_set_init_mask( gtid, TRUE );
3463  }
3464 #endif /* KMP_AFFINITY_SUPPORTED */
3465 
3466  __kmp_root_counter ++;
3467 
3468  KMP_MB();
3469  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3470 
3471  return gtid;
3472 }
3473 
3474 #if KMP_NESTED_HOT_TEAMS
3475 static int
3476 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3477 {
3478  int i, n, nth;
3479  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3480  if( !hot_teams || !hot_teams[level].hot_team ) {
3481  return 0;
3482  }
3483  KMP_DEBUG_ASSERT( level < max_level );
3484  kmp_team_t *team = hot_teams[level].hot_team;
3485  nth = hot_teams[level].hot_team_nth;
3486  n = nth - 1; // master is not freed
3487  if( level < max_level - 1 ) {
3488  for( i = 0; i < nth; ++i ) {
3489  kmp_info_t *th = team->t.t_threads[i];
3490  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3491  if( i > 0 && th->th.th_hot_teams ) {
3492  __kmp_free( th->th.th_hot_teams );
3493  th->th.th_hot_teams = NULL;
3494  }
3495  }
3496  }
3497  __kmp_free_team( root, team, NULL );
3498  return n;
3499 }
3500 #endif
3501 
3502 /* Resets a root thread and clear its root and hot teams.
3503  Returns the number of __kmp_threads entries directly and indirectly freed.
3504 */
3505 static int
3506 __kmp_reset_root(int gtid, kmp_root_t *root)
3507 {
3508  kmp_team_t * root_team = root->r.r_root_team;
3509  kmp_team_t * hot_team = root->r.r_hot_team;
3510  int n = hot_team->t.t_nproc;
3511  int i;
3512 
3513  KMP_DEBUG_ASSERT( ! root->r.r_active );
3514 
3515  root->r.r_root_team = NULL;
3516  root->r.r_hot_team = NULL;
3517  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3518  // to __kmp_free_team().
3519  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3520 #if KMP_NESTED_HOT_TEAMS
3521  if( __kmp_hot_teams_max_level > 1 ) { // need to free nested hot teams and their threads if any
3522  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3523  kmp_info_t *th = hot_team->t.t_threads[i];
3524  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3525  if( th->th.th_hot_teams ) {
3526  __kmp_free( th->th.th_hot_teams );
3527  th->th.th_hot_teams = NULL;
3528  }
3529  }
3530  }
3531 #endif
3532  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3533 
3534  //
3535  // Before we can reap the thread, we need to make certain that all
3536  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3537  //
3538  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3539  __kmp_wait_to_unref_task_teams();
3540  }
3541 
3542  #if KMP_OS_WINDOWS
3543  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3544  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3545  (LPVOID)&(root->r.r_uber_thread->th),
3546  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3547  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3548  #endif /* KMP_OS_WINDOWS */
3549 
3550  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3551  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3552 
3553  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3554  root->r.r_uber_thread = NULL;
3555  /* mark root as no longer in use */
3556  root->r.r_begin = FALSE;
3557 
3558  return n;
3559 }
3560 
3561 void
3562 __kmp_unregister_root_current_thread( int gtid )
3563 {
3564  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3565  /* this lock should be ok, since unregister_root_current_thread is never called during
3566  * and abort, only during a normal close. furthermore, if you have the
3567  * forkjoin lock, you should never try to get the initz lock */
3568 
3569  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3570  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3571  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3572  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3573  return;
3574  }
3575  kmp_root_t *root = __kmp_root[gtid];
3576 
3577  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3578  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3579  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3580  KMP_ASSERT( root->r.r_active == FALSE );
3581 
3582 
3583  KMP_MB();
3584 
3585  __kmp_reset_root(gtid, root);
3586 
3587  /* free up this thread slot */
3588  __kmp_gtid_set_specific( KMP_GTID_DNE );
3589 #ifdef KMP_TDATA_GTID
3590  __kmp_gtid = KMP_GTID_DNE;
3591 #endif
3592 
3593  KMP_MB();
3594  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3595 
3596  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3597 }
3598 
3599 /* __kmp_forkjoin_lock must be already held
3600  Unregisters a root thread that is not the current thread. Returns the number of
3601  __kmp_threads entries freed as a result.
3602  */
3603 static int
3604 __kmp_unregister_root_other_thread( int gtid )
3605 {
3606  kmp_root_t *root = __kmp_root[gtid];
3607  int r;
3608 
3609  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3610  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3611  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3612  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3613  KMP_ASSERT( root->r.r_active == FALSE );
3614 
3615  r = __kmp_reset_root(gtid, root);
3616  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3617  return r;
3618 }
3619 
3620 #if KMP_DEBUG
3621 void __kmp_task_info() {
3622 
3623  kmp_int32 gtid = __kmp_entry_gtid();
3624  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
3625  kmp_info_t *this_thr = __kmp_threads[ gtid ];
3626  kmp_team_t *steam = this_thr->th.th_serial_team;
3627  kmp_team_t *team = this_thr->th.th_team;
3628 
3629  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3630  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3631 }
3632 #endif // KMP_DEBUG
3633 
3634 /* TODO optimize with one big memclr, take out what isn't needed,
3635  * split responsility to workers as much as possible, and delay
3636  * initialization of features as much as possible */
3637 static void
3638 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3639 {
3640  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3641  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3642  kmp_info_t *master = team->t.t_threads[0];
3643  KMP_DEBUG_ASSERT( this_thr != NULL );
3644  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3645  KMP_DEBUG_ASSERT( team );
3646  KMP_DEBUG_ASSERT( team->t.t_threads );
3647  KMP_DEBUG_ASSERT( team->t.t_dispatch );
3648  KMP_DEBUG_ASSERT( master );
3649  KMP_DEBUG_ASSERT( master->th.th_root );
3650 
3651  KMP_MB();
3652 
3653  TCW_SYNC_PTR(this_thr->th.th_team, team);
3654 
3655  this_thr->th.th_info.ds.ds_tid = tid;
3656  this_thr->th.th_set_nproc = 0;
3657 #if OMP_40_ENABLED
3658  this_thr->th.th_set_proc_bind = proc_bind_default;
3659 # if KMP_AFFINITY_SUPPORTED
3660  this_thr->th.th_new_place = this_thr->th.th_current_place;
3661 # endif
3662 #endif
3663  this_thr->th.th_root = master->th.th_root;
3664 
3665  /* setup the thread's cache of the team structure */
3666  this_thr->th.th_team_nproc = team->t.t_nproc;
3667  this_thr->th.th_team_master = master;
3668  this_thr->th.th_team_serialized = team->t.t_serialized;
3669  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
3670 
3671  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
3672  this_thr->th.th_task_state = 0;
3673 
3674  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
3675  tid, gtid, this_thr, this_thr->th.th_current_task ) );
3676 
3677  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
3678 
3679  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
3680  tid, gtid, this_thr, this_thr->th.th_current_task ) );
3681  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
3682 
3683  /* TODO no worksharing in speculative threads */
3684  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
3685 
3686  this_thr->th.th_local.this_construct = 0;
3687 
3688 #ifdef BUILD_TV
3689  this_thr->th.th_local.tv_data = 0;
3690 #endif
3691 
3692  if ( ! this_thr->th.th_pri_common ) {
3693  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
3694  if ( __kmp_storage_map ) {
3695  __kmp_print_storage_map_gtid(
3696  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
3697  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
3698  );
3699  }; // if
3700  this_thr->th.th_pri_head = NULL;
3701  }; // if
3702 
3703  /* Initialize dynamic dispatch */
3704  {
3705  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
3706  /*
3707  * Use team max_nproc since this will never change for the team.
3708  */
3709  size_t disp_size = sizeof( dispatch_private_info_t ) *
3710  ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
3711  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
3712  KMP_ASSERT( dispatch );
3713  KMP_DEBUG_ASSERT( team->t.t_dispatch );
3714  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
3715 
3716  dispatch->th_disp_index = 0;
3717 
3718  if( ! dispatch->th_disp_buffer ) {
3719  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
3720 
3721  if ( __kmp_storage_map ) {
3722  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
3723  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
3724  disp_size, "th_%d.th_dispatch.th_disp_buffer "
3725  "(team_%d.t_dispatch[%d].th_disp_buffer)",
3726  gtid, team->t.t_id, gtid );
3727  }
3728  } else {
3729  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
3730  }
3731 
3732  dispatch->th_dispatch_pr_current = 0;
3733  dispatch->th_dispatch_sh_current = 0;
3734 
3735  dispatch->th_deo_fcn = 0; /* ORDERED */
3736  dispatch->th_dxo_fcn = 0; /* END ORDERED */
3737  }
3738 
3739  this_thr->th.th_next_pool = NULL;
3740 
3741  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
3742  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
3743 
3744  KMP_MB();
3745 }
3746 
3747 
3748 /* allocate a new thread for the requesting team. this is only called from within a
3749  * forkjoin critical section. we will first try to get an available thread from the
3750  * thread pool. if none is available, we will fork a new one assuming we are able
3751  * to create a new one. this should be assured, as the caller should check on this
3752  * first.
3753  */
3754 kmp_info_t *
3755 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
3756 {
3757  kmp_team_t *serial_team;
3758  kmp_info_t *new_thr;
3759  int new_gtid;
3760 
3761  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
3762  KMP_DEBUG_ASSERT( root && team );
3763 #if !KMP_NESTED_HOT_TEAMS
3764  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
3765 #endif
3766  KMP_MB();
3767 
3768  /* first, try to get one from the thread pool */
3769  if ( __kmp_thread_pool ) {
3770 
3771  new_thr = (kmp_info_t*)__kmp_thread_pool;
3772  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
3773  if ( new_thr == __kmp_thread_pool_insert_pt ) {
3774  __kmp_thread_pool_insert_pt = NULL;
3775  }
3776  TCW_4(new_thr->th.th_in_pool, FALSE);
3777  //
3778  // Don't touch th_active_in_pool or th_active.
3779  // The worker thread adjusts those flags as it sleeps/awakens.
3780  //
3781 
3782  __kmp_thread_pool_nth--;
3783 
3784  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
3785  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
3786  KMP_ASSERT( ! new_thr->th.th_team );
3787  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
3788  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
3789 
3790  /* setup the thread structure */
3791  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
3792  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
3793 
3794  TCW_4(__kmp_nth, __kmp_nth + 1);
3795 
3796 #ifdef KMP_ADJUST_BLOCKTIME
3797  /* Adjust blocktime back to zero if necessar y */
3798  /* Middle initialization might not have occurred yet */
3799  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3800  if ( __kmp_nth > __kmp_avail_proc ) {
3801  __kmp_zero_bt = TRUE;
3802  }
3803  }
3804 #endif /* KMP_ADJUST_BLOCKTIME */
3805 
3806 #if KMP_DEBUG
3807  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
3808  int b;
3809  kmp_balign_t * balign = new_thr->th.th_bar;
3810  for( b = 0; b < bs_last_barrier; ++ b )
3811  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
3812 #endif
3813 
3814  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
3815  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
3816 
3817  KMP_MB();
3818  return new_thr;
3819  }
3820 
3821 
3822  /* no, well fork a new one */
3823  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
3824  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
3825 
3826  //
3827  // If this is the first worker thread the RTL is creating, then also
3828  // launch the monitor thread. We try to do this as early as possible.
3829  //
3830  if ( ! TCR_4( __kmp_init_monitor ) ) {
3831  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
3832  if ( ! TCR_4( __kmp_init_monitor ) ) {
3833  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
3834  TCW_4( __kmp_init_monitor, 1 );
3835  __kmp_create_monitor( & __kmp_monitor );
3836  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
3837  #if KMP_OS_WINDOWS
3838  // AC: wait until monitor has started. This is a fix for CQ232808.
3839  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
3840  // work in between, then there is high probability that monitor thread started after
3841  // the library shutdown. At shutdown it is too late to cope with the problem, because
3842  // when the master is in DllMain (process detach) the monitor has no chances to start
3843  // (it is blocked), and master has no means to inform the monitor that the library has gone,
3844  // because all the memory which the monitor can access is going to be released/reset.
3845  while ( TCR_4(__kmp_init_monitor) < 2 ) {
3846  KMP_YIELD( TRUE );
3847  }
3848  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
3849  #endif
3850  }
3851  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
3852  }
3853 
3854  KMP_MB();
3855  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
3856  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
3857  }
3858 
3859  /* allocate space for it. */
3860  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3861 
3862  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
3863 
3864  if ( __kmp_storage_map ) {
3865  __kmp_print_thread_storage_map( new_thr, new_gtid );
3866  }
3867 
3868  /* add the reserve serialized team, initialized from the team's master thread */
3869  {
3870  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
3871  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
3872  new_thr->th.th_serial_team = serial_team =
3873  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
3874 #if OMP_40_ENABLED
3875  proc_bind_default,
3876 #endif
3877  &r_icvs,
3878  0 USE_NESTED_HOT_ARG(NULL) );
3879  }
3880  KMP_ASSERT ( serial_team );
3881  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3882  serial_team->t.t_threads[0] = new_thr;
3883  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
3884  new_thr ) );
3885 
3886  /* setup the thread structures */
3887  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
3888 
3889  #if USE_FAST_MEMORY
3890  __kmp_initialize_fast_memory( new_thr );
3891  #endif /* USE_FAST_MEMORY */
3892 
3893  #if KMP_USE_BGET
3894  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
3895  __kmp_initialize_bget( new_thr );
3896  #endif
3897 
3898  __kmp_init_random( new_thr ); // Initialize random number generator
3899 
3900  /* Initialize these only once when thread is grabbed for a team allocation */
3901  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
3902  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3903 
3904  int b;
3905  kmp_balign_t * balign = new_thr->th.th_bar;
3906  for(b=0; b<bs_last_barrier; ++b) {
3907  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
3908  balign[b].bb.team = NULL;
3909  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
3910  balign[b].bb.use_oncore_barrier = 0;
3911  }
3912 
3913  new_thr->th.th_spin_here = FALSE;
3914  new_thr->th.th_next_waiting = 0;
3915 
3916 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
3917  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
3918  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
3919  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
3920  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
3921 #endif
3922 
3923  TCW_4(new_thr->th.th_in_pool, FALSE);
3924  new_thr->th.th_active_in_pool = FALSE;
3925  TCW_4(new_thr->th.th_active, TRUE);
3926 
3927  /* adjust the global counters */
3928  __kmp_all_nth ++;
3929  __kmp_nth ++;
3930 
3931  //
3932  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3933  // for low numbers of procs, and method #2 (keyed API call) for higher
3934  // numbers of procs.
3935  //
3936  if ( __kmp_adjust_gtid_mode ) {
3937  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3938  if ( TCR_4(__kmp_gtid_mode) != 2) {
3939  TCW_4(__kmp_gtid_mode, 2);
3940  }
3941  }
3942  else {
3943  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3944  TCW_4(__kmp_gtid_mode, 1);
3945  }
3946  }
3947  }
3948 
3949 #ifdef KMP_ADJUST_BLOCKTIME
3950  /* Adjust blocktime back to zero if necessary */
3951  /* Middle initialization might not have occurred yet */
3952  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3953  if ( __kmp_nth > __kmp_avail_proc ) {
3954  __kmp_zero_bt = TRUE;
3955  }
3956  }
3957 #endif /* KMP_ADJUST_BLOCKTIME */
3958 
3959  /* actually fork it and create the new worker thread */
3960  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
3961  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
3962  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
3963 
3964 
3965  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
3966  KMP_MB();
3967  return new_thr;
3968 }
3969 
3970 /*
3971  * reinitialize team for reuse.
3972  *
3973  * The hot team code calls this case at every fork barrier, so EPCC barrier
3974  * test are extremely sensitive to changes in it, esp. writes to the team
3975  * struct, which cause a cache invalidation in all threads.
3976  *
3977  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
3978  */
3979 static void
3980 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
3981  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
3982  team->t.t_threads[0], team ) );
3983  KMP_DEBUG_ASSERT( team && new_icvs);
3984  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
3985  team->t.t_ident = loc;
3986 
3987  team->t.t_id = KMP_GEN_TEAM_ID();
3988 
3989  // Copy ICVs to the master thread's implicit taskdata
3990  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
3991  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
3992 
3993  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
3994  team->t.t_threads[0], team ) );
3995 }
3996 
3997 
3998 /* initialize the team data structure
3999  * this assumes the t_threads and t_max_nproc are already set
4000  * also, we don't touch the arguments */
4001 static void
4002 __kmp_initialize_team(
4003  kmp_team_t * team,
4004  int new_nproc,
4005  kmp_internal_control_t * new_icvs,
4006  ident_t * loc
4007 ) {
4008  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4009 
4010  /* verify */
4011  KMP_DEBUG_ASSERT( team );
4012  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4013  KMP_DEBUG_ASSERT( team->t.t_threads );
4014  KMP_MB();
4015 
4016  team->t.t_master_tid = 0; /* not needed */
4017  /* team->t.t_master_bar; not needed */
4018  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4019  team->t.t_nproc = new_nproc;
4020 
4021  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4022  team->t.t_next_pool = NULL;
4023  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4024 
4025  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4026  team->t.t_invoke = NULL; /* not needed */
4027 
4028  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4029  team->t.t_sched = new_icvs->sched;
4030 
4031 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4032  team->t.t_fp_control_saved = FALSE; /* not needed */
4033  team->t.t_x87_fpu_control_word = 0; /* not needed */
4034  team->t.t_mxcsr = 0; /* not needed */
4035 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4036 
4037  team->t.t_construct = 0;
4038  __kmp_init_lock( & team->t.t_single_lock );
4039 
4040  team->t.t_ordered .dt.t_value = 0;
4041  team->t.t_master_active = FALSE;
4042 
4043  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4044 
4045 #ifdef KMP_DEBUG
4046  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4047 #endif
4048  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4049 
4050  team->t.t_control_stack_top = NULL;
4051 
4052  __kmp_reinitialize_team( team, new_icvs, loc );
4053 
4054  KMP_MB();
4055  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4056 }
4057 
4058 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4059 /* Sets full mask for thread and returns old mask, no changes to structures. */
4060 static void
4061 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4062 {
4063  if ( KMP_AFFINITY_CAPABLE() ) {
4064  int status;
4065  if ( old_mask != NULL ) {
4066  status = __kmp_get_system_affinity( old_mask, TRUE );
4067  int error = errno;
4068  if ( status != 0 ) {
4069  __kmp_msg(
4070  kmp_ms_fatal,
4071  KMP_MSG( ChangeThreadAffMaskError ),
4072  KMP_ERR( error ),
4073  __kmp_msg_null
4074  );
4075  }
4076  }
4077  __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
4078  }
4079 }
4080 #endif
4081 
4082 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4083 
4084 //
4085 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4086 // It calculats the worker + master thread's partition based upon the parent
4087 // thread's partition, and binds each worker to a thread in their partition.
4088 // The master thread's partition should already include its current binding.
4089 //
4090 static void
4091 __kmp_partition_places( kmp_team_t *team )
4092 {
4093  //
4094  // Copy the master thread's place partion to the team struct
4095  //
4096  kmp_info_t *master_th = team->t.t_threads[0];
4097  KMP_DEBUG_ASSERT( master_th != NULL );
4098  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4099  int first_place = master_th->th.th_first_place;
4100  int last_place = master_th->th.th_last_place;
4101  int masters_place = master_th->th.th_current_place;
4102  team->t.t_first_place = first_place;
4103  team->t.t_last_place = last_place;
4104 
4105  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4106  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4107  masters_place, first_place, last_place ) );
4108 
4109  switch ( proc_bind ) {
4110 
4111  case proc_bind_default:
4112  //
4113  // serial teams might have the proc_bind policy set to
4114  // proc_bind_default. It doesn't matter, as we don't
4115  // rebind the master thread for any proc_bind policy.
4116  //
4117  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4118  break;
4119 
4120  case proc_bind_master:
4121  {
4122  int f;
4123  int n_th = team->t.t_nproc;
4124  for ( f = 1; f < n_th; f++ ) {
4125  kmp_info_t *th = team->t.t_threads[f];
4126  KMP_DEBUG_ASSERT( th != NULL );
4127  th->th.th_first_place = first_place;
4128  th->th.th_last_place = last_place;
4129  th->th.th_new_place = masters_place;
4130 
4131  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4132  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4133  team->t.t_id, f, masters_place, first_place, last_place ) );
4134  }
4135  }
4136  break;
4137 
4138  case proc_bind_close:
4139  {
4140  int f;
4141  int n_th = team->t.t_nproc;
4142  int n_places;
4143  if ( first_place <= last_place ) {
4144  n_places = last_place - first_place + 1;
4145  }
4146  else {
4147  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4148  }
4149  if ( n_th <= n_places ) {
4150  int place = masters_place;
4151  for ( f = 1; f < n_th; f++ ) {
4152  kmp_info_t *th = team->t.t_threads[f];
4153  KMP_DEBUG_ASSERT( th != NULL );
4154 
4155  if ( place == last_place ) {
4156  place = first_place;
4157  }
4158  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4159  place = 0;
4160  }
4161  else {
4162  place++;
4163  }
4164  th->th.th_first_place = first_place;
4165  th->th.th_last_place = last_place;
4166  th->th.th_new_place = place;
4167 
4168  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4169  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4170  team->t.t_id, f, place, first_place, last_place ) );
4171  }
4172  }
4173  else {
4174  int S, rem, gap, s_count;
4175  S = n_th / n_places;
4176  s_count = 0;
4177  rem = n_th - ( S * n_places );
4178  gap = rem > 0 ? n_places/rem : n_places;
4179  int place = masters_place;
4180  int gap_ct = gap;
4181  for ( f = 0; f < n_th; f++ ) {
4182  kmp_info_t *th = team->t.t_threads[f];
4183  KMP_DEBUG_ASSERT( th != NULL );
4184 
4185  th->th.th_first_place = first_place;
4186  th->th.th_last_place = last_place;
4187  th->th.th_new_place = place;
4188  s_count++;
4189 
4190  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4191  // do nothing, add an extra thread to place on next iteration
4192  }
4193  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4194  // we added an extra thread to this place; move to next place
4195  if ( place == last_place ) {
4196  place = first_place;
4197  }
4198  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4199  place = 0;
4200  }
4201  else {
4202  place++;
4203  }
4204  s_count = 0;
4205  gap_ct = 1;
4206  rem--;
4207  }
4208  else if (s_count == S) { // place full; don't add extra
4209  if ( place == last_place ) {
4210  place = first_place;
4211  }
4212  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4213  place = 0;
4214  }
4215  else {
4216  place++;
4217  }
4218  gap_ct++;
4219  s_count = 0;
4220  }
4221 
4222  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4223  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4224  team->t.t_id, f, th->th.th_new_place, first_place,
4225  last_place ) );
4226  }
4227  KMP_DEBUG_ASSERT( place == masters_place );
4228  }
4229  }
4230  break;
4231 
4232  case proc_bind_spread:
4233  {
4234  int f;
4235  int n_th = team->t.t_nproc;
4236  int n_places;
4237  if ( first_place <= last_place ) {
4238  n_places = last_place - first_place + 1;
4239  }
4240  else {
4241  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4242  }
4243  if ( n_th <= n_places ) {
4244  int place = masters_place;
4245  int S = n_places/n_th;
4246  int s_count, rem, gap, gap_ct;
4247  rem = n_places - n_th*S;
4248  gap = rem ? n_th/rem : 1;
4249  gap_ct = gap;
4250  for ( f = 0; f < n_th; f++ ) {
4251  kmp_info_t *th = team->t.t_threads[f];
4252  KMP_DEBUG_ASSERT( th != NULL );
4253 
4254  th->th.th_first_place = place;
4255  th->th.th_new_place = place;
4256  s_count = 1;
4257  while (s_count < S) {
4258  if ( place == last_place ) {
4259  place = first_place;
4260  }
4261  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4262  place = 0;
4263  }
4264  else {
4265  place++;
4266  }
4267  s_count++;
4268  }
4269  if (rem && (gap_ct == gap)) {
4270  if ( place == last_place ) {
4271  place = first_place;
4272  }
4273  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4274  place = 0;
4275  }
4276  else {
4277  place++;
4278  }
4279  rem--;
4280  gap_ct = 0;
4281  }
4282  th->th.th_last_place = place;
4283  gap_ct++;
4284 
4285  if ( place == last_place ) {
4286  place = first_place;
4287  }
4288  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4289  place = 0;
4290  }
4291  else {
4292  place++;
4293  }
4294 
4295  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4296  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4297  team->t.t_id, f, th->th.th_new_place,
4298  th->th.th_first_place, th->th.th_last_place ) );
4299  }
4300  KMP_DEBUG_ASSERT( place == masters_place );
4301  }
4302  else {
4303  int S, rem, gap, s_count;
4304  S = n_th / n_places;
4305  s_count = 0;
4306  rem = n_th - ( S * n_places );
4307  gap = rem > 0 ? n_places/rem : n_places;
4308  int place = masters_place;
4309  int gap_ct = gap;
4310  for ( f = 0; f < n_th; f++ ) {
4311  kmp_info_t *th = team->t.t_threads[f];
4312  KMP_DEBUG_ASSERT( th != NULL );
4313 
4314  th->th.th_first_place = place;
4315  th->th.th_last_place = place;
4316  th->th.th_new_place = place;
4317  s_count++;
4318 
4319  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4320  // do nothing, add an extra thread to place on next iteration
4321  }
4322  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4323  // we added an extra thread to this place; move on to next place
4324  if ( place == last_place ) {
4325  place = first_place;
4326  }
4327  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4328  place = 0;
4329  }
4330  else {
4331  place++;
4332  }
4333  s_count = 0;
4334  gap_ct = 1;
4335  rem--;
4336  }
4337  else if (s_count == S) { // place is full; don't add extra thread
4338  if ( place == last_place ) {
4339  place = first_place;
4340  }
4341  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4342  place = 0;
4343  }
4344  else {
4345  place++;
4346  }
4347  gap_ct++;
4348  s_count = 0;
4349  }
4350 
4351  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4352  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4353  team->t.t_id, f, th->th.th_new_place,
4354  th->th.th_first_place, th->th.th_last_place) );
4355  }
4356  KMP_DEBUG_ASSERT( place == masters_place );
4357  }
4358  }
4359  break;
4360 
4361  default:
4362  break;
4363  }
4364 
4365  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4366 }
4367 
4368 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4369 
4370 /* allocate a new team data structure to use. take one off of the free pool if available */
4371 kmp_team_t *
4372 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4373 #if OMP_40_ENABLED
4374  kmp_proc_bind_t new_proc_bind,
4375 #endif
4376  kmp_internal_control_t *new_icvs,
4377  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4378 {
4379  KMP_TIME_BLOCK(KMP_allocate_team);
4380  int f;
4381  kmp_team_t *team;
4382  char *ptr;
4383  size_t size;
4384  int use_hot_team = ! root->r.r_active;
4385 
4386  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4387  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4388  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4389  KMP_MB();
4390 
4391 #if KMP_NESTED_HOT_TEAMS
4392  int level;
4393  kmp_hot_team_ptr_t *hot_teams;
4394  if( master ) {
4395  team = master->th.th_team;
4396  level = team->t.t_active_level;
4397  if( master->th.th_teams_microtask ) { // in teams construct?
4398  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4399  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4400  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4401  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4402  }
4403  }
4404  hot_teams = master->th.th_hot_teams;
4405  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4406  { // hot team has already been allocated for given level
4407  use_hot_team = 1;
4408  } else {
4409  use_hot_team = 0;
4410  }
4411  }
4412 #endif
4413  // Optimization to use a "hot" team
4414  if( use_hot_team && new_nproc > 1 ) {
4415  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4416 #if KMP_NESTED_HOT_TEAMS
4417  team = hot_teams[level].hot_team;
4418 #else
4419  team = root->r.r_hot_team;
4420 #endif
4421 #if KMP_DEBUG
4422  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4423  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p before reinit\n",
4424  team->t.t_task_team ));
4425  }
4426 #endif
4427 
4428  // Has the number of threads changed?
4429  /* Let's assume the most common case is that the number of threads is unchanged, and
4430  put that case first. */
4431  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4432  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4433 #if KMP_MIC
4434  // This case can mean that omp_set_num_threads() was called and the hot team size
4435  // was already reduced, so we check the special flag
4436  if ( team->t.t_size_changed == -1 ) {
4437  team->t.t_size_changed = 1;
4438  } else {
4439  team->t.t_size_changed = 0;
4440  }
4441 #endif
4442 
4443  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4444  team->t.t_sched = new_icvs->sched;
4445 
4446  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4447 
4448  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4449  0, team->t.t_threads[0], team ) );
4450  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4451 
4452 #if OMP_40_ENABLED
4453 # if KMP_AFFINITY_SUPPORTED
4454  if ( team->t.t_proc_bind == new_proc_bind ) {
4455  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4456  team->t.t_id, new_proc_bind, team->t.t_first_place,
4457  team->t.t_last_place ) );
4458  }
4459  else {
4460  team->t.t_proc_bind = new_proc_bind;
4461  __kmp_partition_places( team );
4462  }
4463 # else
4464  if ( team->t.t_proc_bind != new_proc_bind ) {
4465  team->t.t_proc_bind = new_proc_bind;
4466  }
4467 # endif /* KMP_AFFINITY_SUPPORTED */
4468 #endif /* OMP_40_ENABLED */
4469  }
4470  else if( team->t.t_nproc > new_nproc ) {
4471  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4472 
4473 #if KMP_MIC
4474  team->t.t_size_changed = 1;
4475 #endif
4476  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4477  kmp_task_team_t *task_team = team->t.t_task_team;
4478  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
4479  //
4480  // Signal the worker threads (esp. the extra ones) to stop
4481  // looking for tasks while spin waiting. The task teams
4482  // are reference counted and will be deallocated by the
4483  // last worker thread.
4484  //
4485  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
4486  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
4487  KMP_MB();
4488 
4489  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
4490  &team->t.t_task_team ) );
4491  team->t.t_task_team = NULL;
4492  }
4493  else {
4494  KMP_DEBUG_ASSERT( task_team == NULL );
4495  }
4496  }
4497 #if KMP_NESTED_HOT_TEAMS
4498  if( __kmp_hot_teams_mode == 0 ) {
4499  // AC: saved number of threads should correspond to team's value in this mode,
4500  // can be bigger in mode 1, when hot team has some threads in reserve
4501  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4502  hot_teams[level].hot_team_nth = new_nproc;
4503 #endif // KMP_NESTED_HOT_TEAMS
4504  /* release the extra threads we don't need any more */
4505  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4506  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4507  __kmp_free_thread( team->t.t_threads[ f ] );
4508  team->t.t_threads[ f ] = NULL;
4509  }
4510 #if KMP_NESTED_HOT_TEAMS
4511  } // (__kmp_hot_teams_mode == 0)
4512 #endif // KMP_NESTED_HOT_TEAMS
4513  team->t.t_nproc = new_nproc;
4514  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4515  team->t.t_sched = new_icvs->sched;
4516  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4517 
4518  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4519  kmp_task_team_t *task_team = team->t.t_task_team;
4520  if ( task_team != NULL ) {
4521  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
4522  task_team->tt.tt_nproc = new_nproc;
4523  task_team->tt.tt_unfinished_threads = new_nproc;
4524  task_team->tt.tt_ref_ct = new_nproc - 1;
4525  }
4526  }
4527 
4528  /* update the remaining threads */
4529  for(f = 0; f < new_nproc; ++f) {
4530  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4531  }
4532 
4533  // restore the current task state of the master thread: should be the implicit task
4534  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4535  0, team->t.t_threads[0], team ) );
4536 
4537  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4538 
4539 #ifdef KMP_DEBUG
4540  for ( f = 0; f < team->t.t_nproc; f++ ) {
4541  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4542  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4543  }
4544 #endif
4545 
4546 #if OMP_40_ENABLED
4547  team->t.t_proc_bind = new_proc_bind;
4548 # if KMP_AFFINITY_SUPPORTED
4549  __kmp_partition_places( team );
4550 # endif
4551 #endif
4552  }
4553  else { // team->t.t_nproc < new_nproc
4554 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4555  kmp_affin_mask_t *old_mask;
4556  if ( KMP_AFFINITY_CAPABLE() ) {
4557  KMP_CPU_ALLOC(old_mask);
4558  }
4559 #endif
4560 
4561  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4562 
4563 #if KMP_MIC
4564  team->t.t_size_changed = 1;
4565 #endif
4566 
4567 
4568 #if KMP_NESTED_HOT_TEAMS
4569  int avail_threads = hot_teams[level].hot_team_nth;
4570  if( new_nproc < avail_threads )
4571  avail_threads = new_nproc;
4572  kmp_info_t **other_threads = team->t.t_threads;
4573  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4574  // Adjust barrier data of reserved threads (if any) of the team
4575  // Other data will be set in __kmp_initialize_info() below.
4576  int b;
4577  kmp_balign_t * balign = other_threads[f]->th.th_bar;
4578  for ( b = 0; b < bs_last_barrier; ++ b ) {
4579  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4580  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4581  }
4582  }
4583  if( hot_teams[level].hot_team_nth >= new_nproc ) {
4584  // we have all needed threads in reserve, no need to allocate any
4585  // this only possible in mode 1, cannot have reserved threads in mode 0
4586  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4587  team->t.t_nproc = new_nproc; // just get reserved threads involved
4588  } else {
4589  // we may have some threads in reserve, but not enough
4590  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4591  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4592 #endif // KMP_NESTED_HOT_TEAMS
4593  if(team->t.t_max_nproc < new_nproc) {
4594  /* reallocate larger arrays */
4595  __kmp_reallocate_team_arrays(team, new_nproc);
4596  __kmp_reinitialize_team( team, new_icvs, NULL );
4597  }
4598 
4599 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4600  /* Temporarily set full mask for master thread before
4601  creation of workers. The reason is that workers inherit
4602  the affinity from master, so if a lot of workers are
4603  created on the single core quickly, they don't get
4604  a chance to set their own affinity for a long time.
4605  */
4606  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4607 #endif
4608 
4609  /* allocate new threads for the hot team */
4610  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
4611  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4612  KMP_DEBUG_ASSERT( new_worker );
4613  team->t.t_threads[ f ] = new_worker;
4614  new_worker->th.th_team_nproc = team->t.t_nproc;
4615 
4616  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%u, plain=%u\n",
4617  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4618  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4619  team->t.t_bar[bs_plain_barrier].b_arrived ) );
4620 
4621  { // Initialize barrier data for new threads.
4622  int b;
4623  kmp_balign_t * balign = new_worker->th.th_bar;
4624  for( b = 0; b < bs_last_barrier; ++ b ) {
4625  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
4626  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4627  }
4628  }
4629  }
4630 
4631 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4632  if ( KMP_AFFINITY_CAPABLE() ) {
4633  /* Restore initial master thread's affinity mask */
4634  __kmp_set_system_affinity( old_mask, TRUE );
4635  KMP_CPU_FREE(old_mask);
4636  }
4637 #endif
4638 #if KMP_NESTED_HOT_TEAMS
4639  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
4640 #endif // KMP_NESTED_HOT_TEAMS
4641  /* make sure everyone is syncronized */
4642  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
4643 
4644  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4645  kmp_task_team_t *task_team = team->t.t_task_team;
4646  if ( task_team != NULL ) {
4647  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
4648  task_team->tt.tt_nproc = new_nproc;
4649  task_team->tt.tt_unfinished_threads = new_nproc;
4650  task_team->tt.tt_ref_ct = new_nproc - 1;
4651  }
4652  }
4653 
4654  /* reinitialize the old threads */
4655  for( f = 0 ; f < team->t.t_nproc ; f++ )
4656  __kmp_initialize_info( team->t.t_threads[ f ], team, f,
4657  __kmp_gtid_from_tid( f, team ) );
4658 #ifdef KMP_DEBUG
4659  for ( f = 0; f < team->t.t_nproc; ++ f ) {
4660  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4661  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4662  }
4663 #endif
4664 
4665 #if OMP_40_ENABLED
4666  team->t.t_proc_bind = new_proc_bind;
4667 # if KMP_AFFINITY_SUPPORTED
4668  __kmp_partition_places( team );
4669 # endif
4670 #endif
4671  } // Check changes in number of threads
4672 
4673 #if OMP_40_ENABLED
4674  kmp_info_t *master = team->t.t_threads[0];
4675  if( master->th.th_teams_microtask ) {
4676  for( f = 1; f < new_nproc; ++f ) {
4677  // propagate teams construct specific info to workers
4678  kmp_info_t *thr = team->t.t_threads[f];
4679  thr->th.th_teams_microtask = master->th.th_teams_microtask;
4680  thr->th.th_teams_level = master->th.th_teams_level;
4681  thr->th.th_teams_size = master->th.th_teams_size;
4682  }
4683  }
4684 #endif /* OMP_40_ENABLED */
4685 #if KMP_NESTED_HOT_TEAMS
4686  if( level ) {
4687  // Sync task (TODO: and barrier?) state for nested hot teams, not needed for outermost hot team.
4688  for( f = 1; f < new_nproc; ++f ) {
4689  kmp_info_t *thr = team->t.t_threads[f];
4690  thr->th.th_task_state = master->th.th_task_state;
4691  int b;
4692  kmp_balign_t * balign = thr->th.th_bar;
4693  for( b = 0; b < bs_last_barrier; ++ b ) {
4694  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
4695  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4696  }
4697  }
4698  }
4699 #endif // KMP_NESTED_HOT_TEAMS
4700 
4701  /* reallocate space for arguments if necessary */
4702  __kmp_alloc_argv_entries( argc, team, TRUE );
4703  team->t.t_argc = argc;
4704  //
4705  // The hot team re-uses the previous task team,
4706  // if untouched during the previous release->gather phase.
4707  //
4708 
4709  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
4710 
4711 #if KMP_DEBUG
4712  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4713  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p after reinit\n",
4714  team->t.t_task_team ));
4715  }
4716 #endif
4717 
4718  KMP_MB();
4719 
4720  return team;
4721  }
4722 
4723  /* next, let's try to take one from the team pool */
4724  KMP_MB();
4725  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
4726  {
4727  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
4728  if ( team->t.t_max_nproc >= max_nproc ) {
4729  /* take this team from the team pool */
4730  __kmp_team_pool = team->t.t_next_pool;
4731 
4732  /* setup the team for fresh use */
4733  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
4734 
4735  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
4736  &team->t.t_task_team ) );
4737  team->t.t_task_team = NULL;
4738 
4739  /* reallocate space for arguments if necessary */
4740  __kmp_alloc_argv_entries( argc, team, TRUE );
4741  team->t.t_argc = argc;
4742 
4743  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
4744  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4745  { // Initialize barrier data.
4746  int b;
4747  for ( b = 0; b < bs_last_barrier; ++ b) {
4748  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
4749  }
4750  }
4751 
4752 #if OMP_40_ENABLED
4753  team->t.t_proc_bind = new_proc_bind;
4754 #endif
4755 
4756  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
4757  KMP_MB();
4758 
4759  return team;
4760  }
4761 
4762  /* reap team if it is too small, then loop back and check the next one */
4763  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
4764  /* TODO: Use technique to find the right size hot-team, don't reap them */
4765  team = __kmp_reap_team( team );
4766  __kmp_team_pool = team;
4767  }
4768 
4769  /* nothing available in the pool, no matter, make a new team! */
4770  KMP_MB();
4771  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
4772 
4773  /* and set it up */
4774  team->t.t_max_nproc = max_nproc;
4775  /* NOTE well, for some reason allocating one big buffer and dividing it
4776  * up seems to really hurt performance a lot on the P4, so, let's not use
4777  * this... */
4778  __kmp_allocate_team_arrays( team, max_nproc );
4779 
4780  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
4781  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
4782 
4783  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
4784  &team->t.t_task_team ) );
4785  team->t.t_task_team = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
4786 
4787  if ( __kmp_storage_map ) {
4788  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
4789  }
4790 
4791  /* allocate space for arguments */
4792  __kmp_alloc_argv_entries( argc, team, FALSE );
4793  team->t.t_argc = argc;
4794 
4795  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
4796  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4797  { // Initialize barrier data.
4798  int b;
4799  for ( b = 0; b < bs_last_barrier; ++ b ) {
4800  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
4801  }
4802  }
4803 
4804 #if OMP_40_ENABLED
4805  team->t.t_proc_bind = new_proc_bind;
4806 #endif
4807 
4808  KMP_MB();
4809 
4810  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
4811 
4812  return team;
4813 }
4814 
4815 /* TODO implement hot-teams at all levels */
4816 /* TODO implement lazy thread release on demand (disband request) */
4817 
4818 /* free the team. return it to the team pool. release all the threads
4819  * associated with it */
4820 void
4821 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
4822 {
4823  int f;
4824  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
4825 
4826  /* verify state */
4827  KMP_DEBUG_ASSERT( root );
4828  KMP_DEBUG_ASSERT( team );
4829  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
4830  KMP_DEBUG_ASSERT( team->t.t_threads );
4831 
4832  int use_hot_team = team == root->r.r_hot_team;
4833 #if KMP_NESTED_HOT_TEAMS
4834  int level;
4835  kmp_hot_team_ptr_t *hot_teams;
4836  if( master ) {
4837  level = team->t.t_active_level - 1;
4838  if( master->th.th_teams_microtask ) { // in teams construct?
4839  if( master->th.th_teams_size.nteams > 1 ) {
4840  ++level; // level was not increased in teams construct for team_of_masters
4841  }
4842  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
4843  master->th.th_teams_level == team->t.t_level ) {
4844  ++level; // level was not increased in teams construct for team_of_workers before the parallel
4845  } // team->t.t_level will be increased inside parallel
4846  }
4847  hot_teams = master->th.th_hot_teams;
4848  if( level < __kmp_hot_teams_max_level ) {
4849  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
4850  use_hot_team = 1;
4851  }
4852  }
4853 #endif // KMP_NESTED_HOT_TEAMS
4854 
4855  /* team is done working */
4856  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
4857  team->t.t_copyin_counter = 0; // init counter for possible reuse
4858  // Do not reset pointer to parent team to NULL for hot teams.
4859 
4860  /* if we are non-hot team, release our threads */
4861  if( ! use_hot_team ) {
4862 
4863  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4864  kmp_task_team_t *task_team = team->t.t_task_team;
4865  if ( task_team != NULL ) {
4866  //
4867  // Signal the worker threads to stop looking for tasks while
4868  // spin waiting. The task teams are reference counted and will
4869  // be deallocated by the last worker thread via the thread's
4870  // pointer to the task team.
4871  //
4872  KA_TRACE( 20, ( "__kmp_free_team: deactivating task_team %p\n",
4873  task_team ) );
4874  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
4875  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
4876  KMP_MB();
4877  team->t.t_task_team = NULL;
4878  }
4879  }
4880 
4881  // Reset pointer to parent team only for non-hot teams.
4882  team->t.t_parent = NULL;
4883 
4884 
4885  /* free the worker threads */
4886  for ( f = 1; f < team->t.t_nproc; ++ f ) {
4887  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4888  __kmp_free_thread( team->t.t_threads[ f ] );
4889  team->t.t_threads[ f ] = NULL;
4890  }
4891 
4892 
4893  /* put the team back in the team pool */
4894  /* TODO limit size of team pool, call reap_team if pool too large */
4895  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
4896  __kmp_team_pool = (volatile kmp_team_t*) team;
4897  }
4898 
4899  KMP_MB();
4900 }
4901 
4902 
4903 /* reap the team. destroy it, reclaim all its resources and free its memory */
4904 kmp_team_t *
4905 __kmp_reap_team( kmp_team_t *team )
4906 {
4907  kmp_team_t *next_pool = team->t.t_next_pool;
4908 
4909  KMP_DEBUG_ASSERT( team );
4910  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4911  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
4912  KMP_DEBUG_ASSERT( team->t.t_threads );
4913  KMP_DEBUG_ASSERT( team->t.t_argv );
4914 
4915  /* TODO clean the threads that are a part of this? */
4916 
4917  /* free stuff */
4918 
4919  __kmp_free_team_arrays( team );
4920  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
4921  __kmp_free( (void*) team->t.t_argv );
4922  __kmp_free( team );
4923 
4924  KMP_MB();
4925  return next_pool;
4926 }
4927 
4928 //
4929 // Free the thread. Don't reap it, just place it on the pool of available
4930 // threads.
4931 //
4932 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
4933 // binding for the affinity mechanism to be useful.
4934 //
4935 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
4936 // However, we want to avoid a potential performance problem by always
4937 // scanning through the list to find the correct point at which to insert
4938 // the thread (potential N**2 behavior). To do this we keep track of the
4939 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
4940 // With single-level parallelism, threads will always be added to the tail
4941 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
4942 // parallelism, all bets are off and we may need to scan through the entire
4943 // free list.
4944 //
4945 // This change also has a potentially large performance benefit, for some
4946 // applications. Previously, as threads were freed from the hot team, they
4947 // would be placed back on the free list in inverse order. If the hot team
4948 // grew back to it's original size, then the freed thread would be placed
4949 // back on the hot team in reverse order. This could cause bad cache
4950 // locality problems on programs where the size of the hot team regularly
4951 // grew and shrunk.
4952 //
4953 // Now, for single-level parallelism, the OMP tid is alway == gtid.
4954 //
4955 void
4956 __kmp_free_thread( kmp_info_t *this_th )
4957 {
4958  int gtid;
4959  kmp_info_t **scan;
4960 
4961  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
4962  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
4963 
4964  KMP_DEBUG_ASSERT( this_th );
4965 
4966  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
4967  int b;
4968  kmp_balign_t *balign = this_th->th.th_bar;
4969  for (b=0; b<bs_last_barrier; ++b) {
4970  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
4971  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4972  balign[b].bb.team = NULL;
4973  }
4974 
4975 
4976  /* put thread back on the free pool */
4977  TCW_PTR(this_th->th.th_team, NULL);
4978  TCW_PTR(this_th->th.th_root, NULL);
4979  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
4980 
4981  //
4982  // If the __kmp_thread_pool_insert_pt is already past the new insert
4983  // point, then we need to re-scan the entire list.
4984  //
4985  gtid = this_th->th.th_info.ds.ds_gtid;
4986  if ( __kmp_thread_pool_insert_pt != NULL ) {
4987  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
4988  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
4989  __kmp_thread_pool_insert_pt = NULL;
4990  }
4991  }
4992 
4993  //
4994  // Scan down the list to find the place to insert the thread.
4995  // scan is the address of a link in the list, possibly the address of
4996  // __kmp_thread_pool itself.
4997  //
4998  // In the absence of nested parallism, the for loop will have 0 iterations.
4999  //
5000  if ( __kmp_thread_pool_insert_pt != NULL ) {
5001  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5002  }
5003  else {
5004  scan = (kmp_info_t **)&__kmp_thread_pool;
5005  }
5006  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5007  scan = &( (*scan)->th.th_next_pool ) );
5008 
5009  //
5010  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5011  // to its address.
5012  //
5013  TCW_PTR(this_th->th.th_next_pool, *scan);
5014  __kmp_thread_pool_insert_pt = *scan = this_th;
5015  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5016  || ( this_th->th.th_info.ds.ds_gtid
5017  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5018  TCW_4(this_th->th.th_in_pool, TRUE);
5019  __kmp_thread_pool_nth++;
5020 
5021  TCW_4(__kmp_nth, __kmp_nth - 1);
5022 
5023 #ifdef KMP_ADJUST_BLOCKTIME
5024  /* Adjust blocktime back to user setting or default if necessary */
5025  /* Middle initialization might never have occurred */
5026  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5027  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5028  if ( __kmp_nth <= __kmp_avail_proc ) {
5029  __kmp_zero_bt = FALSE;
5030  }
5031  }
5032 #endif /* KMP_ADJUST_BLOCKTIME */
5033 
5034  KMP_MB();
5035 }
5036 
5037 
5038 /* ------------------------------------------------------------------------ */
5039 
5040 void *
5041 __kmp_launch_thread( kmp_info_t *this_thr )
5042 {
5043  int gtid = this_thr->th.th_info.ds.ds_gtid;
5044 /* void *stack_data;*/
5045  kmp_team_t *(*volatile pteam);
5046 
5047  KMP_MB();
5048  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5049 
5050  if( __kmp_env_consistency_check ) {
5051  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5052  }
5053 
5054  /* This is the place where threads wait for work */
5055  while( ! TCR_4(__kmp_global.g.g_done) ) {
5056  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5057  KMP_MB();
5058 
5059  /* wait for work to do */
5060  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5061 
5062  /* No tid yet since not part of a team */
5063  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5064 
5065  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5066 
5067  /* have we been allocated? */
5068  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5069  /* we were just woken up, so run our new task */
5070  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5071  int rc;
5072  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5073  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5074 
5075  updateHWFPControl (*pteam);
5076 
5077  KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
5078  {
5079  KMP_TIME_BLOCK(USER_worker_invoke);
5080  rc = (*pteam)->t.t_invoke( gtid );
5081  }
5082  KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
5083  KMP_ASSERT( rc );
5084 
5085  KMP_MB();
5086  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5087  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5088  }
5089  /* join barrier after parallel region */
5090  __kmp_join_barrier( gtid );
5091  }
5092  }
5093  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5094 
5095  if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
5096  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
5097  }
5098  /* run the destructors for the threadprivate data for this thread */
5099  __kmp_common_destroy_gtid( gtid );
5100 
5101  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5102  KMP_MB();
5103  return this_thr;
5104 }
5105 
5106 /* ------------------------------------------------------------------------ */
5107 /* ------------------------------------------------------------------------ */
5108 
5109 void
5110 __kmp_internal_end_dest( void *specific_gtid )
5111 {
5112  #if KMP_COMPILER_ICC
5113  #pragma warning( push )
5114  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5115  #endif
5116  // Make sure no significant bits are lost
5117  int gtid = (kmp_intptr_t)specific_gtid - 1;
5118  #if KMP_COMPILER_ICC
5119  #pragma warning( pop )
5120  #endif
5121 
5122  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5123  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5124  * this is because 0 is reserved for the nothing-stored case */
5125 
5126  /* josh: One reason for setting the gtid specific data even when it is being
5127  destroyed by pthread is to allow gtid lookup through thread specific data
5128  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5129  that gets executed in the call to __kmp_internal_end_thread, actually
5130  gets the gtid through the thread specific data. Setting it here seems
5131  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5132  to run smoothly.
5133  todo: get rid of this after we remove the dependence on
5134  __kmp_gtid_get_specific
5135  */
5136  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5137  __kmp_gtid_set_specific( gtid );
5138  #ifdef KMP_TDATA_GTID
5139  __kmp_gtid = gtid;
5140  #endif
5141  __kmp_internal_end_thread( gtid );
5142 }
5143 
5144 #if KMP_OS_UNIX && GUIDEDLL_EXPORTS
5145 
5146 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5147 // perfectly, but in real libiomp5.so I have no evidence it is ever called. However, -fini linker
5148 // option in makefile.mk works fine.
5149 
5150 __attribute__(( destructor ))
5151 void
5152 __kmp_internal_end_dtor( void )
5153 {
5154  __kmp_internal_end_atexit();
5155 }
5156 
5157 void
5158 __kmp_internal_end_fini( void )
5159 {
5160  __kmp_internal_end_atexit();
5161 }
5162 
5163 #endif
5164 
5165 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5166 void
5167 __kmp_internal_end_atexit( void )
5168 {
5169  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5170  /* [Windows]
5171  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5172  stat code that depends on thread specific data for gtid fails because that data becomes
5173  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5174  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5175  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5176 
5177 // TODO: Can some of this comment about GVS be removed?
5178  I suspect that the offending stat code is executed when the calling thread tries to
5179  clean up a dead root thread's data structures, resulting in GVS code trying to close
5180  the GVS structures for that thread, but since the stat code uses
5181  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5182  cleaning up itself instead of another thread, it gets confused. This happens because
5183  allowing a thread to unregister and cleanup another thread is a recent modification for
5184  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5185  thread may end up trying to unregister another thread only if thread death does not
5186  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5187  specific data destructor function to detect thread death. For Windows dynamic, there
5188  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5189  workaround is applicable only for Windows static stat library.
5190  */
5191  __kmp_internal_end_library( -1 );
5192  #if KMP_OS_WINDOWS
5193  __kmp_close_console();
5194  #endif
5195 }
5196 
5197 static void
5198 __kmp_reap_thread(
5199  kmp_info_t * thread,
5200  int is_root
5201 ) {
5202 
5203  // It is assumed __kmp_forkjoin_lock is acquired.
5204 
5205  int gtid;
5206 
5207  KMP_DEBUG_ASSERT( thread != NULL );
5208 
5209  gtid = thread->th.th_info.ds.ds_gtid;
5210 
5211  if ( ! is_root ) {
5212 
5213  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5214  /* Assume the threads are at the fork barrier here */
5215  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5216  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5217  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5218  __kmp_release_64(&flag);
5219  }; // if
5220 
5221 
5222  // Terminate OS thread.
5223  __kmp_reap_worker( thread );
5224 
5225  //
5226  // The thread was killed asynchronously. If it was actively
5227  // spinning in the in the thread pool, decrement the global count.
5228  //
5229  // There is a small timing hole here - if the worker thread was
5230  // just waking up after sleeping in the pool, had reset it's
5231  // th_active_in_pool flag but not decremented the global counter
5232  // __kmp_thread_pool_active_nth yet, then the global counter
5233  // might not get updated.
5234  //
5235  // Currently, this can only happen as the library is unloaded,
5236  // so there are no harmful side effects.
5237  //
5238  if ( thread->th.th_active_in_pool ) {
5239  thread->th.th_active_in_pool = FALSE;
5240  KMP_TEST_THEN_DEC32(
5241  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5242  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5243  }
5244 
5245  // Decrement # of [worker] threads in the pool.
5246  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5247  --__kmp_thread_pool_nth;
5248  }; // if
5249 
5250  // Free the fast memory for tasking
5251  #if USE_FAST_MEMORY
5252  __kmp_free_fast_memory( thread );
5253  #endif /* USE_FAST_MEMORY */
5254 
5255  __kmp_suspend_uninitialize_thread( thread );
5256 
5257  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5258  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5259 
5260  -- __kmp_all_nth;
5261  // __kmp_nth was decremented when thread is added to the pool.
5262 
5263 #ifdef KMP_ADJUST_BLOCKTIME
5264  /* Adjust blocktime back to user setting or default if necessary */
5265  /* Middle initialization might never have occurred */
5266  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5267  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5268  if ( __kmp_nth <= __kmp_avail_proc ) {
5269  __kmp_zero_bt = FALSE;
5270  }
5271  }
5272 #endif /* KMP_ADJUST_BLOCKTIME */
5273 
5274  /* free the memory being used */
5275  if( __kmp_env_consistency_check ) {
5276  if ( thread->th.th_cons ) {
5277  __kmp_free_cons_stack( thread->th.th_cons );
5278  thread->th.th_cons = NULL;
5279  }; // if
5280  }
5281 
5282  if ( thread->th.th_pri_common != NULL ) {
5283  __kmp_free( thread->th.th_pri_common );
5284  thread->th.th_pri_common = NULL;
5285  }; // if
5286 
5287  #if KMP_USE_BGET
5288  if ( thread->th.th_local.bget_data != NULL ) {
5289  __kmp_finalize_bget( thread );
5290  }; // if
5291  #endif
5292 
5293 #if KMP_AFFINITY_SUPPORTED
5294  if ( thread->th.th_affin_mask != NULL ) {
5295  KMP_CPU_FREE( thread->th.th_affin_mask );
5296  thread->th.th_affin_mask = NULL;
5297  }; // if
5298 #endif /* KMP_AFFINITY_SUPPORTED */
5299 
5300  __kmp_reap_team( thread->th.th_serial_team );
5301  thread->th.th_serial_team = NULL;
5302  __kmp_free( thread );
5303 
5304  KMP_MB();
5305 
5306 } // __kmp_reap_thread
5307 
5308 static void
5309 __kmp_internal_end(void)
5310 {
5311  int i;
5312 
5313  /* First, unregister the library */
5314  __kmp_unregister_library();
5315 
5316  #if KMP_OS_WINDOWS
5317  /* In Win static library, we can't tell when a root actually dies, so we
5318  reclaim the data structures for any root threads that have died but not
5319  unregistered themselves, in order to shut down cleanly.
5320  In Win dynamic library we also can't tell when a thread dies.
5321  */
5322  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5323  #endif
5324 
5325  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5326  if( __kmp_root[i] )
5327  if( __kmp_root[i]->r.r_active )
5328  break;
5329  KMP_MB(); /* Flush all pending memory write invalidates. */
5330  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5331 
5332  if ( i < __kmp_threads_capacity ) {
5333  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5334  KMP_MB(); /* Flush all pending memory write invalidates. */
5335 
5336  //
5337  // Need to check that monitor was initialized before reaping it.
5338  // If we are called form __kmp_atfork_child (which sets
5339  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5340  // contain valid data, but it is only valid in the parent process,
5341  // not the child.
5342  //
5343  // One of the possible fixes for CQ138434 / CQ140126
5344  // (used in 20091103_dreamworks patch)
5345  //
5346  // New behavior (201008): instead of keying off of the flag
5347  // __kmp_init_parallel, the monitor thread creation is keyed off
5348  // of the new flag __kmp_init_monitor.
5349  //
5350  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5351  if ( TCR_4( __kmp_init_monitor ) ) {
5352  __kmp_reap_monitor( & __kmp_monitor );
5353  TCW_4( __kmp_init_monitor, 0 );
5354  }
5355  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5356  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5357  } else {
5358  /* TODO move this to cleanup code */
5359  #ifdef KMP_DEBUG
5360  /* make sure that everything has properly ended */
5361  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5362  if( __kmp_root[i] ) {
5363 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5364  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5365  }
5366  }
5367  #endif
5368 
5369  KMP_MB();
5370 
5371  // Reap the worker threads.
5372  // This is valid for now, but be careful if threads are reaped sooner.
5373  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5374  // Get the next thread from the pool.
5375  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5376  __kmp_thread_pool = thread->th.th_next_pool;
5377  // Reap it.
5378  thread->th.th_next_pool = NULL;
5379  thread->th.th_in_pool = FALSE;
5380  __kmp_reap_thread( thread, 0 );
5381  }; // while
5382  __kmp_thread_pool_insert_pt = NULL;
5383 
5384  // Reap teams.
5385  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5386  // Get the next team from the pool.
5387  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5388  __kmp_team_pool = team->t.t_next_pool;
5389  // Reap it.
5390  team->t.t_next_pool = NULL;
5391  __kmp_reap_team( team );
5392  }; // while
5393 
5394  __kmp_reap_task_teams( );
5395 
5396  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5397  // TBD: Add some checking...
5398  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5399  }
5400 
5401  /* Make sure all threadprivate destructors get run by joining with all worker
5402  threads before resetting this flag */
5403  TCW_SYNC_4(__kmp_init_common, FALSE);
5404 
5405  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5406  KMP_MB();
5407 
5408  //
5409  // See note above: One of the possible fixes for CQ138434 / CQ140126
5410  //
5411  // FIXME: push both code fragments down and CSE them?
5412  // push them into __kmp_cleanup() ?
5413  //
5414  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5415  if ( TCR_4( __kmp_init_monitor ) ) {
5416  __kmp_reap_monitor( & __kmp_monitor );
5417  TCW_4( __kmp_init_monitor, 0 );
5418  }
5419  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5420  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5421 
5422  } /* else !__kmp_global.t_active */
5423  TCW_4(__kmp_init_gtid, FALSE);
5424  KMP_MB(); /* Flush all pending memory write invalidates. */
5425 
5426 
5427  __kmp_cleanup();
5428 }
5429 
5430 void
5431 __kmp_internal_end_library( int gtid_req )
5432 {
5433  int i;
5434 
5435  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5436  /* this shouldn't be a race condition because __kmp_internal_end() is the
5437  * only place to clear __kmp_serial_init */
5438  /* we'll check this later too, after we get the lock */
5439  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5440  // because the next check will work in any case.
5441  if( __kmp_global.g.g_abort ) {
5442  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5443  /* TODO abort? */
5444  return;
5445  }
5446  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5447  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5448  return;
5449  }
5450 
5451 
5452  KMP_MB(); /* Flush all pending memory write invalidates. */
5453 
5454  /* find out who we are and what we should do */
5455  {
5456  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5457  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5458  if( gtid == KMP_GTID_SHUTDOWN ) {
5459  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5460  return;
5461  } else if( gtid == KMP_GTID_MONITOR ) {
5462  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5463  return;
5464  } else if( gtid == KMP_GTID_DNE ) {
5465  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5466  /* we don't know who we are, but we may still shutdown the library */
5467  } else if( KMP_UBER_GTID( gtid )) {
5468  /* unregister ourselves as an uber thread. gtid is no longer valid */
5469  if( __kmp_root[gtid]->r.r_active ) {
5470  __kmp_global.g.g_abort = -1;
5471  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5472  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5473  return;
5474  } else {
5475  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5476  __kmp_unregister_root_current_thread( gtid );
5477  }
5478  } else {
5479  /* worker threads may call this function through the atexit handler, if they call exit() */
5480  /* For now, skip the usual subsequent processing and just dump the debug buffer.
5481  TODO: do a thorough shutdown instead
5482  */
5483  #ifdef DUMP_DEBUG_ON_EXIT
5484  if ( __kmp_debug_buf )
5485  __kmp_dump_debug_buffer( );
5486  #endif
5487  return;
5488  }
5489  }
5490  /* synchronize the termination process */
5491  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5492 
5493  /* have we already finished */
5494  if( __kmp_global.g.g_abort ) {
5495  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5496  /* TODO abort? */
5497  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5498  return;
5499  }
5500  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5501  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5502  return;
5503  }
5504 
5505  /* We need this lock to enforce mutex between this reading of
5506  __kmp_threads_capacity and the writing by __kmp_register_root.
5507  Alternatively, we can use a counter of roots that is
5508  atomically updated by __kmp_get_global_thread_id_reg,
5509  __kmp_do_serial_initialize and __kmp_internal_end_*.
5510  */
5511  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5512 
5513  /* now we can safely conduct the actual termination */
5514  __kmp_internal_end();
5515 
5516  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5517  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5518 
5519  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
5520 
5521  #ifdef DUMP_DEBUG_ON_EXIT
5522  if ( __kmp_debug_buf )
5523  __kmp_dump_debug_buffer();
5524  #endif
5525 
5526  #if KMP_OS_WINDOWS
5527  __kmp_close_console();
5528  #endif
5529 
5530  __kmp_fini_allocator();
5531 
5532 } // __kmp_internal_end_library
5533 
5534 void
5535 __kmp_internal_end_thread( int gtid_req )
5536 {
5537  int i;
5538 
5539  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5540  /* this shouldn't be a race condition because __kmp_internal_end() is the
5541  * only place to clear __kmp_serial_init */
5542  /* we'll check this later too, after we get the lock */
5543  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
5544  // because the next check will work in any case.
5545  if( __kmp_global.g.g_abort ) {
5546  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
5547  /* TODO abort? */
5548  return;
5549  }
5550  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5551  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
5552  return;
5553  }
5554 
5555  KMP_MB(); /* Flush all pending memory write invalidates. */
5556 
5557  /* find out who we are and what we should do */
5558  {
5559  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5560  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
5561  if( gtid == KMP_GTID_SHUTDOWN ) {
5562  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
5563  return;
5564  } else if( gtid == KMP_GTID_MONITOR ) {
5565  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
5566  return;
5567  } else if( gtid == KMP_GTID_DNE ) {
5568  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
5569  return;
5570  /* we don't know who we are */
5571  } else if( KMP_UBER_GTID( gtid )) {
5572  /* unregister ourselves as an uber thread. gtid is no longer valid */
5573  if( __kmp_root[gtid]->r.r_active ) {
5574  __kmp_global.g.g_abort = -1;
5575  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5576  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
5577  return;
5578  } else {
5579  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
5580  __kmp_unregister_root_current_thread( gtid );
5581  }
5582  } else {
5583  /* just a worker thread, let's leave */
5584  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
5585 
5586  if ( gtid >= 0 ) {
5587  kmp_info_t *this_thr = __kmp_threads[ gtid ];
5588  if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
5589  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
5590  }
5591  }
5592 
5593  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
5594  return;
5595  }
5596  }
5597  #if defined GUIDEDLL_EXPORTS
5598  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
5599  // because we will better shutdown later in the library destructor.
5600  // The reason of this change is performance problem when non-openmp thread
5601  // in a loop forks and joins many openmp threads. We can save a lot of time
5602  // keeping worker threads alive until the program shutdown.
5603  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
5604  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
5605  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
5606  return;
5607  #endif
5608  /* synchronize the termination process */
5609  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5610 
5611  /* have we already finished */
5612  if( __kmp_global.g.g_abort ) {
5613  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
5614  /* TODO abort? */
5615  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5616  return;
5617  }
5618  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5619  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5620  return;
5621  }
5622 
5623  /* We need this lock to enforce mutex between this reading of
5624  __kmp_threads_capacity and the writing by __kmp_register_root.
5625  Alternatively, we can use a counter of roots that is
5626  atomically updated by __kmp_get_global_thread_id_reg,
5627  __kmp_do_serial_initialize and __kmp_internal_end_*.
5628  */
5629 
5630  /* should we finish the run-time? are all siblings done? */
5631  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5632 
5633  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5634  if ( KMP_UBER_GTID( i ) ) {
5635  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
5636  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5637  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5638  return;
5639  };
5640  }
5641 
5642  /* now we can safely conduct the actual termination */
5643 
5644  __kmp_internal_end();
5645 
5646  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5647  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5648 
5649  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
5650 
5651  #ifdef DUMP_DEBUG_ON_EXIT
5652  if ( __kmp_debug_buf )
5653  __kmp_dump_debug_buffer();
5654  #endif
5655 } // __kmp_internal_end_thread
5656 
5657 // -------------------------------------------------------------------------------------------------
5658 // Library registration stuff.
5659 
5660 static long __kmp_registration_flag = 0;
5661  // Random value used to indicate library initialization.
5662 static char * __kmp_registration_str = NULL;
5663  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
5664 
5665 
5666 static inline
5667 char *
5668 __kmp_reg_status_name() {
5669  /*
5670  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
5671  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
5672  the name of registered_lib_env env var can not be found, because the name will contain different pid.
5673  */
5674  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
5675 } // __kmp_reg_status_get
5676 
5677 
5678 void
5679 __kmp_register_library_startup(
5680  void
5681 ) {
5682 
5683  char * name = __kmp_reg_status_name(); // Name of the environment variable.
5684  int done = 0;
5685  union {
5686  double dtime;
5687  long ltime;
5688  } time;
5689  #if KMP_OS_WINDOWS
5690  __kmp_initialize_system_tick();
5691  #endif
5692  __kmp_read_system_time( & time.dtime );
5693  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
5694  __kmp_registration_str =
5695  __kmp_str_format(
5696  "%p-%lx-%s",
5697  & __kmp_registration_flag,
5698  __kmp_registration_flag,
5699  KMP_LIBRARY_FILE
5700  );
5701 
5702  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
5703 
5704  while ( ! done ) {
5705 
5706  char * value = NULL; // Actual value of the environment variable.
5707 
5708  // Set environment variable, but do not overwrite if it is exist.
5709  __kmp_env_set( name, __kmp_registration_str, 0 );
5710  // Check the variable is written.
5711  value = __kmp_env_get( name );
5712  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
5713 
5714  done = 1; // Ok, environment variable set successfully, exit the loop.
5715 
5716  } else {
5717 
5718  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
5719  // Check whether it alive or dead.
5720  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
5721  char * tail = value;
5722  char * flag_addr_str = NULL;
5723  char * flag_val_str = NULL;
5724  char const * file_name = NULL;
5725  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
5726  __kmp_str_split( tail, '-', & flag_val_str, & tail );
5727  file_name = tail;
5728  if ( tail != NULL ) {
5729  long * flag_addr = 0;
5730  long flag_val = 0;
5731  sscanf( flag_addr_str, "%p", & flag_addr );
5732  sscanf( flag_val_str, "%lx", & flag_val );
5733  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
5734  // First, check whether environment-encoded address is mapped into addr space.
5735  // If so, dereference it to see if it still has the right value.
5736 
5737  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
5738  neighbor = 1;
5739  } else {
5740  // If not, then we know the other copy of the library is no longer running.
5741  neighbor = 2;
5742  }; // if
5743  }; // if
5744  }; // if
5745  switch ( neighbor ) {
5746  case 0 : // Cannot parse environment variable -- neighbor status unknown.
5747  // Assume it is the incompatible format of future version of the library.
5748  // Assume the other library is alive.
5749  // WARN( ... ); // TODO: Issue a warning.
5750  file_name = "unknown library";
5751  // Attention! Falling to the next case. That's intentional.
5752  case 1 : { // Neighbor is alive.
5753  // Check it is allowed.
5754  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
5755  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
5756  // That's not allowed. Issue fatal error.
5757  __kmp_msg(
5758  kmp_ms_fatal,
5759  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
5760  KMP_HNT( DuplicateLibrary ),
5761  __kmp_msg_null
5762  );
5763  }; // if
5764  KMP_INTERNAL_FREE( duplicate_ok );
5765  __kmp_duplicate_library_ok = 1;
5766  done = 1; // Exit the loop.
5767  } break;
5768  case 2 : { // Neighbor is dead.
5769  // Clear the variable and try to register library again.
5770  __kmp_env_unset( name );
5771  } break;
5772  default : {
5773  KMP_DEBUG_ASSERT( 0 );
5774  } break;
5775  }; // switch
5776 
5777  }; // if
5778  KMP_INTERNAL_FREE( (void *) value );
5779 
5780  }; // while
5781  KMP_INTERNAL_FREE( (void *) name );
5782 
5783 } // func __kmp_register_library_startup
5784 
5785 
5786 void
5787 __kmp_unregister_library( void ) {
5788 
5789  char * name = __kmp_reg_status_name();
5790  char * value = __kmp_env_get( name );
5791 
5792  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
5793  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
5794  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
5795  // Ok, this is our variable. Delete it.
5796  __kmp_env_unset( name );
5797  }; // if
5798 
5799  KMP_INTERNAL_FREE( __kmp_registration_str );
5800  KMP_INTERNAL_FREE( value );
5801  KMP_INTERNAL_FREE( name );
5802 
5803  __kmp_registration_flag = 0;
5804  __kmp_registration_str = NULL;
5805 
5806 } // __kmp_unregister_library
5807 
5808 
5809 // End of Library registration stuff.
5810 // -------------------------------------------------------------------------------------------------
5811 
5812 static void
5813 __kmp_do_serial_initialize( void )
5814 {
5815  int i, gtid;
5816  int size;
5817 
5818  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
5819 
5820  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
5821  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
5822  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
5823  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
5824  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
5825 
5826  __kmp_validate_locks();
5827 
5828  /* Initialize internal memory allocator */
5829  __kmp_init_allocator();
5830 
5831  /* Register the library startup via an environment variable
5832  and check to see whether another copy of the library is already
5833  registered. */
5834 
5835  __kmp_register_library_startup( );
5836 
5837  /* TODO reinitialization of library */
5838  if( TCR_4(__kmp_global.g.g_done) ) {
5839  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
5840  }
5841 
5842  __kmp_global.g.g_abort = 0;
5843  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
5844 
5845  /* initialize the locks */
5846 #if KMP_USE_ADAPTIVE_LOCKS
5847 #if KMP_DEBUG_ADAPTIVE_LOCKS
5848  __kmp_init_speculative_stats();
5849 #endif
5850 #endif
5851  __kmp_init_lock( & __kmp_global_lock );
5852  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
5853  __kmp_init_lock( & __kmp_debug_lock );
5854  __kmp_init_atomic_lock( & __kmp_atomic_lock );
5855  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
5856  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
5857  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
5858  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
5859  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
5860  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
5861  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
5862  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
5863  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
5864  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
5865  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
5866  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
5867  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
5868  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
5869  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
5870  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
5871 
5872  /* conduct initialization and initial setup of configuration */
5873 
5874  __kmp_runtime_initialize();
5875 
5876  // Some global variable initialization moved here from kmp_env_initialize()
5877 #ifdef KMP_DEBUG
5878  kmp_diag = 0;
5879 #endif
5880  __kmp_abort_delay = 0;
5881 
5882  // From __kmp_init_dflt_team_nth()
5883  /* assume the entire machine will be used */
5884  __kmp_dflt_team_nth_ub = __kmp_xproc;
5885  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
5886  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
5887  }
5888  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
5889  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
5890  }
5891  __kmp_max_nth = __kmp_sys_max_nth;
5892 
5893  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
5894  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
5895  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
5896  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
5897  // From "KMP_LIBRARY" part of __kmp_env_initialize()
5898  __kmp_library = library_throughput;
5899  // From KMP_SCHEDULE initialization
5900  __kmp_static = kmp_sch_static_balanced;
5901  // AC: do not use analytical here, because it is non-monotonous
5902  //__kmp_guided = kmp_sch_guided_iterative_chunked;
5903  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
5904  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
5905  // control parts
5906  #if KMP_FAST_REDUCTION_BARRIER
5907  #define kmp_reduction_barrier_gather_bb ((int)1)
5908  #define kmp_reduction_barrier_release_bb ((int)1)
5909  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
5910  #define kmp_reduction_barrier_release_pat bp_hyper_bar
5911  #endif // KMP_FAST_REDUCTION_BARRIER
5912  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
5913  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
5914  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
5915  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
5916  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
5917  #if KMP_FAST_REDUCTION_BARRIER
5918  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
5919  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
5920  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
5921  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
5922  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
5923  }
5924  #endif // KMP_FAST_REDUCTION_BARRIER
5925  }
5926  #if KMP_FAST_REDUCTION_BARRIER
5927  #undef kmp_reduction_barrier_release_pat
5928  #undef kmp_reduction_barrier_gather_pat
5929  #undef kmp_reduction_barrier_release_bb
5930  #undef kmp_reduction_barrier_gather_bb
5931  #endif // KMP_FAST_REDUCTION_BARRIER
5932  #if KMP_MIC
5933  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
5934  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plane gather
5935  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
5936  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
5937  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
5938 #if KMP_FAST_REDUCTION_BARRIER
5939  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
5940  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
5941 #endif
5942  #endif
5943 
5944  // From KMP_CHECKS initialization
5945 #ifdef KMP_DEBUG
5946  __kmp_env_checks = TRUE; /* development versions have the extra checks */
5947 #else
5948  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
5949 #endif
5950 
5951  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
5952  __kmp_foreign_tp = TRUE;
5953 
5954  __kmp_global.g.g_dynamic = FALSE;
5955  __kmp_global.g.g_dynamic_mode = dynamic_default;
5956 
5957  __kmp_env_initialize( NULL );
5958 
5959  // Print all messages in message catalog for testing purposes.
5960  #ifdef KMP_DEBUG
5961  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
5962  if ( __kmp_str_match_true( val ) ) {
5963  kmp_str_buf_t buffer;
5964  __kmp_str_buf_init( & buffer );
5965  __kmp_i18n_dump_catalog( & buffer );
5966  __kmp_printf( "%s", buffer.str );
5967  __kmp_str_buf_free( & buffer );
5968  }; // if
5969  __kmp_env_free( & val );
5970  #endif
5971 
5972  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
5973  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
5974  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
5975 
5976 
5977  // If the library is shut down properly, both pools must be NULL. Just in case, set them
5978  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
5979  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
5980  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
5981  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
5982  __kmp_thread_pool = NULL;
5983  __kmp_thread_pool_insert_pt = NULL;
5984  __kmp_team_pool = NULL;
5985 
5986  /* Allocate all of the variable sized records */
5987  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
5988  /* Since allocation is cache-aligned, just add extra padding at the end */
5989  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
5990  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
5991  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
5992 
5993  /* init thread counts */
5994  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
5995  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
5996  __kmp_all_nth = 0;
5997  __kmp_nth = 0;
5998 
5999  /* setup the uber master thread and hierarchy */
6000  gtid = __kmp_register_root( TRUE );
6001  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6002  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6003  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6004 
6005  KMP_MB(); /* Flush all pending memory write invalidates. */
6006 
6007  __kmp_common_initialize();
6008 
6009  #if KMP_OS_UNIX
6010  /* invoke the child fork handler */
6011  __kmp_register_atfork();
6012  #endif
6013 
6014  #if ! defined GUIDEDLL_EXPORTS
6015  {
6016  /* Invoke the exit handler when the program finishes, only for static library.
6017  For dynamic library, we already have _fini and DllMain.
6018  */
6019  int rc = atexit( __kmp_internal_end_atexit );
6020  if ( rc != 0 ) {
6021  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6022  }; // if
6023  }
6024  #endif
6025 
6026  #if KMP_HANDLE_SIGNALS
6027  #if KMP_OS_UNIX
6028  /* NOTE: make sure that this is called before the user installs
6029  * their own signal handlers so that the user handlers
6030  * are called first. this way they can return false,
6031  * not call our handler, avoid terminating the library,
6032  * and continue execution where they left off. */
6033  __kmp_install_signals( FALSE );
6034  #endif /* KMP_OS_UNIX */
6035  #if KMP_OS_WINDOWS
6036  __kmp_install_signals( TRUE );
6037  #endif /* KMP_OS_WINDOWS */
6038  #endif
6039 
6040  /* we have finished the serial initialization */
6041  __kmp_init_counter ++;
6042 
6043  __kmp_init_serial = TRUE;
6044 
6045  if (__kmp_settings) {
6046  __kmp_env_print();
6047  }
6048 
6049 #if OMP_40_ENABLED
6050  if (__kmp_display_env || __kmp_display_env_verbose) {
6051  __kmp_env_print_2();
6052  }
6053 #endif // OMP_40_ENABLED
6054 
6055  KMP_MB();
6056 
6057  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6058 }
6059 
6060 void
6061 __kmp_serial_initialize( void )
6062 {
6063  if ( __kmp_init_serial ) {
6064  return;
6065  }
6066  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6067  if ( __kmp_init_serial ) {
6068  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6069  return;
6070  }
6071  __kmp_do_serial_initialize();
6072  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6073 }
6074 
6075 static void
6076 __kmp_do_middle_initialize( void )
6077 {
6078  int i, j;
6079  int prev_dflt_team_nth;
6080 
6081  if( !__kmp_init_serial ) {
6082  __kmp_do_serial_initialize();
6083  }
6084 
6085  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6086 
6087  //
6088  // Save the previous value for the __kmp_dflt_team_nth so that
6089  // we can avoid some reinitialization if it hasn't changed.
6090  //
6091  prev_dflt_team_nth = __kmp_dflt_team_nth;
6092 
6093 #if KMP_AFFINITY_SUPPORTED
6094  //
6095  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6096  // number of cores on the machine.
6097  //
6098  __kmp_affinity_initialize();
6099 
6100  //
6101  // Run through the __kmp_threads array and set the affinity mask
6102  // for each root thread that is currently registered with the RTL.
6103  //
6104  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6105  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6106  __kmp_affinity_set_init_mask( i, TRUE );
6107  }
6108  }
6109 #endif /* KMP_AFFINITY_SUPPORTED */
6110 
6111  KMP_ASSERT( __kmp_xproc > 0 );
6112  if ( __kmp_avail_proc == 0 ) {
6113  __kmp_avail_proc = __kmp_xproc;
6114  }
6115 
6116  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6117  j = 0;
6118  while ( __kmp_nested_nth.used && ! __kmp_nested_nth.nth[ j ] ) {
6119  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6120  j++;
6121  }
6122 
6123  if ( __kmp_dflt_team_nth == 0 ) {
6124 #ifdef KMP_DFLT_NTH_CORES
6125  //
6126  // Default #threads = #cores
6127  //
6128  __kmp_dflt_team_nth = __kmp_ncores;
6129  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6130  __kmp_dflt_team_nth ) );
6131 #else
6132  //
6133  // Default #threads = #available OS procs
6134  //
6135  __kmp_dflt_team_nth = __kmp_avail_proc;
6136  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6137  __kmp_dflt_team_nth ) );
6138 #endif /* KMP_DFLT_NTH_CORES */
6139  }
6140 
6141  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6142  __kmp_dflt_team_nth = KMP_MIN_NTH;
6143  }
6144  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6145  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6146  }
6147 
6148  //
6149  // There's no harm in continuing if the following check fails,
6150  // but it indicates an error in the previous logic.
6151  //
6152  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6153 
6154  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6155  //
6156  // Run through the __kmp_threads array and set the num threads icv
6157  // for each root thread that is currently registered with the RTL
6158  // (which has not already explicitly set its nthreads-var with a
6159  // call to omp_set_num_threads()).
6160  //
6161  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6162  kmp_info_t *thread = __kmp_threads[ i ];
6163  if ( thread == NULL ) continue;
6164  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6165 
6166  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6167  }
6168  }
6169  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6170  __kmp_dflt_team_nth) );
6171 
6172 #ifdef KMP_ADJUST_BLOCKTIME
6173  /* Adjust blocktime to zero if necessary */
6174  /* now that __kmp_avail_proc is set */
6175  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6176  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6177  if ( __kmp_nth > __kmp_avail_proc ) {
6178  __kmp_zero_bt = TRUE;
6179  }
6180  }
6181 #endif /* KMP_ADJUST_BLOCKTIME */
6182 
6183  /* we have finished middle initialization */
6184  TCW_SYNC_4(__kmp_init_middle, TRUE);
6185 
6186  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6187 }
6188 
6189 void
6190 __kmp_middle_initialize( void )
6191 {
6192  if ( __kmp_init_middle ) {
6193  return;
6194  }
6195  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6196  if ( __kmp_init_middle ) {
6197  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6198  return;
6199  }
6200  __kmp_do_middle_initialize();
6201  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6202 }
6203 
6204 void
6205 __kmp_parallel_initialize( void )
6206 {
6207  int gtid = __kmp_entry_gtid(); // this might be a new root
6208 
6209  /* syncronize parallel initialization (for sibling) */
6210  if( TCR_4(__kmp_init_parallel) ) return;
6211  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6212  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6213 
6214  /* TODO reinitialization after we have already shut down */
6215  if( TCR_4(__kmp_global.g.g_done) ) {
6216  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6217  __kmp_infinite_loop();
6218  }
6219 
6220  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6221  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6222  */
6223  if( !__kmp_init_middle ) {
6224  __kmp_do_middle_initialize();
6225  }
6226 
6227  /* begin initialization */
6228  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6229  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6230 
6231 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6232  //
6233  // Save the FP control regs.
6234  // Worker threads will set theirs to these values at thread startup.
6235  //
6236  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6237  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6238  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6239 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6240 
6241 #if KMP_OS_UNIX
6242 # if KMP_HANDLE_SIGNALS
6243  /* must be after __kmp_serial_initialize */
6244  __kmp_install_signals( TRUE );
6245 # endif
6246 #endif
6247 
6248  __kmp_suspend_initialize();
6249 
6250 # if defined(USE_LOAD_BALANCE)
6251  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6252  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6253  }
6254 #else
6255  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6256  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6257  }
6258 #endif
6259 
6260  if ( __kmp_version ) {
6261  __kmp_print_version_2();
6262  }
6263 
6264  /* we have finished parallel initialization */
6265  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6266 
6267  KMP_MB();
6268  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6269 
6270  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6271 }
6272 
6273 
6274 /* ------------------------------------------------------------------------ */
6275 
6276 void
6277 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6278  kmp_team_t *team )
6279 {
6280  kmp_disp_t *dispatch;
6281 
6282  KMP_MB();
6283 
6284  /* none of the threads have encountered any constructs, yet. */
6285  this_thr->th.th_local.this_construct = 0;
6286 #if KMP_CACHE_MANAGE
6287  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6288 #endif /* KMP_CACHE_MANAGE */
6289  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6290  KMP_DEBUG_ASSERT( dispatch );
6291  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6292  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6293 
6294  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6295 
6296  if( __kmp_env_consistency_check )
6297  __kmp_push_parallel( gtid, team->t.t_ident );
6298 
6299  KMP_MB(); /* Flush all pending memory write invalidates. */
6300 }
6301 
6302 void
6303 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6304  kmp_team_t *team )
6305 {
6306  if( __kmp_env_consistency_check )
6307  __kmp_pop_parallel( gtid, team->t.t_ident );
6308 }
6309 
6310 int
6311 __kmp_invoke_task_func( int gtid )
6312 {
6313  int rc;
6314  int tid = __kmp_tid_from_gtid( gtid );
6315  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6316  kmp_team_t *team = this_thr->th.th_team;
6317 
6318  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6319 #if USE_ITT_BUILD
6320  if ( __itt_stack_caller_create_ptr ) {
6321  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6322  }
6323 #endif /* USE_ITT_BUILD */
6324 #if INCLUDE_SSC_MARKS
6325  SSC_MARK_INVOKING();
6326 #endif
6327  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6328  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv );
6329 
6330 #if USE_ITT_BUILD
6331  if ( __itt_stack_caller_create_ptr ) {
6332  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6333  }
6334 #endif /* USE_ITT_BUILD */
6335  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6336 
6337  return rc;
6338 }
6339 
6340 #if OMP_40_ENABLED
6341 void
6342 __kmp_teams_master( int gtid )
6343 {
6344  // This routine is called by all master threads in teams construct
6345  kmp_info_t *thr = __kmp_threads[ gtid ];
6346  kmp_team_t *team = thr->th.th_team;
6347  ident_t *loc = team->t.t_ident;
6348  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6349  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6350  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6351  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6352  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6353  // Launch league of teams now, but not let workers execute
6354  // (they hang on fork barrier until next parallel)
6355 #if INCLUDE_SSC_MARKS
6356  SSC_MARK_FORKING();
6357 #endif
6358  __kmp_fork_call( loc, gtid, fork_context_intel,
6359  team->t.t_argc,
6360  (microtask_t)thr->th.th_teams_microtask,
6361  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6362  NULL );
6363 #if INCLUDE_SSC_MARKS
6364  SSC_MARK_JOINING();
6365 #endif
6366  __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
6367  // worker threads are in a fork barrier waiting for more parallel regions
6368 }
6369 
6370 int
6371 __kmp_invoke_teams_master( int gtid )
6372 {
6373  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6374  kmp_team_t *team = this_thr->th.th_team;
6375  #if KMP_DEBUG
6376  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6377  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6378  #endif
6379  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6380  __kmp_teams_master( gtid );
6381  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6382  return 1;
6383 }
6384 #endif /* OMP_40_ENABLED */
6385 
6386 /* this sets the requested number of threads for the next parallel region
6387  * encountered by this team */
6388 /* since this should be enclosed in the forkjoin critical section it
6389  * should avoid race conditions with assymmetrical nested parallelism */
6390 
6391 void
6392 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6393 {
6394  kmp_info_t *thr = __kmp_threads[gtid];
6395 
6396  if( num_threads > 0 )
6397  thr->th.th_set_nproc = num_threads;
6398 }
6399 
6400 #if OMP_40_ENABLED
6401 
6402 /* this sets the requested number of teams for the teams region and/or
6403  * the number of threads for the next parallel region encountered */
6404 void
6405 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6406 {
6407  kmp_info_t *thr = __kmp_threads[gtid];
6408  KMP_DEBUG_ASSERT(num_teams >= 0);
6409  KMP_DEBUG_ASSERT(num_threads >= 0);
6410  if( num_teams == 0 ) {
6411  num_teams = 1; // default number of teams is 1.
6412  }
6413  // Set number of teams (number of threads in the outer "parallel" of the teams)
6414  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6415 
6416  // Remember the number of threads for inner parallel regions
6417  if( num_threads > 0 ) {
6418  thr->th.th_teams_size.nth = num_threads;
6419  } else {
6420  if( !TCR_4(__kmp_init_middle) )
6421  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6422  thr->th.th_teams_size.nth = __kmp_avail_proc / num_teams;
6423  }
6424 }
6425 
6426 
6427 //
6428 // Set the proc_bind var to use in the following parallel region.
6429 //
6430 void
6431 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
6432 {
6433  kmp_info_t *thr = __kmp_threads[gtid];
6434  thr->th.th_set_proc_bind = proc_bind;
6435 }
6436 
6437 #endif /* OMP_40_ENABLED */
6438 
6439 /* Launch the worker threads into the microtask. */
6440 
6441 void
6442 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
6443 {
6444  kmp_info_t *this_thr = __kmp_threads[gtid];
6445 
6446 #ifdef KMP_DEBUG
6447  int f;
6448 #endif /* KMP_DEBUG */
6449 
6450  KMP_DEBUG_ASSERT( team );
6451  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
6452  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
6453  KMP_MB(); /* Flush all pending memory write invalidates. */
6454 
6455  team->t.t_construct = 0; /* no single directives seen yet */
6456  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
6457 
6458  /* Reset the identifiers on the dispatch buffer */
6459  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
6460  if ( team->t.t_max_nproc > 1 ) {
6461  int i;
6462  for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
6463  team->t.t_disp_buffer[ i ].buffer_index = i;
6464  } else {
6465  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
6466  }
6467 
6468  KMP_MB(); /* Flush all pending memory write invalidates. */
6469  KMP_ASSERT( this_thr->th.th_team == team );
6470 
6471 #ifdef KMP_DEBUG
6472  for( f=0 ; f<team->t.t_nproc ; f++ ) {
6473  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
6474  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
6475  }
6476 #endif /* KMP_DEBUG */
6477 
6478  /* release the worker threads so they may begin working */
6479  __kmp_fork_barrier( gtid, 0 );
6480 }
6481 
6482 
6483 void
6484 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
6485 {
6486  kmp_info_t *this_thr = __kmp_threads[gtid];
6487 
6488  KMP_DEBUG_ASSERT( team );
6489  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
6490  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
6491  KMP_MB(); /* Flush all pending memory write invalidates. */
6492 
6493  /* Join barrier after fork */
6494 
6495 #ifdef KMP_DEBUG
6496  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
6497  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
6498  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
6499  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
6500  __kmp_print_structure();
6501  }
6502  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
6503  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
6504 #endif /* KMP_DEBUG */
6505 
6506  __kmp_join_barrier( gtid ); /* wait for everyone */
6507 
6508  KMP_MB(); /* Flush all pending memory write invalidates. */
6509  KMP_ASSERT( this_thr->th.th_team == team );
6510 }
6511 
6512 
6513 /* ------------------------------------------------------------------------ */
6514 /* ------------------------------------------------------------------------ */
6515 
6516 #ifdef USE_LOAD_BALANCE
6517 
6518 //
6519 // Return the worker threads actively spinning in the hot team, if we
6520 // are at the outermost level of parallelism. Otherwise, return 0.
6521 //
6522 static int
6523 __kmp_active_hot_team_nproc( kmp_root_t *root )
6524 {
6525  int i;
6526  int retval;
6527  kmp_team_t *hot_team;
6528 
6529  if ( root->r.r_active ) {
6530  return 0;
6531  }
6532  hot_team = root->r.r_hot_team;
6533  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
6534  return hot_team->t.t_nproc - 1; // Don't count master thread
6535  }
6536 
6537  //
6538  // Skip the master thread - it is accounted for elsewhere.
6539  //
6540  retval = 0;
6541  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
6542  if ( hot_team->t.t_threads[i]->th.th_active ) {
6543  retval++;
6544  }
6545  }
6546  return retval;
6547 }
6548 
6549 //
6550 // Perform an automatic adjustment to the number of
6551 // threads used by the next parallel region.
6552 //
6553 static int
6554 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
6555 {
6556  int retval;
6557  int pool_active;
6558  int hot_team_active;
6559  int team_curr_active;
6560  int system_active;
6561 
6562  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
6563  root, set_nproc ) );
6564  KMP_DEBUG_ASSERT( root );
6565  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
6566  KMP_DEBUG_ASSERT( set_nproc > 1 );
6567 
6568  if ( set_nproc == 1) {
6569  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
6570  return 1;
6571  }
6572 
6573  //
6574  // Threads that are active in the thread pool, active in the hot team
6575  // for this particular root (if we are at the outer par level), and
6576  // the currently executing thread (to become the master) are available
6577  // to add to the new team, but are currently contributing to the system
6578  // load, and must be accounted for.
6579  //
6580  pool_active = TCR_4(__kmp_thread_pool_active_nth);
6581  hot_team_active = __kmp_active_hot_team_nproc( root );
6582  team_curr_active = pool_active + hot_team_active + 1;
6583 
6584  //
6585  // Check the system load.
6586  //
6587  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
6588  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
6589  system_active, pool_active, hot_team_active ) );
6590 
6591  if ( system_active < 0 ) {
6592  //
6593  // There was an error reading the necessary info from /proc,
6594  // so use the thread limit algorithm instead. Once we set
6595  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
6596  // we shouldn't wind up getting back here.
6597  //
6598  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6599  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
6600 
6601  //
6602  // Make this call behave like the thread limit algorithm.
6603  //
6604  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
6605  : root->r.r_hot_team->t.t_nproc);
6606  if ( retval > set_nproc ) {
6607  retval = set_nproc;
6608  }
6609  if ( retval < KMP_MIN_NTH ) {
6610  retval = KMP_MIN_NTH;
6611  }
6612 
6613  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
6614  return retval;
6615  }
6616 
6617  //
6618  // There is a slight delay in the load balance algorithm in detecting
6619  // new running procs. The real system load at this instant should be
6620  // at least as large as the #active omp thread that are available to
6621  // add to the team.
6622  //
6623  if ( system_active < team_curr_active ) {
6624  system_active = team_curr_active;
6625  }
6626  retval = __kmp_avail_proc - system_active + team_curr_active;
6627  if ( retval > set_nproc ) {
6628  retval = set_nproc;
6629  }
6630  if ( retval < KMP_MIN_NTH ) {
6631  retval = KMP_MIN_NTH;
6632  }
6633 
6634  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
6635  return retval;
6636 } // __kmp_load_balance_nproc()
6637 
6638 #endif /* USE_LOAD_BALANCE */
6639 
6640 
6641 /* ------------------------------------------------------------------------ */
6642 /* ------------------------------------------------------------------------ */
6643 
6644 /* NOTE: this is called with the __kmp_init_lock held */
6645 void
6646 __kmp_cleanup( void )
6647 {
6648  int f;
6649 
6650  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
6651 
6652  if (TCR_4(__kmp_init_parallel)) {
6653 #if KMP_HANDLE_SIGNALS
6654  __kmp_remove_signals();
6655 #endif
6656  TCW_4(__kmp_init_parallel, FALSE);
6657  }
6658 
6659  if (TCR_4(__kmp_init_middle)) {
6660 #if KMP_AFFINITY_SUPPORTED
6661  __kmp_affinity_uninitialize();
6662 #endif /* KMP_AFFINITY_SUPPORTED */
6663  TCW_4(__kmp_init_middle, FALSE);
6664  }
6665 
6666  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
6667 
6668  if (__kmp_init_serial) {
6669 
6670  __kmp_runtime_destroy();
6671 
6672  __kmp_init_serial = FALSE;
6673  }
6674 
6675  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
6676  if ( __kmp_root[ f ] != NULL ) {
6677  __kmp_free( __kmp_root[ f ] );
6678  __kmp_root[ f ] = NULL;
6679  }
6680  }
6681  __kmp_free( __kmp_threads );
6682  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
6683  // freeing __kmp_root.
6684  __kmp_threads = NULL;
6685  __kmp_root = NULL;
6686  __kmp_threads_capacity = 0;
6687 
6688  __kmp_cleanup_user_locks();
6689 
6690  #if KMP_AFFINITY_SUPPORTED
6691  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
6692  __kmp_cpuinfo_file = NULL;
6693  #endif /* KMP_AFFINITY_SUPPORTED */
6694 
6695  #if KMP_USE_ADAPTIVE_LOCKS
6696  #if KMP_DEBUG_ADAPTIVE_LOCKS
6697  __kmp_print_speculative_stats();
6698  #endif
6699  #endif
6700  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
6701  __kmp_nested_nth.nth = NULL;
6702  __kmp_nested_nth.size = 0;
6703  __kmp_nested_nth.used = 0;
6704 
6705  __kmp_i18n_catclose();
6706 
6707 #if KMP_STATS_ENABLED
6708  __kmp_accumulate_stats_at_exit();
6709  __kmp_stats_list.deallocate();
6710 #endif
6711 
6712  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
6713 }
6714 
6715 /* ------------------------------------------------------------------------ */
6716 /* ------------------------------------------------------------------------ */
6717 
6718 int
6719 __kmp_ignore_mppbeg( void )
6720 {
6721  char *env;
6722 
6723  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
6724  if (__kmp_str_match_false( env ))
6725  return FALSE;
6726  }
6727  // By default __kmpc_begin() is no-op.
6728  return TRUE;
6729 }
6730 
6731 int
6732 __kmp_ignore_mppend( void )
6733 {
6734  char *env;
6735 
6736  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
6737  if (__kmp_str_match_false( env ))
6738  return FALSE;
6739  }
6740  // By default __kmpc_end() is no-op.
6741  return TRUE;
6742 }
6743 
6744 void
6745 __kmp_internal_begin( void )
6746 {
6747  int gtid;
6748  kmp_root_t *root;
6749 
6750  /* this is a very important step as it will register new sibling threads
6751  * and assign these new uber threads a new gtid */
6752  gtid = __kmp_entry_gtid();
6753  root = __kmp_threads[ gtid ]->th.th_root;
6754  KMP_ASSERT( KMP_UBER_GTID( gtid ));
6755 
6756  if( root->r.r_begin ) return;
6757  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
6758  if( root->r.r_begin ) {
6759  __kmp_release_lock( & root->r.r_begin_lock, gtid );
6760  return;
6761  }
6762 
6763  root->r.r_begin = TRUE;
6764 
6765  __kmp_release_lock( & root->r.r_begin_lock, gtid );
6766 }
6767 
6768 
6769 /* ------------------------------------------------------------------------ */
6770 /* ------------------------------------------------------------------------ */
6771 
6772 void
6773 __kmp_user_set_library (enum library_type arg)
6774 {
6775  int gtid;
6776  kmp_root_t *root;
6777  kmp_info_t *thread;
6778 
6779  /* first, make sure we are initialized so we can get our gtid */
6780 
6781  gtid = __kmp_entry_gtid();
6782  thread = __kmp_threads[ gtid ];
6783 
6784  root = thread->th.th_root;
6785 
6786  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
6787  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
6788  KMP_WARNING( SetLibraryIncorrectCall );
6789  return;
6790  }
6791 
6792  switch ( arg ) {
6793  case library_serial :
6794  thread->th.th_set_nproc = 0;
6795  set__nproc( thread, 1 );
6796  break;
6797  case library_turnaround :
6798  thread->th.th_set_nproc = 0;
6799  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
6800  break;
6801  case library_throughput :
6802  thread->th.th_set_nproc = 0;
6803  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
6804  break;
6805  default:
6806  KMP_FATAL( UnknownLibraryType, arg );
6807  }
6808 
6809  __kmp_aux_set_library ( arg );
6810 }
6811 
6812 void
6813 __kmp_aux_set_stacksize( size_t arg )
6814 {
6815  if (! __kmp_init_serial)
6816  __kmp_serial_initialize();
6817 
6818 #if KMP_OS_DARWIN
6819  if (arg & (0x1000 - 1)) {
6820  arg &= ~(0x1000 - 1);
6821  if(arg + 0x1000) /* check for overflow if we round up */
6822  arg += 0x1000;
6823  }
6824 #endif
6825  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6826 
6827  /* only change the default stacksize before the first parallel region */
6828  if (! TCR_4(__kmp_init_parallel)) {
6829  size_t value = arg; /* argument is in bytes */
6830 
6831  if (value < __kmp_sys_min_stksize )
6832  value = __kmp_sys_min_stksize ;
6833  else if (value > KMP_MAX_STKSIZE)
6834  value = KMP_MAX_STKSIZE;
6835 
6836  __kmp_stksize = value;
6837 
6838  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
6839  }
6840 
6841  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6842 }
6843 
6844 /* set the behaviour of the runtime library */
6845 /* TODO this can cause some odd behaviour with sibling parallelism... */
6846 void
6847 __kmp_aux_set_library (enum library_type arg)
6848 {
6849  __kmp_library = arg;
6850 
6851  switch ( __kmp_library ) {
6852  case library_serial :
6853  {
6854  KMP_INFORM( LibraryIsSerial );
6855  (void) __kmp_change_library( TRUE );
6856  }
6857  break;
6858  case library_turnaround :
6859  (void) __kmp_change_library( TRUE );
6860  break;
6861  case library_throughput :
6862  (void) __kmp_change_library( FALSE );
6863  break;
6864  default:
6865  KMP_FATAL( UnknownLibraryType, arg );
6866  }
6867 }
6868 
6869 /* ------------------------------------------------------------------------ */
6870 /* ------------------------------------------------------------------------ */
6871 
6872 void
6873 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
6874 {
6875  int blocktime = arg; /* argument is in milliseconds */
6876  int bt_intervals;
6877  int bt_set;
6878 
6879  __kmp_save_internal_controls( thread );
6880 
6881  /* Normalize and set blocktime for the teams */
6882  if (blocktime < KMP_MIN_BLOCKTIME)
6883  blocktime = KMP_MIN_BLOCKTIME;
6884  else if (blocktime > KMP_MAX_BLOCKTIME)
6885  blocktime = KMP_MAX_BLOCKTIME;
6886 
6887  set__blocktime_team( thread->th.th_team, tid, blocktime );
6888  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
6889 
6890  /* Calculate and set blocktime intervals for the teams */
6891  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
6892 
6893  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
6894  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
6895 
6896  /* Set whether blocktime has been set to "TRUE" */
6897  bt_set = TRUE;
6898 
6899  set__bt_set_team( thread->th.th_team, tid, bt_set );
6900  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
6901  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
6902  __kmp_gtid_from_tid(tid, thread->th.th_team),
6903  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
6904 }
6905 
6906 void
6907 __kmp_aux_set_defaults(
6908  char const * str,
6909  int len
6910 ) {
6911  if ( ! __kmp_init_serial ) {
6912  __kmp_serial_initialize();
6913  };
6914  __kmp_env_initialize( str );
6915 
6916  if (__kmp_settings
6917 #if OMP_40_ENABLED
6918  || __kmp_display_env || __kmp_display_env_verbose
6919 #endif // OMP_40_ENABLED
6920  ) {
6921  __kmp_env_print();
6922  }
6923 } // __kmp_aux_set_defaults
6924 
6925 /* ------------------------------------------------------------------------ */
6926 
6927 /*
6928  * internal fast reduction routines
6929  */
6930 
6931 PACKED_REDUCTION_METHOD_T
6932 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
6933  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
6934  kmp_critical_name *lck )
6935 {
6936 
6937  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
6938  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
6939  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
6940  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
6941 
6942  PACKED_REDUCTION_METHOD_T retval;
6943 
6944  int team_size;
6945 
6946  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
6947  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
6948 
6949  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
6950  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
6951 
6952  retval = critical_reduce_block;
6953 
6954  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
6955 
6956  if( team_size == 1 ) {
6957 
6958  retval = empty_reduce_block;
6959 
6960  } else {
6961 
6962  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
6963  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
6964 
6965  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64
6966 
6967  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
6968  #if KMP_MIC
6969  #define REDUCTION_TEAMSIZE_CUTOFF 8
6970  #else // KMP_MIC
6971  #define REDUCTION_TEAMSIZE_CUTOFF 4
6972  #endif // KMP_MIC
6973  if( tree_available ) {
6974  if( team_size <= REDUCTION_TEAMSIZE_CUTOFF ) {
6975  if ( atomic_available ) {
6976  retval = atomic_reduce_block;
6977  }
6978  } else {
6979  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
6980  }
6981  } else if ( atomic_available ) {
6982  retval = atomic_reduce_block;
6983  }
6984  #else
6985  #error "Unknown or unsupported OS"
6986  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
6987 
6988  #elif KMP_ARCH_X86 || KMP_ARCH_ARM
6989 
6990  #if KMP_OS_LINUX || KMP_OS_WINDOWS
6991 
6992  // basic tuning
6993 
6994  if( atomic_available ) {
6995  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
6996  retval = atomic_reduce_block;
6997  }
6998  } // otherwise: use critical section
6999 
7000  #elif KMP_OS_DARWIN
7001 
7002  if( atomic_available && ( num_vars <= 3 ) ) {
7003  retval = atomic_reduce_block;
7004  } else if( tree_available ) {
7005  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7006  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7007  }
7008  } // otherwise: use critical section
7009 
7010  #else
7011  #error "Unknown or unsupported OS"
7012  #endif
7013 
7014  #else
7015  #error "Unknown or unsupported architecture"
7016  #endif
7017 
7018  }
7019 
7020  // KMP_FORCE_REDUCTION
7021 
7022  if( __kmp_force_reduction_method != reduction_method_not_defined ) {
7023 
7024  PACKED_REDUCTION_METHOD_T forced_retval;
7025 
7026  int atomic_available, tree_available;
7027 
7028  switch( ( forced_retval = __kmp_force_reduction_method ) )
7029  {
7030  case critical_reduce_block:
7031  KMP_ASSERT( lck ); // lck should be != 0
7032  if( team_size <= 1 ) {
7033  forced_retval = empty_reduce_block;
7034  }
7035  break;
7036 
7037  case atomic_reduce_block:
7038  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7039  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
7040  break;
7041 
7042  case tree_reduce_block:
7043  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7044  KMP_ASSERT( tree_available ); // tree_available should be != 0
7045  #if KMP_FAST_REDUCTION_BARRIER
7046  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7047  #endif
7048  break;
7049 
7050  default:
7051  KMP_ASSERT( 0 ); // "unsupported method specified"
7052  }
7053 
7054  retval = forced_retval;
7055  }
7056 
7057  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7058 
7059  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7060  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7061 
7062  return ( retval );
7063 }
7064 
7065 // this function is for testing set/get/determine reduce method
7066 kmp_int32
7067 __kmp_get_reduce_method( void ) {
7068  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7069 }
7070 
7071 /* ------------------------------------------------------------------------ */
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:439
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:201
Definition: kmp.h:218
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:424
sched_type
Definition: kmp.h:320
kmp_int32 flags
Definition: kmp.h:220