LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_itt.h"
19 #include "kmp_os.h"
20 #include "kmp_stats.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void __kmp_linear_barrier_gather(
49  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52  kmp_team_t *team = this_thr->th.th_team;
53  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54  kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(
57  20,
58  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66  __itt_get_timestamp();
67  }
68 #endif
69  // We now perform a linear reduction to signal that all of the threads have
70  // arrived.
71  if (!KMP_MASTER_TID(tid)) {
72  KA_TRACE(20,
73  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74  "arrived(%p): %llu => %llu\n",
75  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78  // Mark arrival to master thread
79  /* After performing this write, a worker thread may not assume that the team
80  is valid any more - it could be deallocated by the master thread at any
81  time. */
82  ANNOTATE_BARRIER_BEGIN(this_thr);
83  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84  flag.release();
85  } else {
86  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87  int nproc = this_thr->th.th_team_nproc;
88  int i;
89  // Don't have to worry about sleep bit here or atomic since team setting
90  kmp_uint64 new_state =
91  team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
92 
93  // Collect all the worker team member threads.
94  for (i = 1; i < nproc; ++i) {
95 #if KMP_CACHE_MANAGE
96  // Prefetch next thread's arrived count
97  if (i + 1 < nproc)
98  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
99 #endif /* KMP_CACHE_MANAGE */
100  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
101  "arrived(%p) == %llu\n",
102  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
103  team->t.t_id, i,
104  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
105 
106  // Wait for worker thread to arrive
107  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
108  new_state);
109  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
110  ANNOTATE_BARRIER_END(other_threads[i]);
111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
112  // Barrier imbalance - write min of the thread time and the other thread
113  // time to the thread.
114  if (__kmp_forkjoin_frames_mode == 2) {
115  this_thr->th.th_bar_min_time = KMP_MIN(
116  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
117  }
118 #endif
119  if (reduce) {
120  KA_TRACE(100,
121  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
122  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
123  team->t.t_id, i));
124  ANNOTATE_REDUCE_AFTER(reduce);
125  (*reduce)(this_thr->th.th_local.reduce_data,
126  other_threads[i]->th.th_local.reduce_data);
127  ANNOTATE_REDUCE_BEFORE(reduce);
128  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
129  }
130  }
131  // Don't have to worry about sleep bit here or atomic since team setting
132  team_bar->b_arrived = new_state;
133  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
134  "arrived(%p) = %llu\n",
135  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
136  new_state));
137  }
138  KA_TRACE(
139  20,
140  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
141  gtid, team->t.t_id, tid, bt));
142 }
143 
144 static void __kmp_linear_barrier_release(
145  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
146  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
147  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
148  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
149  kmp_team_t *team;
150 
151  if (KMP_MASTER_TID(tid)) {
152  unsigned int i;
153  kmp_uint32 nproc = this_thr->th.th_team_nproc;
154  kmp_info_t **other_threads;
155 
156  team = __kmp_threads[gtid]->th.th_team;
157  KMP_DEBUG_ASSERT(team != NULL);
158  other_threads = team->t.t_threads;
159 
160  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
161  "barrier type %d\n",
162  gtid, team->t.t_id, tid, bt));
163 
164  if (nproc > 1) {
165 #if KMP_BARRIER_ICV_PUSH
166  {
167  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
168  if (propagate_icvs) {
169  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
170  for (i = 1; i < nproc; ++i) {
171  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
172  team, i, FALSE);
173  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
174  &team->t.t_implicit_task_taskdata[0].td_icvs);
175  }
176  ngo_sync();
177  }
178  }
179 #endif // KMP_BARRIER_ICV_PUSH
180 
181  // Now, release all of the worker threads
182  for (i = 1; i < nproc; ++i) {
183 #if KMP_CACHE_MANAGE
184  // Prefetch next thread's go flag
185  if (i + 1 < nproc)
186  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
187 #endif /* KMP_CACHE_MANAGE */
188  KA_TRACE(
189  20,
190  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
191  "go(%p): %u => %u\n",
192  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
193  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
194  other_threads[i]->th.th_bar[bt].bb.b_go,
195  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
196  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
197  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
198  other_threads[i]);
199  flag.release();
200  }
201  }
202  } else { // Wait for the MASTER thread to release us
203  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
204  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
205  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
206  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
207  ANNOTATE_BARRIER_END(this_thr);
208 #if USE_ITT_BUILD && USE_ITT_NOTIFY
209  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
210  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
211  // disabled)
212  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
213  // Cancel wait on previous parallel region...
214  __kmp_itt_task_starting(itt_sync_obj);
215 
216  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
217  return;
218 
219  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
220  if (itt_sync_obj != NULL)
221  // Call prepare as early as possible for "new" barrier
222  __kmp_itt_task_finished(itt_sync_obj);
223  } else
224 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
225  // Early exit for reaping threads releasing forkjoin barrier
226  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
227  return;
228 // The worker thread may now assume that the team is valid.
229 #ifdef KMP_DEBUG
230  tid = __kmp_tid_from_gtid(gtid);
231  team = __kmp_threads[gtid]->th.th_team;
232 #endif
233  KMP_DEBUG_ASSERT(team != NULL);
234  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
235  KA_TRACE(20,
236  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
237  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
238  KMP_MB(); // Flush all pending memory write invalidates.
239  }
240  KA_TRACE(
241  20,
242  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
243  gtid, team->t.t_id, tid, bt));
244 }
245 
246 // Tree barrier
247 static void
248 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
249  int tid, void (*reduce)(void *, void *)
250  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
251  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
252  kmp_team_t *team = this_thr->th.th_team;
253  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
254  kmp_info_t **other_threads = team->t.t_threads;
255  kmp_uint32 nproc = this_thr->th.th_team_nproc;
256  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
257  kmp_uint32 branch_factor = 1 << branch_bits;
258  kmp_uint32 child;
259  kmp_uint32 child_tid;
260  kmp_uint64 new_state;
261 
262  KA_TRACE(
263  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
264  gtid, team->t.t_id, tid, bt));
265  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
266 
267 #if USE_ITT_BUILD && USE_ITT_NOTIFY
268  // Barrier imbalance - save arrive time to the thread
269  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
270  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
271  __itt_get_timestamp();
272  }
273 #endif
274  // Perform tree gather to wait until all threads have arrived; reduce any
275  // required data as we go
276  child_tid = (tid << branch_bits) + 1;
277  if (child_tid < nproc) {
278  // Parent threads wait for all their children to arrive
279  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
280  child = 1;
281  do {
282  kmp_info_t *child_thr = other_threads[child_tid];
283  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
284 #if KMP_CACHE_MANAGE
285  // Prefetch next thread's arrived count
286  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
287  KMP_CACHE_PREFETCH(
288  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
289 #endif /* KMP_CACHE_MANAGE */
290  KA_TRACE(20,
291  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
292  "arrived(%p) == %llu\n",
293  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
294  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
295  // Wait for child to arrive
296  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
297  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
298  ANNOTATE_BARRIER_END(child_thr);
299 #if USE_ITT_BUILD && USE_ITT_NOTIFY
300  // Barrier imbalance - write min of the thread time and a child time to
301  // the thread.
302  if (__kmp_forkjoin_frames_mode == 2) {
303  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
304  child_thr->th.th_bar_min_time);
305  }
306 #endif
307  if (reduce) {
308  KA_TRACE(100,
309  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
310  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
311  team->t.t_id, child_tid));
312  ANNOTATE_REDUCE_AFTER(reduce);
313  (*reduce)(this_thr->th.th_local.reduce_data,
314  child_thr->th.th_local.reduce_data);
315  ANNOTATE_REDUCE_BEFORE(reduce);
316  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
317  }
318  child++;
319  child_tid++;
320  } while (child <= branch_factor && child_tid < nproc);
321  }
322 
323  if (!KMP_MASTER_TID(tid)) { // Worker threads
324  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
325 
326  KA_TRACE(20,
327  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
328  "arrived(%p): %llu => %llu\n",
329  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
330  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
331  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
332 
333  // Mark arrival to parent thread
334  /* After performing this write, a worker thread may not assume that the team
335  is valid any more - it could be deallocated by the master thread at any
336  time. */
337  ANNOTATE_BARRIER_BEGIN(this_thr);
338  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
339  flag.release();
340  } else {
341  // Need to update the team arrived pointer if we are the master thread
342  if (nproc > 1) // New value was already computed above
343  team->t.t_bar[bt].b_arrived = new_state;
344  else
345  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
346  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
347  "arrived(%p) = %llu\n",
348  gtid, team->t.t_id, tid, team->t.t_id,
349  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
350  }
351  KA_TRACE(20,
352  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
353  gtid, team->t.t_id, tid, bt));
354 }
355 
356 static void __kmp_tree_barrier_release(
357  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
358  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
359  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
360  kmp_team_t *team;
361  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
362  kmp_uint32 nproc;
363  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
364  kmp_uint32 branch_factor = 1 << branch_bits;
365  kmp_uint32 child;
366  kmp_uint32 child_tid;
367 
368  // Perform a tree release for all of the threads that have been gathered
369  if (!KMP_MASTER_TID(
370  tid)) { // Handle fork barrier workers who aren't part of a team yet
371  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
372  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
373  // Wait for parent thread to release us
374  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
375  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
376  ANNOTATE_BARRIER_END(this_thr);
377 #if USE_ITT_BUILD && USE_ITT_NOTIFY
378  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
379  // In fork barrier where we could not get the object reliably (or
380  // ITTNOTIFY is disabled)
381  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
382  // Cancel wait on previous parallel region...
383  __kmp_itt_task_starting(itt_sync_obj);
384 
385  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
386  return;
387 
388  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
389  if (itt_sync_obj != NULL)
390  // Call prepare as early as possible for "new" barrier
391  __kmp_itt_task_finished(itt_sync_obj);
392  } else
393 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
394  // Early exit for reaping threads releasing forkjoin barrier
395  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
396  return;
397 
398  // The worker thread may now assume that the team is valid.
399  team = __kmp_threads[gtid]->th.th_team;
400  KMP_DEBUG_ASSERT(team != NULL);
401  tid = __kmp_tid_from_gtid(gtid);
402 
403  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
404  KA_TRACE(20,
405  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
406  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
407  KMP_MB(); // Flush all pending memory write invalidates.
408  } else {
409  team = __kmp_threads[gtid]->th.th_team;
410  KMP_DEBUG_ASSERT(team != NULL);
411  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
412  "barrier type %d\n",
413  gtid, team->t.t_id, tid, bt));
414  }
415  nproc = this_thr->th.th_team_nproc;
416  child_tid = (tid << branch_bits) + 1;
417 
418  if (child_tid < nproc) {
419  kmp_info_t **other_threads = team->t.t_threads;
420  child = 1;
421  // Parent threads release all their children
422  do {
423  kmp_info_t *child_thr = other_threads[child_tid];
424  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
425 #if KMP_CACHE_MANAGE
426  // Prefetch next thread's go count
427  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
428  KMP_CACHE_PREFETCH(
429  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
430 #endif /* KMP_CACHE_MANAGE */
431 
432 #if KMP_BARRIER_ICV_PUSH
433  {
434  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
435  if (propagate_icvs) {
436  __kmp_init_implicit_task(team->t.t_ident,
437  team->t.t_threads[child_tid], team,
438  child_tid, FALSE);
439  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
440  &team->t.t_implicit_task_taskdata[0].td_icvs);
441  }
442  }
443 #endif // KMP_BARRIER_ICV_PUSH
444  KA_TRACE(20,
445  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
446  "go(%p): %u => %u\n",
447  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
448  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
449  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
450  // Release child from barrier
451  ANNOTATE_BARRIER_BEGIN(child_thr);
452  kmp_flag_64 flag(&child_bar->b_go, child_thr);
453  flag.release();
454  child++;
455  child_tid++;
456  } while (child <= branch_factor && child_tid < nproc);
457  }
458  KA_TRACE(
459  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
460  gtid, team->t.t_id, tid, bt));
461 }
462 
463 // Hyper Barrier
464 static void
465 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
466  int tid, void (*reduce)(void *, void *)
467  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
468  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
469  kmp_team_t *team = this_thr->th.th_team;
470  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
471  kmp_info_t **other_threads = team->t.t_threads;
472  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
473  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
474  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
475  kmp_uint32 branch_factor = 1 << branch_bits;
476  kmp_uint32 offset;
477  kmp_uint32 level;
478 
479  KA_TRACE(
480  20,
481  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
482  gtid, team->t.t_id, tid, bt));
483  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
484 
485 #if USE_ITT_BUILD && USE_ITT_NOTIFY
486  // Barrier imbalance - save arrive time to the thread
487  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
488  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
489  __itt_get_timestamp();
490  }
491 #endif
492  /* Perform a hypercube-embedded tree gather to wait until all of the threads
493  have arrived, and reduce any required data as we go. */
494  kmp_flag_64 p_flag(&thr_bar->b_arrived);
495  for (level = 0, offset = 1; offset < num_threads;
496  level += branch_bits, offset <<= branch_bits) {
497  kmp_uint32 child;
498  kmp_uint32 child_tid;
499 
500  if (((tid >> level) & (branch_factor - 1)) != 0) {
501  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
502 
503  KA_TRACE(20,
504  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
505  "arrived(%p): %llu => %llu\n",
506  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
507  team->t.t_id, parent_tid, &thr_bar->b_arrived,
508  thr_bar->b_arrived,
509  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
510  // Mark arrival to parent thread
511  /* After performing this write (in the last iteration of the enclosing for
512  loop), a worker thread may not assume that the team is valid any more
513  - it could be deallocated by the master thread at any time. */
514  ANNOTATE_BARRIER_BEGIN(this_thr);
515  p_flag.set_waiter(other_threads[parent_tid]);
516  p_flag.release();
517  break;
518  }
519 
520  // Parent threads wait for children to arrive
521  if (new_state == KMP_BARRIER_UNUSED_STATE)
522  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
523  for (child = 1, child_tid = tid + (1 << level);
524  child < branch_factor && child_tid < num_threads;
525  child++, child_tid += (1 << level)) {
526  kmp_info_t *child_thr = other_threads[child_tid];
527  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
528 #if KMP_CACHE_MANAGE
529  kmp_uint32 next_child_tid = child_tid + (1 << level);
530  // Prefetch next thread's arrived count
531  if (child + 1 < branch_factor && next_child_tid < num_threads)
532  KMP_CACHE_PREFETCH(
533  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
534 #endif /* KMP_CACHE_MANAGE */
535  KA_TRACE(20,
536  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
537  "arrived(%p) == %llu\n",
538  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
539  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
540  // Wait for child to arrive
541  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
542  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
543  ANNOTATE_BARRIER_END(child_thr);
544 #if USE_ITT_BUILD && USE_ITT_NOTIFY
545  // Barrier imbalance - write min of the thread time and a child time to
546  // the thread.
547  if (__kmp_forkjoin_frames_mode == 2) {
548  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
549  child_thr->th.th_bar_min_time);
550  }
551 #endif
552  if (reduce) {
553  KA_TRACE(100,
554  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
555  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
556  team->t.t_id, child_tid));
557  ANNOTATE_REDUCE_AFTER(reduce);
558  (*reduce)(this_thr->th.th_local.reduce_data,
559  child_thr->th.th_local.reduce_data);
560  ANNOTATE_REDUCE_BEFORE(reduce);
561  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
562  }
563  }
564  }
565 
566  if (KMP_MASTER_TID(tid)) {
567  // Need to update the team arrived pointer if we are the master thread
568  if (new_state == KMP_BARRIER_UNUSED_STATE)
569  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
570  else
571  team->t.t_bar[bt].b_arrived = new_state;
572  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
573  "arrived(%p) = %llu\n",
574  gtid, team->t.t_id, tid, team->t.t_id,
575  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
576  }
577  KA_TRACE(
578  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
579  gtid, team->t.t_id, tid, bt));
580 }
581 
582 // The reverse versions seem to beat the forward versions overall
583 #define KMP_REVERSE_HYPER_BAR
584 static void __kmp_hyper_barrier_release(
585  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
586  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
587  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
588  kmp_team_t *team;
589  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
590  kmp_info_t **other_threads;
591  kmp_uint32 num_threads;
592  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
593  kmp_uint32 branch_factor = 1 << branch_bits;
594  kmp_uint32 child;
595  kmp_uint32 child_tid;
596  kmp_uint32 offset;
597  kmp_uint32 level;
598 
599  /* Perform a hypercube-embedded tree release for all of the threads that have
600  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
601  are released in the reverse order of the corresponding gather, otherwise
602  threads are released in the same order. */
603  if (KMP_MASTER_TID(tid)) { // master
604  team = __kmp_threads[gtid]->th.th_team;
605  KMP_DEBUG_ASSERT(team != NULL);
606  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
607  "barrier type %d\n",
608  gtid, team->t.t_id, tid, bt));
609 #if KMP_BARRIER_ICV_PUSH
610  if (propagate_icvs) { // master already has ICVs in final destination; copy
611  copy_icvs(&thr_bar->th_fixed_icvs,
612  &team->t.t_implicit_task_taskdata[tid].td_icvs);
613  }
614 #endif
615  } else { // Handle fork barrier workers who aren't part of a team yet
616  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
617  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
618  // Wait for parent thread to release us
619  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
620  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
621  ANNOTATE_BARRIER_END(this_thr);
622 #if USE_ITT_BUILD && USE_ITT_NOTIFY
623  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
624  // In fork barrier where we could not get the object reliably
625  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
626  // Cancel wait on previous parallel region...
627  __kmp_itt_task_starting(itt_sync_obj);
628 
629  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
630  return;
631 
632  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
633  if (itt_sync_obj != NULL)
634  // Call prepare as early as possible for "new" barrier
635  __kmp_itt_task_finished(itt_sync_obj);
636  } else
637 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
638  // Early exit for reaping threads releasing forkjoin barrier
639  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
640  return;
641 
642  // The worker thread may now assume that the team is valid.
643  team = __kmp_threads[gtid]->th.th_team;
644  KMP_DEBUG_ASSERT(team != NULL);
645  tid = __kmp_tid_from_gtid(gtid);
646 
647  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
648  KA_TRACE(20,
649  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
650  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
651  KMP_MB(); // Flush all pending memory write invalidates.
652  }
653  num_threads = this_thr->th.th_team_nproc;
654  other_threads = team->t.t_threads;
655 
656 #ifdef KMP_REVERSE_HYPER_BAR
657  // Count up to correct level for parent
658  for (level = 0, offset = 1;
659  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
660  level += branch_bits, offset <<= branch_bits)
661  ;
662 
663  // Now go down from there
664  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
665  level -= branch_bits, offset >>= branch_bits)
666 #else
667  // Go down the tree, level by level
668  for (level = 0, offset = 1; offset < num_threads;
669  level += branch_bits, offset <<= branch_bits)
670 #endif // KMP_REVERSE_HYPER_BAR
671  {
672 #ifdef KMP_REVERSE_HYPER_BAR
673  /* Now go in reverse order through the children, highest to lowest.
674  Initial setting of child is conservative here. */
675  child = num_threads >> ((level == 0) ? level : level - 1);
676  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
677  child_tid = tid + (child << level);
678  child >= 1; child--, child_tid -= (1 << level))
679 #else
680  if (((tid >> level) & (branch_factor - 1)) != 0)
681  // No need to go lower than this, since this is the level parent would be
682  // notified
683  break;
684  // Iterate through children on this level of the tree
685  for (child = 1, child_tid = tid + (1 << level);
686  child < branch_factor && child_tid < num_threads;
687  child++, child_tid += (1 << level))
688 #endif // KMP_REVERSE_HYPER_BAR
689  {
690  if (child_tid >= num_threads)
691  continue; // Child doesn't exist so keep going
692  else {
693  kmp_info_t *child_thr = other_threads[child_tid];
694  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
695 #if KMP_CACHE_MANAGE
696  kmp_uint32 next_child_tid = child_tid - (1 << level);
697 // Prefetch next thread's go count
698 #ifdef KMP_REVERSE_HYPER_BAR
699  if (child - 1 >= 1 && next_child_tid < num_threads)
700 #else
701  if (child + 1 < branch_factor && next_child_tid < num_threads)
702 #endif // KMP_REVERSE_HYPER_BAR
703  KMP_CACHE_PREFETCH(
704  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
705 #endif /* KMP_CACHE_MANAGE */
706 
707 #if KMP_BARRIER_ICV_PUSH
708  if (propagate_icvs) // push my fixed ICVs to my child
709  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
710 #endif // KMP_BARRIER_ICV_PUSH
711 
712  KA_TRACE(
713  20,
714  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
715  "go(%p): %u => %u\n",
716  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
717  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
718  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
719  // Release child from barrier
720  ANNOTATE_BARRIER_BEGIN(child_thr);
721  kmp_flag_64 flag(&child_bar->b_go, child_thr);
722  flag.release();
723  }
724  }
725  }
726 #if KMP_BARRIER_ICV_PUSH
727  if (propagate_icvs &&
728  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
729  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
730  FALSE);
731  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
732  &thr_bar->th_fixed_icvs);
733  }
734 #endif
735  KA_TRACE(
736  20,
737  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
738  gtid, team->t.t_id, tid, bt));
739 }
740 
741 // Hierarchical Barrier
742 
743 // Initialize thread barrier data
744 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
745  Performs the minimum amount of initialization required based on how the team
746  has changed. Returns true if leaf children will require both on-core and
747  traditional wake-up mechanisms. For example, if the team size increases,
748  threads already in the team will respond to on-core wakeup on their parent
749  thread, but threads newly added to the team will only be listening on the
750  their local b_go. */
751 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
752  kmp_bstate_t *thr_bar,
753  kmp_uint32 nproc, int gtid,
754  int tid, kmp_team_t *team) {
755  // Checks to determine if (re-)initialization is needed
756  bool uninitialized = thr_bar->team == NULL;
757  bool team_changed = team != thr_bar->team;
758  bool team_sz_changed = nproc != thr_bar->nproc;
759  bool tid_changed = tid != thr_bar->old_tid;
760  bool retval = false;
761 
762  if (uninitialized || team_sz_changed) {
763  __kmp_get_hierarchy(nproc, thr_bar);
764  }
765 
766  if (uninitialized || team_sz_changed || tid_changed) {
767  thr_bar->my_level = thr_bar->depth - 1; // default for master
768  thr_bar->parent_tid = -1; // default for master
769  if (!KMP_MASTER_TID(
770  tid)) { // if not master, find parent thread in hierarchy
771  kmp_uint32 d = 0;
772  while (d < thr_bar->depth) { // find parent based on level of thread in
773  // hierarchy, and note level
774  kmp_uint32 rem;
775  if (d == thr_bar->depth - 2) { // reached level right below the master
776  thr_bar->parent_tid = 0;
777  thr_bar->my_level = d;
778  break;
779  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
780  0) { // TODO: can we make this op faster?
781  // thread is not a subtree root at next level, so this is max
782  thr_bar->parent_tid = tid - rem;
783  thr_bar->my_level = d;
784  break;
785  }
786  ++d;
787  }
788  }
789  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
790  thr_bar->old_tid = tid;
791  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
792  thr_bar->team = team;
793  thr_bar->parent_bar =
794  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
795  }
796  if (uninitialized || team_changed || tid_changed) {
797  thr_bar->team = team;
798  thr_bar->parent_bar =
799  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
800  retval = true;
801  }
802  if (uninitialized || team_sz_changed || tid_changed) {
803  thr_bar->nproc = nproc;
804  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
805  if (thr_bar->my_level == 0)
806  thr_bar->leaf_kids = 0;
807  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
808  thr_bar->leaf_kids = nproc - tid - 1;
809  thr_bar->leaf_state = 0;
810  for (int i = 0; i < thr_bar->leaf_kids; ++i)
811  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
812  }
813  return retval;
814 }
815 
816 static void __kmp_hierarchical_barrier_gather(
817  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
818  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
819  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
820  kmp_team_t *team = this_thr->th.th_team;
821  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
822  kmp_uint32 nproc = this_thr->th.th_team_nproc;
823  kmp_info_t **other_threads = team->t.t_threads;
824  kmp_uint64 new_state;
825 
826  int level = team->t.t_level;
827 #if OMP_40_ENABLED
828  if (other_threads[0]
829  ->th.th_teams_microtask) // are we inside the teams construct?
830  if (this_thr->th.th_teams_size.nteams > 1)
831  ++level; // level was not increased in teams construct for team_of_masters
832 #endif
833  if (level == 1)
834  thr_bar->use_oncore_barrier = 1;
835  else
836  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
837 
838  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
839  "barrier type %d\n",
840  gtid, team->t.t_id, tid, bt));
841  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
842 
843 #if USE_ITT_BUILD && USE_ITT_NOTIFY
844  // Barrier imbalance - save arrive time to the thread
845  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
846  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
847  }
848 #endif
849 
850  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
851  team);
852 
853  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
854  kmp_int32 child_tid;
855  new_state =
856  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
857  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
858  thr_bar->use_oncore_barrier) {
859  if (thr_bar->leaf_kids) {
860  // First, wait for leaf children to check-in on my b_arrived flag
861  kmp_uint64 leaf_state =
862  KMP_MASTER_TID(tid)
863  ? thr_bar->b_arrived | thr_bar->leaf_state
864  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
865  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
866  "for leaf kids\n",
867  gtid, team->t.t_id, tid));
868  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
869  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870  if (reduce) {
871  ANNOTATE_REDUCE_AFTER(reduce);
872  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
873  ++child_tid) {
874  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
875  "T#%d(%d:%d)\n",
876  gtid, team->t.t_id, tid,
877  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
878  child_tid));
879  ANNOTATE_BARRIER_END(other_threads[child_tid]);
880  (*reduce)(this_thr->th.th_local.reduce_data,
881  other_threads[child_tid]->th.th_local.reduce_data);
882  }
883  ANNOTATE_REDUCE_BEFORE(reduce);
884  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
885  }
886  // clear leaf_state bits
887  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
888  }
889  // Next, wait for higher level children on each child's b_arrived flag
890  for (kmp_uint32 d = 1; d < thr_bar->my_level;
891  ++d) { // gather lowest level threads first, but skip 0
892  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
893  skip = thr_bar->skip_per_level[d];
894  if (last > nproc)
895  last = nproc;
896  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
897  kmp_info_t *child_thr = other_threads[child_tid];
898  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
899  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
900  "T#%d(%d:%d) "
901  "arrived(%p) == %llu\n",
902  gtid, team->t.t_id, tid,
903  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
904  child_tid, &child_bar->b_arrived, new_state));
905  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
906  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
907  ANNOTATE_BARRIER_END(child_thr);
908  if (reduce) {
909  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
910  "T#%d(%d:%d)\n",
911  gtid, team->t.t_id, tid,
912  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
913  child_tid));
914  ANNOTATE_REDUCE_AFTER(reduce);
915  (*reduce)(this_thr->th.th_local.reduce_data,
916  child_thr->th.th_local.reduce_data);
917  ANNOTATE_REDUCE_BEFORE(reduce);
918  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
919  }
920  }
921  }
922  } else { // Blocktime is not infinite
923  for (kmp_uint32 d = 0; d < thr_bar->my_level;
924  ++d) { // Gather lowest level threads first
925  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
926  skip = thr_bar->skip_per_level[d];
927  if (last > nproc)
928  last = nproc;
929  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
930  kmp_info_t *child_thr = other_threads[child_tid];
931  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
932  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
933  "T#%d(%d:%d) "
934  "arrived(%p) == %llu\n",
935  gtid, team->t.t_id, tid,
936  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
937  child_tid, &child_bar->b_arrived, new_state));
938  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
939  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
940  ANNOTATE_BARRIER_END(child_thr);
941  if (reduce) {
942  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
943  "T#%d(%d:%d)\n",
944  gtid, team->t.t_id, tid,
945  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
946  child_tid));
947  ANNOTATE_REDUCE_AFTER(reduce);
948  (*reduce)(this_thr->th.th_local.reduce_data,
949  child_thr->th.th_local.reduce_data);
950  ANNOTATE_REDUCE_BEFORE(reduce);
951  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
952  }
953  }
954  }
955  }
956  }
957  // All subordinates are gathered; now release parent if not master thread
958 
959  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
960  KA_TRACE(
961  20,
962  ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
963  "arrived(%p): %llu => %llu\n",
964  gtid, team->t.t_id, tid,
965  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
966  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
967  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
968  /* Mark arrival to parent: After performing this write, a worker thread may
969  not assume that the team is valid any more - it could be deallocated by
970  the master thread at any time. */
971  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
972  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
973  // flag; release it
974  ANNOTATE_BARRIER_BEGIN(this_thr);
975  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
976  flag.release();
977  } else { // Leaf does special release on the "offset" bits of parent's
978  // b_arrived flag
979  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
980  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
981  flag.set_waiter(other_threads[thr_bar->parent_tid]);
982  flag.release();
983  }
984  } else { // Master thread needs to update the team's b_arrived value
985  team->t.t_bar[bt].b_arrived = new_state;
986  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
987  "arrived(%p) = %llu\n",
988  gtid, team->t.t_id, tid, team->t.t_id,
989  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
990  }
991  // Is the team access below unsafe or just technically invalid?
992  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
993  "barrier type %d\n",
994  gtid, team->t.t_id, tid, bt));
995 }
996 
997 static void __kmp_hierarchical_barrier_release(
998  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
999  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1000  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1001  kmp_team_t *team;
1002  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1003  kmp_uint32 nproc;
1004  bool team_change = false; // indicates on-core barrier shouldn't be used
1005 
1006  if (KMP_MASTER_TID(tid)) {
1007  team = __kmp_threads[gtid]->th.th_team;
1008  KMP_DEBUG_ASSERT(team != NULL);
1009  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1010  "entered barrier type %d\n",
1011  gtid, team->t.t_id, tid, bt));
1012  } else { // Worker threads
1013  // Wait for parent thread to release me
1014  if (!thr_bar->use_oncore_barrier ||
1015  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1016  thr_bar->team == NULL) {
1017  // Use traditional method of waiting on my own b_go flag
1018  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1019  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1020  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1021  ANNOTATE_BARRIER_END(this_thr);
1022  TCW_8(thr_bar->b_go,
1023  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1024  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1025  // infinite, not nested
1026  // Wait on my "offset" bits on parent's b_go flag
1027  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1028  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1029  thr_bar->offset, bt,
1030  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1031  flag.wait(this_thr, TRUE);
1032  if (thr_bar->wait_flag ==
1033  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1034  TCW_8(thr_bar->b_go,
1035  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1036  } else { // Reset my bits on parent's b_go flag
1037  (RCAST(volatile char *,
1038  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1039  }
1040  }
1041  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1042  // Early exit for reaping threads releasing forkjoin barrier
1043  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1044  return;
1045  // The worker thread may now assume that the team is valid.
1046  team = __kmp_threads[gtid]->th.th_team;
1047  KMP_DEBUG_ASSERT(team != NULL);
1048  tid = __kmp_tid_from_gtid(gtid);
1049 
1050  KA_TRACE(
1051  20,
1052  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1053  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1054  KMP_MB(); // Flush all pending memory write invalidates.
1055  }
1056 
1057  nproc = this_thr->th.th_team_nproc;
1058  int level = team->t.t_level;
1059 #if OMP_40_ENABLED
1060  if (team->t.t_threads[0]
1061  ->th.th_teams_microtask) { // are we inside the teams construct?
1062  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1063  this_thr->th.th_teams_level == level)
1064  ++level; // level was not increased in teams construct for team_of_workers
1065  if (this_thr->th.th_teams_size.nteams > 1)
1066  ++level; // level was not increased in teams construct for team_of_masters
1067  }
1068 #endif
1069  if (level == 1)
1070  thr_bar->use_oncore_barrier = 1;
1071  else
1072  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1073 
1074  // If the team size has increased, we still communicate with old leaves via
1075  // oncore barrier.
1076  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1077  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1078  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1079  tid, team);
1080  // But if the entire team changes, we won't use oncore barrier at all
1081  if (team_change)
1082  old_leaf_kids = 0;
1083 
1084 #if KMP_BARRIER_ICV_PUSH
1085  if (propagate_icvs) {
1086  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1087  FALSE);
1088  if (KMP_MASTER_TID(
1089  tid)) { // master already has copy in final destination; copy
1090  copy_icvs(&thr_bar->th_fixed_icvs,
1091  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1092  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1093  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1094  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1095  // leaves (on-core children) pull parent's fixed ICVs directly to local
1096  // ICV store
1097  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1098  &thr_bar->parent_bar->th_fixed_icvs);
1099  // non-leaves will get ICVs piggybacked with b_go via NGO store
1100  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1101  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1102  // access
1103  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1104  else // leaves copy parent's fixed ICVs directly to local ICV store
1105  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1106  &thr_bar->parent_bar->th_fixed_icvs);
1107  }
1108  }
1109 #endif // KMP_BARRIER_ICV_PUSH
1110 
1111  // Now, release my children
1112  if (thr_bar->my_level) { // not a leaf
1113  kmp_int32 child_tid;
1114  kmp_uint32 last;
1115  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1116  thr_bar->use_oncore_barrier) {
1117  if (KMP_MASTER_TID(tid)) { // do a flat release
1118  // Set local b_go to bump children via NGO store of the cache line
1119  // containing IVCs and b_go.
1120  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1121  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1122  // the cache line
1123  ngo_load(&thr_bar->th_fixed_icvs);
1124  // This loops over all the threads skipping only the leaf nodes in the
1125  // hierarchy
1126  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1127  child_tid += thr_bar->skip_per_level[1]) {
1128  kmp_bstate_t *child_bar =
1129  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1130  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1131  "releasing T#%d(%d:%d)"
1132  " go(%p): %u => %u\n",
1133  gtid, team->t.t_id, tid,
1134  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1135  child_tid, &child_bar->b_go, child_bar->b_go,
1136  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1137  // Use ngo store (if available) to both store ICVs and release child
1138  // via child's b_go
1139  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1140  }
1141  ngo_sync();
1142  }
1143  TCW_8(thr_bar->b_go,
1144  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1145  // Now, release leaf children
1146  if (thr_bar->leaf_kids) { // if there are any
1147  // We test team_change on the off-chance that the level 1 team changed.
1148  if (team_change ||
1149  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1150  if (old_leaf_kids) { // release old leaf kids
1151  thr_bar->b_go |= old_leaf_state;
1152  }
1153  // Release new leaf kids
1154  last = tid + thr_bar->skip_per_level[1];
1155  if (last > nproc)
1156  last = nproc;
1157  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1158  ++child_tid) { // skip_per_level[0]=1
1159  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1160  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1161  KA_TRACE(
1162  20,
1163  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1164  " T#%d(%d:%d) go(%p): %u => %u\n",
1165  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1166  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1167  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1168  // Release child using child's b_go flag
1169  ANNOTATE_BARRIER_BEGIN(child_thr);
1170  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1171  flag.release();
1172  }
1173  } else { // Release all children at once with leaf_state bits on my own
1174  // b_go flag
1175  thr_bar->b_go |= thr_bar->leaf_state;
1176  }
1177  }
1178  } else { // Blocktime is not infinite; do a simple hierarchical release
1179  for (int d = thr_bar->my_level - 1; d >= 0;
1180  --d) { // Release highest level threads first
1181  last = tid + thr_bar->skip_per_level[d + 1];
1182  kmp_uint32 skip = thr_bar->skip_per_level[d];
1183  if (last > nproc)
1184  last = nproc;
1185  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1186  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1187  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1188  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1189  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1190  gtid, team->t.t_id, tid,
1191  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1192  child_tid, &child_bar->b_go, child_bar->b_go,
1193  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1194  // Release child using child's b_go flag
1195  ANNOTATE_BARRIER_BEGIN(child_thr);
1196  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1197  flag.release();
1198  }
1199  }
1200  }
1201 #if KMP_BARRIER_ICV_PUSH
1202  if (propagate_icvs && !KMP_MASTER_TID(tid))
1203  // non-leaves copy ICVs from fixed ICVs to local dest
1204  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1205  &thr_bar->th_fixed_icvs);
1206 #endif // KMP_BARRIER_ICV_PUSH
1207  }
1208  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1209  "barrier type %d\n",
1210  gtid, team->t.t_id, tid, bt));
1211 }
1212 
1213 // End of Barrier Algorithms
1214 
1215 // Internal function to do a barrier.
1216 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1217  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1218  barrier
1219  Returns 0 if master thread, 1 if worker thread. */
1220 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1221  size_t reduce_size, void *reduce_data,
1222  void (*reduce)(void *, void *)) {
1223  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1224  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1225  int tid = __kmp_tid_from_gtid(gtid);
1226  kmp_info_t *this_thr = __kmp_threads[gtid];
1227  kmp_team_t *team = this_thr->th.th_team;
1228  int status = 0;
1229  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1230 #if OMPT_SUPPORT
1231  ompt_task_id_t my_task_id;
1232  ompt_parallel_id_t my_parallel_id;
1233 #endif
1234 
1235  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1236  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1237 
1238  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1239 #if OMPT_SUPPORT
1240  if (ompt_enabled) {
1241 #if OMPT_BLAME
1242  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1243  my_parallel_id = team->t.ompt_team_info.parallel_id;
1244 
1245 #if OMPT_TRACE
1246  if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1247  if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1248  ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1249  my_parallel_id, my_task_id);
1250  }
1251  }
1252 #endif
1253  if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1254  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(my_parallel_id,
1255  my_task_id);
1256  }
1257 #endif
1258  // It is OK to report the barrier state after the barrier begin callback.
1259  // According to the OMPT specification, a compliant implementation may
1260  // even delay reporting this state until the barrier begins to wait.
1261  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1262  }
1263 #endif
1264 
1265  if (!team->t.t_serialized) {
1266 #if USE_ITT_BUILD
1267  // This value will be used in itt notify events below.
1268  void *itt_sync_obj = NULL;
1269 #if USE_ITT_NOTIFY
1270  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1271  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1272 #endif
1273 #endif /* USE_ITT_BUILD */
1274  if (__kmp_tasking_mode == tskm_extra_barrier) {
1275  __kmp_tasking_barrier(team, this_thr, gtid);
1276  KA_TRACE(15,
1277  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1278  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1279  }
1280 
1281  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1282  access it when the team struct is not guaranteed to exist. */
1283  // See note about the corresponding code in __kmp_join_barrier() being
1284  // performance-critical.
1285  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1286 #if KMP_USE_MONITOR
1287  this_thr->th.th_team_bt_intervals =
1288  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1289  this_thr->th.th_team_bt_set =
1290  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1291 #else
1292  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1293 #endif
1294  }
1295 
1296 #if USE_ITT_BUILD
1297  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1298  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1299 #endif /* USE_ITT_BUILD */
1300 #if USE_DEBUGGER
1301  // Let the debugger know: the thread arrived to the barrier and waiting.
1302  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1303  team->t.t_bar[bt].b_master_arrived += 1;
1304  } else {
1305  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1306  } // if
1307 #endif /* USE_DEBUGGER */
1308  if (reduce != NULL) {
1309  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1310  this_thr->th.th_local.reduce_data = reduce_data;
1311  }
1312 
1313  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1314  __kmp_task_team_setup(
1315  this_thr, team,
1316  0); // use 0 to only setup the current team if nthreads > 1
1317 
1318  switch (__kmp_barrier_gather_pattern[bt]) {
1319  case bp_hyper_bar: {
1320  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1321  // to 0; use linear
1322  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1323  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1324  break;
1325  }
1326  case bp_hierarchical_bar: {
1327  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid,
1328  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1329  break;
1330  }
1331  case bp_tree_bar: {
1332  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1333  // to 0; use linear
1334  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1335  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1336  break;
1337  }
1338  default: {
1339  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1340  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1341  }
1342  }
1343 
1344  KMP_MB();
1345 
1346  if (KMP_MASTER_TID(tid)) {
1347  status = 0;
1348  if (__kmp_tasking_mode != tskm_immediate_exec) {
1349  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1350  }
1351 #if USE_DEBUGGER
1352  // Let the debugger know: All threads are arrived and starting leaving the
1353  // barrier.
1354  team->t.t_bar[bt].b_team_arrived += 1;
1355 #endif
1356 
1357 #if OMP_40_ENABLED
1358  // Reset cancellation flag for worksharing constructs
1359  if (team->t.t_cancel_request == cancel_loop ||
1360  team->t.t_cancel_request == cancel_sections) {
1361  team->t.t_cancel_request = cancel_noreq;
1362  }
1363 #endif
1364 #if USE_ITT_BUILD
1365  /* TODO: In case of split reduction barrier, master thread may send
1366  acquired event early, before the final summation into the shared
1367  variable is done (final summation can be a long operation for array
1368  reductions). */
1369  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1370  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1371 #endif /* USE_ITT_BUILD */
1372 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1373  // Barrier - report frame end (only if active_level == 1)
1374  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1375  __kmp_forkjoin_frames_mode &&
1376 #if OMP_40_ENABLED
1377  this_thr->th.th_teams_microtask == NULL &&
1378 #endif
1379  team->t.t_active_level == 1) {
1380  kmp_uint64 cur_time = __itt_get_timestamp();
1381  kmp_info_t **other_threads = team->t.t_threads;
1382  int nproc = this_thr->th.th_team_nproc;
1383  int i;
1384  switch (__kmp_forkjoin_frames_mode) {
1385  case 1:
1386  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1387  loc, nproc);
1388  this_thr->th.th_frame_time = cur_time;
1389  break;
1390  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1391  // be fixed)
1392  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1393  1, loc, nproc);
1394  break;
1395  case 3:
1396  if (__itt_metadata_add_ptr) {
1397  // Initialize with master's wait time
1398  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1399  // Set arrive time to zero to be able to check it in
1400  // __kmp_invoke_task(); the same is done inside the loop below
1401  this_thr->th.th_bar_arrive_time = 0;
1402  for (i = 1; i < nproc; ++i) {
1403  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1404  other_threads[i]->th.th_bar_arrive_time = 0;
1405  }
1406  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1407  cur_time, delta,
1408  (kmp_uint64)(reduce != NULL));
1409  }
1410  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1411  loc, nproc);
1412  this_thr->th.th_frame_time = cur_time;
1413  break;
1414  }
1415  }
1416 #endif /* USE_ITT_BUILD */
1417  } else {
1418  status = 1;
1419 #if USE_ITT_BUILD
1420  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1421  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1422 #endif /* USE_ITT_BUILD */
1423  }
1424  if (status == 1 || !is_split) {
1425  switch (__kmp_barrier_release_pattern[bt]) {
1426  case bp_hyper_bar: {
1427  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1428  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1429  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1430  break;
1431  }
1432  case bp_hierarchical_bar: {
1433  __kmp_hierarchical_barrier_release(
1434  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1435  break;
1436  }
1437  case bp_tree_bar: {
1438  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1439  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1440  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1441  break;
1442  }
1443  default: {
1444  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1445  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1446  }
1447  }
1448  if (__kmp_tasking_mode != tskm_immediate_exec) {
1449  __kmp_task_team_sync(this_thr, team);
1450  }
1451  }
1452 
1453 #if USE_ITT_BUILD
1454  /* GEH: TODO: Move this under if-condition above and also include in
1455  __kmp_end_split_barrier(). This will more accurately represent the actual
1456  release time of the threads for split barriers. */
1457  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1458  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1459 #endif /* USE_ITT_BUILD */
1460  } else { // Team is serialized.
1461  status = 0;
1462  if (__kmp_tasking_mode != tskm_immediate_exec) {
1463 #if OMP_45_ENABLED
1464  if (this_thr->th.th_task_team != NULL) {
1465  void *itt_sync_obj = NULL;
1466 #if USE_ITT_NOTIFY
1467  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1468  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1469  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1470  }
1471 #endif
1472 
1473  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1474  TRUE);
1475  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1476  __kmp_task_team_setup(this_thr, team, 0);
1477 
1478 #if USE_ITT_BUILD
1479  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1480  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1481 #endif /* USE_ITT_BUILD */
1482  }
1483 #else
1484  // The task team should be NULL for serialized code (tasks will be
1485  // executed immediately)
1486  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1487  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1488 #endif
1489  }
1490  }
1491  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1492  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1493  __kmp_tid_from_gtid(gtid), status));
1494 
1495 #if OMPT_SUPPORT
1496  if (ompt_enabled) {
1497 #if OMPT_BLAME
1498  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1499  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(my_parallel_id,
1500  my_task_id);
1501  }
1502 #endif
1503  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1504  }
1505 #endif
1506  ANNOTATE_BARRIER_END(&team->t.t_bar);
1507 
1508  return status;
1509 }
1510 
1511 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1512  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1513  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1514  int tid = __kmp_tid_from_gtid(gtid);
1515  kmp_info_t *this_thr = __kmp_threads[gtid];
1516  kmp_team_t *team = this_thr->th.th_team;
1517 
1518  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1519  if (!team->t.t_serialized) {
1520  if (KMP_MASTER_GTID(gtid)) {
1521  switch (__kmp_barrier_release_pattern[bt]) {
1522  case bp_hyper_bar: {
1523  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1524  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1525  FALSE USE_ITT_BUILD_ARG(NULL));
1526  break;
1527  }
1528  case bp_hierarchical_bar: {
1529  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1530  FALSE USE_ITT_BUILD_ARG(NULL));
1531  break;
1532  }
1533  case bp_tree_bar: {
1534  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1535  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1536  FALSE USE_ITT_BUILD_ARG(NULL));
1537  break;
1538  }
1539  default: {
1540  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1541  FALSE USE_ITT_BUILD_ARG(NULL));
1542  }
1543  }
1544  if (__kmp_tasking_mode != tskm_immediate_exec) {
1545  __kmp_task_team_sync(this_thr, team);
1546  } // if
1547  }
1548  }
1549  ANNOTATE_BARRIER_END(&team->t.t_bar);
1550 }
1551 
1552 void __kmp_join_barrier(int gtid) {
1553  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1554  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1555  kmp_info_t *this_thr = __kmp_threads[gtid];
1556  kmp_team_t *team;
1557  kmp_uint nproc;
1558  kmp_info_t *master_thread;
1559  int tid;
1560 #ifdef KMP_DEBUG
1561  int team_id;
1562 #endif /* KMP_DEBUG */
1563 #if USE_ITT_BUILD
1564  void *itt_sync_obj = NULL;
1565 #if USE_ITT_NOTIFY
1566  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1567  // Get object created at fork_barrier
1568  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1569 #endif
1570 #endif /* USE_ITT_BUILD */
1571  KMP_MB();
1572 
1573  // Get current info
1574  team = this_thr->th.th_team;
1575  nproc = this_thr->th.th_team_nproc;
1576  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1577  tid = __kmp_tid_from_gtid(gtid);
1578 #ifdef KMP_DEBUG
1579  team_id = team->t.t_id;
1580 #endif /* KMP_DEBUG */
1581  master_thread = this_thr->th.th_team_master;
1582 #ifdef KMP_DEBUG
1583  if (master_thread != team->t.t_threads[0]) {
1584  __kmp_print_structure();
1585  }
1586 #endif /* KMP_DEBUG */
1587  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1588  KMP_MB();
1589 
1590  // Verify state
1591  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1592  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1593  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1594  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1595  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1596  gtid, team_id, tid));
1597 
1598  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1599 #if OMPT_SUPPORT
1600 #if OMPT_TRACE
1601  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1602  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1603  team->t.ompt_team_info.parallel_id,
1604  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1605  }
1606 #endif
1607  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1608 #endif
1609 
1610  if (__kmp_tasking_mode == tskm_extra_barrier) {
1611  __kmp_tasking_barrier(team, this_thr, gtid);
1612  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1613  team_id, tid));
1614  }
1615 #ifdef KMP_DEBUG
1616  if (__kmp_tasking_mode != tskm_immediate_exec) {
1617  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1618  "%p, th_task_team = %p\n",
1619  __kmp_gtid_from_thread(this_thr), team_id,
1620  team->t.t_task_team[this_thr->th.th_task_state],
1621  this_thr->th.th_task_team));
1622  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1623  team->t.t_task_team[this_thr->th.th_task_state]);
1624  }
1625 #endif /* KMP_DEBUG */
1626 
1627  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1628  access it when the team struct is not guaranteed to exist. Doing these
1629  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1630  we do not perform the copy if blocktime=infinite, since the values are not
1631  used by __kmp_wait_template() in that case. */
1632  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1633 #if KMP_USE_MONITOR
1634  this_thr->th.th_team_bt_intervals =
1635  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1636  this_thr->th.th_team_bt_set =
1637  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1638 #else
1639  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1640 #endif
1641  }
1642 
1643 #if USE_ITT_BUILD
1644  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1645  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1646 #endif /* USE_ITT_BUILD */
1647 
1648  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1649  case bp_hyper_bar: {
1650  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1651  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1652  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1653  break;
1654  }
1655  case bp_hierarchical_bar: {
1656  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1657  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1658  break;
1659  }
1660  case bp_tree_bar: {
1661  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1662  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1663  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1664  break;
1665  }
1666  default: {
1667  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1668  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1669  }
1670  }
1671 
1672  /* From this point on, the team data structure may be deallocated at any time
1673  by the master thread - it is unsafe to reference it in any of the worker
1674  threads. Any per-team data items that need to be referenced before the
1675  end of the barrier should be moved to the kmp_task_team_t structs. */
1676  if (KMP_MASTER_TID(tid)) {
1677  if (__kmp_tasking_mode != tskm_immediate_exec) {
1678  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1679  }
1680 #if KMP_STATS_ENABLED
1681  // Have master thread flag the workers to indicate they are now waiting for
1682  // next parallel region, Also wake them up so they switch their timers to
1683  // idle.
1684  for (int i = 0; i < team->t.t_nproc; ++i) {
1685  kmp_info_t *team_thread = team->t.t_threads[i];
1686  if (team_thread == this_thr)
1687  continue;
1688  team_thread->th.th_stats->setIdleFlag();
1689  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1690  team_thread->th.th_sleep_loc != NULL)
1691  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1692  team_thread->th.th_sleep_loc);
1693  }
1694 #endif
1695 #if USE_ITT_BUILD
1696  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1697  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1698 #endif /* USE_ITT_BUILD */
1699 
1700 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1701  // Join barrier - report frame end
1702  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1703  __kmp_forkjoin_frames_mode &&
1704 #if OMP_40_ENABLED
1705  this_thr->th.th_teams_microtask == NULL &&
1706 #endif
1707  team->t.t_active_level == 1) {
1708  kmp_uint64 cur_time = __itt_get_timestamp();
1709  ident_t *loc = team->t.t_ident;
1710  kmp_info_t **other_threads = team->t.t_threads;
1711  int nproc = this_thr->th.th_team_nproc;
1712  int i;
1713  switch (__kmp_forkjoin_frames_mode) {
1714  case 1:
1715  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1716  loc, nproc);
1717  break;
1718  case 2:
1719  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1720  loc, nproc);
1721  break;
1722  case 3:
1723  if (__itt_metadata_add_ptr) {
1724  // Initialize with master's wait time
1725  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1726  // Set arrive time to zero to be able to check it in
1727  // __kmp_invoke_task(); the same is done inside the loop below
1728  this_thr->th.th_bar_arrive_time = 0;
1729  for (i = 1; i < nproc; ++i) {
1730  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1731  other_threads[i]->th.th_bar_arrive_time = 0;
1732  }
1733  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1734  cur_time, delta, 0);
1735  }
1736  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1737  loc, nproc);
1738  this_thr->th.th_frame_time = cur_time;
1739  break;
1740  }
1741  }
1742 #endif /* USE_ITT_BUILD */
1743  }
1744 #if USE_ITT_BUILD
1745  else {
1746  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1747  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1748  }
1749 #endif /* USE_ITT_BUILD */
1750 
1751 #if KMP_DEBUG
1752  if (KMP_MASTER_TID(tid)) {
1753  KA_TRACE(
1754  15,
1755  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1756  gtid, team_id, tid, nproc));
1757  }
1758 #endif /* KMP_DEBUG */
1759 
1760  // TODO now, mark worker threads as done so they may be disbanded
1761  KMP_MB(); // Flush all pending memory write invalidates.
1762  KA_TRACE(10,
1763  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1764 
1765 #if OMPT_SUPPORT
1766  if (ompt_enabled) {
1767 #if OMPT_BLAME
1768  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1769  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1770  team->t.ompt_team_info.parallel_id,
1771  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1772  }
1773 #endif
1774 
1775  // return to default state
1776  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1777  }
1778 #endif
1779  ANNOTATE_BARRIER_END(&team->t.t_bar);
1780 }
1781 
1782 // TODO release worker threads' fork barriers as we are ready instead of all at
1783 // once
1784 void __kmp_fork_barrier(int gtid, int tid) {
1785  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1786  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1787  kmp_info_t *this_thr = __kmp_threads[gtid];
1788  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1789 #if USE_ITT_BUILD
1790  void *itt_sync_obj = NULL;
1791 #endif /* USE_ITT_BUILD */
1792  if (team)
1793  ANNOTATE_BARRIER_END(&team->t.t_bar);
1794 
1795  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1796  (team != NULL) ? team->t.t_id : -1, tid));
1797 
1798  // th_team pointer only valid for master thread here
1799  if (KMP_MASTER_TID(tid)) {
1800 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1801  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1802  // Create itt barrier object
1803  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1804  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1805  }
1806 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1807 
1808 #ifdef KMP_DEBUG
1809  kmp_info_t **other_threads = team->t.t_threads;
1810  int i;
1811 
1812  // Verify state
1813  KMP_MB();
1814 
1815  for (i = 1; i < team->t.t_nproc; ++i) {
1816  KA_TRACE(500,
1817  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1818  "== %u.\n",
1819  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1820  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1821  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1822  KMP_DEBUG_ASSERT(
1823  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1824  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1825  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1826  }
1827 #endif
1828 
1829  if (__kmp_tasking_mode != tskm_immediate_exec) {
1830  // 0 indicates setup current task team if nthreads > 1
1831  __kmp_task_team_setup(this_thr, team, 0);
1832  }
1833 
1834  /* The master thread may have changed its blocktime between the join barrier
1835  and the fork barrier. Copy the blocktime info to the thread, where
1836  __kmp_wait_template() can access it when the team struct is not
1837  guaranteed to exist. */
1838  // See note about the corresponding code in __kmp_join_barrier() being
1839  // performance-critical
1840  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1841 #if KMP_USE_MONITOR
1842  this_thr->th.th_team_bt_intervals =
1843  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1844  this_thr->th.th_team_bt_set =
1845  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1846 #else
1847  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1848 #endif
1849  }
1850  } // master
1851 
1852  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1853  case bp_hyper_bar: {
1854  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1855  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1856  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1857  break;
1858  }
1859  case bp_hierarchical_bar: {
1860  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1861  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1862  break;
1863  }
1864  case bp_tree_bar: {
1865  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1866  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1867  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1868  break;
1869  }
1870  default: {
1871  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1872  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1873  }
1874  }
1875 
1876  // Early exit for reaping threads releasing forkjoin barrier
1877  if (TCR_4(__kmp_global.g.g_done)) {
1878  this_thr->th.th_task_team = NULL;
1879 
1880 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1881  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1882  if (!KMP_MASTER_TID(tid)) {
1883  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1884  if (itt_sync_obj)
1885  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1886  }
1887  }
1888 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1889  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1890  return;
1891  }
1892 
1893  /* We can now assume that a valid team structure has been allocated by the
1894  master and propagated to all worker threads. The current thread, however,
1895  may not be part of the team, so we can't blindly assume that the team
1896  pointer is non-null. */
1897  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1898  KMP_DEBUG_ASSERT(team != NULL);
1899  tid = __kmp_tid_from_gtid(gtid);
1900 
1901 #if KMP_BARRIER_ICV_PULL
1902  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1903  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1904  implicit task has this data before this function is called. We cannot
1905  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
1906  struct, because it is not always the case that the threads arrays have
1907  been allocated when __kmp_fork_call() is executed. */
1908  {
1909  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1910  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1911  // Copy the initial ICVs from the master's thread struct to the implicit
1912  // task for this tid.
1913  KA_TRACE(10,
1914  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1915  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
1916  tid, FALSE);
1917  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1918  &team->t.t_threads[0]
1919  ->th.th_bar[bs_forkjoin_barrier]
1920  .bb.th_fixed_icvs);
1921  }
1922  }
1923 #endif // KMP_BARRIER_ICV_PULL
1924 
1925  if (__kmp_tasking_mode != tskm_immediate_exec) {
1926  __kmp_task_team_sync(this_thr, team);
1927  }
1928 
1929 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1930  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1931  if (proc_bind == proc_bind_intel) {
1932 #endif
1933 #if KMP_AFFINITY_SUPPORTED
1934  // Call dynamic affinity settings
1935  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1936  __kmp_balanced_affinity(tid, team->t.t_nproc);
1937  }
1938 #endif // KMP_AFFINITY_SUPPORTED
1939 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1940  } else if (proc_bind != proc_bind_false) {
1941  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1942  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1943  __kmp_gtid_from_thread(this_thr),
1944  this_thr->th.th_current_place));
1945  } else {
1946  __kmp_affinity_set_place(gtid);
1947  }
1948  }
1949 #endif
1950 
1951 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1952  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1953  if (!KMP_MASTER_TID(tid)) {
1954  // Get correct barrier object
1955  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1956  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1957  } // (prepare called inside barrier_release)
1958  }
1959 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1960  ANNOTATE_BARRIER_END(&team->t.t_bar);
1961  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
1962  team->t.t_id, tid));
1963 }
1964 
1965 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
1966  kmp_internal_control_t *new_icvs, ident_t *loc) {
1967  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
1968 
1969  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1970  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1971 
1972 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1973  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1974  implicit task has this data before this function is called. */
1975 #if KMP_BARRIER_ICV_PULL
1976  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
1977  untouched), where all of the worker threads can access them and make their
1978  own copies after the barrier. */
1979  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
1980  // allocated at this point
1981  copy_icvs(
1982  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
1983  new_icvs);
1984  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
1985  team->t.t_threads[0], team));
1986 #elif KMP_BARRIER_ICV_PUSH
1987  // The ICVs will be propagated in the fork barrier, so nothing needs to be
1988  // done here.
1989  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
1990  team->t.t_threads[0], team));
1991 #else
1992  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
1993  // time.
1994  ngo_load(new_icvs);
1995  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
1996  // allocated at this point
1997  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
1998  // TODO: GEH - pass in better source location info since usually NULL here
1999  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2000  f, team->t.t_threads[f], team));
2001  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2002  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2003  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2004  f, team->t.t_threads[f], team));
2005  }
2006  ngo_sync();
2007 #endif // KMP_BARRIER_ICV_PULL
2008 }
Definition: kmp.h:208