Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "kmp.h"
36 #include "kmp_wait_release.h"
37 #include "kmp_stats.h"
38 #include "kmp_itt.h"
39 
40 #if KMP_MIC
41 #include <immintrin.h>
42 #define USE_NGO_STORES 1
43 #endif // KMP_MIC
44 
45 #if KMP_MIC && USE_NGO_STORES
46 // ICV copying
47 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
48 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
49 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
50 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
51 #else
52 #define ngo_load(src) ((void)0)
53 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
54 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
55 #define ngo_sync() ((void)0)
56 #endif /* KMP_MIC && USE_NGO_STORES */
57 
58 void __kmp_print_structure(void); // Forward declaration
59 
60 // ---------------------------- Barrier Algorithms ----------------------------
61 
62 // Linear Barrier
63 static void
64 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
65  void (*reduce)(void *, void *)
66  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
67 {
68  KMP_TIME_BLOCK(KMP_linear_gather);
69  register kmp_team_t *team = this_thr->th.th_team;
70  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
71  register kmp_info_t **other_threads = team->t.t_threads;
72 
73  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
74  gtid, team->t.t_id, tid, bt));
75  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
76 
77 #if USE_ITT_BUILD && USE_ITT_NOTIFY
78  // Barrier imbalance - save arrive time to the thread
79  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
80  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
81  }
82 #endif
83  // We now perform a linear reduction to signal that all of the threads have arrived.
84  if (!KMP_MASTER_TID(tid)) {
85  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
86  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
87  __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
88  thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
89  // Mark arrival to master thread
90  /* After performing this write, a worker thread may not assume that the team is valid
91  any more - it could be deallocated by the master thread at any time. */
92  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
93  flag.release();
94  } else {
95  register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
96  register int nproc = this_thr->th.th_team_nproc;
97  register int i;
98  // Don't have to worry about sleep bit here or atomic since team setting
99  register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
100 
101  // Collect all the worker team member threads.
102  for (i=1; i<nproc; ++i) {
103 #if KMP_CACHE_MANAGE
104  // Prefetch next thread's arrived count
105  if (i+1 < nproc)
106  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
107 #endif /* KMP_CACHE_MANAGE */
108  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
109  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
110  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
111  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
112 
113  // Wait for worker thread to arrive
114  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
115  flag.wait(this_thr, FALSE
116  USE_ITT_BUILD_ARG(itt_sync_obj) );
117 #if USE_ITT_BUILD && USE_ITT_NOTIFY
118  // Barrier imbalance - write min of the thread time and the other thread time to the thread.
119  if (__kmp_forkjoin_frames_mode == 2) {
120  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
121  other_threads[i]->th.th_bar_min_time);
122  }
123 #endif
124  if (reduce) {
125  KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
126  team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
127  (*reduce)(this_thr->th.th_local.reduce_data,
128  other_threads[i]->th.th_local.reduce_data);
129  }
130  }
131  // Don't have to worry about sleep bit here or atomic since team setting
132  team_bar->b_arrived = new_state;
133  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
134  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
135  }
136  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
137  gtid, team->t.t_id, tid, bt));
138 }
139 
140 static void
141 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
142  int propagate_icvs
143  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
144 {
145  KMP_TIME_BLOCK(KMP_linear_release);
146  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
147  register kmp_team_t *team;
148 
149  if (KMP_MASTER_TID(tid)) {
150  register unsigned int i;
151  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
152  register kmp_info_t **other_threads;
153 
154  team = __kmp_threads[gtid]->th.th_team;
155  KMP_DEBUG_ASSERT(team != NULL);
156  other_threads = team->t.t_threads;
157 
158  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
159  gtid, team->t.t_id, tid, bt));
160 
161  if (nproc > 1) {
162 #if KMP_BARRIER_ICV_PUSH
163  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
164  if (propagate_icvs) {
165  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
166  for (i=1; i<nproc; ++i) {
167  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
168  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
169  &team->t.t_implicit_task_taskdata[0].td_icvs);
170  }
171  ngo_sync();
172  }
173  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
174 #endif // KMP_BARRIER_ICV_PUSH
175 
176  // Now, release all of the worker threads
177  for (i=1; i<nproc; ++i) {
178 #if KMP_CACHE_MANAGE
179  // Prefetch next thread's go flag
180  if (i+1 < nproc)
181  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
182 #endif /* KMP_CACHE_MANAGE */
183  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
184  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
185  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
186  &other_threads[i]->th.th_bar[bt].bb.b_go,
187  other_threads[i]->th.th_bar[bt].bb.b_go,
188  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
189  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
190  flag.release();
191  }
192  }
193  } else { // Wait for the MASTER thread to release us
194  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
195  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
196  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
197  flag.wait(this_thr, TRUE
198  USE_ITT_BUILD_ARG(itt_sync_obj) );
199 #if USE_ITT_BUILD && USE_ITT_NOTIFY
200  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
201  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
202  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
203  // Cancel wait on previous parallel region...
204  __kmp_itt_task_starting(itt_sync_obj);
205 
206  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
207  return;
208 
209  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
210  if (itt_sync_obj != NULL)
211  // Call prepare as early as possible for "new" barrier
212  __kmp_itt_task_finished(itt_sync_obj);
213  } else
214 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
215  // Early exit for reaping threads releasing forkjoin barrier
216  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
217  return;
218  // The worker thread may now assume that the team is valid.
219 #ifdef KMP_DEBUG
220  tid = __kmp_tid_from_gtid(gtid);
221  team = __kmp_threads[gtid]->th.th_team;
222 #endif
223  KMP_DEBUG_ASSERT(team != NULL);
224  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
225  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
226  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
227  KMP_MB(); // Flush all pending memory write invalidates.
228  }
229  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
230  gtid, team->t.t_id, tid, bt));
231 }
232 
233 // Tree barrier
234 static void
235 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
236  void (*reduce)(void *, void *)
237  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
238 {
239  KMP_TIME_BLOCK(KMP_tree_gather);
240  register kmp_team_t *team = this_thr->th.th_team;
241  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
242  register kmp_info_t **other_threads = team->t.t_threads;
243  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
244  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
245  register kmp_uint32 branch_factor = 1 << branch_bits;
246  register kmp_uint32 child;
247  register kmp_uint32 child_tid;
248  register kmp_uint new_state;
249 
250  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
251  gtid, team->t.t_id, tid, bt));
252  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
253 
254 #if USE_ITT_BUILD && USE_ITT_NOTIFY
255  // Barrier imbalance - save arrive time to the thread
256  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
257  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
258  }
259 #endif
260  // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
261  child_tid = (tid << branch_bits) + 1;
262  if (child_tid < nproc) {
263  // Parent threads wait for all their children to arrive
264  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
265  child = 1;
266  do {
267  register kmp_info_t *child_thr = other_threads[child_tid];
268  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
269 #if KMP_CACHE_MANAGE
270  // Prefetch next thread's arrived count
271  if (child+1 <= branch_factor && child_tid+1 < nproc)
272  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
273 #endif /* KMP_CACHE_MANAGE */
274  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
275  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
276  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
277  &child_bar->b_arrived, new_state));
278  // Wait for child to arrive
279  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
280  flag.wait(this_thr, FALSE
281  USE_ITT_BUILD_ARG(itt_sync_obj) );
282 #if USE_ITT_BUILD && USE_ITT_NOTIFY
283  // Barrier imbalance - write min of the thread time and a child time to the thread.
284  if (__kmp_forkjoin_frames_mode == 2) {
285  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
286  child_thr->th.th_bar_min_time);
287  }
288 #endif
289  if (reduce) {
290  KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
291  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
292  team->t.t_id, child_tid));
293  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
294  }
295  child++;
296  child_tid++;
297  }
298  while (child <= branch_factor && child_tid < nproc);
299  }
300 
301  if (!KMP_MASTER_TID(tid)) { // Worker threads
302  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
303 
304  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
305  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
306  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
307  &thr_bar->b_arrived, thr_bar->b_arrived,
308  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
309 
310  // Mark arrival to parent thread
311  /* After performing this write, a worker thread may not assume that the team is valid
312  any more - it could be deallocated by the master thread at any time. */
313  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
314  flag.release();
315  } else {
316  // Need to update the team arrived pointer if we are the master thread
317  if (nproc > 1) // New value was already computed above
318  team->t.t_bar[bt].b_arrived = new_state;
319  else
320  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
321  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
322  gtid, team->t.t_id, tid, team->t.t_id,
323  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
324  }
325  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
326  gtid, team->t.t_id, tid, bt));
327 }
328 
329 static void
330 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
331  int propagate_icvs
332  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
333 {
334  KMP_TIME_BLOCK(KMP_tree_release);
335  register kmp_team_t *team;
336  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
337  register kmp_uint32 nproc;
338  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
339  register kmp_uint32 branch_factor = 1 << branch_bits;
340  register kmp_uint32 child;
341  register kmp_uint32 child_tid;
342 
343  // Perform a tree release for all of the threads that have been gathered
344  if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
345  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
346  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
347  // Wait for parent thread to release us
348  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
349  flag.wait(this_thr, TRUE
350  USE_ITT_BUILD_ARG(itt_sync_obj) );
351 #if USE_ITT_BUILD && USE_ITT_NOTIFY
352  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
353  // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
354  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
355  // Cancel wait on previous parallel region...
356  __kmp_itt_task_starting(itt_sync_obj);
357 
358  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
359  return;
360 
361  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
362  if (itt_sync_obj != NULL)
363  // Call prepare as early as possible for "new" barrier
364  __kmp_itt_task_finished(itt_sync_obj);
365  } else
366 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
367  // Early exit for reaping threads releasing forkjoin barrier
368  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
369  return;
370 
371  // The worker thread may now assume that the team is valid.
372  team = __kmp_threads[gtid]->th.th_team;
373  KMP_DEBUG_ASSERT(team != NULL);
374  tid = __kmp_tid_from_gtid(gtid);
375 
376  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
377  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
378  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
379  KMP_MB(); // Flush all pending memory write invalidates.
380  } else {
381  team = __kmp_threads[gtid]->th.th_team;
382  KMP_DEBUG_ASSERT(team != NULL);
383  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
384  gtid, team->t.t_id, tid, bt));
385  }
386  nproc = this_thr->th.th_team_nproc;
387  child_tid = (tid << branch_bits) + 1;
388 
389  if (child_tid < nproc) {
390  register kmp_info_t **other_threads = team->t.t_threads;
391  child = 1;
392  // Parent threads release all their children
393  do {
394  register kmp_info_t *child_thr = other_threads[child_tid];
395  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
396 #if KMP_CACHE_MANAGE
397  // Prefetch next thread's go count
398  if (child+1 <= branch_factor && child_tid+1 < nproc)
399  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
400 #endif /* KMP_CACHE_MANAGE */
401 
402 #if KMP_BARRIER_ICV_PUSH
403  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
404  if (propagate_icvs) {
405  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
406  team, child_tid, FALSE);
407  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
408  &team->t.t_implicit_task_taskdata[0].td_icvs);
409  }
410  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
411 #endif // KMP_BARRIER_ICV_PUSH
412  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
413  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
414  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
415  child_tid, &child_bar->b_go, child_bar->b_go,
416  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
417  // Release child from barrier
418  kmp_flag_64 flag(&child_bar->b_go, child_thr);
419  flag.release();
420  child++;
421  child_tid++;
422  }
423  while (child <= branch_factor && child_tid < nproc);
424  }
425  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
426  gtid, team->t.t_id, tid, bt));
427 }
428 
429 
430 // Hyper Barrier
431 static void
432 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
433  void (*reduce)(void *, void *)
434  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
435 {
436  KMP_TIME_BLOCK(KMP_hyper_gather);
437  register kmp_team_t *team = this_thr->th.th_team;
438  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
439  register kmp_info_t **other_threads = team->t.t_threads;
440  register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
441  register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
442  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
443  register kmp_uint32 branch_factor = 1 << branch_bits;
444  register kmp_uint32 offset;
445  register kmp_uint32 level;
446 
447  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
448  gtid, team->t.t_id, tid, bt));
449 
450  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
451 
452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
453  // Barrier imbalance - save arrive time to the thread
454  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
455  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
456  }
457 #endif
458  /* Perform a hypercube-embedded tree gather to wait until all of the threads have
459  arrived, and reduce any required data as we go. */
460  kmp_flag_64 p_flag(&thr_bar->b_arrived);
461  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
462  {
463  register kmp_uint32 child;
464  register kmp_uint32 child_tid;
465 
466  if (((tid >> level) & (branch_factor - 1)) != 0) {
467  register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
468 
469  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
470  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
471  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
472  &thr_bar->b_arrived, thr_bar->b_arrived,
473  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
474  // Mark arrival to parent thread
475  /* After performing this write (in the last iteration of the enclosing for loop),
476  a worker thread may not assume that the team is valid any more - it could be
477  deallocated by the master thread at any time. */
478  p_flag.set_waiter(other_threads[parent_tid]);
479  p_flag.release();
480  break;
481  }
482 
483  // Parent threads wait for children to arrive
484  if (new_state == KMP_BARRIER_UNUSED_STATE)
485  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
486  for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
487  child++, child_tid+=(1 << level))
488  {
489  register kmp_info_t *child_thr = other_threads[child_tid];
490  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
491 #if KMP_CACHE_MANAGE
492  register kmp_uint32 next_child_tid = child_tid + (1 << level);
493  // Prefetch next thread's arrived count
494  if (child+1 < branch_factor && next_child_tid < num_threads)
495  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
496 #endif /* KMP_CACHE_MANAGE */
497  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
498  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
499  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
500  &child_bar->b_arrived, new_state));
501  // Wait for child to arrive
502  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
503  c_flag.wait(this_thr, FALSE
504  USE_ITT_BUILD_ARG(itt_sync_obj) );
505 #if USE_ITT_BUILD && USE_ITT_NOTIFY
506  // Barrier imbalance - write min of the thread time and a child time to the thread.
507  if (__kmp_forkjoin_frames_mode == 2) {
508  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
509  child_thr->th.th_bar_min_time);
510  }
511 #endif
512  if (reduce) {
513  KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
514  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
515  team->t.t_id, child_tid));
516  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
517  }
518  }
519  }
520 
521  if (KMP_MASTER_TID(tid)) {
522  // Need to update the team arrived pointer if we are the master thread
523  if (new_state == KMP_BARRIER_UNUSED_STATE)
524  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
525  else
526  team->t.t_bar[bt].b_arrived = new_state;
527  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
528  gtid, team->t.t_id, tid, team->t.t_id,
529  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
530  }
531  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
532  gtid, team->t.t_id, tid, bt));
533 }
534 
535 // The reverse versions seem to beat the forward versions overall
536 #define KMP_REVERSE_HYPER_BAR
537 static void
538 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
539  int propagate_icvs
540  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
541 {
542  KMP_TIME_BLOCK(KMP_hyper_release);
543  register kmp_team_t *team;
544  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
545  register kmp_info_t **other_threads;
546  register kmp_uint32 num_threads;
547  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
548  register kmp_uint32 branch_factor = 1 << branch_bits;
549  register kmp_uint32 child;
550  register kmp_uint32 child_tid;
551  register kmp_uint32 offset;
552  register kmp_uint32 level;
553 
554  /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
555  If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
556  order of the corresponding gather, otherwise threads are released in the same order. */
557  if (KMP_MASTER_TID(tid)) { // master
558  team = __kmp_threads[gtid]->th.th_team;
559  KMP_DEBUG_ASSERT(team != NULL);
560  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
561  gtid, team->t.t_id, tid, bt));
562 #if KMP_BARRIER_ICV_PUSH
563  if (propagate_icvs) { // master already has ICVs in final destination; copy
564  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
565  }
566 #endif
567  }
568  else { // Handle fork barrier workers who aren't part of a team yet
569  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
570  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
571  // Wait for parent thread to release us
572  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
573  flag.wait(this_thr, TRUE
574  USE_ITT_BUILD_ARG(itt_sync_obj) );
575 #if USE_ITT_BUILD && USE_ITT_NOTIFY
576  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
577  // In fork barrier where we could not get the object reliably
578  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
579  // Cancel wait on previous parallel region...
580  __kmp_itt_task_starting(itt_sync_obj);
581 
582  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
583  return;
584 
585  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
586  if (itt_sync_obj != NULL)
587  // Call prepare as early as possible for "new" barrier
588  __kmp_itt_task_finished(itt_sync_obj);
589  } else
590 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
591  // Early exit for reaping threads releasing forkjoin barrier
592  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
593  return;
594 
595  // The worker thread may now assume that the team is valid.
596  team = __kmp_threads[gtid]->th.th_team;
597  KMP_DEBUG_ASSERT(team != NULL);
598  tid = __kmp_tid_from_gtid(gtid);
599 
600  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
601  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
602  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
603  KMP_MB(); // Flush all pending memory write invalidates.
604  }
605  num_threads = this_thr->th.th_team_nproc;
606  other_threads = team->t.t_threads;
607 
608 #ifdef KMP_REVERSE_HYPER_BAR
609  // Count up to correct level for parent
610  for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
611  level+=branch_bits, offset<<=branch_bits);
612 
613  // Now go down from there
614  for (level-=branch_bits, offset>>=branch_bits; offset != 0;
615  level-=branch_bits, offset>>=branch_bits)
616 #else
617  // Go down the tree, level by level
618  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
619 #endif // KMP_REVERSE_HYPER_BAR
620  {
621 #ifdef KMP_REVERSE_HYPER_BAR
622  /* Now go in reverse order through the children, highest to lowest.
623  Initial setting of child is conservative here. */
624  child = num_threads >> ((level==0)?level:level-1);
625  for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
626  child>=1; child--, child_tid-=(1<<level))
627 #else
628  if (((tid >> level) & (branch_factor - 1)) != 0)
629  // No need to go lower than this, since this is the level parent would be notified
630  break;
631  // Iterate through children on this level of the tree
632  for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
633  child++, child_tid+=(1<<level))
634 #endif // KMP_REVERSE_HYPER_BAR
635  {
636  if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
637  else {
638  register kmp_info_t *child_thr = other_threads[child_tid];
639  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
640 #if KMP_CACHE_MANAGE
641  register kmp_uint32 next_child_tid = child_tid - (1 << level);
642  // Prefetch next thread's go count
643 # ifdef KMP_REVERSE_HYPER_BAR
644  if (child-1 >= 1 && next_child_tid < num_threads)
645 # else
646  if (child+1 < branch_factor && next_child_tid < num_threads)
647 # endif // KMP_REVERSE_HYPER_BAR
648  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
649 #endif /* KMP_CACHE_MANAGE */
650 
651 #if KMP_BARRIER_ICV_PUSH
652  if (propagate_icvs) // push my fixed ICVs to my child
653  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
654 #endif // KMP_BARRIER_ICV_PUSH
655 
656  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
657  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
658  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
659  child_tid, &child_bar->b_go, child_bar->b_go,
660  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
661  // Release child from barrier
662  kmp_flag_64 flag(&child_bar->b_go, child_thr);
663  flag.release();
664  }
665  }
666  }
667 #if KMP_BARRIER_ICV_PUSH
668  if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
669  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
670  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
671  }
672 #endif
673  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
674  gtid, team->t.t_id, tid, bt));
675 }
676 
677 // Hierarchical Barrier
678 
679 // Initialize thread barrier data
680 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
681  minimum amount of initialization required based on how the team has changed. Returns true if
682  leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
683  team size increases, threads already in the team will respond to on-core wakeup on their parent
684  thread, but threads newly added to the team will only be listening on the their local b_go. */
685 static bool
686 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
687  int gtid, int tid, kmp_team_t *team)
688 {
689  // Checks to determine if (re-)initialization is needed
690  bool uninitialized = thr_bar->team == NULL;
691  bool team_changed = team != thr_bar->team;
692  bool team_sz_changed = nproc != thr_bar->nproc;
693  bool tid_changed = tid != thr_bar->old_tid;
694  bool retval = false;
695 
696  if (uninitialized || team_sz_changed) {
697  __kmp_get_hierarchy(nproc, thr_bar);
698  }
699 
700  if (uninitialized || team_sz_changed || tid_changed) {
701  thr_bar->my_level = thr_bar->depth-1; // default for master
702  thr_bar->parent_tid = -1; // default for master
703  if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
704  kmp_uint32 d=0;
705  while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
706  kmp_uint32 rem;
707  if (d == thr_bar->depth-2) { // reached level right below the master
708  thr_bar->parent_tid = 0;
709  thr_bar->my_level = d;
710  break;
711  }
712  else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
713  // thread is not a subtree root at next level, so this is max
714  thr_bar->parent_tid = tid - rem;
715  thr_bar->my_level = d;
716  break;
717  }
718  ++d;
719  }
720  }
721  thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
722  thr_bar->old_tid = tid;
723  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
724  }
725  if (uninitialized || team_changed || tid_changed) {
726  thr_bar->team = team;
727  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
728  retval = true;
729  }
730  if (uninitialized || team_sz_changed || tid_changed) {
731  thr_bar->nproc = nproc;
732  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
733  if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
734  if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
735  thr_bar->leaf_kids = nproc - tid - 1;
736  thr_bar->leaf_state = 0;
737  for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
738  }
739  return retval;
740 }
741 
742 static void
743 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
744  int gtid, int tid, void (*reduce) (void *, void *)
745  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
746 {
747  KMP_TIME_BLOCK(KMP_hier_gather);
748  register kmp_team_t *team = this_thr->th.th_team;
749  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
750  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
751  register kmp_info_t **other_threads = team->t.t_threads;
752  register kmp_uint64 new_state;
753 
754  int level = team->t.t_level;
755  if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
756  if (this_thr->th.th_teams_size.nteams > 1)
757  ++level; // level was not increased in teams construct for team_of_masters
758  if (level == 1) thr_bar->use_oncore_barrier = 1;
759  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
760 
761  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
762  gtid, team->t.t_id, tid, bt));
763  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
764 
765 #if USE_ITT_BUILD && USE_ITT_NOTIFY
766  // Barrier imbalance - save arrive time to the thread
767  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
768  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
769  }
770 #endif
771 
772  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
773 
774  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
775  register kmp_int32 child_tid;
776  new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
777  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
778  if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
779  kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
780  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
781  flag.wait(this_thr, FALSE
782  USE_ITT_BUILD_ARG(itt_sync_obj) );
783  if (reduce) {
784  for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
785  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
786  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
787  team->t.t_id, child_tid));
788  (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
789  }
790  }
791  (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
792  }
793  // Next, wait for higher level children on each child's b_arrived flag
794  for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
795  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
796  if (last > nproc) last = nproc;
797  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
798  register kmp_info_t *child_thr = other_threads[child_tid];
799  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
800  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
801  "arrived(%p) == %u\n",
802  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
803  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
804  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
805  flag.wait(this_thr, FALSE
806  USE_ITT_BUILD_ARG(itt_sync_obj) );
807  if (reduce) {
808  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
809  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
810  team->t.t_id, child_tid));
811  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
812  }
813  }
814  }
815  }
816  else { // Blocktime is not infinite
817  for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
818  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
819  if (last > nproc) last = nproc;
820  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
821  register kmp_info_t *child_thr = other_threads[child_tid];
822  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
823  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
824  "arrived(%p) == %u\n",
825  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
826  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
827  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
828  flag.wait(this_thr, FALSE
829  USE_ITT_BUILD_ARG(itt_sync_obj) );
830  if (reduce) {
831  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
832  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
833  team->t.t_id, child_tid));
834  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
835  }
836  }
837  }
838  }
839  }
840  // All subordinates are gathered; now release parent if not master thread
841 
842  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
843  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
844  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
845  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
846  &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
847  /* Mark arrival to parent: After performing this write, a worker thread may not assume that
848  the team is valid any more - it could be deallocated by the master thread at any time. */
849  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
850  || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
851  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
852  flag.release();
853  }
854  else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
855  thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
856  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
857  flag.set_waiter(other_threads[thr_bar->parent_tid]);
858  flag.release();
859  }
860  } else { // Master thread needs to update the team's b_arrived value
861  team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
862  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
863  gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
864  }
865  // Is the team access below unsafe or just technically invalid?
866  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
867  gtid, team->t.t_id, tid, bt));
868 }
869 
870 static void
871 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872  int propagate_icvs
873  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
874 {
875  KMP_TIME_BLOCK(KMP_hier_release);
876  register kmp_team_t *team;
877  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
878  register kmp_uint32 nproc;
879  bool team_change = false; // indicates on-core barrier shouldn't be used
880 
881  if (KMP_MASTER_TID(tid)) {
882  team = __kmp_threads[gtid]->th.th_team;
883  KMP_DEBUG_ASSERT(team != NULL);
884  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
885  gtid, team->t.t_id, tid, bt));
886  }
887  else { // Worker threads
888  // Wait for parent thread to release me
889  if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
890  || thr_bar->my_level != 0 || thr_bar->team == NULL) {
891  // Use traditional method of waiting on my own b_go flag
892  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
893  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
894  flag.wait(this_thr, TRUE
895  USE_ITT_BUILD_ARG(itt_sync_obj) );
896  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
897  }
898  else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
899  // Wait on my "offset" bits on parent's b_go flag
900  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
901  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
902  bt, this_thr
903  USE_ITT_BUILD_ARG(itt_sync_obj) );
904  flag.wait(this_thr, TRUE
905  USE_ITT_BUILD_ARG(itt_sync_obj) );
906  if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
907  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
908  }
909  else { // Reset my bits on parent's b_go flag
910  ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
911  }
912  }
913  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
914  // Early exit for reaping threads releasing forkjoin barrier
915  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
916  return;
917  // The worker thread may now assume that the team is valid.
918  team = __kmp_threads[gtid]->th.th_team;
919  KMP_DEBUG_ASSERT(team != NULL);
920  tid = __kmp_tid_from_gtid(gtid);
921 
922  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
923  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
924  KMP_MB(); // Flush all pending memory write invalidates.
925  }
926 
927  int level = team->t.t_level;
928  if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
929  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
930  ++level; // level was not increased in teams construct for team_of_workers
931  if( this_thr->th.th_teams_size.nteams > 1 )
932  ++level; // level was not increased in teams construct for team_of_masters
933  }
934  if (level == 1) thr_bar->use_oncore_barrier = 1;
935  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
936  nproc = this_thr->th.th_team_nproc;
937 
938  // If the team size has increased, we still communicate with old leaves via oncore barrier.
939  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
940  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
941  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
942  // But if the entire team changes, we won't use oncore barrier at all
943  if (team_change) old_leaf_kids = 0;
944 
945 #if KMP_BARRIER_ICV_PUSH
946  if (propagate_icvs) {
947  if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
948  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
949  }
950  else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
951  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
952  // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
953  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
954  &thr_bar->parent_bar->th_fixed_icvs);
955  // non-leaves will get ICVs piggybacked with b_go via NGO store
956  }
957  else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
958  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
959  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
960  else // leaves copy parent's fixed ICVs directly to local ICV store
961  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
962  &thr_bar->parent_bar->th_fixed_icvs);
963  }
964  }
965 #endif // KMP_BARRIER_ICV_PUSH
966 
967  // Now, release my children
968  if (thr_bar->my_level) { // not a leaf
969  register kmp_int32 child_tid;
970  kmp_uint32 last;
971  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
972  if (KMP_MASTER_TID(tid)) { // do a flat release
973  // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
974  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
975  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
976  ngo_load(&thr_bar->th_fixed_icvs);
977  // This loops over all the threads skipping only the leaf nodes in the hierarchy
978  for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
979  register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
980  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
981  " go(%p): %u => %u\n",
982  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
983  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
984  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
985  // Use ngo store (if available) to both store ICVs and release child via child's b_go
986  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
987  }
988  ngo_sync();
989  }
990  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
991  // Now, release leaf children
992  if (thr_bar->leaf_kids) { // if there are any
993  // We test team_change on the off-chance that the level 1 team changed.
994  if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
995  if (old_leaf_kids) { // release old leaf kids
996  thr_bar->b_go |= old_leaf_state;
997  }
998  // Release new leaf kids
999  last = tid+thr_bar->skip_per_level[1];
1000  if (last > nproc) last = nproc;
1001  for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
1002  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1003  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1004  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1005  " T#%d(%d:%d) go(%p): %u => %u\n",
1006  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1007  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1008  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1009  // Release child using child's b_go flag
1010  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1011  flag.release();
1012  }
1013  }
1014  else { // Release all children at once with leaf_state bits on my own b_go flag
1015  thr_bar->b_go |= thr_bar->leaf_state;
1016  }
1017  }
1018  }
1019  else { // Blocktime is not infinite; do a simple hierarchical release
1020  for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1021  last = tid+thr_bar->skip_per_level[d+1];
1022  kmp_uint32 skip = thr_bar->skip_per_level[d];
1023  if (last > nproc) last = nproc;
1024  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1025  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1026  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1027  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1028  " go(%p): %u => %u\n",
1029  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1030  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1031  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1032  // Release child using child's b_go flag
1033  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1034  flag.release();
1035  }
1036  }
1037  }
1038 #if KMP_BARRIER_ICV_PUSH
1039  if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1040  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1041 #endif // KMP_BARRIER_ICV_PUSH
1042  }
1043  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1044  gtid, team->t.t_id, tid, bt));
1045 }
1046 
1047 // ---------------------------- End of Barrier Algorithms ----------------------------
1048 
1049 // Internal function to do a barrier.
1050 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1051  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1052  Returns 0 if master thread, 1 if worker thread. */
1053 int
1054 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1055  void *reduce_data, void (*reduce)(void *, void *))
1056 {
1057  KMP_TIME_BLOCK(KMP_barrier);
1058  register int tid = __kmp_tid_from_gtid(gtid);
1059  register kmp_info_t *this_thr = __kmp_threads[gtid];
1060  register kmp_team_t *team = this_thr->th.th_team;
1061  register int status = 0;
1062  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1063 
1064  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1065  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1066 
1067  if (! team->t.t_serialized) {
1068 #if USE_ITT_BUILD
1069  // This value will be used in itt notify events below.
1070  void *itt_sync_obj = NULL;
1071 # if USE_ITT_NOTIFY
1072  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1073  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1074 # endif
1075 #endif /* USE_ITT_BUILD */
1076  if (__kmp_tasking_mode == tskm_extra_barrier) {
1077  __kmp_tasking_barrier(team, this_thr, gtid);
1078  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1079  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1080  }
1081 
1082  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1083  the team struct is not guaranteed to exist. */
1084  // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1085  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1086  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1087  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1088  }
1089 
1090 #if USE_ITT_BUILD
1091  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1092  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1093 #endif /* USE_ITT_BUILD */
1094 #if USE_DEBUGGER
1095  // Let the debugger know: the thread arrived to the barrier and waiting.
1096  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1097  team->t.t_bar[bt].b_master_arrived += 1;
1098  } else {
1099  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1100  } // if
1101 #endif /* USE_DEBUGGER */
1102  if (reduce != NULL) {
1103  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1104  this_thr->th.th_local.reduce_data = reduce_data;
1105  }
1106  switch (__kmp_barrier_gather_pattern[bt]) {
1107  case bp_hyper_bar: {
1108  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1109  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1110  USE_ITT_BUILD_ARG(itt_sync_obj) );
1111  break;
1112  }
1113  case bp_hierarchical_bar: {
1114  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1115  USE_ITT_BUILD_ARG(itt_sync_obj));
1116  break;
1117  }
1118  case bp_tree_bar: {
1119  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1120  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1121  USE_ITT_BUILD_ARG(itt_sync_obj) );
1122  break;
1123  }
1124  default: {
1125  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1126  USE_ITT_BUILD_ARG(itt_sync_obj) );
1127  }
1128  }
1129 
1130  KMP_MB();
1131 
1132  if (KMP_MASTER_TID(tid)) {
1133  status = 0;
1134  if (__kmp_tasking_mode != tskm_immediate_exec) {
1135  __kmp_task_team_wait(this_thr, team
1136  USE_ITT_BUILD_ARG(itt_sync_obj) );
1137  __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
1138  }
1139 #if USE_DEBUGGER
1140  // Let the debugger know: All threads are arrived and starting leaving the barrier.
1141  team->t.t_bar[bt].b_team_arrived += 1;
1142 #endif
1143 
1144 #if USE_ITT_BUILD
1145  /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1146  before the final summation into the shared variable is done (final summation can be a
1147  long operation for array reductions). */
1148  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1149  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1150 #endif /* USE_ITT_BUILD */
1151 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1152  // Barrier - report frame end (only if active_level == 1)
1153  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1154 #if OMP_40_ENABLED
1155  this_thr->th.th_teams_microtask == NULL &&
1156 #endif
1157  team->t.t_active_level == 1)
1158  {
1159  kmp_uint64 cur_time = __itt_get_timestamp();
1160  kmp_info_t **other_threads = team->t.t_threads;
1161  int nproc = this_thr->th.th_team_nproc;
1162  int i;
1163  switch(__kmp_forkjoin_frames_mode) {
1164  case 1:
1165  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1166  this_thr->th.th_frame_time = cur_time;
1167  break;
1168  case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1169  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1170  break;
1171  case 3:
1172  if( __itt_metadata_add_ptr ) {
1173  // Initialize with master's wait time
1174  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1175  for (i=1; i<nproc; ++i) {
1176  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1177  }
1178  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1179  }
1180  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1181  this_thr->th.th_frame_time = cur_time;
1182  break;
1183  }
1184  }
1185 #endif /* USE_ITT_BUILD */
1186  } else {
1187  status = 1;
1188 #if USE_ITT_BUILD
1189  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1190  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1191 #endif /* USE_ITT_BUILD */
1192  }
1193  if (status == 1 || ! is_split) {
1194  switch (__kmp_barrier_release_pattern[bt]) {
1195  case bp_hyper_bar: {
1196  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1197  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1198  USE_ITT_BUILD_ARG(itt_sync_obj) );
1199  break;
1200  }
1201  case bp_hierarchical_bar: {
1202  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1203  USE_ITT_BUILD_ARG(itt_sync_obj) );
1204  break;
1205  }
1206  case bp_tree_bar: {
1207  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1208  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1209  USE_ITT_BUILD_ARG(itt_sync_obj) );
1210  break;
1211  }
1212  default: {
1213  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1214  USE_ITT_BUILD_ARG(itt_sync_obj) );
1215  }
1216  }
1217  if (__kmp_tasking_mode != tskm_immediate_exec) {
1218  __kmp_task_team_sync(this_thr, team);
1219  }
1220  }
1221 
1222 #if USE_ITT_BUILD
1223  /* GEH: TODO: Move this under if-condition above and also include in
1224  __kmp_end_split_barrier(). This will more accurately represent the actual release time
1225  of the threads for split barriers. */
1226  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1227  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1228 #endif /* USE_ITT_BUILD */
1229  } else { // Team is serialized.
1230  status = 0;
1231  if (__kmp_tasking_mode != tskm_immediate_exec) {
1232 #if OMP_41_ENABLED
1233  if ( this_thr->th.th_task_team != NULL ) {
1234  void *itt_sync_obj = NULL;
1235 #if USE_ITT_NOTIFY
1236  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1237  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1238  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1239  }
1240 #endif
1241 
1242  kmp_task_team_t * task_team = this_thr->th.th_task_team;
1243  KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1244  __kmp_task_team_wait(this_thr, team
1245  USE_ITT_BUILD_ARG(itt_sync_obj));
1246  __kmp_task_team_setup(this_thr, team, 0, 0);
1247 
1248 #if USE_ITT_BUILD
1249  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1250  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1251 #endif /* USE_ITT_BUILD */
1252  }
1253 #else
1254  // The task team should be NULL for serialized code (tasks will be executed immediately)
1255  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1256  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1257 #endif
1258  }
1259  }
1260  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1261  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1262  return status;
1263 }
1264 
1265 
1266 void
1267 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1268 {
1269  KMP_TIME_BLOCK(KMP_end_split_barrier);
1270  int tid = __kmp_tid_from_gtid(gtid);
1271  kmp_info_t *this_thr = __kmp_threads[gtid];
1272  kmp_team_t *team = this_thr->th.th_team;
1273 
1274  if (!team->t.t_serialized) {
1275  if (KMP_MASTER_GTID(gtid)) {
1276  switch (__kmp_barrier_release_pattern[bt]) {
1277  case bp_hyper_bar: {
1278  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1279  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1280  USE_ITT_BUILD_ARG(NULL) );
1281  break;
1282  }
1283  case bp_hierarchical_bar: {
1284  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1285  USE_ITT_BUILD_ARG(NULL));
1286  break;
1287  }
1288  case bp_tree_bar: {
1289  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1290  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1291  USE_ITT_BUILD_ARG(NULL) );
1292  break;
1293  }
1294  default: {
1295  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1296  USE_ITT_BUILD_ARG(NULL) );
1297  }
1298  }
1299  if (__kmp_tasking_mode != tskm_immediate_exec) {
1300  __kmp_task_team_sync(this_thr, team);
1301  } // if
1302  }
1303  }
1304 }
1305 
1306 
1307 void
1308 __kmp_join_barrier(int gtid)
1309 {
1310  KMP_TIME_BLOCK(KMP_join_barrier);
1311  register kmp_info_t *this_thr = __kmp_threads[gtid];
1312  register kmp_team_t *team;
1313  register kmp_uint nproc;
1314  kmp_info_t *master_thread;
1315  int tid;
1316 #ifdef KMP_DEBUG
1317  int team_id;
1318 #endif /* KMP_DEBUG */
1319 #if USE_ITT_BUILD
1320  void *itt_sync_obj = NULL;
1321 # if USE_ITT_NOTIFY
1322  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1323  // Get object created at fork_barrier
1324  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1325 # endif
1326 #endif /* USE_ITT_BUILD */
1327  KMP_MB();
1328 
1329  // Get current info
1330  team = this_thr->th.th_team;
1331  nproc = this_thr->th.th_team_nproc;
1332  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1333  tid = __kmp_tid_from_gtid(gtid);
1334 #ifdef KMP_DEBUG
1335  team_id = team->t.t_id;
1336 #endif /* KMP_DEBUG */
1337  master_thread = this_thr->th.th_team_master;
1338 #ifdef KMP_DEBUG
1339  if (master_thread != team->t.t_threads[0]) {
1340  __kmp_print_structure();
1341  }
1342 #endif /* KMP_DEBUG */
1343  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1344  KMP_MB();
1345 
1346  // Verify state
1347  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1348  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1349  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1350  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1351  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1352 
1353  if (__kmp_tasking_mode == tskm_extra_barrier) {
1354  __kmp_tasking_barrier(team, this_thr, gtid);
1355  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1356  }
1357 # ifdef KMP_DEBUG
1358  if (__kmp_tasking_mode != tskm_immediate_exec) {
1359  KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1360  __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1361  this_thr->th.th_task_team));
1362  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1363  }
1364 # endif /* KMP_DEBUG */
1365 
1366  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1367  team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1368  down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1369  since the values are not used by __kmp_wait_template() in that case. */
1370  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1371  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1372  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1373  }
1374 
1375 #if USE_ITT_BUILD
1376  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1377  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1378 #endif /* USE_ITT_BUILD */
1379 
1380  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1381  case bp_hyper_bar: {
1382  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1383  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1384  USE_ITT_BUILD_ARG(itt_sync_obj) );
1385  break;
1386  }
1387  case bp_hierarchical_bar: {
1388  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1389  USE_ITT_BUILD_ARG(itt_sync_obj) );
1390  break;
1391  }
1392  case bp_tree_bar: {
1393  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1394  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1395  USE_ITT_BUILD_ARG(itt_sync_obj) );
1396  break;
1397  }
1398  default: {
1399  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1400  USE_ITT_BUILD_ARG(itt_sync_obj) );
1401  }
1402  }
1403 
1404  /* From this point on, the team data structure may be deallocated at any time by the
1405  master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1406  data items that need to be referenced before the end of the barrier should be moved to
1407  the kmp_task_team_t structs. */
1408  if (KMP_MASTER_TID(tid)) {
1409  if (__kmp_tasking_mode != tskm_immediate_exec) {
1410  // Master shouldn't call decrease_load(). // TODO: enable master threads.
1411  // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1412  __kmp_task_team_wait(this_thr, team
1413  USE_ITT_BUILD_ARG(itt_sync_obj) );
1414  }
1415 #if USE_ITT_BUILD
1416  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1417  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1418 #endif /* USE_ITT_BUILD */
1419 
1420 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1421  // Join barrier - report frame end
1422  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1423 #if OMP_40_ENABLED
1424  this_thr->th.th_teams_microtask == NULL &&
1425 #endif
1426  team->t.t_active_level == 1)
1427  {
1428  kmp_uint64 cur_time = __itt_get_timestamp();
1429  ident_t * loc = team->t.t_ident;
1430  kmp_info_t **other_threads = team->t.t_threads;
1431  int nproc = this_thr->th.th_team_nproc;
1432  int i;
1433  switch(__kmp_forkjoin_frames_mode) {
1434  case 1:
1435  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1436  break;
1437  case 2:
1438  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1439  break;
1440  case 3:
1441  if( __itt_metadata_add_ptr ) {
1442  // Initialize with master's wait time
1443  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1444  for (i=1; i<nproc; ++i) {
1445  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1446  }
1447  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1448  }
1449  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1450  this_thr->th.th_frame_time = cur_time;
1451  break;
1452  }
1453  }
1454 # endif /* USE_ITT_BUILD */
1455  }
1456 #if USE_ITT_BUILD
1457  else {
1458  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1459  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1460  }
1461 #endif /* USE_ITT_BUILD */
1462 
1463 #if KMP_DEBUG
1464  if (KMP_MASTER_TID(tid)) {
1465  KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1466  gtid, team_id, tid, nproc));
1467  }
1468 #endif /* KMP_DEBUG */
1469 
1470  // TODO now, mark worker threads as done so they may be disbanded
1471  KMP_MB(); // Flush all pending memory write invalidates.
1472  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1473 }
1474 
1475 
1476 // TODO release worker threads' fork barriers as we are ready instead of all at once
1477 void
1478 __kmp_fork_barrier(int gtid, int tid)
1479 {
1480  KMP_TIME_BLOCK(KMP_fork_barrier);
1481  kmp_info_t *this_thr = __kmp_threads[gtid];
1482  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1483 #if USE_ITT_BUILD
1484  void * itt_sync_obj = NULL;
1485 #endif /* USE_ITT_BUILD */
1486 
1487  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1488  gtid, (team != NULL) ? team->t.t_id : -1, tid));
1489 
1490  // th_team pointer only valid for master thread here
1491  if (KMP_MASTER_TID(tid)) {
1492 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1493  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1494  // Create itt barrier object
1495  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1496  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1497  }
1498 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1499 
1500 #ifdef KMP_DEBUG
1501  register kmp_info_t **other_threads = team->t.t_threads;
1502  register int i;
1503 
1504  // Verify state
1505  KMP_MB();
1506 
1507  for(i=1; i<team->t.t_nproc; ++i) {
1508  KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1509  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1510  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1511  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1512  KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1513  & ~(KMP_BARRIER_SLEEP_STATE))
1514  == KMP_INIT_BARRIER_STATE);
1515  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1516  }
1517 #endif
1518 
1519  if (__kmp_tasking_mode != tskm_immediate_exec) {
1520  __kmp_task_team_setup(this_thr, team, 1, 0); // 1 indicates setup both task teams
1521  }
1522 
1523  /* The master thread may have changed its blocktime between the join barrier and the
1524  fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1525  access it when the team struct is not guaranteed to exist. */
1526  // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1527  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1528  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1529  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1530  }
1531  } // master
1532 
1533  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1534  case bp_hyper_bar: {
1535  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1536  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1537  USE_ITT_BUILD_ARG(itt_sync_obj) );
1538  break;
1539  }
1540  case bp_hierarchical_bar: {
1541  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1542  USE_ITT_BUILD_ARG(itt_sync_obj) );
1543  break;
1544  }
1545  case bp_tree_bar: {
1546  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1547  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1548  USE_ITT_BUILD_ARG(itt_sync_obj) );
1549  break;
1550  }
1551  default: {
1552  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1553  USE_ITT_BUILD_ARG(itt_sync_obj) );
1554  }
1555  }
1556 
1557  // Early exit for reaping threads releasing forkjoin barrier
1558  if (TCR_4(__kmp_global.g.g_done)) {
1559  if (this_thr->th.th_task_team != NULL) {
1560  if (KMP_MASTER_TID(tid)) {
1561  TCW_PTR(this_thr->th.th_task_team, NULL);
1562  }
1563  else {
1564  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1565  }
1566  }
1567 
1568 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1569  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1570  if (!KMP_MASTER_TID(tid)) {
1571  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1572  if (itt_sync_obj)
1573  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1574  }
1575  }
1576 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1577  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1578  return;
1579  }
1580 
1581  /* We can now assume that a valid team structure has been allocated by the master and
1582  propagated to all worker threads. The current thread, however, may not be part of the
1583  team, so we can't blindly assume that the team pointer is non-null. */
1584  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1585  KMP_DEBUG_ASSERT(team != NULL);
1586  tid = __kmp_tid_from_gtid(gtid);
1587 
1588 
1589 #if KMP_BARRIER_ICV_PULL
1590  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1591  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1592  this data before this function is called. We cannot modify __kmp_fork_call() to look at
1593  the fixed ICVs in the master's thread struct, because it is not always the case that the
1594  threads arrays have been allocated when __kmp_fork_call() is executed. */
1595  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1596  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1597  // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1598  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1599  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1600  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1601  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1602  }
1603  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1604 #endif // KMP_BARRIER_ICV_PULL
1605 
1606  if (__kmp_tasking_mode != tskm_immediate_exec) {
1607  __kmp_task_team_sync(this_thr, team);
1608  }
1609 
1610 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1611  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1612  if (proc_bind == proc_bind_intel) {
1613 #endif
1614 #if KMP_AFFINITY_SUPPORTED
1615  // Call dynamic affinity settings
1616  if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1617  __kmp_balanced_affinity(tid, team->t.t_nproc);
1618  }
1619 #endif // KMP_AFFINITY_SUPPORTED
1620 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1621  }
1622  else if (proc_bind != proc_bind_false) {
1623  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1624  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1625  __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1626  }
1627  else {
1628  __kmp_affinity_set_place(gtid);
1629  }
1630  }
1631 #endif
1632 
1633 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1634  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1635  if (!KMP_MASTER_TID(tid)) {
1636  // Get correct barrier object
1637  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1638  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1639  } // (prepare called inside barrier_release)
1640  }
1641 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1642  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1643 }
1644 
1645 
1646 void
1647 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1648 {
1649  KMP_TIME_BLOCK(KMP_setup_icv_copy);
1650  int f;
1651 
1652  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1653  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1654 
1655  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1656  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1657  this data before this function is called. */
1658 #if KMP_BARRIER_ICV_PULL
1659  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1660  all of the worker threads can access them and make their own copies after the barrier. */
1661  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1662  copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1663  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1664  0, team->t.t_threads[0], team));
1665 #elif KMP_BARRIER_ICV_PUSH
1666  // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1667  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1668  0, team->t.t_threads[0], team));
1669 #else
1670  // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1671  ngo_load(new_icvs);
1672  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1673  for (f=1; f<new_nproc; ++f) { // Skip the master thread
1674  // TODO: GEH - pass in better source location info since usually NULL here
1675  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1676  f, team->t.t_threads[f], team));
1677  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1678  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1679  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1680  f, team->t.t_threads[f], team));
1681  }
1682  ngo_sync();
1683 #endif // KMP_BARRIER_ICV_PULL
1684 }
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
Definition: kmp.h:221