Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  * $Revision: 43473 $
4  * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2014 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_wait_release.h"
39 #include "kmp_stats.h"
40 #include "kmp_itt.h"
41 
42 #if KMP_MIC
43 #include <immintrin.h>
44 #define USE_NGO_STORES 1
45 #endif // KMP_MIC
46 
47 #if KMP_MIC && USE_NGO_STORES
48 // ICV copying
49 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
50 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
51 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
52 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
53 #else
54 #define ngo_load(src) ((void)0)
55 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
56 #define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
57 #define ngo_sync() ((void)0)
58 #endif /* KMP_MIC && USE_NGO_STORES */
59 
60 void __kmp_print_structure(void); // Forward declaration
61 
62 // ---------------------------- Barrier Algorithms ----------------------------
63 
64 // Linear Barrier
65 static void
66 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
67  void (*reduce)(void *, void *)
68  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
69 {
70  KMP_TIME_BLOCK(KMP_linear_gather);
71  register kmp_team_t *team = this_thr->th.th_team;
72  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
73  register kmp_info_t **other_threads = team->t.t_threads;
74 
75  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
76  gtid, team->t.t_id, tid, bt));
77  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
78 
79 #if USE_ITT_BUILD && USE_ITT_NOTIFY
80  // Barrier imbalance - save arrive time to the thread
81  if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
82  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
83  }
84 #endif
85  // We now perform a linear reduction to signal that all of the threads have arrived.
86  if (!KMP_MASTER_TID(tid)) {
87  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
88  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
89  __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
90  thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
91  // Mark arrival to master thread
92  /* After performing this write, a worker thread may not assume that the team is valid
93  any more - it could be deallocated by the master thread at any time. */
94  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
95  flag.release();
96  } else {
97  register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
98  register int nproc = this_thr->th.th_team_nproc;
99  register int i;
100  // Don't have to worry about sleep bit here or atomic since team setting
101  register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
102 
103  // Collect all the worker team member threads.
104  for (i=1; i<nproc; ++i) {
105 #if KMP_CACHE_MANAGE
106  // Prefetch next thread's arrived count
107  if (i+1 < nproc)
108  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
109 #endif /* KMP_CACHE_MANAGE */
110  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
111  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
112  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
113  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
114 
115  // Wait for worker thread to arrive
116  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
117  flag.wait(this_thr, FALSE
118  USE_ITT_BUILD_ARG(itt_sync_obj) );
119 #if USE_ITT_BUILD && USE_ITT_NOTIFY
120  // Barrier imbalance - write min of the thread time and the other thread time to the thread.
121  if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
122  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
123  other_threads[i]->th.th_bar_min_time);
124  }
125 #endif
126  if (reduce) {
127  KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
128  team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
129  (*reduce)(this_thr->th.th_local.reduce_data,
130  other_threads[i]->th.th_local.reduce_data);
131  }
132  }
133  // Don't have to worry about sleep bit here or atomic since team setting
134  team_bar->b_arrived = new_state;
135  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
136  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
137  }
138  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
139  gtid, team->t.t_id, tid, bt));
140 }
141 
142 static void
143 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
144  int propagate_icvs
145  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
146 {
147  KMP_TIME_BLOCK(KMP_linear_release);
148  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
149  register kmp_team_t *team;
150 
151  if (KMP_MASTER_TID(tid)) {
152  register unsigned int i;
153  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
154  register kmp_info_t **other_threads;
155 
156  team = __kmp_threads[gtid]->th.th_team;
157  KMP_DEBUG_ASSERT(team != NULL);
158  other_threads = team->t.t_threads;
159 
160  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
161  gtid, team->t.t_id, tid, bt));
162 
163  if (nproc > 1) {
164 #if KMP_BARRIER_ICV_PUSH
165  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
166  if (propagate_icvs) {
167  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
168  for (i=1; i<nproc; ++i) {
169  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
170  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
171  &team->t.t_implicit_task_taskdata[0].td_icvs);
172  }
173  ngo_sync();
174  }
175  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
176 #endif // KMP_BARRIER_ICV_PUSH
177 
178  // Now, release all of the worker threads
179  for (i=1; i<nproc; ++i) {
180 #if KMP_CACHE_MANAGE
181  // Prefetch next thread's go flag
182  if (i+1 < nproc)
183  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
184 #endif /* KMP_CACHE_MANAGE */
185  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
186  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
187  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
188  &other_threads[i]->th.th_bar[bt].bb.b_go,
189  other_threads[i]->th.th_bar[bt].bb.b_go,
190  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
191  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
192  flag.release();
193  }
194  }
195  } else { // Wait for the MASTER thread to release us
196  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
197  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
198  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
199  flag.wait(this_thr, TRUE
200  USE_ITT_BUILD_ARG(itt_sync_obj) );
201 #if USE_ITT_BUILD && USE_ITT_NOTIFY
202  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
203  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
204  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
205  // Cancel wait on previous parallel region...
206  __kmp_itt_task_starting(itt_sync_obj);
207 
208  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
209  return;
210 
211  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
212  if (itt_sync_obj != NULL)
213  // Call prepare as early as possible for "new" barrier
214  __kmp_itt_task_finished(itt_sync_obj);
215  } else
216 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
217  // Early exit for reaping threads releasing forkjoin barrier
218  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
219  return;
220  // The worker thread may now assume that the team is valid.
221 #ifdef KMP_DEBUG
222  tid = __kmp_tid_from_gtid(gtid);
223  team = __kmp_threads[gtid]->th.th_team;
224 #endif
225  KMP_DEBUG_ASSERT(team != NULL);
226  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
227  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
228  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
229  KMP_MB(); // Flush all pending memory write invalidates.
230  }
231  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
232  gtid, team->t.t_id, tid, bt));
233 }
234 
235 // Tree barrier
236 static void
237 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
238  void (*reduce)(void *, void *)
239  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
240 {
241  KMP_TIME_BLOCK(KMP_tree_gather);
242  register kmp_team_t *team = this_thr->th.th_team;
243  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
244  register kmp_info_t **other_threads = team->t.t_threads;
245  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
246  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
247  register kmp_uint32 branch_factor = 1 << branch_bits;
248  register kmp_uint32 child;
249  register kmp_uint32 child_tid;
250  register kmp_uint new_state;
251 
252  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
253  gtid, team->t.t_id, tid, bt));
254  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
255 
256 #if USE_ITT_BUILD && USE_ITT_NOTIFY
257  // Barrier imbalance - save arrive time to the thread
258  if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
259  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
260  }
261 #endif
262  // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
263  child_tid = (tid << branch_bits) + 1;
264  if (child_tid < nproc) {
265  // Parent threads wait for all their children to arrive
266  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
267  child = 1;
268  do {
269  register kmp_info_t *child_thr = other_threads[child_tid];
270  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
271 #if KMP_CACHE_MANAGE
272  // Prefetch next thread's arrived count
273  if (child+1 <= branch_factor && child_tid+1 < nproc)
274  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
275 #endif /* KMP_CACHE_MANAGE */
276  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
277  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
278  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
279  &child_bar->b_arrived, new_state));
280  // Wait for child to arrive
281  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
282  flag.wait(this_thr, FALSE
283  USE_ITT_BUILD_ARG(itt_sync_obj) );
284 #if USE_ITT_BUILD && USE_ITT_NOTIFY
285  // Barrier imbalance - write min of the thread time and a child time to the thread.
286  if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
287  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
288  child_thr->th.th_bar_min_time);
289  }
290 #endif
291  if (reduce) {
292  KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
293  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
294  team->t.t_id, child_tid));
295  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
296  }
297  child++;
298  child_tid++;
299  }
300  while (child <= branch_factor && child_tid < nproc);
301  }
302 
303  if (!KMP_MASTER_TID(tid)) { // Worker threads
304  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
305 
306  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
307  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
308  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
309  &thr_bar->b_arrived, thr_bar->b_arrived,
310  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
311 
312  // Mark arrival to parent thread
313  /* After performing this write, a worker thread may not assume that the team is valid
314  any more - it could be deallocated by the master thread at any time. */
315  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
316  flag.release();
317  } else {
318  // Need to update the team arrived pointer if we are the master thread
319  if (nproc > 1) // New value was already computed above
320  team->t.t_bar[bt].b_arrived = new_state;
321  else
322  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
323  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
324  gtid, team->t.t_id, tid, team->t.t_id,
325  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
326  }
327  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
328  gtid, team->t.t_id, tid, bt));
329 }
330 
331 static void
332 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
333  int propagate_icvs
334  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
335 {
336  KMP_TIME_BLOCK(KMP_tree_release);
337  register kmp_team_t *team;
338  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
339  register kmp_uint32 nproc;
340  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
341  register kmp_uint32 branch_factor = 1 << branch_bits;
342  register kmp_uint32 child;
343  register kmp_uint32 child_tid;
344 
345  // Perform a tree release for all of the threads that have been gathered
346  if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
347  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
348  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
349  // Wait for parent thread to release us
350  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
351  flag.wait(this_thr, TRUE
352  USE_ITT_BUILD_ARG(itt_sync_obj) );
353 #if USE_ITT_BUILD && USE_ITT_NOTIFY
354  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
355  // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
356  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
357  // Cancel wait on previous parallel region...
358  __kmp_itt_task_starting(itt_sync_obj);
359 
360  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361  return;
362 
363  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
364  if (itt_sync_obj != NULL)
365  // Call prepare as early as possible for "new" barrier
366  __kmp_itt_task_finished(itt_sync_obj);
367  } else
368 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
369  // Early exit for reaping threads releasing forkjoin barrier
370  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
371  return;
372 
373  // The worker thread may now assume that the team is valid.
374  team = __kmp_threads[gtid]->th.th_team;
375  KMP_DEBUG_ASSERT(team != NULL);
376  tid = __kmp_tid_from_gtid(gtid);
377 
378  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
379  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
380  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
381  KMP_MB(); // Flush all pending memory write invalidates.
382  } else {
383  team = __kmp_threads[gtid]->th.th_team;
384  KMP_DEBUG_ASSERT(team != NULL);
385  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
386  gtid, team->t.t_id, tid, bt));
387  }
388  nproc = this_thr->th.th_team_nproc;
389  child_tid = (tid << branch_bits) + 1;
390 
391  if (child_tid < nproc) {
392  register kmp_info_t **other_threads = team->t.t_threads;
393  child = 1;
394  // Parent threads release all their children
395  do {
396  register kmp_info_t *child_thr = other_threads[child_tid];
397  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
398 #if KMP_CACHE_MANAGE
399  // Prefetch next thread's go count
400  if (child+1 <= branch_factor && child_tid+1 < nproc)
401  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
402 #endif /* KMP_CACHE_MANAGE */
403 
404 #if KMP_BARRIER_ICV_PUSH
405  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
406  if (propagate_icvs) {
407  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
408  team, child_tid, FALSE);
409  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
410  &team->t.t_implicit_task_taskdata[0].td_icvs);
411  }
412  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
413 #endif // KMP_BARRIER_ICV_PUSH
414  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
415  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
416  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
417  child_tid, &child_bar->b_go, child_bar->b_go,
418  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
419  // Release child from barrier
420  kmp_flag_64 flag(&child_bar->b_go, child_thr);
421  flag.release();
422  child++;
423  child_tid++;
424  }
425  while (child <= branch_factor && child_tid < nproc);
426  }
427  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
428  gtid, team->t.t_id, tid, bt));
429 }
430 
431 
432 // Hyper Barrier
433 static void
434 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
435  void (*reduce)(void *, void *)
436  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
437 {
438  KMP_TIME_BLOCK(KMP_hyper_gather);
439  register kmp_team_t *team = this_thr->th.th_team;
440  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
441  register kmp_info_t **other_threads = team->t.t_threads;
442  register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
443  register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
444  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
445  register kmp_uint32 branch_factor = 1 << branch_bits;
446  register kmp_uint32 offset;
447  register kmp_uint32 level;
448 
449  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
450  gtid, team->t.t_id, tid, bt));
451 
452  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
453 
454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
455  // Barrier imbalance - save arrive time to the thread
456  if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
457  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
458  }
459 #endif
460  /* Perform a hypercube-embedded tree gather to wait until all of the threads have
461  arrived, and reduce any required data as we go. */
462  kmp_flag_64 p_flag(&thr_bar->b_arrived);
463  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
464  {
465  register kmp_uint32 child;
466  register kmp_uint32 child_tid;
467 
468  if (((tid >> level) & (branch_factor - 1)) != 0) {
469  register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
470 
471  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
472  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
473  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
474  &thr_bar->b_arrived, thr_bar->b_arrived,
475  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
476  // Mark arrival to parent thread
477  /* After performing this write (in the last iteration of the enclosing for loop),
478  a worker thread may not assume that the team is valid any more - it could be
479  deallocated by the master thread at any time. */
480  p_flag.set_waiter(other_threads[parent_tid]);
481  p_flag.release();
482  break;
483  }
484 
485  // Parent threads wait for children to arrive
486  if (new_state == KMP_BARRIER_UNUSED_STATE)
487  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
488  for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
489  child++, child_tid+=(1 << level))
490  {
491  register kmp_info_t *child_thr = other_threads[child_tid];
492  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
493 #if KMP_CACHE_MANAGE
494  register kmp_uint32 next_child_tid = child_tid + (1 << level);
495  // Prefetch next thread's arrived count
496  if (child+1 < branch_factor && next_child_tid < num_threads)
497  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
498 #endif /* KMP_CACHE_MANAGE */
499  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
500  "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
501  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
502  &child_bar->b_arrived, new_state));
503  // Wait for child to arrive
504  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
505  c_flag.wait(this_thr, FALSE
506  USE_ITT_BUILD_ARG(itt_sync_obj) );
507 #if USE_ITT_BUILD && USE_ITT_NOTIFY
508  // Barrier imbalance - write min of the thread time and a child time to the thread.
509  if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
510  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
511  child_thr->th.th_bar_min_time);
512  }
513 #endif
514  if (reduce) {
515  KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
516  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
517  team->t.t_id, child_tid));
518  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
519  }
520  }
521  }
522 
523  if (KMP_MASTER_TID(tid)) {
524  // Need to update the team arrived pointer if we are the master thread
525  if (new_state == KMP_BARRIER_UNUSED_STATE)
526  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
527  else
528  team->t.t_bar[bt].b_arrived = new_state;
529  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
530  gtid, team->t.t_id, tid, team->t.t_id,
531  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
532  }
533  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
534  gtid, team->t.t_id, tid, bt));
535 }
536 
537 // The reverse versions seem to beat the forward versions overall
538 #define KMP_REVERSE_HYPER_BAR
539 static void
540 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
541  int propagate_icvs
542  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
543 {
544  KMP_TIME_BLOCK(KMP_hyper_release);
545  register kmp_team_t *team;
546  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
547  register kmp_info_t **other_threads;
548  register kmp_uint32 num_threads;
549  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
550  register kmp_uint32 branch_factor = 1 << branch_bits;
551  register kmp_uint32 child;
552  register kmp_uint32 child_tid;
553  register kmp_uint32 offset;
554  register kmp_uint32 level;
555 
556  /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
557  If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
558  order of the corresponding gather, otherwise threads are released in the same order. */
559  if (KMP_MASTER_TID(tid)) { // master
560  team = __kmp_threads[gtid]->th.th_team;
561  KMP_DEBUG_ASSERT(team != NULL);
562  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
563  gtid, team->t.t_id, tid, bt));
564 #if KMP_BARRIER_ICV_PUSH
565  if (propagate_icvs) { // master already has ICVs in final destination; copy
566  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
567  }
568 #endif
569  }
570  else { // Handle fork barrier workers who aren't part of a team yet
571  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
572  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
573  // Wait for parent thread to release us
574  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
575  flag.wait(this_thr, TRUE
576  USE_ITT_BUILD_ARG(itt_sync_obj) );
577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
578  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
579  // In fork barrier where we could not get the object reliably
580  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
581  // Cancel wait on previous parallel region...
582  __kmp_itt_task_starting(itt_sync_obj);
583 
584  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
585  return;
586 
587  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
588  if (itt_sync_obj != NULL)
589  // Call prepare as early as possible for "new" barrier
590  __kmp_itt_task_finished(itt_sync_obj);
591  } else
592 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
593  // Early exit for reaping threads releasing forkjoin barrier
594  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
595  return;
596 
597  // The worker thread may now assume that the team is valid.
598  team = __kmp_threads[gtid]->th.th_team;
599  KMP_DEBUG_ASSERT(team != NULL);
600  tid = __kmp_tid_from_gtid(gtid);
601 
602  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
603  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
604  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
605  KMP_MB(); // Flush all pending memory write invalidates.
606  }
607  num_threads = this_thr->th.th_team_nproc;
608  other_threads = team->t.t_threads;
609 
610 #ifdef KMP_REVERSE_HYPER_BAR
611  // Count up to correct level for parent
612  for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
613  level+=branch_bits, offset<<=branch_bits);
614 
615  // Now go down from there
616  for (level-=branch_bits, offset>>=branch_bits; offset != 0;
617  level-=branch_bits, offset>>=branch_bits)
618 #else
619  // Go down the tree, level by level
620  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
621 #endif // KMP_REVERSE_HYPER_BAR
622  {
623 #ifdef KMP_REVERSE_HYPER_BAR
624  /* Now go in reverse order through the children, highest to lowest.
625  Initial setting of child is conservative here. */
626  child = num_threads >> ((level==0)?level:level-1);
627  for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
628  child>=1; child--, child_tid-=(1<<level))
629 #else
630  if (((tid >> level) & (branch_factor - 1)) != 0)
631  // No need to go lower than this, since this is the level parent would be notified
632  break;
633  // Iterate through children on this level of the tree
634  for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
635  child++, child_tid+=(1<<level))
636 #endif // KMP_REVERSE_HYPER_BAR
637  {
638  if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
639  else {
640  register kmp_info_t *child_thr = other_threads[child_tid];
641  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
642 #if KMP_CACHE_MANAGE
643  register kmp_uint32 next_child_tid = child_tid - (1 << level);
644  // Prefetch next thread's go count
645 # ifdef KMP_REVERSE_HYPER_BAR
646  if (child-1 >= 1 && next_child_tid < num_threads)
647 # else
648  if (child+1 < branch_factor && next_child_tid < num_threads)
649 # endif // KMP_REVERSE_HYPER_BAR
650  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
651 #endif /* KMP_CACHE_MANAGE */
652 
653 #if KMP_BARRIER_ICV_PUSH
654  if (propagate_icvs) // push my fixed ICVs to my child
655  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
656 #endif // KMP_BARRIER_ICV_PUSH
657 
658  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
659  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
660  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
661  child_tid, &child_bar->b_go, child_bar->b_go,
662  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
663  // Release child from barrier
664  kmp_flag_64 flag(&child_bar->b_go, child_thr);
665  flag.release();
666  }
667  }
668  }
669 #if KMP_BARRIER_ICV_PUSH
670  if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
671  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
672  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
673  }
674 #endif
675  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
676  gtid, team->t.t_id, tid, bt));
677 }
678 
679 // Hierarchical Barrier
680 
681 // Initialize thread barrier data
682 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
683  minimum amount of initialization required based on how the team has changed. Returns true if
684  leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
685  team size increases, threads already in the team will respond to on-core wakeup on their parent
686  thread, but threads newly added to the team will only be listening on the their local b_go. */
687 static bool
688 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
689  int gtid, int tid, kmp_team_t *team)
690 {
691  // Checks to determine if (re-)initialization is needed
692  bool uninitialized = thr_bar->team == NULL;
693  bool team_changed = team != thr_bar->team;
694  bool team_sz_changed = nproc != thr_bar->nproc;
695  bool tid_changed = tid != thr_bar->old_tid;
696  bool retval = false;
697 
698  if (uninitialized || team_sz_changed) {
699  __kmp_get_hierarchy(nproc, thr_bar);
700  }
701 
702  if (uninitialized || team_sz_changed || tid_changed) {
703  thr_bar->my_level = thr_bar->depth-1; // default for master
704  thr_bar->parent_tid = -1; // default for master
705  if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
706  kmp_uint32 d=0;
707  while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
708  kmp_uint32 rem;
709  if (d == thr_bar->depth-2) { // reached level right below the master
710  thr_bar->parent_tid = 0;
711  thr_bar->my_level = d;
712  break;
713  }
714  else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
715  // thread is not a subtree root at next level, so this is max
716  thr_bar->parent_tid = tid - rem;
717  thr_bar->my_level = d;
718  break;
719  }
720  ++d;
721  }
722  }
723  thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
724  thr_bar->old_tid = tid;
725  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
726  }
727  if (uninitialized || team_changed || tid_changed) {
728  thr_bar->team = team;
729  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
730  retval = true;
731  }
732  if (uninitialized || team_sz_changed || tid_changed) {
733  thr_bar->nproc = nproc;
734  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
735  if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
736  if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
737  thr_bar->leaf_kids = nproc - tid - 1;
738  thr_bar->leaf_state = 0;
739  for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
740  }
741  return retval;
742 }
743 
744 static void
745 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
746  int gtid, int tid, void (*reduce) (void *, void *)
747  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
748 {
749  KMP_TIME_BLOCK(KMP_hier_gather);
750  register kmp_team_t *team = this_thr->th.th_team;
751  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
752  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
753  register kmp_info_t **other_threads = team->t.t_threads;
754  register kmp_uint64 new_state;
755 
756  if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
757  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
758 
759  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
760  gtid, team->t.t_id, tid, bt));
761  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
762 
763  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
764 
765  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
766  register kmp_int32 child_tid;
767  new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
768  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
769  if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
770  kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
771  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
772  flag.wait(this_thr, FALSE
773  USE_ITT_BUILD_ARG(itt_sync_obj) );
774  if (reduce) {
775  for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
776  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
777  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
778  team->t.t_id, child_tid));
779  (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
780  }
781  }
782  (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
783  }
784  // Next, wait for higher level children on each child's b_arrived flag
785  for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
786  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
787  if (last > nproc) last = nproc;
788  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
789  register kmp_info_t *child_thr = other_threads[child_tid];
790  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
791  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
792  "arrived(%p) == %u\n",
793  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
794  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
795  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
796  flag.wait(this_thr, FALSE
797  USE_ITT_BUILD_ARG(itt_sync_obj) );
798  if (reduce) {
799  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
800  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
801  team->t.t_id, child_tid));
802  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
803  }
804  }
805  }
806  }
807  else { // Blocktime is not infinite
808  for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
809  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
810  if (last > nproc) last = nproc;
811  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
812  register kmp_info_t *child_thr = other_threads[child_tid];
813  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
814  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
815  "arrived(%p) == %u\n",
816  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
817  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
818  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
819  flag.wait(this_thr, FALSE
820  USE_ITT_BUILD_ARG(itt_sync_obj) );
821  if (reduce) {
822  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
823  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
824  team->t.t_id, child_tid));
825  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
826  }
827  }
828  }
829  }
830  }
831  // All subordinates are gathered; now release parent if not master thread
832 
833  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
834  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
835  "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
836  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
837  &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
838  /* Mark arrival to parent: After performing this write, a worker thread may not assume that
839  the team is valid any more - it could be deallocated by the master thread at any time. */
840  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
841  || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
842  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
843  flag.release();
844  }
845  else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
846  thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
847  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
848  flag.set_waiter(other_threads[thr_bar->parent_tid]);
849  flag.release();
850  }
851  } else { // Master thread needs to update the team's b_arrived value
852  team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
853  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
854  gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
855  }
856  // Is the team access below unsafe or just technically invalid?
857  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
858  gtid, team->t.t_id, tid, bt));
859 }
860 
861 static void
862 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863  int propagate_icvs
864  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
865 {
866  KMP_TIME_BLOCK(KMP_hier_release);
867  register kmp_team_t *team;
868  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
869  register kmp_uint32 nproc;
870  bool team_change = false; // indicates on-core barrier shouldn't be used
871 
872  if (KMP_MASTER_TID(tid)) {
873  team = __kmp_threads[gtid]->th.th_team;
874  KMP_DEBUG_ASSERT(team != NULL);
875  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
876  gtid, team->t.t_id, tid, bt));
877  }
878  else { // Worker threads
879  // Wait for parent thread to release me
880  if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
881  || thr_bar->my_level != 0 || thr_bar->team == NULL) {
882  // Use traditional method of waiting on my own b_go flag
883  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
884  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
885  flag.wait(this_thr, TRUE
886  USE_ITT_BUILD_ARG(itt_sync_obj) );
887  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
888  }
889  else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
890  // Wait on my "offset" bits on parent's b_go flag
891  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
892  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
893  bt, this_thr
894  USE_ITT_BUILD_ARG(itt_sync_obj) );
895  flag.wait(this_thr, TRUE
896  USE_ITT_BUILD_ARG(itt_sync_obj) );
897  if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
898  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
899  }
900  else { // Reset my bits on parent's b_go flag
901  ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
902  }
903  }
904  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
905  // Early exit for reaping threads releasing forkjoin barrier
906  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
907  return;
908  // The worker thread may now assume that the team is valid.
909  team = __kmp_threads[gtid]->th.th_team;
910  KMP_DEBUG_ASSERT(team != NULL);
911  tid = __kmp_tid_from_gtid(gtid);
912 
913  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
914  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
915  KMP_MB(); // Flush all pending memory write invalidates.
916  }
917 
918  if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
919  else thr_bar->use_oncore_barrier = 0;
920  nproc = this_thr->th.th_team_nproc;
921 
922  // If the team size has increased, we still communicate with old leaves via oncore barrier.
923  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
924  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
925  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
926  // But if the entire team changes, we won't use oncore barrier at all
927  if (team_change) old_leaf_kids = 0;
928 
929 #if KMP_BARRIER_ICV_PUSH
930  if (propagate_icvs) {
931  if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
932  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
933  }
934  else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
935  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
936  // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
937  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
938  &thr_bar->parent_bar->th_fixed_icvs);
939  // non-leaves will get ICVs piggybacked with b_go via NGO store
940  }
941  else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
942  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
943  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
944  else // leaves copy parent's fixed ICVs directly to local ICV store
945  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
946  &thr_bar->parent_bar->th_fixed_icvs);
947  }
948  }
949 #endif // KMP_BARRIER_ICV_PUSH
950 
951  // Now, release my children
952  if (thr_bar->my_level) { // not a leaf
953  register kmp_int32 child_tid;
954  kmp_uint32 last;
955  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
956  if (KMP_MASTER_TID(tid)) { // do a flat release
957  // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
958  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
959  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
960  ngo_load(&thr_bar->th_fixed_icvs);
961  // This loops over all the threads skipping only the leaf nodes in the hierarchy
962  for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
963  register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
964  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
965  " go(%p): %u => %u\n",
966  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
967  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
968  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
969  // Use ngo store (if available) to both store ICVs and release child via child's b_go
970  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
971  }
972  ngo_sync();
973  }
974  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
975  // Now, release leaf children
976  if (thr_bar->leaf_kids) { // if there are any
977  // We test team_change on the off-chance that the level 1 team changed.
978  if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
979  if (old_leaf_kids) { // release old leaf kids
980  thr_bar->b_go |= old_leaf_state;
981  }
982  // Release new leaf kids
983  last = tid+thr_bar->skip_per_level[1];
984  if (last > nproc) last = nproc;
985  for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
986  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
987  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
988  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
989  " T#%d(%d:%d) go(%p): %u => %u\n",
990  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993  // Release child using child's b_go flag
994  kmp_flag_64 flag(&child_bar->b_go, child_thr);
995  flag.release();
996  }
997  }
998  else { // Release all children at once with leaf_state bits on my own b_go flag
999  thr_bar->b_go |= thr_bar->leaf_state;
1000  }
1001  }
1002  }
1003  else { // Blocktime is not infinite; do a simple hierarchical release
1004  for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1005  last = tid+thr_bar->skip_per_level[d+1];
1006  kmp_uint32 skip = thr_bar->skip_per_level[d];
1007  if (last > nproc) last = nproc;
1008  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1009  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1010  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1011  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1012  " go(%p): %u => %u\n",
1013  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1014  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1015  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1016  // Release child using child's b_go flag
1017  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1018  flag.release();
1019  }
1020  }
1021  }
1022 #if KMP_BARRIER_ICV_PUSH
1023  if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1024  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1025 #endif // KMP_BARRIER_ICV_PUSH
1026  }
1027  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028  gtid, team->t.t_id, tid, bt));
1029 }
1030 
1031 // ---------------------------- End of Barrier Algorithms ----------------------------
1032 
1033 // Internal function to do a barrier.
1034 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1035  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1036  Returns 0 if master thread, 1 if worker thread. */
1037 int
1038 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1039  void *reduce_data, void (*reduce)(void *, void *))
1040 {
1041  KMP_TIME_BLOCK(KMP_barrier);
1042  register int tid = __kmp_tid_from_gtid(gtid);
1043  register kmp_info_t *this_thr = __kmp_threads[gtid];
1044  register kmp_team_t *team = this_thr->th.th_team;
1045  register int status = 0;
1046  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1047 
1048  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1049  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1050 
1051  if (! team->t.t_serialized) {
1052 #if USE_ITT_BUILD
1053  // This value will be used in itt notify events below.
1054  void *itt_sync_obj = NULL;
1055 # if USE_ITT_NOTIFY
1056  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1057  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1058 # endif
1059 #endif /* USE_ITT_BUILD */
1060  if (__kmp_tasking_mode == tskm_extra_barrier) {
1061  __kmp_tasking_barrier(team, this_thr, gtid);
1062  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1063  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1064  }
1065 
1066  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1067  the team struct is not guaranteed to exist. */
1068  // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1069  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1070  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1071  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1072  }
1073 
1074 #if USE_ITT_BUILD
1075  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1076  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1077 #endif /* USE_ITT_BUILD */
1078 
1079  if (reduce != NULL) {
1080  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1081  this_thr->th.th_local.reduce_data = reduce_data;
1082  }
1083  switch (__kmp_barrier_gather_pattern[bt]) {
1084  case bp_hyper_bar: {
1085  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1086  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1087  USE_ITT_BUILD_ARG(itt_sync_obj) );
1088  break;
1089  }
1090  case bp_hierarchical_bar: {
1091  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1092  USE_ITT_BUILD_ARG(itt_sync_obj));
1093  break;
1094  }
1095  case bp_tree_bar: {
1096  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1097  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1098  USE_ITT_BUILD_ARG(itt_sync_obj) );
1099  break;
1100  }
1101  default: {
1102  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1103  USE_ITT_BUILD_ARG(itt_sync_obj) );
1104  }
1105  }
1106 
1107  KMP_MB();
1108 
1109  if (KMP_MASTER_TID(tid)) {
1110  status = 0;
1111  if (__kmp_tasking_mode != tskm_immediate_exec) {
1112  __kmp_task_team_wait(this_thr, team
1113  USE_ITT_BUILD_ARG(itt_sync_obj) );
1114  __kmp_task_team_setup(this_thr, team);
1115  }
1116 
1117 
1118 #if USE_ITT_BUILD
1119  /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1120  before the final summation into the shared variable is done (final summation can be a
1121  long operation for array reductions). */
1122  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1123  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1124 #endif /* USE_ITT_BUILD */
1125 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1126  // Barrier - report frame end
1127  if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1128  kmp_uint64 cur_time = __itt_get_timestamp();
1129  kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1130  int nproc = this_thr->th.th_team_nproc;
1131  int i;
1132  // Initialize with master's wait time
1133  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1134  switch(__kmp_forkjoin_frames_mode) {
1135  case 1:
1136  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1137  this_thr->th.th_frame_time = cur_time;
1138  break;
1139  case 2:
1140  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1141  break;
1142  case 3:
1143  if( __itt_metadata_add_ptr ) {
1144  for (i=1; i<nproc; ++i) {
1145  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1146  }
1147  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1148  }
1149  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1150  this_thr->th.th_frame_time = cur_time;
1151  break;
1152  }
1153  }
1154 #endif /* USE_ITT_BUILD */
1155  } else {
1156  status = 1;
1157 #if USE_ITT_BUILD
1158  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1159  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1160 #endif /* USE_ITT_BUILD */
1161  }
1162  if (status == 1 || ! is_split) {
1163  switch (__kmp_barrier_release_pattern[bt]) {
1164  case bp_hyper_bar: {
1165  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1166  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1167  USE_ITT_BUILD_ARG(itt_sync_obj) );
1168  break;
1169  }
1170  case bp_hierarchical_bar: {
1171  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1172  USE_ITT_BUILD_ARG(itt_sync_obj) );
1173  break;
1174  }
1175  case bp_tree_bar: {
1176  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1177  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1178  USE_ITT_BUILD_ARG(itt_sync_obj) );
1179  break;
1180  }
1181  default: {
1182  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1183  USE_ITT_BUILD_ARG(itt_sync_obj) );
1184  }
1185  }
1186  if (__kmp_tasking_mode != tskm_immediate_exec) {
1187  __kmp_task_team_sync(this_thr, team);
1188  }
1189  }
1190 
1191 #if USE_ITT_BUILD
1192  /* GEH: TODO: Move this under if-condition above and also include in
1193  __kmp_end_split_barrier(). This will more accurately represent the actual release time
1194  of the threads for split barriers. */
1195  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1196  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1197 #endif /* USE_ITT_BUILD */
1198  } else { // Team is serialized.
1199  status = 0;
1200  if (__kmp_tasking_mode != tskm_immediate_exec) {
1201  // The task team should be NULL for serialized code (tasks will be executed immediately)
1202  KMP_DEBUG_ASSERT(team->t.t_task_team == NULL);
1203  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1204  }
1205  }
1206  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1207  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1208  return status;
1209 }
1210 
1211 
1212 void
1213 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1214 {
1215  KMP_TIME_BLOCK(KMP_end_split_barrier);
1216  int tid = __kmp_tid_from_gtid(gtid);
1217  kmp_info_t *this_thr = __kmp_threads[gtid];
1218  kmp_team_t *team = this_thr->th.th_team;
1219 
1220  if (!team->t.t_serialized) {
1221  if (KMP_MASTER_GTID(gtid)) {
1222  switch (__kmp_barrier_release_pattern[bt]) {
1223  case bp_hyper_bar: {
1224  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1225  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1226  USE_ITT_BUILD_ARG(NULL) );
1227  break;
1228  }
1229  case bp_hierarchical_bar: {
1230  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1231  USE_ITT_BUILD_ARG(NULL));
1232  break;
1233  }
1234  case bp_tree_bar: {
1235  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1236  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1237  USE_ITT_BUILD_ARG(NULL) );
1238  break;
1239  }
1240  default: {
1241  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1242  USE_ITT_BUILD_ARG(NULL) );
1243  }
1244  }
1245  if (__kmp_tasking_mode != tskm_immediate_exec) {
1246  __kmp_task_team_sync(this_thr, team);
1247  } // if
1248  }
1249  }
1250 }
1251 
1252 
1253 void
1254 __kmp_join_barrier(int gtid)
1255 {
1256  KMP_TIME_BLOCK(KMP_join_barrier);
1257  register kmp_info_t *this_thr = __kmp_threads[gtid];
1258  register kmp_team_t *team;
1259  register kmp_uint nproc;
1260  kmp_info_t *master_thread;
1261  int tid;
1262 #ifdef KMP_DEBUG
1263  int team_id;
1264 #endif /* KMP_DEBUG */
1265 #if USE_ITT_BUILD
1266  void *itt_sync_obj = NULL;
1267 # if USE_ITT_NOTIFY
1268  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1269  // Get object created at fork_barrier
1270  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1271 # endif
1272 #endif /* USE_ITT_BUILD */
1273  KMP_MB();
1274 
1275  // Get current info
1276  team = this_thr->th.th_team;
1277  nproc = this_thr->th.th_team_nproc;
1278  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1279  tid = __kmp_tid_from_gtid(gtid);
1280 #ifdef KMP_DEBUG
1281  team_id = team->t.t_id;
1282 #endif /* KMP_DEBUG */
1283  master_thread = this_thr->th.th_team_master;
1284 #ifdef KMP_DEBUG
1285  if (master_thread != team->t.t_threads[0]) {
1286  __kmp_print_structure();
1287  }
1288 #endif /* KMP_DEBUG */
1289  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1290  KMP_MB();
1291 
1292  // Verify state
1293  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1294  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1295  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1296  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1297  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1298 
1299  if (__kmp_tasking_mode == tskm_extra_barrier) {
1300  __kmp_tasking_barrier(team, this_thr, gtid);
1301  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1302  }
1303 # ifdef KMP_DEBUG
1304  if (__kmp_tasking_mode != tskm_immediate_exec) {
1305  KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1306  __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team,
1307  this_thr->th.th_task_team));
1308  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team);
1309  }
1310 # endif /* KMP_DEBUG */
1311 
1312  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1313  team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1314  down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1315  since the values are not used by __kmp_wait_template() in that case. */
1316  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1317  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1318  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1319  }
1320 
1321 #if USE_ITT_BUILD
1322  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1323  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1324 #endif /* USE_ITT_BUILD */
1325 
1326  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1327  case bp_hyper_bar: {
1328  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1329  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1330  USE_ITT_BUILD_ARG(itt_sync_obj) );
1331  break;
1332  }
1333  case bp_hierarchical_bar: {
1334  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1335  USE_ITT_BUILD_ARG(itt_sync_obj) );
1336  break;
1337  }
1338  case bp_tree_bar: {
1339  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1340  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1341  USE_ITT_BUILD_ARG(itt_sync_obj) );
1342  break;
1343  }
1344  default: {
1345  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1346  USE_ITT_BUILD_ARG(itt_sync_obj) );
1347  }
1348  }
1349 
1350  /* From this point on, the team data structure may be deallocated at any time by the
1351  master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1352  data items that need to be referenced before the end of the barrier should be moved to
1353  the kmp_task_team_t structs. */
1354  if (KMP_MASTER_TID(tid)) {
1355  if (__kmp_tasking_mode != tskm_immediate_exec) {
1356  // Master shouldn't call decrease_load(). // TODO: enable master threads.
1357  // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1358  __kmp_task_team_wait(this_thr, team
1359  USE_ITT_BUILD_ARG(itt_sync_obj) );
1360  }
1361 #if USE_ITT_BUILD
1362  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1363  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1364 #endif /* USE_ITT_BUILD */
1365 
1366 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1367  // Join barrier - report frame end
1368  if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1369  kmp_uint64 cur_time = __itt_get_timestamp();
1370  ident_t * loc = team->t.t_ident;
1371  kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1372  int nproc = this_thr->th.th_team_nproc;
1373  int i;
1374  // Initialize with master's wait time
1375  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1376  switch(__kmp_forkjoin_frames_mode) {
1377  case 1:
1378  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1379  break;
1380  case 2:
1381  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1382  break;
1383  case 3:
1384  if( __itt_metadata_add_ptr ) {
1385  for (i=1; i<nproc; ++i) {
1386  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1387  }
1388  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1389  }
1390  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1391  this_thr->th.th_frame_time = cur_time;
1392  break;
1393  }
1394  }
1395 # endif /* USE_ITT_BUILD */
1396  }
1397 #if USE_ITT_BUILD
1398  else {
1399  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1400  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1401  }
1402 #endif /* USE_ITT_BUILD */
1403 
1404 #if KMP_DEBUG
1405  if (KMP_MASTER_TID(tid)) {
1406  KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1407  gtid, team_id, tid, nproc));
1408  }
1409 #endif /* KMP_DEBUG */
1410 
1411  // TODO now, mark worker threads as done so they may be disbanded
1412  KMP_MB(); // Flush all pending memory write invalidates.
1413  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1414 }
1415 
1416 
1417 // TODO release worker threads' fork barriers as we are ready instead of all at once
1418 void
1419 __kmp_fork_barrier(int gtid, int tid)
1420 {
1421  KMP_TIME_BLOCK(KMP_fork_barrier);
1422  kmp_info_t *this_thr = __kmp_threads[gtid];
1423  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1424 #if USE_ITT_BUILD
1425  void * itt_sync_obj = NULL;
1426 #endif /* USE_ITT_BUILD */
1427 
1428  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1429  gtid, (team != NULL) ? team->t.t_id : -1, tid));
1430 
1431  // th_team pointer only valid for master thread here
1432  if (KMP_MASTER_TID(tid)) {
1433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1434  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1435  // Create itt barrier object
1436  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1437  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1438  }
1439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1440 
1441 #ifdef KMP_DEBUG
1442  register kmp_info_t **other_threads = team->t.t_threads;
1443  register int i;
1444 
1445  // Verify state
1446  KMP_MB();
1447 
1448  for(i=1; i<team->t.t_nproc; ++i) {
1449  KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1450  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1451  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1452  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1453  KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1454  & ~(KMP_BARRIER_SLEEP_STATE))
1455  == KMP_INIT_BARRIER_STATE);
1456  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1457  }
1458 #endif
1459 
1460  if (__kmp_tasking_mode != tskm_immediate_exec) {
1461  __kmp_task_team_setup(this_thr, team);
1462  }
1463 
1464  /* The master thread may have changed its blocktime between the join barrier and the
1465  fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1466  access it when the team struct is not guaranteed to exist. */
1467  // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1468  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1469  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1470  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1471  }
1472  } // master
1473 
1474  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1475  case bp_hyper_bar: {
1476  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1477  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1478  USE_ITT_BUILD_ARG(itt_sync_obj) );
1479  break;
1480  }
1481  case bp_hierarchical_bar: {
1482  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1483  USE_ITT_BUILD_ARG(itt_sync_obj) );
1484  break;
1485  }
1486  case bp_tree_bar: {
1487  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1488  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1489  USE_ITT_BUILD_ARG(itt_sync_obj) );
1490  break;
1491  }
1492  default: {
1493  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1494  USE_ITT_BUILD_ARG(itt_sync_obj) );
1495  }
1496  }
1497 
1498  // Early exit for reaping threads releasing forkjoin barrier
1499  if (TCR_4(__kmp_global.g.g_done)) {
1500  if (this_thr->th.th_task_team != NULL) {
1501  if (KMP_MASTER_TID(tid)) {
1502  TCW_PTR(this_thr->th.th_task_team, NULL);
1503  }
1504  else {
1505  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1506  }
1507  }
1508 
1509 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1510  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1511  if (!KMP_MASTER_TID(tid)) {
1512  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1513  if (itt_sync_obj)
1514  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1515  }
1516  }
1517 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1518  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1519  return;
1520  }
1521 
1522  /* We can now assume that a valid team structure has been allocated by the master and
1523  propagated to all worker threads. The current thread, however, may not be part of the
1524  team, so we can't blindly assume that the team pointer is non-null. */
1525  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1526  KMP_DEBUG_ASSERT(team != NULL);
1527  tid = __kmp_tid_from_gtid(gtid);
1528 
1529 
1530 #if KMP_BARRIER_ICV_PULL
1531  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1532  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1533  this data before this function is called. We cannot modify __kmp_fork_call() to look at
1534  the fixed ICVs in the master's thread struct, because it is not always the case that the
1535  threads arrays have been allocated when __kmp_fork_call() is executed. */
1536  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1537  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1538  // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1539  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1540  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1541  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1542  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1543  }
1544  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1545 #endif // KMP_BARRIER_ICV_PULL
1546 
1547  if (__kmp_tasking_mode != tskm_immediate_exec) {
1548  __kmp_task_team_sync(this_thr, team);
1549  }
1550 
1551 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1552  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1553  if (proc_bind == proc_bind_intel) {
1554 #endif
1555 #if KMP_MIC
1556  // Call dynamic affinity settings
1557  if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1558  __kmp_balanced_affinity(tid, team->t.t_nproc);
1559  }
1560 #endif
1561 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1562  }
1563  else if ((proc_bind != proc_bind_false)
1564  && (proc_bind != proc_bind_disabled)) {
1565  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1566  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1567  __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1568  }
1569  else {
1570  __kmp_affinity_set_place(gtid);
1571  }
1572  }
1573 #endif
1574 
1575 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1576  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1577  if (!KMP_MASTER_TID(tid)) {
1578  // Get correct barrier object
1579  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1580  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1581  } // (prepare called inside barrier_release)
1582  }
1583 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1584  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1585 }
1586 
1587 
1588 void
1589 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1590 {
1591  KMP_TIME_BLOCK(KMP_setup_icv_copy);
1592  int f;
1593 
1594  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1595  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1596 
1597  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1598  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1599  this data before this function is called. */
1600 #if KMP_BARRIER_ICV_PULL
1601  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1602  all of the worker threads can access them and make their own copies after the barrier. */
1603  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1604  copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1605  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1606  0, team->t.t_threads[0], team));
1607 #elif KMP_BARRIER_ICV_PUSH
1608  // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1609  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1610  0, team->t.t_threads[0], team));
1611 #else
1612  // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1613  ngo_load(new_icvs);
1614  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1615  for (f=1; f<new_nproc; ++f) { // Skip the master thread
1616  // TODO: GEH - pass in better source location info since usually NULL here
1617  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1618  f, team->t.t_threads[f], team));
1619  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1620  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1621  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1622  f, team->t.t_threads[f], team));
1623  }
1624  ngo_sync();
1625 #endif // KMP_BARRIER_ICV_PULL
1626 }
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
Definition: kmp.h:218