38 #include "kmp_wait_release.h"
39 #include "kmp_stats.h"
43 #include <immintrin.h>
44 #define USE_NGO_STORES 1
47 #if KMP_MIC && USE_NGO_STORES
49 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
50 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
51 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
52 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
54 #define ngo_load(src) ((void)0)
55 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
56 #define ngo_store_go(dst, src) memcpy((dst), (src), CACHE_LINE)
57 #define ngo_sync() ((void)0)
60 void __kmp_print_structure(
void);
66 __kmp_linear_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
67 void (*reduce)(
void *,
void *)
68 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
71 register kmp_team_t *team = this_thr->th.th_team;
72 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
73 register kmp_info_t **other_threads = team->t.t_threads;
75 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
76 gtid, team->t.t_id, tid, bt));
77 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
79 #if USE_ITT_BUILD && USE_ITT_NOTIFY
81 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
82 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
86 if (!KMP_MASTER_TID(tid)) {
87 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
88 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
89 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
90 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
94 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
97 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
98 register int nproc = this_thr->th.th_team_nproc;
101 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
104 for (i=1; i<nproc; ++i) {
108 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
110 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
111 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
112 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
113 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
116 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
117 flag.wait(this_thr, FALSE
118 USE_ITT_BUILD_ARG(itt_sync_obj) );
119 #if USE_ITT_BUILD && USE_ITT_NOTIFY
121 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
122 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
123 other_threads[i]->th.th_bar_min_time);
127 KA_TRACE(100, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
128 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
129 (*reduce)(this_thr->th.th_local.reduce_data,
130 other_threads[i]->th.th_local.reduce_data);
134 team_bar->b_arrived = new_state;
135 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
136 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
138 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
139 gtid, team->t.t_id, tid, bt));
143 __kmp_linear_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
145 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
148 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
149 register kmp_team_t *team;
151 if (KMP_MASTER_TID(tid)) {
152 register unsigned int i;
153 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
154 register kmp_info_t **other_threads;
156 team = __kmp_threads[gtid]->th.th_team;
157 KMP_DEBUG_ASSERT(team != NULL);
158 other_threads = team->t.t_threads;
160 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
161 gtid, team->t.t_id, tid, bt));
164 #if KMP_BARRIER_ICV_PUSH
166 if (propagate_icvs) {
167 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
168 for (i=1; i<nproc; ++i) {
169 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
170 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
171 &team->t.t_implicit_task_taskdata[0].td_icvs);
176 #endif // KMP_BARRIER_ICV_PUSH
179 for (i=1; i<nproc; ++i) {
183 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
185 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
186 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
187 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
188 &other_threads[i]->th.th_bar[bt].bb.b_go,
189 other_threads[i]->th.th_bar[bt].bb.b_go,
190 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
191 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
196 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
197 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
198 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
199 flag.wait(this_thr, TRUE
200 USE_ITT_BUILD_ARG(itt_sync_obj) );
201 #if USE_ITT_BUILD && USE_ITT_NOTIFY
202 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
204 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
206 __kmp_itt_task_starting(itt_sync_obj);
208 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
211 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
212 if (itt_sync_obj != NULL)
214 __kmp_itt_task_finished(itt_sync_obj);
218 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
222 tid = __kmp_tid_from_gtid(gtid);
223 team = __kmp_threads[gtid]->th.th_team;
225 KMP_DEBUG_ASSERT(team != NULL);
226 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
227 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
228 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
231 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
232 gtid, team->t.t_id, tid, bt));
237 __kmp_tree_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
238 void (*reduce)(
void *,
void *)
239 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
242 register kmp_team_t *team = this_thr->th.th_team;
243 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
244 register kmp_info_t **other_threads = team->t.t_threads;
245 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
246 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
247 register kmp_uint32 branch_factor = 1 << branch_bits;
248 register kmp_uint32 child;
249 register kmp_uint32 child_tid;
250 register kmp_uint new_state;
252 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
253 gtid, team->t.t_id, tid, bt));
254 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
256 #if USE_ITT_BUILD && USE_ITT_NOTIFY
258 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
259 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
263 child_tid = (tid << branch_bits) + 1;
264 if (child_tid < nproc) {
266 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
269 register kmp_info_t *child_thr = other_threads[child_tid];
270 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
273 if (child+1 <= branch_factor && child_tid+1 < nproc)
274 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
276 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
277 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
278 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
279 &child_bar->b_arrived, new_state));
281 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
282 flag.wait(this_thr, FALSE
283 USE_ITT_BUILD_ARG(itt_sync_obj) );
284 #if USE_ITT_BUILD && USE_ITT_NOTIFY
286 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
287 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
288 child_thr->th.th_bar_min_time);
292 KA_TRACE(100, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
293 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
294 team->t.t_id, child_tid));
295 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
300 while (child <= branch_factor && child_tid < nproc);
303 if (!KMP_MASTER_TID(tid)) {
304 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
306 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
307 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
308 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
309 &thr_bar->b_arrived, thr_bar->b_arrived,
310 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
315 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
320 team->t.t_bar[bt].b_arrived = new_state;
322 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
323 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
324 gtid, team->t.t_id, tid, team->t.t_id,
325 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
327 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
328 gtid, team->t.t_id, tid, bt));
332 __kmp_tree_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
334 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
337 register kmp_team_t *team;
338 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
339 register kmp_uint32 nproc;
340 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
341 register kmp_uint32 branch_factor = 1 << branch_bits;
342 register kmp_uint32 child;
343 register kmp_uint32 child_tid;
346 if (!KMP_MASTER_TID(tid)) {
347 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
348 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
350 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
351 flag.wait(this_thr, TRUE
352 USE_ITT_BUILD_ARG(itt_sync_obj) );
353 #if USE_ITT_BUILD && USE_ITT_NOTIFY
354 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
356 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
358 __kmp_itt_task_starting(itt_sync_obj);
360 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
363 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
364 if (itt_sync_obj != NULL)
366 __kmp_itt_task_finished(itt_sync_obj);
370 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
374 team = __kmp_threads[gtid]->th.th_team;
375 KMP_DEBUG_ASSERT(team != NULL);
376 tid = __kmp_tid_from_gtid(gtid);
378 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
379 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
380 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
383 team = __kmp_threads[gtid]->th.th_team;
384 KMP_DEBUG_ASSERT(team != NULL);
385 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
386 gtid, team->t.t_id, tid, bt));
388 nproc = this_thr->th.th_team_nproc;
389 child_tid = (tid << branch_bits) + 1;
391 if (child_tid < nproc) {
392 register kmp_info_t **other_threads = team->t.t_threads;
396 register kmp_info_t *child_thr = other_threads[child_tid];
397 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
400 if (child+1 <= branch_factor && child_tid+1 < nproc)
401 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
404 #if KMP_BARRIER_ICV_PUSH
406 if (propagate_icvs) {
407 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
408 team, child_tid, FALSE);
409 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
410 &team->t.t_implicit_task_taskdata[0].td_icvs);
413 #endif // KMP_BARRIER_ICV_PUSH
414 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
415 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
416 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
417 child_tid, &child_bar->b_go, child_bar->b_go,
418 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
420 kmp_flag_64 flag(&child_bar->b_go, child_thr);
425 while (child <= branch_factor && child_tid < nproc);
427 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
428 gtid, team->t.t_id, tid, bt));
434 __kmp_hyper_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
435 void (*reduce)(
void *,
void *)
436 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
439 register kmp_team_t *team = this_thr->th.th_team;
440 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
441 register kmp_info_t **other_threads = team->t.t_threads;
442 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
443 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
444 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
445 register kmp_uint32 branch_factor = 1 << branch_bits;
446 register kmp_uint32 offset;
447 register kmp_uint32 level;
449 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
450 gtid, team->t.t_id, tid, bt));
452 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
456 if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
457 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
462 kmp_flag_64 p_flag(&thr_bar->b_arrived);
463 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
465 register kmp_uint32 child;
466 register kmp_uint32 child_tid;
468 if (((tid >> level) & (branch_factor - 1)) != 0) {
469 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
471 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
472 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
473 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
474 &thr_bar->b_arrived, thr_bar->b_arrived,
475 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
480 p_flag.set_waiter(other_threads[parent_tid]);
486 if (new_state == KMP_BARRIER_UNUSED_STATE)
487 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
488 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
489 child++, child_tid+=(1 << level))
491 register kmp_info_t *child_thr = other_threads[child_tid];
492 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
494 register kmp_uint32 next_child_tid = child_tid + (1 << level);
496 if (child+1 < branch_factor && next_child_tid < num_threads)
497 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
499 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
500 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
501 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
502 &child_bar->b_arrived, new_state));
504 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
505 c_flag.wait(this_thr, FALSE
506 USE_ITT_BUILD_ARG(itt_sync_obj) );
507 #if USE_ITT_BUILD && USE_ITT_NOTIFY
509 if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
510 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
511 child_thr->th.th_bar_min_time);
515 KA_TRACE(100, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
516 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
517 team->t.t_id, child_tid));
518 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
523 if (KMP_MASTER_TID(tid)) {
525 if (new_state == KMP_BARRIER_UNUSED_STATE)
526 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
528 team->t.t_bar[bt].b_arrived = new_state;
529 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
530 gtid, team->t.t_id, tid, team->t.t_id,
531 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
533 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
534 gtid, team->t.t_id, tid, bt));
538 #define KMP_REVERSE_HYPER_BAR
540 __kmp_hyper_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
542 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
545 register kmp_team_t *team;
546 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
547 register kmp_info_t **other_threads;
548 register kmp_uint32 num_threads;
549 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
550 register kmp_uint32 branch_factor = 1 << branch_bits;
551 register kmp_uint32 child;
552 register kmp_uint32 child_tid;
553 register kmp_uint32 offset;
554 register kmp_uint32 level;
559 if (KMP_MASTER_TID(tid)) {
560 team = __kmp_threads[gtid]->th.th_team;
561 KMP_DEBUG_ASSERT(team != NULL);
562 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
563 gtid, team->t.t_id, tid, bt));
564 #if KMP_BARRIER_ICV_PUSH
565 if (propagate_icvs) {
566 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
571 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
572 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
574 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
575 flag.wait(this_thr, TRUE
576 USE_ITT_BUILD_ARG(itt_sync_obj) );
577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
578 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
580 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
582 __kmp_itt_task_starting(itt_sync_obj);
584 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
587 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
588 if (itt_sync_obj != NULL)
590 __kmp_itt_task_finished(itt_sync_obj);
594 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
598 team = __kmp_threads[gtid]->th.th_team;
599 KMP_DEBUG_ASSERT(team != NULL);
600 tid = __kmp_tid_from_gtid(gtid);
602 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
603 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
604 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
607 num_threads = this_thr->th.th_team_nproc;
608 other_threads = team->t.t_threads;
610 #ifdef KMP_REVERSE_HYPER_BAR
612 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
613 level+=branch_bits, offset<<=branch_bits);
616 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
617 level-=branch_bits, offset>>=branch_bits)
620 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
623 #ifdef KMP_REVERSE_HYPER_BAR
626 child = num_threads >> ((level==0)?level:level-1);
627 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
628 child>=1; child--, child_tid-=(1<<level))
630 if (((tid >> level) & (branch_factor - 1)) != 0)
634 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
635 child++, child_tid+=(1<<level))
636 #endif // KMP_REVERSE_HYPER_BAR
638 if (child_tid >= num_threads)
continue;
640 register kmp_info_t *child_thr = other_threads[child_tid];
641 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
643 register kmp_uint32 next_child_tid = child_tid - (1 << level);
645 # ifdef KMP_REVERSE_HYPER_BAR
646 if (child-1 >= 1 && next_child_tid < num_threads)
648 if (child+1 < branch_factor && next_child_tid < num_threads)
649 # endif // KMP_REVERSE_HYPER_BAR
650 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
653 #if KMP_BARRIER_ICV_PUSH
655 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
656 #endif // KMP_BARRIER_ICV_PUSH
658 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
659 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
660 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
661 child_tid, &child_bar->b_go, child_bar->b_go,
662 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
664 kmp_flag_64 flag(&child_bar->b_go, child_thr);
669 #if KMP_BARRIER_ICV_PUSH
670 if (propagate_icvs && !KMP_MASTER_TID(tid)) {
671 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
672 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
675 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
676 gtid, team->t.t_id, tid, bt));
688 __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
689 int gtid,
int tid, kmp_team_t *team)
692 bool uninitialized = thr_bar->team == NULL;
693 bool team_changed = team != thr_bar->team;
694 bool team_sz_changed = nproc != thr_bar->nproc;
695 bool tid_changed = tid != thr_bar->old_tid;
698 if (uninitialized || team_sz_changed) {
699 __kmp_get_hierarchy(nproc, thr_bar);
702 if (uninitialized || team_sz_changed || tid_changed) {
703 thr_bar->my_level = thr_bar->depth-1;
704 thr_bar->parent_tid = -1;
705 if (!KMP_MASTER_TID(tid)) {
707 while (d<thr_bar->depth) {
709 if (d == thr_bar->depth-2) {
710 thr_bar->parent_tid = 0;
711 thr_bar->my_level = d;
714 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) {
716 thr_bar->parent_tid = tid - rem;
717 thr_bar->my_level = d;
723 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
724 thr_bar->old_tid = tid;
725 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
727 if (uninitialized || team_changed || tid_changed) {
728 thr_bar->team = team;
729 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
732 if (uninitialized || team_sz_changed || tid_changed) {
733 thr_bar->nproc = nproc;
734 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
735 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
736 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
737 thr_bar->leaf_kids = nproc - tid - 1;
738 thr_bar->leaf_state = 0;
739 for (
int i=0; i<thr_bar->leaf_kids; ++i) ((
char *)&(thr_bar->leaf_state))[7-i] = 1;
745 __kmp_hierarchical_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
746 int gtid,
int tid,
void (*reduce) (
void *,
void *)
747 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
750 register kmp_team_t *team = this_thr->th.th_team;
751 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
752 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
753 register kmp_info_t **other_threads = team->t.t_threads;
754 register kmp_uint64 new_state;
756 if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
757 else thr_bar->use_oncore_barrier = 0;
759 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
760 gtid, team->t.t_id, tid, bt));
761 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
763 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
765 if (thr_bar->my_level) {
766 register kmp_int32 child_tid;
767 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
768 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
769 if (thr_bar->leaf_kids) {
770 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
771 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
772 flag.wait(this_thr, FALSE
773 USE_ITT_BUILD_ARG(itt_sync_obj) );
775 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
776 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
777 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
778 team->t.t_id, child_tid));
779 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
782 (void) KMP_TEST_THEN_AND64((
volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state));
785 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) {
786 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
787 if (last > nproc) last = nproc;
788 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
789 register kmp_info_t *child_thr = other_threads[child_tid];
790 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
791 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
792 "arrived(%p) == %u\n",
793 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
794 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
795 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
796 flag.wait(this_thr, FALSE
797 USE_ITT_BUILD_ARG(itt_sync_obj) );
799 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
800 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
801 team->t.t_id, child_tid));
802 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
808 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) {
809 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
810 if (last > nproc) last = nproc;
811 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
812 register kmp_info_t *child_thr = other_threads[child_tid];
813 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
814 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
815 "arrived(%p) == %u\n",
816 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
817 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
818 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
819 flag.wait(this_thr, FALSE
820 USE_ITT_BUILD_ARG(itt_sync_obj) );
822 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
823 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
824 team->t.t_id, child_tid));
825 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
833 if (!KMP_MASTER_TID(tid)) {
834 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
835 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
836 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
837 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
840 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
841 || !thr_bar->use_oncore_barrier) {
842 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
846 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
847 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
848 flag.set_waiter(other_threads[thr_bar->parent_tid]);
852 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
853 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
854 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
857 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
858 gtid, team->t.t_id, tid, bt));
862 __kmp_hierarchical_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
864 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
867 register kmp_team_t *team;
868 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
869 register kmp_uint32 nproc;
870 bool team_change =
false;
872 if (KMP_MASTER_TID(tid)) {
873 team = __kmp_threads[gtid]->th.th_team;
874 KMP_DEBUG_ASSERT(team != NULL);
875 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
876 gtid, team->t.t_id, tid, bt));
880 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
881 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
883 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
884 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
885 flag.wait(this_thr, TRUE
886 USE_ITT_BUILD_ARG(itt_sync_obj) );
887 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
891 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
892 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
894 USE_ITT_BUILD_ARG(itt_sync_obj) );
895 flag.wait(this_thr, TRUE
896 USE_ITT_BUILD_ARG(itt_sync_obj) );
897 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) {
898 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
901 ((
char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
904 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
906 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
909 team = __kmp_threads[gtid]->th.th_team;
910 KMP_DEBUG_ASSERT(team != NULL);
911 tid = __kmp_tid_from_gtid(gtid);
913 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
914 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
918 if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
919 else thr_bar->use_oncore_barrier = 0;
920 nproc = this_thr->th.th_team_nproc;
923 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
924 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
925 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
927 if (team_change) old_leaf_kids = 0;
929 #if KMP_BARRIER_ICV_PUSH
930 if (propagate_icvs) {
931 if (KMP_MASTER_TID(tid)) {
932 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
934 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
935 if (!thr_bar->my_level)
937 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
938 &thr_bar->parent_bar->th_fixed_icvs);
942 if (thr_bar->my_level)
943 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
945 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
946 &thr_bar->parent_bar->th_fixed_icvs);
949 #endif // KMP_BARRIER_ICV_PUSH
952 if (thr_bar->my_level) {
953 register kmp_int32 child_tid;
955 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
956 if (KMP_MASTER_TID(tid)) {
958 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
960 ngo_load(&thr_bar->th_fixed_icvs);
962 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(
int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
963 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
964 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
965 " go(%p): %u => %u\n",
966 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
967 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
968 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
970 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
974 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
976 if (thr_bar->leaf_kids) {
978 if (team_change || old_leaf_kids < thr_bar->leaf_kids) {
980 thr_bar->b_go |= old_leaf_state;
983 last = tid+thr_bar->skip_per_level[1];
984 if (last > nproc) last = nproc;
985 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) {
986 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
987 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
988 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
989 " T#%d(%d:%d) go(%p): %u => %u\n",
990 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
994 kmp_flag_64 flag(&child_bar->b_go, child_thr);
999 thr_bar->b_go |= thr_bar->leaf_state;
1004 for (
int d=thr_bar->my_level-1; d>=0; --d) {
1005 last = tid+thr_bar->skip_per_level[d+1];
1006 kmp_uint32 skip = thr_bar->skip_per_level[d];
1007 if (last > nproc) last = nproc;
1008 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1009 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1010 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1011 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1012 " go(%p): %u => %u\n",
1013 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1014 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1015 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1017 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1022 #if KMP_BARRIER_ICV_PUSH
1023 if (propagate_icvs && !KMP_MASTER_TID(tid))
1024 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1025 #endif // KMP_BARRIER_ICV_PUSH
1027 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028 gtid, team->t.t_id, tid, bt));
1038 __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
size_t reduce_size,
1039 void *reduce_data,
void (*reduce)(
void *,
void *))
1042 register int tid = __kmp_tid_from_gtid(gtid);
1043 register kmp_info_t *this_thr = __kmp_threads[gtid];
1044 register kmp_team_t *team = this_thr->th.th_team;
1045 register int status = 0;
1046 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1048 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n",
1049 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1051 if (! team->t.t_serialized) {
1054 void *itt_sync_obj = NULL;
1056 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1057 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1060 if (__kmp_tasking_mode == tskm_extra_barrier) {
1061 __kmp_tasking_barrier(team, this_thr, gtid);
1062 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1063 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1069 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1070 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1071 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1075 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1076 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1079 if (reduce != NULL) {
1081 this_thr->th.th_local.reduce_data = reduce_data;
1083 switch (__kmp_barrier_gather_pattern[bt]) {
1084 case bp_hyper_bar: {
1085 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1086 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1087 USE_ITT_BUILD_ARG(itt_sync_obj) );
1090 case bp_hierarchical_bar: {
1091 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1092 USE_ITT_BUILD_ARG(itt_sync_obj));
1096 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1097 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1098 USE_ITT_BUILD_ARG(itt_sync_obj) );
1102 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1103 USE_ITT_BUILD_ARG(itt_sync_obj) );
1109 if (KMP_MASTER_TID(tid)) {
1111 if (__kmp_tasking_mode != tskm_immediate_exec) {
1112 __kmp_task_team_wait(this_thr, team
1113 USE_ITT_BUILD_ARG(itt_sync_obj) );
1114 __kmp_task_team_setup(this_thr, team);
1122 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1123 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1125 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1127 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1128 kmp_uint64 cur_time = __itt_get_timestamp();
1129 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1130 int nproc = this_thr->th.th_team_nproc;
1133 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1134 switch(__kmp_forkjoin_frames_mode) {
1136 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1137 this_thr->th.th_frame_time = cur_time;
1140 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1143 if( __itt_metadata_add_ptr ) {
1144 for (i=1; i<nproc; ++i) {
1145 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1147 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1149 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1150 this_thr->th.th_frame_time = cur_time;
1158 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1159 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1162 if (status == 1 || ! is_split) {
1163 switch (__kmp_barrier_release_pattern[bt]) {
1164 case bp_hyper_bar: {
1165 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1166 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1167 USE_ITT_BUILD_ARG(itt_sync_obj) );
1170 case bp_hierarchical_bar: {
1171 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1172 USE_ITT_BUILD_ARG(itt_sync_obj) );
1176 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1177 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1178 USE_ITT_BUILD_ARG(itt_sync_obj) );
1182 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1183 USE_ITT_BUILD_ARG(itt_sync_obj) );
1186 if (__kmp_tasking_mode != tskm_immediate_exec) {
1187 __kmp_task_team_sync(this_thr, team);
1195 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1196 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1200 if (__kmp_tasking_mode != tskm_immediate_exec) {
1202 KMP_DEBUG_ASSERT(team->t.t_task_team == NULL);
1203 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1206 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1207 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1213 __kmp_end_split_barrier(
enum barrier_type bt,
int gtid)
1216 int tid = __kmp_tid_from_gtid(gtid);
1217 kmp_info_t *this_thr = __kmp_threads[gtid];
1218 kmp_team_t *team = this_thr->th.th_team;
1220 if (!team->t.t_serialized) {
1221 if (KMP_MASTER_GTID(gtid)) {
1222 switch (__kmp_barrier_release_pattern[bt]) {
1223 case bp_hyper_bar: {
1224 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1225 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1226 USE_ITT_BUILD_ARG(NULL) );
1229 case bp_hierarchical_bar: {
1230 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1231 USE_ITT_BUILD_ARG(NULL));
1235 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1236 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1237 USE_ITT_BUILD_ARG(NULL) );
1241 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1242 USE_ITT_BUILD_ARG(NULL) );
1245 if (__kmp_tasking_mode != tskm_immediate_exec) {
1246 __kmp_task_team_sync(this_thr, team);
1254 __kmp_join_barrier(
int gtid)
1257 register kmp_info_t *this_thr = __kmp_threads[gtid];
1258 register kmp_team_t *team;
1259 register kmp_uint nproc;
1260 kmp_info_t *master_thread;
1266 void *itt_sync_obj = NULL;
1268 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1270 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1276 team = this_thr->th.th_team;
1277 nproc = this_thr->th.th_team_nproc;
1278 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1279 tid = __kmp_tid_from_gtid(gtid);
1281 team_id = team->t.t_id;
1283 master_thread = this_thr->th.th_team_master;
1285 if (master_thread != team->t.t_threads[0]) {
1286 __kmp_print_structure();
1289 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1293 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1294 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1295 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1296 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1297 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1299 if (__kmp_tasking_mode == tskm_extra_barrier) {
1300 __kmp_tasking_barrier(team, this_thr, gtid);
1301 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1304 if (__kmp_tasking_mode != tskm_immediate_exec) {
1305 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1306 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team,
1307 this_thr->th.th_task_team));
1308 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team);
1316 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1317 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1318 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1322 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1323 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1326 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1327 case bp_hyper_bar: {
1328 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1329 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1330 USE_ITT_BUILD_ARG(itt_sync_obj) );
1333 case bp_hierarchical_bar: {
1334 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1335 USE_ITT_BUILD_ARG(itt_sync_obj) );
1339 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1340 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1341 USE_ITT_BUILD_ARG(itt_sync_obj) );
1345 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1346 USE_ITT_BUILD_ARG(itt_sync_obj) );
1354 if (KMP_MASTER_TID(tid)) {
1355 if (__kmp_tasking_mode != tskm_immediate_exec) {
1358 __kmp_task_team_wait(this_thr, team
1359 USE_ITT_BUILD_ARG(itt_sync_obj) );
1362 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1363 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1366 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1368 if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
1369 kmp_uint64 cur_time = __itt_get_timestamp();
1370 ident_t * loc = team->t.t_ident;
1371 kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
1372 int nproc = this_thr->th.th_team_nproc;
1375 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1376 switch(__kmp_forkjoin_frames_mode) {
1378 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1381 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1384 if( __itt_metadata_add_ptr ) {
1385 for (i=1; i<nproc; ++i) {
1386 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1388 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1390 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1391 this_thr->th.th_frame_time = cur_time;
1399 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1400 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1405 if (KMP_MASTER_TID(tid)) {
1406 KA_TRACE(15, (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1407 gtid, team_id, tid, nproc));
1413 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1419 __kmp_fork_barrier(
int gtid,
int tid)
1422 kmp_info_t *this_thr = __kmp_threads[gtid];
1423 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1425 void * itt_sync_obj = NULL;
1428 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1429 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1432 if (KMP_MASTER_TID(tid)) {
1433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1434 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1436 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1437 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1442 register kmp_info_t **other_threads = team->t.t_threads;
1448 for(i=1; i<team->t.t_nproc; ++i) {
1449 KA_TRACE(500, (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1450 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1451 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1452 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1453 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1454 & ~(KMP_BARRIER_SLEEP_STATE))
1455 == KMP_INIT_BARRIER_STATE);
1456 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1460 if (__kmp_tasking_mode != tskm_immediate_exec) {
1461 __kmp_task_team_setup(this_thr, team);
1468 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1469 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1470 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1474 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1475 case bp_hyper_bar: {
1476 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1477 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1478 USE_ITT_BUILD_ARG(itt_sync_obj) );
1481 case bp_hierarchical_bar: {
1482 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1483 USE_ITT_BUILD_ARG(itt_sync_obj) );
1487 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1488 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1489 USE_ITT_BUILD_ARG(itt_sync_obj) );
1493 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1494 USE_ITT_BUILD_ARG(itt_sync_obj) );
1499 if (TCR_4(__kmp_global.g.g_done)) {
1500 if (this_thr->th.th_task_team != NULL) {
1501 if (KMP_MASTER_TID(tid)) {
1502 TCW_PTR(this_thr->th.th_task_team, NULL);
1505 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1509 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1510 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1511 if (!KMP_MASTER_TID(tid)) {
1512 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1514 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1518 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1525 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1526 KMP_DEBUG_ASSERT(team != NULL);
1527 tid = __kmp_tid_from_gtid(gtid);
1530 #if KMP_BARRIER_ICV_PULL
1537 if (!KMP_MASTER_TID(tid)) {
1539 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1540 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1541 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1542 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1545 #endif // KMP_BARRIER_ICV_PULL
1547 if (__kmp_tasking_mode != tskm_immediate_exec) {
1548 __kmp_task_team_sync(this_thr, team);
1551 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1552 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1553 if (proc_bind == proc_bind_intel) {
1557 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1558 __kmp_balanced_affinity(tid, team->t.t_nproc);
1561 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1563 else if ((proc_bind != proc_bind_false)
1564 && (proc_bind != proc_bind_disabled)) {
1565 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1566 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
1567 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1570 __kmp_affinity_set_place(gtid);
1575 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1576 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1577 if (!KMP_MASTER_TID(tid)) {
1579 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1580 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1584 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1589 __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc, kmp_internal_control_t *new_icvs,
ident_t *loc )
1594 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1595 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1600 #if KMP_BARRIER_ICV_PULL
1603 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
1604 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1605 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1606 0, team->t.t_threads[0], team));
1607 #elif KMP_BARRIER_ICV_PUSH
1609 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1610 0, team->t.t_threads[0], team));
1614 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
1615 for (f=1; f<new_nproc; ++f) {
1617 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1618 f, team->t.t_threads[f], team));
1619 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1620 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1621 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1622 f, team->t.t_threads[f], team));
1625 #endif // KMP_BARRIER_ICV_PULL
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.