36 #include "kmp_wait_release.h"
37 #include "kmp_stats.h"
41 #include <immintrin.h>
42 #define USE_NGO_STORES 1
45 #if KMP_MIC && USE_NGO_STORES
47 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
48 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
49 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
50 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
52 #define ngo_load(src) ((void)0)
53 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
54 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
55 #define ngo_sync() ((void)0)
58 void __kmp_print_structure(
void);
64 __kmp_linear_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
65 void (*reduce)(
void *,
void *)
66 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
69 register kmp_team_t *team = this_thr->th.th_team;
70 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
71 register kmp_info_t **other_threads = team->t.t_threads;
73 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
74 gtid, team->t.t_id, tid, bt));
75 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
77 #if USE_ITT_BUILD && USE_ITT_NOTIFY
79 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
80 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
84 if (!KMP_MASTER_TID(tid)) {
85 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
86 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
87 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
88 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
92 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
95 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
96 register int nproc = this_thr->th.th_team_nproc;
99 register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
102 for (i=1; i<nproc; ++i) {
106 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
108 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
109 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
110 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
111 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
114 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
115 flag.wait(this_thr, FALSE
116 USE_ITT_BUILD_ARG(itt_sync_obj) );
117 #if USE_ITT_BUILD && USE_ITT_NOTIFY
119 if (__kmp_forkjoin_frames_mode == 2) {
120 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
121 other_threads[i]->th.th_bar_min_time);
125 KA_TRACE(100, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
126 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
127 (*reduce)(this_thr->th.th_local.reduce_data,
128 other_threads[i]->th.th_local.reduce_data);
132 team_bar->b_arrived = new_state;
133 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
134 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
136 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
137 gtid, team->t.t_id, tid, bt));
141 __kmp_linear_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
143 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
146 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
147 register kmp_team_t *team;
149 if (KMP_MASTER_TID(tid)) {
150 register unsigned int i;
151 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
152 register kmp_info_t **other_threads;
154 team = __kmp_threads[gtid]->th.th_team;
155 KMP_DEBUG_ASSERT(team != NULL);
156 other_threads = team->t.t_threads;
158 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
159 gtid, team->t.t_id, tid, bt));
162 #if KMP_BARRIER_ICV_PUSH
164 if (propagate_icvs) {
165 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
166 for (i=1; i<nproc; ++i) {
167 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
168 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
169 &team->t.t_implicit_task_taskdata[0].td_icvs);
174 #endif // KMP_BARRIER_ICV_PUSH
177 for (i=1; i<nproc; ++i) {
181 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
183 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
184 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
185 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
186 &other_threads[i]->th.th_bar[bt].bb.b_go,
187 other_threads[i]->th.th_bar[bt].bb.b_go,
188 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
189 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
194 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
195 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
196 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
197 flag.wait(this_thr, TRUE
198 USE_ITT_BUILD_ARG(itt_sync_obj) );
199 #if USE_ITT_BUILD && USE_ITT_NOTIFY
200 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
202 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
204 __kmp_itt_task_starting(itt_sync_obj);
206 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
209 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
210 if (itt_sync_obj != NULL)
212 __kmp_itt_task_finished(itt_sync_obj);
216 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
220 tid = __kmp_tid_from_gtid(gtid);
221 team = __kmp_threads[gtid]->th.th_team;
223 KMP_DEBUG_ASSERT(team != NULL);
224 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
225 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
226 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
229 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
230 gtid, team->t.t_id, tid, bt));
235 __kmp_tree_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
236 void (*reduce)(
void *,
void *)
237 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
240 register kmp_team_t *team = this_thr->th.th_team;
241 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
242 register kmp_info_t **other_threads = team->t.t_threads;
243 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
244 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
245 register kmp_uint32 branch_factor = 1 << branch_bits;
246 register kmp_uint32 child;
247 register kmp_uint32 child_tid;
248 register kmp_uint new_state;
250 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
251 gtid, team->t.t_id, tid, bt));
252 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
254 #if USE_ITT_BUILD && USE_ITT_NOTIFY
256 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
257 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
261 child_tid = (tid << branch_bits) + 1;
262 if (child_tid < nproc) {
264 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
267 register kmp_info_t *child_thr = other_threads[child_tid];
268 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
271 if (child+1 <= branch_factor && child_tid+1 < nproc)
272 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
274 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
275 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
276 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
277 &child_bar->b_arrived, new_state));
279 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
280 flag.wait(this_thr, FALSE
281 USE_ITT_BUILD_ARG(itt_sync_obj) );
282 #if USE_ITT_BUILD && USE_ITT_NOTIFY
284 if (__kmp_forkjoin_frames_mode == 2) {
285 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
286 child_thr->th.th_bar_min_time);
290 KA_TRACE(100, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
291 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
292 team->t.t_id, child_tid));
293 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
298 while (child <= branch_factor && child_tid < nproc);
301 if (!KMP_MASTER_TID(tid)) {
302 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
304 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
305 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
306 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
307 &thr_bar->b_arrived, thr_bar->b_arrived,
308 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
313 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
318 team->t.t_bar[bt].b_arrived = new_state;
320 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
321 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
322 gtid, team->t.t_id, tid, team->t.t_id,
323 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
325 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
326 gtid, team->t.t_id, tid, bt));
330 __kmp_tree_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
332 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
335 register kmp_team_t *team;
336 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
337 register kmp_uint32 nproc;
338 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
339 register kmp_uint32 branch_factor = 1 << branch_bits;
340 register kmp_uint32 child;
341 register kmp_uint32 child_tid;
344 if (!KMP_MASTER_TID(tid)) {
345 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
346 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
348 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
349 flag.wait(this_thr, TRUE
350 USE_ITT_BUILD_ARG(itt_sync_obj) );
351 #if USE_ITT_BUILD && USE_ITT_NOTIFY
352 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
354 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
356 __kmp_itt_task_starting(itt_sync_obj);
358 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
362 if (itt_sync_obj != NULL)
364 __kmp_itt_task_finished(itt_sync_obj);
368 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
372 team = __kmp_threads[gtid]->th.th_team;
373 KMP_DEBUG_ASSERT(team != NULL);
374 tid = __kmp_tid_from_gtid(gtid);
376 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
377 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
378 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
381 team = __kmp_threads[gtid]->th.th_team;
382 KMP_DEBUG_ASSERT(team != NULL);
383 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
384 gtid, team->t.t_id, tid, bt));
386 nproc = this_thr->th.th_team_nproc;
387 child_tid = (tid << branch_bits) + 1;
389 if (child_tid < nproc) {
390 register kmp_info_t **other_threads = team->t.t_threads;
394 register kmp_info_t *child_thr = other_threads[child_tid];
395 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
398 if (child+1 <= branch_factor && child_tid+1 < nproc)
399 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
402 #if KMP_BARRIER_ICV_PUSH
404 if (propagate_icvs) {
405 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
406 team, child_tid, FALSE);
407 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
408 &team->t.t_implicit_task_taskdata[0].td_icvs);
411 #endif // KMP_BARRIER_ICV_PUSH
412 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
413 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
414 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
415 child_tid, &child_bar->b_go, child_bar->b_go,
416 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
418 kmp_flag_64 flag(&child_bar->b_go, child_thr);
423 while (child <= branch_factor && child_tid < nproc);
425 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
426 gtid, team->t.t_id, tid, bt));
432 __kmp_hyper_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
433 void (*reduce)(
void *,
void *)
434 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
437 register kmp_team_t *team = this_thr->th.th_team;
438 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
439 register kmp_info_t **other_threads = team->t.t_threads;
440 register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
441 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
442 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
443 register kmp_uint32 branch_factor = 1 << branch_bits;
444 register kmp_uint32 offset;
445 register kmp_uint32 level;
447 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
448 gtid, team->t.t_id, tid, bt));
450 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
454 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
455 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
460 kmp_flag_64 p_flag(&thr_bar->b_arrived);
461 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
463 register kmp_uint32 child;
464 register kmp_uint32 child_tid;
466 if (((tid >> level) & (branch_factor - 1)) != 0) {
467 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
469 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
470 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
471 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
472 &thr_bar->b_arrived, thr_bar->b_arrived,
473 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
478 p_flag.set_waiter(other_threads[parent_tid]);
484 if (new_state == KMP_BARRIER_UNUSED_STATE)
485 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
486 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
487 child++, child_tid+=(1 << level))
489 register kmp_info_t *child_thr = other_threads[child_tid];
490 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
492 register kmp_uint32 next_child_tid = child_tid + (1 << level);
494 if (child+1 < branch_factor && next_child_tid < num_threads)
495 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
497 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
498 "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
499 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
500 &child_bar->b_arrived, new_state));
502 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
503 c_flag.wait(this_thr, FALSE
504 USE_ITT_BUILD_ARG(itt_sync_obj) );
505 #if USE_ITT_BUILD && USE_ITT_NOTIFY
507 if (__kmp_forkjoin_frames_mode == 2) {
508 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
509 child_thr->th.th_bar_min_time);
513 KA_TRACE(100, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
514 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
515 team->t.t_id, child_tid));
516 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
521 if (KMP_MASTER_TID(tid)) {
523 if (new_state == KMP_BARRIER_UNUSED_STATE)
524 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
526 team->t.t_bar[bt].b_arrived = new_state;
527 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
528 gtid, team->t.t_id, tid, team->t.t_id,
529 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
531 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
532 gtid, team->t.t_id, tid, bt));
536 #define KMP_REVERSE_HYPER_BAR
538 __kmp_hyper_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
540 USE_ITT_BUILD_ARG(
void *itt_sync_obj) )
543 register kmp_team_t *team;
544 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
545 register kmp_info_t **other_threads;
546 register kmp_uint32 num_threads;
547 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
548 register kmp_uint32 branch_factor = 1 << branch_bits;
549 register kmp_uint32 child;
550 register kmp_uint32 child_tid;
551 register kmp_uint32 offset;
552 register kmp_uint32 level;
557 if (KMP_MASTER_TID(tid)) {
558 team = __kmp_threads[gtid]->th.th_team;
559 KMP_DEBUG_ASSERT(team != NULL);
560 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
561 gtid, team->t.t_id, tid, bt));
562 #if KMP_BARRIER_ICV_PUSH
563 if (propagate_icvs) {
564 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
569 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
570 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
572 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
573 flag.wait(this_thr, TRUE
574 USE_ITT_BUILD_ARG(itt_sync_obj) );
575 #if USE_ITT_BUILD && USE_ITT_NOTIFY
576 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
578 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
580 __kmp_itt_task_starting(itt_sync_obj);
582 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
585 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
586 if (itt_sync_obj != NULL)
588 __kmp_itt_task_finished(itt_sync_obj);
592 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
596 team = __kmp_threads[gtid]->th.th_team;
597 KMP_DEBUG_ASSERT(team != NULL);
598 tid = __kmp_tid_from_gtid(gtid);
600 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
601 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
602 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
605 num_threads = this_thr->th.th_team_nproc;
606 other_threads = team->t.t_threads;
608 #ifdef KMP_REVERSE_HYPER_BAR
610 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
611 level+=branch_bits, offset<<=branch_bits);
614 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
615 level-=branch_bits, offset>>=branch_bits)
618 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
621 #ifdef KMP_REVERSE_HYPER_BAR
624 child = num_threads >> ((level==0)?level:level-1);
625 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
626 child>=1; child--, child_tid-=(1<<level))
628 if (((tid >> level) & (branch_factor - 1)) != 0)
632 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
633 child++, child_tid+=(1<<level))
634 #endif // KMP_REVERSE_HYPER_BAR
636 if (child_tid >= num_threads)
continue;
638 register kmp_info_t *child_thr = other_threads[child_tid];
639 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
641 register kmp_uint32 next_child_tid = child_tid - (1 << level);
643 # ifdef KMP_REVERSE_HYPER_BAR
644 if (child-1 >= 1 && next_child_tid < num_threads)
646 if (child+1 < branch_factor && next_child_tid < num_threads)
647 # endif // KMP_REVERSE_HYPER_BAR
648 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
651 #if KMP_BARRIER_ICV_PUSH
653 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
654 #endif // KMP_BARRIER_ICV_PUSH
656 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
657 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
658 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
659 child_tid, &child_bar->b_go, child_bar->b_go,
660 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
662 kmp_flag_64 flag(&child_bar->b_go, child_thr);
667 #if KMP_BARRIER_ICV_PUSH
668 if (propagate_icvs && !KMP_MASTER_TID(tid)) {
669 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
670 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
673 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
674 gtid, team->t.t_id, tid, bt));
686 __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
687 int gtid,
int tid, kmp_team_t *team)
690 bool uninitialized = thr_bar->team == NULL;
691 bool team_changed = team != thr_bar->team;
692 bool team_sz_changed = nproc != thr_bar->nproc;
693 bool tid_changed = tid != thr_bar->old_tid;
696 if (uninitialized || team_sz_changed) {
697 __kmp_get_hierarchy(nproc, thr_bar);
700 if (uninitialized || team_sz_changed || tid_changed) {
701 thr_bar->my_level = thr_bar->depth-1;
702 thr_bar->parent_tid = -1;
703 if (!KMP_MASTER_TID(tid)) {
705 while (d<thr_bar->depth) {
707 if (d == thr_bar->depth-2) {
708 thr_bar->parent_tid = 0;
709 thr_bar->my_level = d;
712 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) {
714 thr_bar->parent_tid = tid - rem;
715 thr_bar->my_level = d;
721 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
722 thr_bar->old_tid = tid;
723 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
725 if (uninitialized || team_changed || tid_changed) {
726 thr_bar->team = team;
727 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
730 if (uninitialized || team_sz_changed || tid_changed) {
731 thr_bar->nproc = nproc;
732 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
733 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
734 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
735 thr_bar->leaf_kids = nproc - tid - 1;
736 thr_bar->leaf_state = 0;
737 for (
int i=0; i<thr_bar->leaf_kids; ++i) ((
char *)&(thr_bar->leaf_state))[7-i] = 1;
743 __kmp_hierarchical_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
744 int gtid,
int tid,
void (*reduce) (
void *,
void *)
745 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
748 register kmp_team_t *team = this_thr->th.th_team;
749 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
750 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
751 register kmp_info_t **other_threads = team->t.t_threads;
752 register kmp_uint64 new_state;
754 int level = team->t.t_level;
755 if (other_threads[0]->th.th_teams_microtask)
756 if (this_thr->th.th_teams_size.nteams > 1)
758 if (level == 1) thr_bar->use_oncore_barrier = 1;
759 else thr_bar->use_oncore_barrier = 0;
761 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
762 gtid, team->t.t_id, tid, bt));
763 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
765 #if USE_ITT_BUILD && USE_ITT_NOTIFY
767 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
768 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
772 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
774 if (thr_bar->my_level) {
775 register kmp_int32 child_tid;
776 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
777 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
778 if (thr_bar->leaf_kids) {
779 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
780 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
781 flag.wait(this_thr, FALSE
782 USE_ITT_BUILD_ARG(itt_sync_obj) );
784 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
785 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
786 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
787 team->t.t_id, child_tid));
788 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
791 (void) KMP_TEST_THEN_AND64((
volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state));
794 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) {
795 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
796 if (last > nproc) last = nproc;
797 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
798 register kmp_info_t *child_thr = other_threads[child_tid];
799 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
800 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
801 "arrived(%p) == %u\n",
802 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
803 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
804 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
805 flag.wait(this_thr, FALSE
806 USE_ITT_BUILD_ARG(itt_sync_obj) );
808 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
809 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
810 team->t.t_id, child_tid));
811 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
817 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) {
818 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
819 if (last > nproc) last = nproc;
820 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
821 register kmp_info_t *child_thr = other_threads[child_tid];
822 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
823 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
824 "arrived(%p) == %u\n",
825 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
826 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
827 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
828 flag.wait(this_thr, FALSE
829 USE_ITT_BUILD_ARG(itt_sync_obj) );
831 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
832 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
833 team->t.t_id, child_tid));
834 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
842 if (!KMP_MASTER_TID(tid)) {
843 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
844 "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
845 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
846 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
849 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
850 || !thr_bar->use_oncore_barrier) {
851 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
855 thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
856 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
857 flag.set_waiter(other_threads[thr_bar->parent_tid]);
861 team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
862 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
863 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
866 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
867 gtid, team->t.t_id, tid, bt));
871 __kmp_hierarchical_barrier_release(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
873 USE_ITT_BUILD_ARG(
void * itt_sync_obj) )
876 register kmp_team_t *team;
877 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
878 register kmp_uint32 nproc;
879 bool team_change =
false;
881 if (KMP_MASTER_TID(tid)) {
882 team = __kmp_threads[gtid]->th.th_team;
883 KMP_DEBUG_ASSERT(team != NULL);
884 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
885 gtid, team->t.t_id, tid, bt));
889 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
890 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
892 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
893 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
894 flag.wait(this_thr, TRUE
895 USE_ITT_BUILD_ARG(itt_sync_obj) );
896 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
900 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
901 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
903 USE_ITT_BUILD_ARG(itt_sync_obj) );
904 flag.wait(this_thr, TRUE
905 USE_ITT_BUILD_ARG(itt_sync_obj) );
906 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) {
907 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
910 ((
char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
913 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
915 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
918 team = __kmp_threads[gtid]->th.th_team;
919 KMP_DEBUG_ASSERT(team != NULL);
920 tid = __kmp_tid_from_gtid(gtid);
922 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
923 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
927 int level = team->t.t_level;
928 if (team->t.t_threads[0]->th.th_teams_microtask ) {
929 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
931 if( this_thr->th.th_teams_size.nteams > 1 )
934 if (level == 1) thr_bar->use_oncore_barrier = 1;
935 else thr_bar->use_oncore_barrier = 0;
936 nproc = this_thr->th.th_team_nproc;
939 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
940 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
941 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
943 if (team_change) old_leaf_kids = 0;
945 #if KMP_BARRIER_ICV_PUSH
946 if (propagate_icvs) {
947 if (KMP_MASTER_TID(tid)) {
948 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
950 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
951 if (!thr_bar->my_level)
953 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
954 &thr_bar->parent_bar->th_fixed_icvs);
958 if (thr_bar->my_level)
959 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
961 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
962 &thr_bar->parent_bar->th_fixed_icvs);
965 #endif // KMP_BARRIER_ICV_PUSH
968 if (thr_bar->my_level) {
969 register kmp_int32 child_tid;
971 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
972 if (KMP_MASTER_TID(tid)) {
974 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
976 ngo_load(&thr_bar->th_fixed_icvs);
978 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(
int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
979 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
980 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
981 " go(%p): %u => %u\n",
982 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
983 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
984 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
986 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
990 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
992 if (thr_bar->leaf_kids) {
994 if (team_change || old_leaf_kids < thr_bar->leaf_kids) {
996 thr_bar->b_go |= old_leaf_state;
999 last = tid+thr_bar->skip_per_level[1];
1000 if (last > nproc) last = nproc;
1001 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) {
1002 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1003 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1004 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1005 " T#%d(%d:%d) go(%p): %u => %u\n",
1006 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1007 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1008 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1010 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1015 thr_bar->b_go |= thr_bar->leaf_state;
1020 for (
int d=thr_bar->my_level-1; d>=0; --d) {
1021 last = tid+thr_bar->skip_per_level[d+1];
1022 kmp_uint32 skip = thr_bar->skip_per_level[d];
1023 if (last > nproc) last = nproc;
1024 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1025 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1026 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1027 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1028 " go(%p): %u => %u\n",
1029 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1030 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1031 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1033 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1038 #if KMP_BARRIER_ICV_PUSH
1039 if (propagate_icvs && !KMP_MASTER_TID(tid))
1040 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1041 #endif // KMP_BARRIER_ICV_PUSH
1043 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1044 gtid, team->t.t_id, tid, bt));
1054 __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
size_t reduce_size,
1055 void *reduce_data,
void (*reduce)(
void *,
void *))
1058 register int tid = __kmp_tid_from_gtid(gtid);
1059 register kmp_info_t *this_thr = __kmp_threads[gtid];
1060 register kmp_team_t *team = this_thr->th.th_team;
1061 register int status = 0;
1062 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1064 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n",
1065 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1067 if (! team->t.t_serialized) {
1070 void *itt_sync_obj = NULL;
1072 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1073 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1076 if (__kmp_tasking_mode == tskm_extra_barrier) {
1077 __kmp_tasking_barrier(team, this_thr, gtid);
1078 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1079 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1085 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1086 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1087 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1091 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1092 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1096 if (KMP_MASTER_TID(tid)) {
1097 team->t.t_bar[bt].b_master_arrived += 1;
1099 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1102 if (reduce != NULL) {
1104 this_thr->th.th_local.reduce_data = reduce_data;
1106 switch (__kmp_barrier_gather_pattern[bt]) {
1107 case bp_hyper_bar: {
1108 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1109 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1110 USE_ITT_BUILD_ARG(itt_sync_obj) );
1113 case bp_hierarchical_bar: {
1114 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1115 USE_ITT_BUILD_ARG(itt_sync_obj));
1119 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1120 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1121 USE_ITT_BUILD_ARG(itt_sync_obj) );
1125 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1126 USE_ITT_BUILD_ARG(itt_sync_obj) );
1132 if (KMP_MASTER_TID(tid)) {
1134 if (__kmp_tasking_mode != tskm_immediate_exec) {
1135 __kmp_task_team_wait(this_thr, team
1136 USE_ITT_BUILD_ARG(itt_sync_obj) );
1137 __kmp_task_team_setup(this_thr, team, 0, 0);
1141 team->t.t_bar[bt].b_team_arrived += 1;
1148 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1149 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1151 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1153 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1155 this_thr->th.th_teams_microtask == NULL &&
1157 team->t.t_active_level == 1)
1159 kmp_uint64 cur_time = __itt_get_timestamp();
1160 kmp_info_t **other_threads = team->t.t_threads;
1161 int nproc = this_thr->th.th_team_nproc;
1163 switch(__kmp_forkjoin_frames_mode) {
1165 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1166 this_thr->th.th_frame_time = cur_time;
1169 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1172 if( __itt_metadata_add_ptr ) {
1174 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1175 for (i=1; i<nproc; ++i) {
1176 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1178 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1180 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1181 this_thr->th.th_frame_time = cur_time;
1189 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1190 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1193 if (status == 1 || ! is_split) {
1194 switch (__kmp_barrier_release_pattern[bt]) {
1195 case bp_hyper_bar: {
1196 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1197 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1198 USE_ITT_BUILD_ARG(itt_sync_obj) );
1201 case bp_hierarchical_bar: {
1202 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1203 USE_ITT_BUILD_ARG(itt_sync_obj) );
1207 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1208 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1209 USE_ITT_BUILD_ARG(itt_sync_obj) );
1213 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1214 USE_ITT_BUILD_ARG(itt_sync_obj) );
1217 if (__kmp_tasking_mode != tskm_immediate_exec) {
1218 __kmp_task_team_sync(this_thr, team);
1226 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1227 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1231 if (__kmp_tasking_mode != tskm_immediate_exec) {
1233 if ( this_thr->th.th_task_team != NULL ) {
1234 void *itt_sync_obj = NULL;
1236 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1237 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1238 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1242 kmp_task_team_t * task_team = this_thr->th.th_task_team;
1243 KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1244 __kmp_task_team_wait(this_thr, team
1245 USE_ITT_BUILD_ARG(itt_sync_obj));
1246 __kmp_task_team_setup(this_thr, team, 0, 0);
1249 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1250 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1255 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1256 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1260 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1261 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1267 __kmp_end_split_barrier(
enum barrier_type bt,
int gtid)
1270 int tid = __kmp_tid_from_gtid(gtid);
1271 kmp_info_t *this_thr = __kmp_threads[gtid];
1272 kmp_team_t *team = this_thr->th.th_team;
1274 if (!team->t.t_serialized) {
1275 if (KMP_MASTER_GTID(gtid)) {
1276 switch (__kmp_barrier_release_pattern[bt]) {
1277 case bp_hyper_bar: {
1278 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1279 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1280 USE_ITT_BUILD_ARG(NULL) );
1283 case bp_hierarchical_bar: {
1284 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1285 USE_ITT_BUILD_ARG(NULL));
1289 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1290 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1291 USE_ITT_BUILD_ARG(NULL) );
1295 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1296 USE_ITT_BUILD_ARG(NULL) );
1299 if (__kmp_tasking_mode != tskm_immediate_exec) {
1300 __kmp_task_team_sync(this_thr, team);
1308 __kmp_join_barrier(
int gtid)
1311 register kmp_info_t *this_thr = __kmp_threads[gtid];
1312 register kmp_team_t *team;
1313 register kmp_uint nproc;
1314 kmp_info_t *master_thread;
1320 void *itt_sync_obj = NULL;
1322 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1324 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1330 team = this_thr->th.th_team;
1331 nproc = this_thr->th.th_team_nproc;
1332 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1333 tid = __kmp_tid_from_gtid(gtid);
1335 team_id = team->t.t_id;
1337 master_thread = this_thr->th.th_team_master;
1339 if (master_thread != team->t.t_threads[0]) {
1340 __kmp_print_structure();
1343 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1347 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1348 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1349 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1350 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1351 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1353 if (__kmp_tasking_mode == tskm_extra_barrier) {
1354 __kmp_tasking_barrier(team, this_thr, gtid);
1355 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1358 if (__kmp_tasking_mode != tskm_immediate_exec) {
1359 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1360 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1361 this_thr->th.th_task_team));
1362 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1370 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1371 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1372 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1376 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1377 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1380 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1381 case bp_hyper_bar: {
1382 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1383 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1384 USE_ITT_BUILD_ARG(itt_sync_obj) );
1387 case bp_hierarchical_bar: {
1388 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1389 USE_ITT_BUILD_ARG(itt_sync_obj) );
1393 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1394 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1395 USE_ITT_BUILD_ARG(itt_sync_obj) );
1399 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1400 USE_ITT_BUILD_ARG(itt_sync_obj) );
1408 if (KMP_MASTER_TID(tid)) {
1409 if (__kmp_tasking_mode != tskm_immediate_exec) {
1412 __kmp_task_team_wait(this_thr, team
1413 USE_ITT_BUILD_ARG(itt_sync_obj) );
1416 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1417 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1420 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1422 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1424 this_thr->th.th_teams_microtask == NULL &&
1426 team->t.t_active_level == 1)
1428 kmp_uint64 cur_time = __itt_get_timestamp();
1429 ident_t * loc = team->t.t_ident;
1430 kmp_info_t **other_threads = team->t.t_threads;
1431 int nproc = this_thr->th.th_team_nproc;
1433 switch(__kmp_forkjoin_frames_mode) {
1435 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1438 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1441 if( __itt_metadata_add_ptr ) {
1443 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1444 for (i=1; i<nproc; ++i) {
1445 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1447 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1449 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1450 this_thr->th.th_frame_time = cur_time;
1458 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1459 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1464 if (KMP_MASTER_TID(tid)) {
1465 KA_TRACE(15, (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1466 gtid, team_id, tid, nproc));
1472 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1478 __kmp_fork_barrier(
int gtid,
int tid)
1481 kmp_info_t *this_thr = __kmp_threads[gtid];
1482 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1484 void * itt_sync_obj = NULL;
1487 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1488 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1491 if (KMP_MASTER_TID(tid)) {
1492 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1493 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1495 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1496 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 register kmp_info_t **other_threads = team->t.t_threads;
1507 for(i=1; i<team->t.t_nproc; ++i) {
1508 KA_TRACE(500, (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1509 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1510 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1511 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1512 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1513 & ~(KMP_BARRIER_SLEEP_STATE))
1514 == KMP_INIT_BARRIER_STATE);
1515 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1519 if (__kmp_tasking_mode != tskm_immediate_exec) {
1520 __kmp_task_team_setup(this_thr, team, 1, 0);
1527 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1528 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1529 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1533 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1534 case bp_hyper_bar: {
1535 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1536 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1537 USE_ITT_BUILD_ARG(itt_sync_obj) );
1540 case bp_hierarchical_bar: {
1541 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1542 USE_ITT_BUILD_ARG(itt_sync_obj) );
1546 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1547 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1548 USE_ITT_BUILD_ARG(itt_sync_obj) );
1552 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1553 USE_ITT_BUILD_ARG(itt_sync_obj) );
1558 if (TCR_4(__kmp_global.g.g_done)) {
1559 if (this_thr->th.th_task_team != NULL) {
1560 if (KMP_MASTER_TID(tid)) {
1561 TCW_PTR(this_thr->th.th_task_team, NULL);
1564 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1568 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1569 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1570 if (!KMP_MASTER_TID(tid)) {
1571 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1573 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1577 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1584 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1585 KMP_DEBUG_ASSERT(team != NULL);
1586 tid = __kmp_tid_from_gtid(gtid);
1589 #if KMP_BARRIER_ICV_PULL
1596 if (!KMP_MASTER_TID(tid)) {
1598 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1599 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1600 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1601 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1604 #endif // KMP_BARRIER_ICV_PULL
1606 if (__kmp_tasking_mode != tskm_immediate_exec) {
1607 __kmp_task_team_sync(this_thr, team);
1610 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1611 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1612 if (proc_bind == proc_bind_intel) {
1614 #if KMP_AFFINITY_SUPPORTED
1616 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1617 __kmp_balanced_affinity(tid, team->t.t_nproc);
1619 #endif // KMP_AFFINITY_SUPPORTED
1620 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1622 else if (proc_bind != proc_bind_false) {
1623 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1624 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
1625 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1628 __kmp_affinity_set_place(gtid);
1633 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1634 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1635 if (!KMP_MASTER_TID(tid)) {
1637 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1638 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1642 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1647 __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc, kmp_internal_control_t *new_icvs,
ident_t *loc )
1652 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1653 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1658 #if KMP_BARRIER_ICV_PULL
1661 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
1662 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1663 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1664 0, team->t.t_threads[0], team));
1665 #elif KMP_BARRIER_ICV_PUSH
1667 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1668 0, team->t.t_threads[0], team));
1672 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
1673 for (f=1; f<new_nproc; ++f) {
1675 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1676 f, team->t.t_threads[f], team));
1677 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1678 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1679 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1680 f, team->t.t_threads[f], team));
1683 #endif // KMP_BARRIER_ICV_PULL
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.