53 #include "kmp_error.h"
54 #include "kmp_stats.h"
55 #if KMP_OS_WINDOWS && KMP_ARCH_X86
63 template<
typename T >
69 struct i_maxmin< int > {
70 static const int mx = 0x7fffffff;
71 static const int mn = 0x80000000;
74 struct i_maxmin< unsigned int > {
75 static const unsigned int mx = 0xffffffff;
76 static const unsigned int mn = 0x00000000;
79 struct i_maxmin< long long > {
80 static const long long mx = 0x7fffffffffffffffLL;
81 static const long long mn = 0x8000000000000000LL;
84 struct i_maxmin< unsigned long long > {
85 static const unsigned long long mx = 0xffffffffffffffffLL;
86 static const unsigned long long mn = 0x0000000000000000LL;
90 #ifdef KMP_STATIC_STEAL_ENABLED
93 template<
typename T >
94 struct dispatch_private_infoXX_template {
95 typedef typename traits_t< T >::unsigned_t UT;
96 typedef typename traits_t< T >::signed_t ST;
103 T static_steal_counter;
113 struct KMP_ALIGN( 32 ) {
130 template<
typename T >
131 struct dispatch_private_infoXX_template {
132 typedef typename traits_t< T >::unsigned_t UT;
133 typedef typename traits_t< T >::signed_t ST;
156 template<
typename T >
157 struct KMP_ALIGN_CACHE dispatch_private_info_template {
159 union KMP_ALIGN_CACHE private_info_tmpl {
160 dispatch_private_infoXX_template< T > p;
161 dispatch_private_info64_t p64;
165 kmp_uint32 ordered_bumped;
166 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3];
167 dispatch_private_info * next;
169 kmp_uint32 type_size;
170 enum cons_type pushed_ws;
175 template<
typename UT >
176 struct dispatch_shared_infoXX_template {
179 volatile UT iteration;
180 volatile UT num_done;
181 volatile UT ordered_iteration;
182 UT ordered_dummy[KMP_MAX_ORDERED-1];
186 template<
typename UT >
187 struct dispatch_shared_info_template {
189 union shared_info_tmpl {
190 dispatch_shared_infoXX_template< UT > s;
191 dispatch_shared_info64_t s64;
193 volatile kmp_uint32 buffer_index;
199 #undef USE_TEST_LOCKS
202 template<
typename T >
203 static __forceinline T
204 test_then_add(
volatile T *p, T d ) { KMP_ASSERT(0); };
207 __forceinline kmp_int32
208 test_then_add< kmp_int32 >(
volatile kmp_int32 *p, kmp_int32 d )
211 r = KMP_TEST_THEN_ADD32( p, d );
216 __forceinline kmp_int64
217 test_then_add< kmp_int64 >(
volatile kmp_int64 *p, kmp_int64 d )
220 r = KMP_TEST_THEN_ADD64( p, d );
225 template<
typename T >
226 static __forceinline T
227 test_then_inc_acq(
volatile T *p ) { KMP_ASSERT(0); };
230 __forceinline kmp_int32
231 test_then_inc_acq< kmp_int32 >(
volatile kmp_int32 *p )
234 r = KMP_TEST_THEN_INC_ACQ32( p );
239 __forceinline kmp_int64
240 test_then_inc_acq< kmp_int64 >(
volatile kmp_int64 *p )
243 r = KMP_TEST_THEN_INC_ACQ64( p );
248 template<
typename T >
249 static __forceinline T
250 test_then_inc(
volatile T *p ) { KMP_ASSERT(0); };
253 __forceinline kmp_int32
254 test_then_inc< kmp_int32 >(
volatile kmp_int32 *p )
257 r = KMP_TEST_THEN_INC32( p );
262 __forceinline kmp_int64
263 test_then_inc< kmp_int64 >(
volatile kmp_int64 *p )
266 r = KMP_TEST_THEN_INC64( p );
271 template<
typename T >
272 static __forceinline kmp_int32
273 compare_and_swap(
volatile T *p, T c, T s ) { KMP_ASSERT(0); };
276 __forceinline kmp_int32
277 compare_and_swap< kmp_int32 >(
volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
279 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
283 __forceinline kmp_int32
284 compare_and_swap< kmp_int64 >(
volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
286 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
302 template<
typename UT >
305 __kmp_wait_yield(
volatile UT * spinner,
307 kmp_uint32 (* pred)( UT, UT )
308 USE_ITT_BUILD_ARG(
void * obj)
312 register volatile UT * spin = spinner;
313 register UT check = checker;
314 register kmp_uint32 spins;
315 register kmp_uint32 (*f) ( UT, UT ) = pred;
318 KMP_FSYNC_SPIN_INIT( obj, (
void*) spin );
319 KMP_INIT_YIELD( spins );
321 while(!f(r = *spin, check))
323 KMP_FSYNC_SPIN_PREPARE( obj );
332 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
333 KMP_YIELD_SPIN( spins );
335 KMP_FSYNC_SPIN_ACQUIRED( obj );
339 template<
typename UT >
340 static kmp_uint32 __kmp_eq( UT value, UT checker) {
341 return value == checker;
344 template<
typename UT >
345 static kmp_uint32 __kmp_neq( UT value, UT checker) {
346 return value != checker;
349 template<
typename UT >
350 static kmp_uint32 __kmp_lt( UT value, UT checker) {
351 return value < checker;
354 template<
typename UT >
355 static kmp_uint32 __kmp_ge( UT value, UT checker) {
356 return value >= checker;
359 template<
typename UT >
360 static kmp_uint32 __kmp_le( UT value, UT checker) {
361 return value <= checker;
369 __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref )
373 KMP_DEBUG_ASSERT( gtid_ref );
375 if ( __kmp_env_consistency_check ) {
376 th = __kmp_threads[*gtid_ref];
377 if ( th -> th.th_root -> r.r_active
378 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
379 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
384 template<
typename UT >
386 __kmp_dispatch_deo(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref )
388 typedef typename traits_t< UT >::signed_t ST;
389 dispatch_private_info_template< UT > * pr;
391 int gtid = *gtid_ref;
393 kmp_info_t *th = __kmp_threads[ gtid ];
394 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
396 KD_TRACE(100, (
"__kmp_dispatch_deo: T#%d called\n", gtid ) );
397 if ( __kmp_env_consistency_check ) {
398 pr =
reinterpret_cast< dispatch_private_info_template< UT >*
>
399 ( th -> th.th_dispatch -> th_dispatch_pr_current );
400 if ( pr -> pushed_ws != ct_none ) {
401 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
405 if ( ! th -> th.th_team -> t.t_serialized ) {
406 dispatch_shared_info_template< UT > * sh =
reinterpret_cast< dispatch_shared_info_template< UT >*
>
407 ( th -> th.th_dispatch -> th_dispatch_sh_current );
410 if ( ! __kmp_env_consistency_check ) {
411 pr =
reinterpret_cast< dispatch_private_info_template< UT >*
>
412 ( th -> th.th_dispatch -> th_dispatch_pr_current );
414 lower = pr->u.p.ordered_lower;
416 #if ! defined( KMP_GOMP_COMPAT )
417 if ( __kmp_env_consistency_check ) {
418 if ( pr->ordered_bumped ) {
419 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
420 __kmp_error_construct2(
421 kmp_i18n_msg_CnsMultipleNesting,
422 ct_ordered_in_pdo, loc_ref,
423 & p->stack_data[ p->w_top ]
434 buff = __kmp_str_format(
435 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
436 traits_t< UT >::spec, traits_t< UT >::spec );
437 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
438 __kmp_str_free( &buff );
442 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
443 USE_ITT_BUILD_ARG( NULL )
450 buff = __kmp_str_format(
451 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
452 traits_t< UT >::spec, traits_t< UT >::spec );
453 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
454 __kmp_str_free( &buff );
458 KD_TRACE(100, (
"__kmp_dispatch_deo: T#%d returned\n", gtid ) );
462 __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref )
466 if ( __kmp_env_consistency_check ) {
467 th = __kmp_threads[*gtid_ref];
468 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
469 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
474 template<
typename UT >
476 __kmp_dispatch_dxo(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref )
478 typedef typename traits_t< UT >::signed_t ST;
479 dispatch_private_info_template< UT > * pr;
481 int gtid = *gtid_ref;
483 kmp_info_t *th = __kmp_threads[ gtid ];
484 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
486 KD_TRACE(100, (
"__kmp_dispatch_dxo: T#%d called\n", gtid ) );
487 if ( __kmp_env_consistency_check ) {
488 pr =
reinterpret_cast< dispatch_private_info_template< UT >*
>
489 ( th -> th.th_dispatch -> th_dispatch_pr_current );
490 if ( pr -> pushed_ws != ct_none ) {
491 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
495 if ( ! th -> th.th_team -> t.t_serialized ) {
496 dispatch_shared_info_template< UT > * sh =
reinterpret_cast< dispatch_shared_info_template< UT >*
>
497 ( th -> th.th_dispatch -> th_dispatch_sh_current );
499 if ( ! __kmp_env_consistency_check ) {
500 pr =
reinterpret_cast< dispatch_private_info_template< UT >*
>
501 ( th -> th.th_dispatch -> th_dispatch_pr_current );
504 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
505 #if ! defined( KMP_GOMP_COMPAT )
506 if ( __kmp_env_consistency_check ) {
507 if ( pr->ordered_bumped != 0 ) {
508 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
510 __kmp_error_construct2(
511 kmp_i18n_msg_CnsMultipleNesting,
512 ct_ordered_in_pdo, loc_ref,
513 & p->stack_data[ p->w_top ]
521 pr->ordered_bumped += 1;
523 KD_TRACE(1000, (
"__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
524 gtid, pr->ordered_bumped ) );
529 test_then_inc< ST >( (
volatile ST *) & sh->u.s.ordered_iteration );
533 KD_TRACE(100, (
"__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
537 template<
typename UT >
538 static __forceinline
long double
539 __kmp_pow(
long double x, UT y) {
542 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
558 template<
typename T >
559 static __inline
typename traits_t< T >::unsigned_t
560 __kmp_dispatch_guided_remaining(
562 typename traits_t< T >::floating_t base,
563 typename traits_t< T >::unsigned_t idx
572 typedef typename traits_t< T >::unsigned_t UT;
574 long double x = tc * __kmp_pow< UT >(base, idx);
586 static int guided_int_param = 2;
587 static double guided_flt_param = 0.5;
591 template<
typename T >
599 typename traits_t< T >::signed_t st,
600 typename traits_t< T >::signed_t chunk,
603 typedef typename traits_t< T >::unsigned_t UT;
604 typedef typename traits_t< T >::signed_t ST;
605 typedef typename traits_t< T >::floating_t DBL;
606 static const int ___kmp_size_type =
sizeof( UT );
612 kmp_uint32 my_buffer_index;
613 dispatch_private_info_template< T > * pr;
614 dispatch_shared_info_template< UT >
volatile * sh;
616 KMP_BUILD_ASSERT(
sizeof( dispatch_private_info_template< T > ) ==
sizeof( dispatch_private_info ) );
617 KMP_BUILD_ASSERT(
sizeof( dispatch_shared_info_template< UT > ) ==
sizeof( dispatch_shared_info ) );
619 if ( ! TCR_4( __kmp_init_parallel ) )
620 __kmp_parallel_initialize();
622 #if INCLUDE_SSC_MARKS
623 SSC_MARK_DISPATCH_INIT();
629 buff = __kmp_str_format(
630 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
631 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
632 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
633 __kmp_str_free( &buff );
637 th = __kmp_threads[ gtid ];
638 team = th -> th.th_team;
639 active = ! team -> t.t_serialized;
640 th->th.th_ident = loc;
643 kmp_uint64 cur_chunk = chunk;
646 pr =
reinterpret_cast< dispatch_private_info_template< T >*
>
647 ( th -> th.th_dispatch -> th_disp_buffer );
649 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
650 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
652 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
655 pr =
reinterpret_cast< dispatch_private_info_template< T > *
>
656 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657 sh =
reinterpret_cast< dispatch_shared_info_template< UT >
volatile *
>
658 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
668 pr->type_size = ___kmp_size_type;
676 schedule = __kmp_static;
678 if ( schedule == kmp_sch_runtime ) {
680 schedule = team -> t.t_sched.r_sched_type;
683 schedule = __kmp_guided;
685 schedule = __kmp_static;
688 chunk = team -> t.t_sched.chunk;
694 buff = __kmp_str_format(
695 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
696 traits_t< ST >::spec );
697 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
698 __kmp_str_free( &buff );
703 schedule = __kmp_guided;
706 chunk = KMP_DEFAULT_CHUNK;
712 schedule = __kmp_auto;
717 buff = __kmp_str_format(
718 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
719 traits_t< ST >::spec );
720 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
721 __kmp_str_free( &buff );
727 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
728 schedule = kmp_sch_guided_iterative_chunked;
729 KMP_WARNING( DispatchManyThreads );
731 pr->u.p.parm1 = chunk;
734 "unknown scheduling type" );
738 if ( __kmp_env_consistency_check ) {
740 __kmp_error_construct(
741 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
742 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
747 tc = ( ub - lb + st );
762 }
else if ( ub < lb ) {
772 pr->u.p.last_upper = ub + st;
778 if ( pr->ordered == 0 ) {
779 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
780 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
782 pr->ordered_bumped = 0;
784 pr->u.p.ordered_lower = 1;
785 pr->u.p.ordered_upper = 0;
787 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
788 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
792 if ( __kmp_env_consistency_check ) {
793 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
795 __kmp_push_workshare( gtid, ws, loc );
798 __kmp_check_workshare( gtid, ws, loc );
799 pr->pushed_ws = ct_none;
803 switch ( schedule ) {
804 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
807 T nproc = team->t.t_nproc;
810 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
812 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
813 if ( nproc > 1 && ntc >= nproc ) {
814 T
id = __kmp_tid_from_gtid(gtid);
815 T small_chunk, extras;
817 small_chunk = ntc / nproc;
818 extras = ntc % nproc;
820 init =
id * small_chunk + (
id < extras ?
id : extras );
821 pr->u.p.count = init;
822 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0 );
830 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
832 schedule = kmp_sch_static_balanced;
838 case kmp_sch_static_balanced:
840 T nproc = team->t.t_nproc;
843 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
847 T
id = __kmp_tid_from_gtid(gtid);
853 pr->u.p.parm1 = (
id == tc - 1);
856 pr->u.p.parm1 = FALSE;
860 T small_chunk = tc / nproc;
861 T extras = tc % nproc;
862 init =
id * small_chunk + (
id < extras ?
id : extras);
863 limit = init + small_chunk - (
id < extras ? 0 : 1);
864 pr->u.p.parm1 = (
id == nproc - 1);
870 pr->u.p.parm1 = TRUE;
874 pr->u.p.parm1 = FALSE;
880 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
881 cur_chunk = limit - init + 1;
885 pr->u.p.lb = lb + init;
886 pr->u.p.ub = lb + limit;
888 T ub_tmp = lb + limit * st;
889 pr->u.p.lb = lb + init * st;
892 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
894 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
898 pr->u.p.ordered_lower = init;
899 pr->u.p.ordered_upper = limit;
903 case kmp_sch_guided_iterative_chunked :
905 T nproc = team->t.t_nproc;
906 KD_TRACE(100,(
"__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
909 if ( (2L * chunk + 1 ) * nproc >= tc ) {
911 schedule = kmp_sch_dynamic_chunked;
914 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
915 *(
double*)&pr->u.p.parm3 = guided_flt_param / nproc;
918 KD_TRACE(100,(
"__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
919 schedule = kmp_sch_static_greedy;
921 KD_TRACE(100,(
"__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
926 case kmp_sch_guided_analytical_chunked:
928 T nproc = team->t.t_nproc;
929 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
932 if ( (2L * chunk + 1 ) * nproc >= tc ) {
934 schedule = kmp_sch_dynamic_chunked;
939 #if KMP_OS_WINDOWS && KMP_ARCH_X86
952 unsigned int oldFpcw = _control87(0,0);
953 _control87(_PC_64,_MCW_PC);
956 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
963 x = (
long double)1.0 - (
long double)0.5 / nproc;
974 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
976 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
981 *(DBL*)&pr->u.p.parm3 = x;
994 p = __kmp_pow< UT >(x,right);
999 }
while(p>target && right < (1<<27));
1006 while ( left + 1 < right ) {
1007 mid = (left + right) / 2;
1008 if ( __kmp_pow< UT >(x,mid) > target ) {
1017 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1020 pr->u.p.parm2 = cross;
1023 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1024 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1026 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1029 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1030 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1032 _control87(oldFpcw,_MCW_PC);
1036 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1038 schedule = kmp_sch_static_greedy;
1044 case kmp_sch_static_greedy:
1045 KD_TRACE(100,(
"__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1046 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1047 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1050 case kmp_sch_static_chunked :
1051 case kmp_sch_dynamic_chunked :
1052 KD_TRACE(100,(
"__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1054 case kmp_sch_trapezoidal :
1058 T parm1, parm2, parm3, parm4;
1059 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1064 parm2 = ( tc / (2 * team->t.t_nproc) );
1075 }
else if ( parm1 > parm2 ) {
1080 parm3 = ( parm2 + parm1 );
1081 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1088 parm4 = ( parm3 - 1 );
1089 parm4 = ( parm2 - parm1 ) / parm4;
1096 pr->u.p.parm1 = parm1;
1097 pr->u.p.parm2 = parm2;
1098 pr->u.p.parm3 = parm3;
1099 pr->u.p.parm4 = parm4;
1107 KMP_MSG( UnknownSchedTypeDetected ),
1108 KMP_HNT( GetNewerLibrary ),
1114 pr->schedule = schedule;
1118 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1119 gtid, my_buffer_index, sh->buffer_index) );
1120 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1121 USE_ITT_BUILD_ARG( NULL )
1126 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1127 gtid, my_buffer_index, sh->buffer_index) );
1129 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1130 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1132 if ( pr->ordered ) {
1133 __kmp_itt_ordered_init( gtid );
1140 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
1141 kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
1142 if (KMP_MASTER_TID(tid)) {
1143 kmp_uint64 schedtype = 0;
1145 switch ( schedule ) {
1146 case kmp_sch_static_chunked:
1147 case kmp_sch_static_balanced:
1149 case kmp_sch_static_greedy:
1150 cur_chunk = pr->u.p.parm1;
1152 case kmp_sch_dynamic_chunked:
1155 case kmp_sch_guided_iterative_chunked:
1156 case kmp_sch_guided_analytical_chunked:
1165 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1174 buff = __kmp_str_format(
1175 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1176 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1177 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1178 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1179 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1180 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1181 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1182 KD_TRACE(10, ( buff,
1183 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1184 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1185 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1186 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1187 __kmp_str_free( &buff );
1190 #if ( KMP_STATIC_STEAL_ENABLED )
1191 if ( ___kmp_size_type < 8 ) {
1200 volatile T * p = &pr->u.p.static_steal_counter;
1204 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1214 template<
typename UT >
1216 __kmp_dispatch_finish(
int gtid,
ident_t *loc )
1218 typedef typename traits_t< UT >::signed_t ST;
1219 kmp_info_t *th = __kmp_threads[ gtid ];
1221 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid ) );
1222 if ( ! th -> th.th_team -> t.t_serialized ) {
1224 dispatch_private_info_template< UT > * pr =
1225 reinterpret_cast< dispatch_private_info_template< UT >*
>
1226 ( th->th.th_dispatch->th_dispatch_pr_current );
1227 dispatch_shared_info_template< UT >
volatile * sh =
1228 reinterpret_cast< dispatch_shared_info_template< UT >volatile*
>
1229 ( th->th.th_dispatch->th_dispatch_sh_current );
1230 KMP_DEBUG_ASSERT( pr );
1231 KMP_DEBUG_ASSERT( sh );
1232 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1233 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1235 if ( pr->ordered_bumped ) {
1236 KD_TRACE(1000, (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1238 pr->ordered_bumped = 0;
1240 UT lower = pr->u.p.ordered_lower;
1246 buff = __kmp_str_format(
1247 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1248 traits_t< UT >::spec, traits_t< UT >::spec );
1249 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1250 __kmp_str_free( &buff );
1254 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1255 USE_ITT_BUILD_ARG(NULL)
1262 buff = __kmp_str_format(
1263 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1264 traits_t< UT >::spec, traits_t< UT >::spec );
1265 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1266 __kmp_str_free( &buff );
1270 test_then_inc< ST >( (
volatile ST *) & sh->u.s.ordered_iteration );
1273 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1276 #ifdef KMP_GOMP_COMPAT
1278 template<
typename UT >
1280 __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc )
1282 typedef typename traits_t< UT >::signed_t ST;
1283 kmp_info_t *th = __kmp_threads[ gtid ];
1285 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1286 if ( ! th -> th.th_team -> t.t_serialized ) {
1288 dispatch_private_info_template< UT > * pr =
1289 reinterpret_cast< dispatch_private_info_template< UT >*
>
1290 ( th->th.th_dispatch->th_dispatch_pr_current );
1291 dispatch_shared_info_template< UT >
volatile * sh =
1292 reinterpret_cast< dispatch_shared_info_template< UT >volatile*
>
1293 ( th->th.th_dispatch->th_dispatch_sh_current );
1294 KMP_DEBUG_ASSERT( pr );
1295 KMP_DEBUG_ASSERT( sh );
1296 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1297 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1300 UT lower = pr->u.p.ordered_lower;
1301 UT upper = pr->u.p.ordered_upper;
1302 UT inc = upper - lower + 1;
1304 if ( pr->ordered_bumped == inc ) {
1305 KD_TRACE(1000, (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1307 pr->ordered_bumped = 0;
1309 inc -= pr->ordered_bumped;
1315 buff = __kmp_str_format(
1316 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1317 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1318 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1319 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1320 __kmp_str_free( &buff );
1324 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1325 USE_ITT_BUILD_ARG(NULL)
1329 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1331 pr->ordered_bumped = 0;
1337 buff = __kmp_str_format(
1338 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1339 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1340 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1341 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1342 __kmp_str_free( &buff );
1346 test_then_add< ST >( (
volatile ST *) & sh->u.s.ordered_iteration, inc);
1350 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1355 template<
typename T >
1357 __kmp_dispatch_next(
1358 ident_t *loc,
int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub,
typename traits_t< T >::signed_t *p_st
1361 typedef typename traits_t< T >::unsigned_t UT;
1362 typedef typename traits_t< T >::signed_t ST;
1363 typedef typename traits_t< T >::floating_t DBL;
1364 static const int ___kmp_size_type =
sizeof( UT );
1367 dispatch_private_info_template< T > * pr;
1368 kmp_info_t * th = __kmp_threads[ gtid ];
1369 kmp_team_t * team = th -> th.th_team;
1371 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st );
1376 buff = __kmp_str_format(
1377 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1378 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1379 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1380 __kmp_str_free( &buff );
1384 if ( team -> t.t_serialized ) {
1386 pr =
reinterpret_cast< dispatch_private_info_template< T >*
>
1387 ( th -> th.th_dispatch -> th_disp_buffer );
1388 KMP_DEBUG_ASSERT( pr );
1390 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1397 if ( __kmp_env_consistency_check ) {
1398 if ( pr->pushed_ws != ct_none ) {
1399 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1402 }
else if ( pr->nomerge ) {
1405 UT limit, trip, init;
1407 T chunk = pr->u.p.parm1;
1409 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1411 init = chunk * pr->u.p.count++;
1412 trip = pr->u.p.tc - 1;
1414 if ( (status = (init <= trip)) == 0 ) {
1421 if ( __kmp_env_consistency_check ) {
1422 if ( pr->pushed_ws != ct_none ) {
1423 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1428 limit = chunk + init - 1;
1431 if ( (last = (limit >= trip)) != 0 ) {
1434 pr->u.p.last_upper = pr->u.p.ub;
1437 if ( p_last != NULL )
1442 *p_lb = start + init;
1443 *p_ub = start + limit;
1445 *p_lb = start + init * incr;
1446 *p_ub = start + limit * incr;
1449 if ( pr->ordered ) {
1450 pr->u.p.ordered_lower = init;
1451 pr->u.p.ordered_upper = limit;
1456 buff = __kmp_str_format(
1457 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1458 traits_t< UT >::spec, traits_t< UT >::spec );
1459 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1460 __kmp_str_free( &buff );
1470 pr->u.p.last_upper = *p_ub;
1472 if ( p_last != NULL )
1481 buff = __kmp_str_format(
1482 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1483 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1484 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1485 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1486 __kmp_str_free( &buff );
1489 #if INCLUDE_SSC_MARKS
1490 SSC_MARK_DISPATCH_NEXT();
1495 dispatch_shared_info_template< UT > *sh;
1498 UT limit, trip, init;
1500 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1501 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1503 pr =
reinterpret_cast< dispatch_private_info_template< T >*
>
1504 ( th->th.th_dispatch->th_dispatch_pr_current );
1505 KMP_DEBUG_ASSERT( pr );
1506 sh =
reinterpret_cast< dispatch_shared_info_template< UT >*
>
1507 ( th->th.th_dispatch->th_dispatch_sh_current );
1508 KMP_DEBUG_ASSERT( sh );
1510 if ( pr->u.p.tc == 0 ) {
1514 switch (pr->schedule) {
1515 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1518 T chunk = pr->u.p.parm1;
1520 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1522 trip = pr->u.p.tc - 1;
1524 if ( ___kmp_size_type > 4 ) {
1527 init = ( pr->u.p.count )++;
1528 status = ( init < (UT)pr->u.p.ub );
1540 union_i4 vold, vnew;
1541 vold.b = *(
volatile kmp_int64 * )(&pr->u.p.count);
1544 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1545 (
volatile kmp_int64* )&pr->u.p.count,
1546 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1547 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1549 vold.b = *(
volatile kmp_int64 * )(&pr->u.p.count);
1554 init = vnew.p.count;
1555 status = ( init < (UT)vnew.p.ub ) ;
1559 kmp_info_t **other_threads = team->t.t_threads;
1560 int while_limit = 10;
1561 int while_index = 0;
1565 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1566 union_i4 vold, vnew;
1567 kmp_int32 remaining;
1568 T victimIdx = pr->u.p.parm4;
1569 T oldVictimIdx = victimIdx;
1570 dispatch_private_info_template< T > * victim;
1574 victimIdx = team->t.t_nproc - 1;
1578 victim =
reinterpret_cast< dispatch_private_info_template< T >*
>
1579 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1580 }
while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1583 ( (*(
volatile T * )&victim->u.p.static_steal_counter) !=
1584 (*(
volatile T * )&pr->u.p.static_steal_counter) ) ) {
1590 if ( oldVictimIdx == victimIdx ) {
1593 pr->u.p.parm4 = victimIdx;
1596 vold.b = *(
volatile kmp_int64 * )( &victim->u.p.count );
1599 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1600 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1603 vnew.p.ub -= (remaining >> 2);
1604 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1605 #pragma warning( push )
1607 #pragma warning( disable: 186 )
1608 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1609 #pragma warning( pop )
1611 if ( KMP_COMPARE_AND_STORE_ACQ64(
1612 (
volatile kmp_int64 * )&victim->u.p.count,
1613 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1614 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1623 init = vold.p.count;
1625 pr->u.p.count = init + 1;
1626 pr->u.p.ub = vnew.p.count;
1629 vold.p.count = init + 1;
1631 *(
volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1632 #endif // KMP_ARCH_X86
1643 if ( p_st != NULL ) *p_st = 0;
1645 start = pr->u.p.parm2;
1647 limit = chunk + init - 1;
1650 KMP_DEBUG_ASSERT(init <= trip);
1651 if ( (last = (limit >= trip)) != 0 )
1653 if ( p_st != NULL ) *p_st = incr;
1656 *p_lb = start + init;
1657 *p_ub = start + limit;
1659 *p_lb = start + init * incr;
1660 *p_ub = start + limit * incr;
1663 if ( pr->ordered ) {
1664 pr->u.p.ordered_lower = init;
1665 pr->u.p.ordered_upper = limit;
1670 buff = __kmp_str_format(
1671 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1672 traits_t< UT >::spec, traits_t< UT >::spec );
1673 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1674 __kmp_str_free( &buff );
1681 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1682 case kmp_sch_static_balanced:
1684 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1685 if ( (status = !pr->u.p.count) != 0 ) {
1689 last = pr->u.p.parm1;
1693 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1695 if ( pr->ordered ) {
1700 buff = __kmp_str_format(
1701 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1702 traits_t< UT >::spec, traits_t< UT >::spec );
1703 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1704 __kmp_str_free( &buff );
1710 case kmp_sch_static_greedy:
1711 case kmp_sch_static_chunked:
1715 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1717 parm1 = pr->u.p.parm1;
1719 trip = pr->u.p.tc - 1;
1720 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1722 if ( (status = (init <= trip)) != 0 ) {
1725 limit = parm1 + init - 1;
1727 if ( (last = (limit >= trip)) != 0 )
1730 if ( p_st != NULL ) *p_st = incr;
1732 pr->u.p.count += team->t.t_nproc;
1735 *p_lb = start + init;
1736 *p_ub = start + limit;
1739 *p_lb = start + init * incr;
1740 *p_ub = start + limit * incr;
1743 if ( pr->ordered ) {
1744 pr->u.p.ordered_lower = init;
1745 pr->u.p.ordered_upper = limit;
1750 buff = __kmp_str_format(
1751 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1752 traits_t< UT >::spec, traits_t< UT >::spec );
1753 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1754 __kmp_str_free( &buff );
1762 case kmp_sch_dynamic_chunked:
1764 T chunk = pr->u.p.parm1;
1766 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1769 init = chunk * test_then_inc_acq< ST >((
volatile ST *) & sh->u.s.iteration );
1770 trip = pr->u.p.tc - 1;
1772 if ( (status = (init <= trip)) == 0 ) {
1775 if ( p_st != NULL ) *p_st = 0;
1778 limit = chunk + init - 1;
1781 if ( (last = (limit >= trip)) != 0 )
1784 if ( p_st != NULL ) *p_st = incr;
1787 *p_lb = start + init;
1788 *p_ub = start + limit;
1790 *p_lb = start + init * incr;
1791 *p_ub = start + limit * incr;
1794 if ( pr->ordered ) {
1795 pr->u.p.ordered_lower = init;
1796 pr->u.p.ordered_upper = limit;
1801 buff = __kmp_str_format(
1802 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1803 traits_t< UT >::spec, traits_t< UT >::spec );
1804 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1805 __kmp_str_free( &buff );
1813 case kmp_sch_guided_iterative_chunked:
1815 T chunkspec = pr->u.p.parm1;
1817 (
"__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1822 init = sh->u.s.iteration;
1823 remaining = trip - init;
1824 if ( remaining <= 0 ) {
1829 if ( (T)remaining < pr->u.p.parm2 ) {
1832 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1833 remaining = trip - init;
1834 if (remaining <= 0) {
1839 if ( (T)remaining > chunkspec ) {
1840 limit = init + chunkspec - 1;
1843 limit = init + remaining - 1;
1848 limit = init + (UT)( remaining * *(
double*)&pr->u.p.parm3 );
1849 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1856 if ( status != 0 ) {
1861 *p_lb = start + init * incr;
1862 *p_ub = start + limit * incr;
1863 if ( pr->ordered ) {
1864 pr->u.p.ordered_lower = init;
1865 pr->u.p.ordered_upper = limit;
1870 buff = __kmp_str_format(
1871 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1872 traits_t< UT >::spec, traits_t< UT >::spec );
1873 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1874 __kmp_str_free( &buff );
1887 case kmp_sch_guided_analytical_chunked:
1889 T chunkspec = pr->u.p.parm1;
1891 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1894 unsigned int oldFpcw;
1895 unsigned int fpcwSet = 0;
1897 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1902 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1903 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1906 chunkIdx = test_then_inc_acq< ST >((
volatile ST *) & sh->u.s.iteration );
1907 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1910 init = chunkIdx * chunkspec + pr->u.p.count;
1912 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1913 limit = init + chunkspec -1;
1915 if ( (last = (limit >= trip)) != 0 )
1924 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1929 oldFpcw = _control87(0,0);
1930 _control87(_PC_64,_MCW_PC);
1935 init = __kmp_dispatch_guided_remaining< T >(
1936 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1937 KMP_DEBUG_ASSERT(init);
1941 limit = trip - __kmp_dispatch_guided_remaining< T >(
1942 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1943 KMP_ASSERT(init <= limit);
1944 if ( init < limit ) {
1945 KMP_DEBUG_ASSERT(limit <= trip);
1952 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1956 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1957 _control87(oldFpcw,_MCW_PC);
1959 if ( status != 0 ) {
1964 *p_lb = start + init * incr;
1965 *p_ub = start + limit * incr;
1966 if ( pr->ordered ) {
1967 pr->u.p.ordered_lower = init;
1968 pr->u.p.ordered_upper = limit;
1973 buff = __kmp_str_format(
1974 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1975 traits_t< UT >::spec, traits_t< UT >::spec );
1976 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1977 __kmp_str_free( &buff );
1990 case kmp_sch_trapezoidal:
1993 T parm2 = pr->u.p.parm2;
1994 T parm3 = pr->u.p.parm3;
1995 T parm4 = pr->u.p.parm4;
1996 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1999 index = test_then_inc< ST >( (
volatile ST *) & sh->u.s.iteration );
2001 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2002 trip = pr->u.p.tc - 1;
2004 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2007 if ( p_st != NULL ) *p_st = 0;
2010 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2013 if ( (last = (limit >= trip)) != 0 )
2016 if ( p_st != NULL ) *p_st = incr;
2019 *p_lb = start + init;
2020 *p_ub = start + limit;
2022 *p_lb = start + init * incr;
2023 *p_ub = start + limit * incr;
2026 if ( pr->ordered ) {
2027 pr->u.p.ordered_lower = init;
2028 pr->u.p.ordered_upper = limit;
2033 buff = __kmp_str_format(
2034 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2035 traits_t< UT >::spec, traits_t< UT >::spec );
2036 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2037 __kmp_str_free( &buff );
2049 KMP_MSG( UnknownSchedTypeDetected ),
2050 KMP_HNT( GetNewerLibrary ),
2058 if ( status == 0 ) {
2061 num_done = test_then_inc< ST >( (
volatile ST *) & sh->u.s.num_done );
2066 buff = __kmp_str_format(
2067 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2068 traits_t< UT >::spec );
2069 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2070 __kmp_str_free( &buff );
2074 if ( (ST)num_done == team->t.t_nproc-1 ) {
2079 sh->u.s.num_done = 0;
2080 sh->u.s.iteration = 0;
2083 if ( pr->ordered ) {
2084 sh->u.s.ordered_iteration = 0;
2089 sh -> buffer_index += KMP_MAX_DISP_BUF;
2090 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2091 gtid, sh->buffer_index) );
2096 if ( __kmp_env_consistency_check ) {
2097 if ( pr->pushed_ws != ct_none ) {
2098 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2102 th -> th.th_dispatch -> th_deo_fcn = NULL;
2103 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2104 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2105 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2109 pr->u.p.last_upper = pr->u.p.ub;
2112 if ( p_last != NULL && status != 0 )
2120 buff = __kmp_str_format(
2121 "__kmp_dispatch_next: T#%%d normal case: " \
2122 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2123 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2124 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2125 __kmp_str_free( &buff );
2128 #if INCLUDE_SSC_MARKS
2129 SSC_MARK_DISPATCH_NEXT();
2134 template<
typename T >
2136 __kmp_dist_get_bounds(
2139 kmp_int32 *plastiter,
2142 typename traits_t< T >::signed_t incr
2145 typedef typename traits_t< T >::unsigned_t UT;
2146 typedef typename traits_t< T >::signed_t ST;
2147 register kmp_uint32 team_id;
2148 register kmp_uint32 nteams;
2149 register UT trip_count;
2150 register kmp_team_t *team;
2153 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2154 KE_TRACE( 10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2159 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2160 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2161 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2162 traits_t< T >::spec );
2163 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2164 __kmp_str_free( &buff );
2168 if( __kmp_env_consistency_check ) {
2170 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2172 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2182 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2185 th = __kmp_threads[gtid];
2186 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2187 team = th->th.th_team;
2189 nteams = th->th.th_teams_size.nteams;
2191 team_id = team->t.t_master_tid;
2192 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2196 trip_count = *pupper - *plower + 1;
2197 }
else if(incr == -1) {
2198 trip_count = *plower - *pupper + 1;
2200 trip_count = (ST)(*pupper - *plower) / incr + 1;
2202 if( trip_count <= nteams ) {
2204 __kmp_static == kmp_sch_static_greedy || \
2205 __kmp_static == kmp_sch_static_balanced
2208 if( team_id < trip_count ) {
2209 *pupper = *plower = *plower + team_id * incr;
2211 *plower = *pupper + incr;
2213 if( plastiter != NULL )
2214 *plastiter = ( team_id == trip_count - 1 );
2216 if( __kmp_static == kmp_sch_static_balanced ) {
2217 register UT chunk = trip_count / nteams;
2218 register UT extras = trip_count % nteams;
2219 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2220 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2221 if( plastiter != NULL )
2222 *plastiter = ( team_id == nteams - 1 );
2224 register T chunk_inc_count =
2225 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2226 register T upper = *pupper;
2227 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2229 *plower += team_id * chunk_inc_count;
2230 *pupper = *plower + chunk_inc_count - incr;
2233 if( *pupper < *plower )
2234 *pupper = i_maxmin< T >::mx;
2235 if( plastiter != NULL )
2236 *plastiter = *plower <= upper && *pupper > upper - incr;
2237 if( *pupper > upper )
2240 if( *pupper > *plower )
2241 *pupper = i_maxmin< T >::mn;
2242 if( plastiter != NULL )
2243 *plastiter = *plower >= upper && *pupper < upper - incr;
2244 if( *pupper < upper )
2275 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2278 KMP_DEBUG_ASSERT( __kmp_init_serial );
2279 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2286 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2289 KMP_DEBUG_ASSERT( __kmp_init_serial );
2290 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2298 kmp_int64 lb, kmp_int64 ub,
2299 kmp_int64 st, kmp_int64 chunk )
2302 KMP_DEBUG_ASSERT( __kmp_init_serial );
2303 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2311 kmp_uint64 lb, kmp_uint64 ub,
2312 kmp_int64 st, kmp_int64 chunk )
2315 KMP_DEBUG_ASSERT( __kmp_init_serial );
2316 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2330 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2333 KMP_DEBUG_ASSERT( __kmp_init_serial );
2334 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2335 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2339 __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
enum sched_type schedule,
2340 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2343 KMP_DEBUG_ASSERT( __kmp_init_serial );
2344 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2345 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2349 __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
enum sched_type schedule,
2350 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2353 KMP_DEBUG_ASSERT( __kmp_init_serial );
2354 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2355 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2359 __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
enum sched_type schedule,
2360 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2363 KMP_DEBUG_ASSERT( __kmp_init_serial );
2364 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2365 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2382 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2384 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2392 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2394 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2402 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2404 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2412 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2414 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2426 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2435 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2444 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2453 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2460 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2461 return value == checker;
2464 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2465 return value != checker;
2468 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2469 return value < checker;
2472 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2473 return value >= checker;
2476 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2477 return value <= checker;
2479 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2480 return value == checker;
2483 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2484 return value != checker;
2487 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2488 return value < checker;
2491 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2492 return value >= checker;
2495 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2496 return value <= checker;
2500 __kmp_wait_yield_4(
volatile kmp_uint32 * spinner,
2502 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2507 register volatile kmp_uint32 * spin = spinner;
2508 register kmp_uint32 check = checker;
2509 register kmp_uint32 spins;
2510 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2511 register kmp_uint32 r;
2513 KMP_FSYNC_SPIN_INIT( obj, (
void*) spin );
2514 KMP_INIT_YIELD( spins );
2516 while(!f(r = TCR_4(*spin), check)) {
2517 KMP_FSYNC_SPIN_PREPARE( obj );
2525 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2526 KMP_YIELD_SPIN( spins );
2528 KMP_FSYNC_SPIN_ACQUIRED( obj );
2533 __kmp_wait_yield_8(
volatile kmp_uint64 * spinner,
2535 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2540 register volatile kmp_uint64 * spin = spinner;
2541 register kmp_uint64 check = checker;
2542 register kmp_uint32 spins;
2543 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2544 register kmp_uint64 r;
2546 KMP_FSYNC_SPIN_INIT( obj, (
void*) spin );
2547 KMP_INIT_YIELD( spins );
2549 while(!f(r = *spin, check))
2551 KMP_FSYNC_SPIN_PREPARE( obj );
2560 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2561 KMP_YIELD_SPIN( spins );
2563 KMP_FSYNC_SPIN_ACQUIRED( obj );
2569 #ifdef KMP_GOMP_COMPAT
2572 __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
enum sched_type schedule,
2573 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2574 kmp_int32 chunk,
int push_ws )
2576 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2581 __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
enum sched_type schedule,
2582 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2583 kmp_int32 chunk,
int push_ws )
2585 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2590 __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
enum sched_type schedule,
2591 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2592 kmp_int64 chunk,
int push_ws )
2594 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2599 __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
enum sched_type schedule,
2600 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2601 kmp_int64 chunk,
int push_ws )
2603 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2608 __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid )
2610 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2614 __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid )
2616 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2620 __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid )
2622 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2626 __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid )
2628 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)