Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 /*
36  * Dynamic scheduling initialization and dispatch.
37  *
38  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
39  * it may change values between parallel regions. __kmp_max_nth
40  * is the largest value __kmp_nth may take, 1 is the smallest.
41  *
42  */
43 
44 /* ------------------------------------------------------------------------ */
45 /* ------------------------------------------------------------------------ */
46 
47 #include "kmp.h"
48 #include "kmp_i18n.h"
49 #include "kmp_itt.h"
50 #include "kmp_str.h"
51 #include "kmp_error.h"
52 #include "kmp_stats.h"
53 #if KMP_OS_WINDOWS && KMP_ARCH_X86
54  #include <float.h>
55 #endif
56 
57 /* ------------------------------------------------------------------------ */
58 /* ------------------------------------------------------------------------ */
59 
60 // template for type limits
61 template< typename T >
62 struct i_maxmin {
63  static const T mx;
64  static const T mn;
65 };
66 template<>
67 struct i_maxmin< int > {
68  static const int mx = 0x7fffffff;
69  static const int mn = 0x80000000;
70 };
71 template<>
72 struct i_maxmin< unsigned int > {
73  static const unsigned int mx = 0xffffffff;
74  static const unsigned int mn = 0x00000000;
75 };
76 template<>
77 struct i_maxmin< long long > {
78  static const long long mx = 0x7fffffffffffffffLL;
79  static const long long mn = 0x8000000000000000LL;
80 };
81 template<>
82 struct i_maxmin< unsigned long long > {
83  static const unsigned long long mx = 0xffffffffffffffffLL;
84  static const unsigned long long mn = 0x0000000000000000LL;
85 };
86 //-------------------------------------------------------------------------
87 
88 #ifdef KMP_STATIC_STEAL_ENABLED
89 
90  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
91  template< typename T >
92  struct dispatch_private_infoXX_template {
93  typedef typename traits_t< T >::unsigned_t UT;
94  typedef typename traits_t< T >::signed_t ST;
95  UT count; // unsigned
96  T ub;
97  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
98  T lb;
99  ST st; // signed
100  UT tc; // unsigned
101  T static_steal_counter; // for static_steal only; maybe better to put after ub
102 
103  /* parm[1-4] are used in different ways by different scheduling algorithms */
104 
105  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
106  // a) parm3 is properly aligned and
107  // b) all parm1-4 are in the same cache line.
108  // Because of parm1-4 are used together, performance seems to be better
109  // if they are in the same line (not measured though).
110 
111  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
112  T parm1;
113  T parm2;
114  T parm3;
115  T parm4;
116  };
117 
118  UT ordered_lower; // unsigned
119  UT ordered_upper; // unsigned
120  #if KMP_OS_WINDOWS
121  T last_upper;
122  #endif /* KMP_OS_WINDOWS */
123  };
124 
125 #else /* KMP_STATIC_STEAL_ENABLED */
126 
127  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
128  template< typename T >
129  struct dispatch_private_infoXX_template {
130  typedef typename traits_t< T >::unsigned_t UT;
131  typedef typename traits_t< T >::signed_t ST;
132  T lb;
133  T ub;
134  ST st; // signed
135  UT tc; // unsigned
136 
137  T parm1;
138  T parm2;
139  T parm3;
140  T parm4;
141 
142  UT count; // unsigned
143 
144  UT ordered_lower; // unsigned
145  UT ordered_upper; // unsigned
146  #if KMP_OS_WINDOWS
147  T last_upper;
148  #endif /* KMP_OS_WINDOWS */
149  };
150 
151 #endif /* KMP_STATIC_STEAL_ENABLED */
152 
153 // replaces dispatch_private_info structure and dispatch_private_info_t type
154 template< typename T >
155 struct KMP_ALIGN_CACHE dispatch_private_info_template {
156  // duplicate alignment here, otherwise size of structure is not correct in our compiler
157  union KMP_ALIGN_CACHE private_info_tmpl {
158  dispatch_private_infoXX_template< T > p;
159  dispatch_private_info64_t p64;
160  } u;
161  enum sched_type schedule; /* scheduling algorithm */
162  kmp_uint32 ordered; /* ordered clause specified */
163  kmp_uint32 ordered_bumped;
164  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
165  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
166  kmp_uint32 nomerge; /* don't merge iters if serialized */
167  kmp_uint32 type_size;
168  enum cons_type pushed_ws;
169 };
170 
171 
172 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
173 template< typename UT >
174 struct dispatch_shared_infoXX_template {
175  /* chunk index under dynamic, number of idle threads under static-steal;
176  iteration index otherwise */
177  volatile UT iteration;
178  volatile UT num_done;
179  volatile UT ordered_iteration;
180  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
181 };
182 
183 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
184 template< typename UT >
185 struct dispatch_shared_info_template {
186  // we need union here to keep the structure size
187  union shared_info_tmpl {
188  dispatch_shared_infoXX_template< UT > s;
189  dispatch_shared_info64_t s64;
190  } u;
191  volatile kmp_uint32 buffer_index;
192 };
193 
194 /* ------------------------------------------------------------------------ */
195 /* ------------------------------------------------------------------------ */
196 
197 #undef USE_TEST_LOCKS
198 
199 // test_then_add template (general template should NOT be used)
200 template< typename T >
201 static __forceinline T
202 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
203 
204 template<>
205 __forceinline kmp_int32
206 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
207 {
208  kmp_int32 r;
209  r = KMP_TEST_THEN_ADD32( p, d );
210  return r;
211 }
212 
213 template<>
214 __forceinline kmp_int64
215 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
216 {
217  kmp_int64 r;
218  r = KMP_TEST_THEN_ADD64( p, d );
219  return r;
220 }
221 
222 // test_then_inc_acq template (general template should NOT be used)
223 template< typename T >
224 static __forceinline T
225 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
226 
227 template<>
228 __forceinline kmp_int32
229 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
230 {
231  kmp_int32 r;
232  r = KMP_TEST_THEN_INC_ACQ32( p );
233  return r;
234 }
235 
236 template<>
237 __forceinline kmp_int64
238 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
239 {
240  kmp_int64 r;
241  r = KMP_TEST_THEN_INC_ACQ64( p );
242  return r;
243 }
244 
245 // test_then_inc template (general template should NOT be used)
246 template< typename T >
247 static __forceinline T
248 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
249 
250 template<>
251 __forceinline kmp_int32
252 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
253 {
254  kmp_int32 r;
255  r = KMP_TEST_THEN_INC32( p );
256  return r;
257 }
258 
259 template<>
260 __forceinline kmp_int64
261 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
262 {
263  kmp_int64 r;
264  r = KMP_TEST_THEN_INC64( p );
265  return r;
266 }
267 
268 // compare_and_swap template (general template should NOT be used)
269 template< typename T >
270 static __forceinline kmp_int32
271 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
272 
273 template<>
274 __forceinline kmp_int32
275 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
276 {
277  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
278 }
279 
280 template<>
281 __forceinline kmp_int32
282 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
283 {
284  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
285 }
286 
287 /*
288  Spin wait loop that first does pause, then yield.
289  Waits until function returns non-zero when called with *spinner and check.
290  Does NOT put threads to sleep.
291 #if USE_ITT_BUILD
292  Arguments:
293  obj -- is higher-level synchronization object to report to ittnotify. It is used to report
294  locks consistently. For example, if lock is acquired immediately, its address is
295  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
296  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
297  address, not an address of low-level spinner.
298 #endif // USE_ITT_BUILD
299 */
300 template< typename UT >
301 // ToDo: make inline function (move to header file for icl)
302 static UT // unsigned 4- or 8-byte type
303 __kmp_wait_yield( volatile UT * spinner,
304  UT checker,
305  kmp_uint32 (* pred)( UT, UT )
306  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
307  )
308 {
309  // note: we may not belong to a team at this point
310  register volatile UT * spin = spinner;
311  register UT check = checker;
312  register kmp_uint32 spins;
313  register kmp_uint32 (*f) ( UT, UT ) = pred;
314  register UT r;
315 
316  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
317  KMP_INIT_YIELD( spins );
318  // main wait spin loop
319  while(!f(r = *spin, check))
320  {
321  KMP_FSYNC_SPIN_PREPARE( obj );
322  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
323  It causes problems with infinite recursion because of exit lock */
324  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
325  __kmp_abort_thread(); */
326 
327  // if we are oversubscribed,
328  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
329  // pause is in the following code
330  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
331  KMP_YIELD_SPIN( spins );
332  }
333  KMP_FSYNC_SPIN_ACQUIRED( obj );
334  return r;
335 }
336 
337 template< typename UT >
338 static kmp_uint32 __kmp_eq( UT value, UT checker) {
339  return value == checker;
340 }
341 
342 template< typename UT >
343 static kmp_uint32 __kmp_neq( UT value, UT checker) {
344  return value != checker;
345 }
346 
347 template< typename UT >
348 static kmp_uint32 __kmp_lt( UT value, UT checker) {
349  return value < checker;
350 }
351 
352 template< typename UT >
353 static kmp_uint32 __kmp_ge( UT value, UT checker) {
354  return value >= checker;
355 }
356 
357 template< typename UT >
358 static kmp_uint32 __kmp_le( UT value, UT checker) {
359  return value <= checker;
360 }
361 
362 
363 /* ------------------------------------------------------------------------ */
364 /* ------------------------------------------------------------------------ */
365 
366 static void
367 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
368 {
369  kmp_info_t *th;
370 
371  KMP_DEBUG_ASSERT( gtid_ref );
372 
373  if ( __kmp_env_consistency_check ) {
374  th = __kmp_threads[*gtid_ref];
375  if ( th -> th.th_root -> r.r_active
376  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
377 #if KMP_USE_DYNAMIC_LOCK
378  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
379 #else
380  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
381 #endif
382  }
383  }
384 }
385 
386 template< typename UT >
387 static void
388 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
389 {
390  typedef typename traits_t< UT >::signed_t ST;
391  dispatch_private_info_template< UT > * pr;
392 
393  int gtid = *gtid_ref;
394 // int cid = *cid_ref;
395  kmp_info_t *th = __kmp_threads[ gtid ];
396  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
397 
398  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
399  if ( __kmp_env_consistency_check ) {
400  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
401  ( th -> th.th_dispatch -> th_dispatch_pr_current );
402  if ( pr -> pushed_ws != ct_none ) {
403 #if KMP_USE_DYNAMIC_LOCK
404  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
405 #else
406  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
407 #endif
408  }
409  }
410 
411  if ( ! th -> th.th_team -> t.t_serialized ) {
412  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
413  ( th -> th.th_dispatch -> th_dispatch_sh_current );
414  UT lower;
415 
416  if ( ! __kmp_env_consistency_check ) {
417  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
418  ( th -> th.th_dispatch -> th_dispatch_pr_current );
419  }
420  lower = pr->u.p.ordered_lower;
421 
422  #if ! defined( KMP_GOMP_COMPAT )
423  if ( __kmp_env_consistency_check ) {
424  if ( pr->ordered_bumped ) {
425  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
426  __kmp_error_construct2(
427  kmp_i18n_msg_CnsMultipleNesting,
428  ct_ordered_in_pdo, loc_ref,
429  & p->stack_data[ p->w_top ]
430  );
431  }
432  }
433  #endif /* !defined(KMP_GOMP_COMPAT) */
434 
435  KMP_MB();
436  #ifdef KMP_DEBUG
437  {
438  const char * buff;
439  // create format specifiers before the debug output
440  buff = __kmp_str_format(
441  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
442  traits_t< UT >::spec, traits_t< UT >::spec );
443  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
444  __kmp_str_free( &buff );
445  }
446  #endif
447 
448  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
449  USE_ITT_BUILD_ARG( NULL )
450  );
451  KMP_MB(); /* is this necessary? */
452  #ifdef KMP_DEBUG
453  {
454  const char * buff;
455  // create format specifiers before the debug output
456  buff = __kmp_str_format(
457  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
458  traits_t< UT >::spec, traits_t< UT >::spec );
459  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
460  __kmp_str_free( &buff );
461  }
462  #endif
463  }
464  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
465 }
466 
467 static void
468 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470  kmp_info_t *th;
471 
472  if ( __kmp_env_consistency_check ) {
473  th = __kmp_threads[*gtid_ref];
474  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
475  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
476  }
477  }
478 }
479 
480 template< typename UT >
481 static void
482 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
483 {
484  typedef typename traits_t< UT >::signed_t ST;
485  dispatch_private_info_template< UT > * pr;
486 
487  int gtid = *gtid_ref;
488 // int cid = *cid_ref;
489  kmp_info_t *th = __kmp_threads[ gtid ];
490  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
491 
492  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
493  if ( __kmp_env_consistency_check ) {
494  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
495  ( th -> th.th_dispatch -> th_dispatch_pr_current );
496  if ( pr -> pushed_ws != ct_none ) {
497  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
498  }
499  }
500 
501  if ( ! th -> th.th_team -> t.t_serialized ) {
502  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
503  ( th -> th.th_dispatch -> th_dispatch_sh_current );
504 
505  if ( ! __kmp_env_consistency_check ) {
506  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
507  ( th -> th.th_dispatch -> th_dispatch_pr_current );
508  }
509 
510  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
511  #if ! defined( KMP_GOMP_COMPAT )
512  if ( __kmp_env_consistency_check ) {
513  if ( pr->ordered_bumped != 0 ) {
514  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
515  /* How to test it? - OM */
516  __kmp_error_construct2(
517  kmp_i18n_msg_CnsMultipleNesting,
518  ct_ordered_in_pdo, loc_ref,
519  & p->stack_data[ p->w_top ]
520  );
521  }
522  }
523  #endif /* !defined(KMP_GOMP_COMPAT) */
524 
525  KMP_MB(); /* Flush all pending memory write invalidates. */
526 
527  pr->ordered_bumped += 1;
528 
529  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
530  gtid, pr->ordered_bumped ) );
531 
532  KMP_MB(); /* Flush all pending memory write invalidates. */
533 
534  /* TODO use general release procedure? */
535  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
536 
537  KMP_MB(); /* Flush all pending memory write invalidates. */
538  }
539  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
540 }
541 
542 /* Computes and returns x to the power of y, where y must a non-negative integer */
543 template< typename UT >
544 static __forceinline long double
545 __kmp_pow(long double x, UT y) {
546  long double s=1.0L;
547 
548  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
549  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
550  while(y) {
551  if ( y & 1 )
552  s *= x;
553  x *= x;
554  y >>= 1;
555  }
556  return s;
557 }
558 
559 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
560  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
561  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
562  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
563 */
564 template< typename T >
565 static __inline typename traits_t< T >::unsigned_t
566 __kmp_dispatch_guided_remaining(
567  T tc,
568  typename traits_t< T >::floating_t base,
569  typename traits_t< T >::unsigned_t idx
570 ) {
571  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
572  least for ICL 8.1, long double arithmetic may not really have
573  long double precision, even with /Qlong_double. Currently, we
574  workaround that in the caller code, by manipulating the FPCW for
575  Windows* OS on IA-32 architecture. The lack of precision is not
576  expected to be a correctness issue, though.
577  */
578  typedef typename traits_t< T >::unsigned_t UT;
579 
580  long double x = tc * __kmp_pow< UT >(base, idx);
581  UT r = (UT) x;
582  if ( x == r )
583  return r;
584  return r + 1;
585 }
586 
587 // Parameters of the guided-iterative algorithm:
588 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
589 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
590 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
591 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
592 static int guided_int_param = 2;
593 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
594 
595 // UT - unsigned flavor of T, ST - signed flavor of T,
596 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
597 template< typename T >
598 static void
599 __kmp_dispatch_init(
600  ident_t * loc,
601  int gtid,
602  enum sched_type schedule,
603  T lb,
604  T ub,
605  typename traits_t< T >::signed_t st,
606  typename traits_t< T >::signed_t chunk,
607  int push_ws
608 ) {
609  typedef typename traits_t< T >::unsigned_t UT;
610  typedef typename traits_t< T >::signed_t ST;
611  typedef typename traits_t< T >::floating_t DBL;
612  static const int ___kmp_size_type = sizeof( UT );
613 
614  int active;
615  T tc;
616  kmp_info_t * th;
617  kmp_team_t * team;
618  kmp_uint32 my_buffer_index;
619  dispatch_private_info_template< T > * pr;
620  dispatch_shared_info_template< UT > volatile * sh;
621 
622  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
623  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
624 
625  if ( ! TCR_4( __kmp_init_parallel ) )
626  __kmp_parallel_initialize();
627 
628 #if INCLUDE_SSC_MARKS
629  SSC_MARK_DISPATCH_INIT();
630 #endif
631  #ifdef KMP_DEBUG
632  {
633  const char * buff;
634  // create format specifiers before the debug output
635  buff = __kmp_str_format(
636  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
637  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
638  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
639  __kmp_str_free( &buff );
640  }
641  #endif
642  /* setup data */
643  th = __kmp_threads[ gtid ];
644  team = th -> th.th_team;
645  active = ! team -> t.t_serialized;
646  th->th.th_ident = loc;
647 
648 #if USE_ITT_BUILD
649  kmp_uint64 cur_chunk = chunk;
650  int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
651  KMP_MASTER_GTID(gtid) &&
652 #if OMP_40_ENABLED
653  th->th.th_teams_microtask == NULL &&
654 #endif
655  team->t.t_active_level == 1;
656 #endif
657  if ( ! active ) {
658  pr = reinterpret_cast< dispatch_private_info_template< T >* >
659  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
660  } else {
661  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
662  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
663 
664  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
665 
666  /* What happens when number of threads changes, need to resize buffer? */
667  pr = reinterpret_cast< dispatch_private_info_template< T > * >
668  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
669  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
670  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
671  }
672 
673  /* Pick up the nomerge/ordered bits from the scheduling type */
674  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
675  pr->nomerge = TRUE;
676  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
677  } else {
678  pr->nomerge = FALSE;
679  }
680  pr->type_size = ___kmp_size_type; // remember the size of variables
681  if ( kmp_ord_lower & schedule ) {
682  pr->ordered = TRUE;
683  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
684  } else {
685  pr->ordered = FALSE;
686  }
687  if ( schedule == kmp_sch_static ) {
688  schedule = __kmp_static;
689  } else {
690  if ( schedule == kmp_sch_runtime ) {
691  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
692  schedule = team -> t.t_sched.r_sched_type;
693  // Detail the schedule if needed (global controls are differentiated appropriately)
694  if ( schedule == kmp_sch_guided_chunked ) {
695  schedule = __kmp_guided;
696  } else if ( schedule == kmp_sch_static ) {
697  schedule = __kmp_static;
698  }
699  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
700  chunk = team -> t.t_sched.chunk;
701 
702  #ifdef KMP_DEBUG
703  {
704  const char * buff;
705  // create format specifiers before the debug output
706  buff = __kmp_str_format(
707  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
708  traits_t< ST >::spec );
709  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
710  __kmp_str_free( &buff );
711  }
712  #endif
713  } else {
714  if ( schedule == kmp_sch_guided_chunked ) {
715  schedule = __kmp_guided;
716  }
717  if ( chunk <= 0 ) {
718  chunk = KMP_DEFAULT_CHUNK;
719  }
720  }
721 
722  if ( schedule == kmp_sch_auto ) {
723  // mapping and differentiation: in the __kmp_do_serial_initialize()
724  schedule = __kmp_auto;
725  #ifdef KMP_DEBUG
726  {
727  const char * buff;
728  // create format specifiers before the debug output
729  buff = __kmp_str_format(
730  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
731  traits_t< ST >::spec );
732  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
733  __kmp_str_free( &buff );
734  }
735  #endif
736  }
737 
738  /* guided analytical not safe for too many threads */
739  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
740  schedule = kmp_sch_guided_iterative_chunked;
741  KMP_WARNING( DispatchManyThreads );
742  }
743  pr->u.p.parm1 = chunk;
744  }
745  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
746  "unknown scheduling type" );
747 
748  pr->u.p.count = 0;
749 
750  if ( __kmp_env_consistency_check ) {
751  if ( st == 0 ) {
752  __kmp_error_construct(
753  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
754  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
755  );
756  }
757  }
758 
759  tc = ( ub - lb + st );
760  if ( st != 1 ) {
761  if ( st < 0 ) {
762  if ( lb < ub ) {
763  tc = 0; // zero-trip
764  } else { // lb >= ub
765  tc = (ST)tc / st; // convert to signed division
766  }
767  } else { // st > 0
768  if ( ub < lb ) {
769  tc = 0; // zero-trip
770  } else { // lb >= ub
771  tc /= st;
772  }
773  }
774  } else if ( ub < lb ) { // st == 1
775  tc = 0; // zero-trip
776  }
777 
778  pr->u.p.lb = lb;
779  pr->u.p.ub = ub;
780  pr->u.p.st = st;
781  pr->u.p.tc = tc;
782 
783  #if KMP_OS_WINDOWS
784  pr->u.p.last_upper = ub + st;
785  #endif /* KMP_OS_WINDOWS */
786 
787  /* NOTE: only the active parallel region(s) has active ordered sections */
788 
789  if ( active ) {
790  if ( pr->ordered == 0 ) {
791  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
792  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
793  } else {
794  pr->ordered_bumped = 0;
795 
796  pr->u.p.ordered_lower = 1;
797  pr->u.p.ordered_upper = 0;
798 
799  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
800  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
801  }
802  }
803 
804  if ( __kmp_env_consistency_check ) {
805  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
806  if ( push_ws ) {
807  __kmp_push_workshare( gtid, ws, loc );
808  pr->pushed_ws = ws;
809  } else {
810  __kmp_check_workshare( gtid, ws, loc );
811  pr->pushed_ws = ct_none;
812  }
813  }
814 
815  switch ( schedule ) {
816  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
818  {
819  T nproc = team->t.t_nproc;
820  T ntc, init;
821 
822  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
823 
824  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
825  if ( nproc > 1 && ntc >= nproc ) {
826  T id = __kmp_tid_from_gtid(gtid);
827  T small_chunk, extras;
828 
829  small_chunk = ntc / nproc;
830  extras = ntc % nproc;
831 
832  init = id * small_chunk + ( id < extras ? id : extras );
833  pr->u.p.count = init;
834  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
835 
836  pr->u.p.parm2 = lb;
837  //pr->pfields.parm3 = 0; // it's not used in static_steal
838  pr->u.p.parm4 = id;
839  pr->u.p.st = st;
840  break;
841  } else {
842  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
843  gtid ) );
844  schedule = kmp_sch_static_balanced;
845  /* too few iterations: fall-through to kmp_sch_static_balanced */
846  } // if
847  /* FALL-THROUGH to static balanced */
848  } // case
849  #endif
850  case kmp_sch_static_balanced:
851  {
852  T nproc = team->t.t_nproc;
853  T init, limit;
854 
855  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
856  gtid ) );
857 
858  if ( nproc > 1 ) {
859  T id = __kmp_tid_from_gtid(gtid);
860 
861  if ( tc < nproc ) {
862  if ( id < tc ) {
863  init = id;
864  limit = id;
865  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
866  } else {
867  pr->u.p.count = 1; /* means no more chunks to execute */
868  pr->u.p.parm1 = FALSE;
869  break;
870  }
871  } else {
872  T small_chunk = tc / nproc;
873  T extras = tc % nproc;
874  init = id * small_chunk + (id < extras ? id : extras);
875  limit = init + small_chunk - (id < extras ? 0 : 1);
876  pr->u.p.parm1 = (id == nproc - 1);
877  }
878  } else {
879  if ( tc > 0 ) {
880  init = 0;
881  limit = tc - 1;
882  pr->u.p.parm1 = TRUE;
883  } else {
884  // zero trip count
885  pr->u.p.count = 1; /* means no more chunks to execute */
886  pr->u.p.parm1 = FALSE;
887  break;
888  }
889  }
890 #if USE_ITT_BUILD
891  // Calculate chunk for metadata report
892  if ( itt_need_metadata_reporting )
893  cur_chunk = limit - init + 1;
894 #endif
895  if ( st == 1 ) {
896  pr->u.p.lb = lb + init;
897  pr->u.p.ub = lb + limit;
898  } else {
899  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
900  pr->u.p.lb = lb + init * st;
901  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
902  if ( st > 0 ) {
903  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
904  } else {
905  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
906  }
907  }
908  if ( pr->ordered ) {
909  pr->u.p.ordered_lower = init;
910  pr->u.p.ordered_upper = limit;
911  }
912  break;
913  } // case
914  case kmp_sch_guided_iterative_chunked :
915  {
916  T nproc = team->t.t_nproc;
917  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
918 
919  if ( nproc > 1 ) {
920  if ( (2L * chunk + 1 ) * nproc >= tc ) {
921  /* chunk size too large, switch to dynamic */
922  schedule = kmp_sch_dynamic_chunked;
923  } else {
924  // when remaining iters become less than parm2 - switch to dynamic
925  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
926  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
927  }
928  } else {
929  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
930  schedule = kmp_sch_static_greedy;
931  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
932  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
933  pr->u.p.parm1 = tc;
934  } // if
935  } // case
936  break;
937  case kmp_sch_guided_analytical_chunked:
938  {
939  T nproc = team->t.t_nproc;
940  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
941 
942  if ( nproc > 1 ) {
943  if ( (2L * chunk + 1 ) * nproc >= tc ) {
944  /* chunk size too large, switch to dynamic */
945  schedule = kmp_sch_dynamic_chunked;
946  } else {
947  /* commonly used term: (2 nproc - 1)/(2 nproc) */
948  DBL x;
949 
950  #if KMP_OS_WINDOWS && KMP_ARCH_X86
951  /* Linux* OS already has 64-bit computation by default for
952  long double, and on Windows* OS on Intel(R) 64,
953  /Qlong_double doesn't work. On Windows* OS
954  on IA-32 architecture, we need to set precision to
955  64-bit instead of the default 53-bit. Even though long
956  double doesn't work on Windows* OS on Intel(R) 64, the
957  resulting lack of precision is not expected to impact
958  the correctness of the algorithm, but this has not been
959  mathematically proven.
960  */
961  // save original FPCW and set precision to 64-bit, as
962  // Windows* OS on IA-32 architecture defaults to 53-bit
963  unsigned int oldFpcw = _control87(0,0);
964  _control87(_PC_64,_MCW_PC); // 0,0x30000
965  #endif
966  /* value used for comparison in solver for cross-over point */
967  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
968 
969  /* crossover point--chunk indexes equal to or greater than
970  this point switch to dynamic-style scheduling */
971  UT cross;
972 
973  /* commonly used term: (2 nproc - 1)/(2 nproc) */
974  x = (long double)1.0 - (long double)0.5 / nproc;
975 
976  #ifdef KMP_DEBUG
977  { // test natural alignment
978  struct _test_a {
979  char a;
980  union {
981  char b;
982  DBL d;
983  };
984  } t;
985  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
986  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
987  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
988  }
989  #endif // KMP_DEBUG
990 
991  /* save the term in thread private dispatch structure */
992  *(DBL*)&pr->u.p.parm3 = x;
993 
994  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
995  {
996  UT left, right, mid;
997  long double p;
998 
999  /* estimate initial upper and lower bound */
1000 
1001  /* doesn't matter what value right is as long as it is positive, but
1002  it affects performance of the solver
1003  */
1004  right = 229;
1005  p = __kmp_pow< UT >(x,right);
1006  if ( p > target ) {
1007  do{
1008  p *= p;
1009  right <<= 1;
1010  } while(p>target && right < (1<<27));
1011  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1012  } else {
1013  left = 0;
1014  }
1015 
1016  /* bisection root-finding method */
1017  while ( left + 1 < right ) {
1018  mid = (left + right) / 2;
1019  if ( __kmp_pow< UT >(x,mid) > target ) {
1020  left = mid;
1021  } else {
1022  right = mid;
1023  }
1024  } // while
1025  cross = right;
1026  }
1027  /* assert sanity of computed crossover point */
1028  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1029 
1030  /* save the crossover point in thread private dispatch structure */
1031  pr->u.p.parm2 = cross;
1032 
1033  // C75803
1034  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1035  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1036  #else
1037  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1038  #endif
1039  /* dynamic-style scheduling offset */
1040  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1041  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1042  // restore FPCW
1043  _control87(oldFpcw,_MCW_PC);
1044  #endif
1045  } // if
1046  } else {
1047  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1048  gtid ) );
1049  schedule = kmp_sch_static_greedy;
1050  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1051  pr->u.p.parm1 = tc;
1052  } // if
1053  } // case
1054  break;
1055  case kmp_sch_static_greedy:
1056  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1057  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1058  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1059  tc;
1060  break;
1061  case kmp_sch_static_chunked :
1062  case kmp_sch_dynamic_chunked :
1063  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1064  break;
1065  case kmp_sch_trapezoidal :
1066  {
1067  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1068 
1069  T parm1, parm2, parm3, parm4;
1070  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1071 
1072  parm1 = chunk;
1073 
1074  /* F : size of the first cycle */
1075  parm2 = ( tc / (2 * team->t.t_nproc) );
1076 
1077  if ( parm2 < 1 ) {
1078  parm2 = 1;
1079  }
1080 
1081  /* L : size of the last cycle. Make sure the last cycle
1082  * is not larger than the first cycle.
1083  */
1084  if ( parm1 < 1 ) {
1085  parm1 = 1;
1086  } else if ( parm1 > parm2 ) {
1087  parm1 = parm2;
1088  }
1089 
1090  /* N : number of cycles */
1091  parm3 = ( parm2 + parm1 );
1092  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1093 
1094  if ( parm3 < 2 ) {
1095  parm3 = 2;
1096  }
1097 
1098  /* sigma : decreasing incr of the trapezoid */
1099  parm4 = ( parm3 - 1 );
1100  parm4 = ( parm2 - parm1 ) / parm4;
1101 
1102  // pointless check, because parm4 >= 0 always
1103  //if ( parm4 < 0 ) {
1104  // parm4 = 0;
1105  //}
1106 
1107  pr->u.p.parm1 = parm1;
1108  pr->u.p.parm2 = parm2;
1109  pr->u.p.parm3 = parm3;
1110  pr->u.p.parm4 = parm4;
1111  } // case
1112  break;
1113 
1114  default:
1115  {
1116  __kmp_msg(
1117  kmp_ms_fatal, // Severity
1118  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1119  KMP_HNT( GetNewerLibrary ), // Hint
1120  __kmp_msg_null // Variadic argument list terminator
1121  );
1122  }
1123  break;
1124  } // switch
1125  pr->schedule = schedule;
1126  if ( active ) {
1127  /* The name of this buffer should be my_buffer_index when it's free to use it */
1128 
1129  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1130  gtid, my_buffer_index, sh->buffer_index) );
1131  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1132  USE_ITT_BUILD_ARG( NULL )
1133  );
1134  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1135  // *always* 32-bit integers.
1136  KMP_MB(); /* is this necessary? */
1137  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1138  gtid, my_buffer_index, sh->buffer_index) );
1139 
1140  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1141  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1142 #if USE_ITT_BUILD
1143  if ( pr->ordered ) {
1144  __kmp_itt_ordered_init( gtid );
1145  }; // if
1146  // Report loop metadata
1147  if ( itt_need_metadata_reporting ) {
1148  // Only report metadata by master of active team at level 1
1149  kmp_uint64 schedtype = 0;
1150  switch ( schedule ) {
1151  case kmp_sch_static_chunked:
1152  case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1153  break;
1154  case kmp_sch_static_greedy:
1155  cur_chunk = pr->u.p.parm1;
1156  break;
1157  case kmp_sch_dynamic_chunked:
1158  schedtype = 1;
1159  break;
1160  case kmp_sch_guided_iterative_chunked:
1161  case kmp_sch_guided_analytical_chunked:
1162  schedtype = 2;
1163  break;
1164  default:
1165 // Should we put this case under "static"?
1166 // case kmp_sch_static_steal:
1167  schedtype = 3;
1168  break;
1169  }
1170  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1171  }
1172 #endif /* USE_ITT_BUILD */
1173  }; // if
1174 
1175  #ifdef KMP_DEBUG
1176  {
1177  const char * buff;
1178  // create format specifiers before the debug output
1179  buff = __kmp_str_format(
1180  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1181  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1182  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1183  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1184  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1185  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1186  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1187  KD_TRACE(10, ( buff,
1188  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1189  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1190  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1191  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1192  __kmp_str_free( &buff );
1193  }
1194  #endif
1195  #if ( KMP_STATIC_STEAL_ENABLED )
1196  if ( ___kmp_size_type < 8 ) {
1197  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1198  // all the parm3 variables will contain the same value.
1199  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1200  // rather than program life-time increment.
1201  // So the dedicated variable is required. The 'static_steal_counter' is used.
1202  if( schedule == kmp_sch_static_steal ) {
1203  // Other threads will inspect this variable when searching for a victim.
1204  // This is a flag showing that other threads may steal from this thread since then.
1205  volatile T * p = &pr->u.p.static_steal_counter;
1206  *p = *p + 1;
1207  }
1208  }
1209  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1210 }
1211 
1212 /*
1213  * For ordered loops, either __kmp_dispatch_finish() should be called after
1214  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1215  * every chunk of iterations. If the ordered section(s) were not executed
1216  * for this iteration (or every iteration in this chunk), we need to set the
1217  * ordered iteration counters so that the next thread can proceed.
1218  */
1219 template< typename UT >
1220 static void
1221 __kmp_dispatch_finish( int gtid, ident_t *loc )
1222 {
1223  typedef typename traits_t< UT >::signed_t ST;
1224  kmp_info_t *th = __kmp_threads[ gtid ];
1225 
1226  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1227  if ( ! th -> th.th_team -> t.t_serialized ) {
1228 
1229  dispatch_private_info_template< UT > * pr =
1230  reinterpret_cast< dispatch_private_info_template< UT >* >
1231  ( th->th.th_dispatch->th_dispatch_pr_current );
1232  dispatch_shared_info_template< UT > volatile * sh =
1233  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1234  ( th->th.th_dispatch->th_dispatch_sh_current );
1235  KMP_DEBUG_ASSERT( pr );
1236  KMP_DEBUG_ASSERT( sh );
1237  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1238  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1239 
1240  if ( pr->ordered_bumped ) {
1241  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1242  gtid ) );
1243  pr->ordered_bumped = 0;
1244  } else {
1245  UT lower = pr->u.p.ordered_lower;
1246 
1247  #ifdef KMP_DEBUG
1248  {
1249  const char * buff;
1250  // create format specifiers before the debug output
1251  buff = __kmp_str_format(
1252  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1253  traits_t< UT >::spec, traits_t< UT >::spec );
1254  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1255  __kmp_str_free( &buff );
1256  }
1257  #endif
1258 
1259  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1260  USE_ITT_BUILD_ARG(NULL)
1261  );
1262  KMP_MB(); /* is this necessary? */
1263  #ifdef KMP_DEBUG
1264  {
1265  const char * buff;
1266  // create format specifiers before the debug output
1267  buff = __kmp_str_format(
1268  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1269  traits_t< UT >::spec, traits_t< UT >::spec );
1270  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1271  __kmp_str_free( &buff );
1272  }
1273  #endif
1274 
1275  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1276  } // if
1277  } // if
1278  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1279 }
1280 
1281 #ifdef KMP_GOMP_COMPAT
1282 
1283 template< typename UT >
1284 static void
1285 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1286 {
1287  typedef typename traits_t< UT >::signed_t ST;
1288  kmp_info_t *th = __kmp_threads[ gtid ];
1289 
1290  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1291  if ( ! th -> th.th_team -> t.t_serialized ) {
1292 // int cid;
1293  dispatch_private_info_template< UT > * pr =
1294  reinterpret_cast< dispatch_private_info_template< UT >* >
1295  ( th->th.th_dispatch->th_dispatch_pr_current );
1296  dispatch_shared_info_template< UT > volatile * sh =
1297  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1298  ( th->th.th_dispatch->th_dispatch_sh_current );
1299  KMP_DEBUG_ASSERT( pr );
1300  KMP_DEBUG_ASSERT( sh );
1301  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1302  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1303 
1304 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1305  UT lower = pr->u.p.ordered_lower;
1306  UT upper = pr->u.p.ordered_upper;
1307  UT inc = upper - lower + 1;
1308 
1309  if ( pr->ordered_bumped == inc ) {
1310  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1311  gtid ) );
1312  pr->ordered_bumped = 0;
1313  } else {
1314  inc -= pr->ordered_bumped;
1315 
1316  #ifdef KMP_DEBUG
1317  {
1318  const char * buff;
1319  // create format specifiers before the debug output
1320  buff = __kmp_str_format(
1321  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1322  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1323  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1324  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1325  __kmp_str_free( &buff );
1326  }
1327  #endif
1328 
1329  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1330  USE_ITT_BUILD_ARG(NULL)
1331  );
1332 
1333  KMP_MB(); /* is this necessary? */
1334  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1335  gtid ) );
1336  pr->ordered_bumped = 0;
1338  #ifdef KMP_DEBUG
1339  {
1340  const char * buff;
1341  // create format specifiers before the debug output
1342  buff = __kmp_str_format(
1343  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1344  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1345  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1346  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1347  __kmp_str_free( &buff );
1348  }
1349  #endif
1350 
1351  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1352  }
1353 // }
1354  }
1355  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1356 }
1357 
1358 #endif /* KMP_GOMP_COMPAT */
1359 
1360 template< typename T >
1361 static int
1362 __kmp_dispatch_next(
1363  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1364 ) {
1365 
1366  typedef typename traits_t< T >::unsigned_t UT;
1367  typedef typename traits_t< T >::signed_t ST;
1368  typedef typename traits_t< T >::floating_t DBL;
1369  static const int ___kmp_size_type = sizeof( UT );
1370 
1371  int status;
1372  dispatch_private_info_template< T > * pr;
1373  kmp_info_t * th = __kmp_threads[ gtid ];
1374  kmp_team_t * team = th -> th.th_team;
1375 
1376  KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1377  #ifdef KMP_DEBUG
1378  {
1379  const char * buff;
1380  // create format specifiers before the debug output
1381  buff = __kmp_str_format(
1382  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1383  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1384  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1385  __kmp_str_free( &buff );
1386  }
1387  #endif
1388 
1389  if ( team -> t.t_serialized ) {
1390  /* NOTE: serialize this dispatch becase we are not at the active level */
1391  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1392  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1393  KMP_DEBUG_ASSERT( pr );
1394 
1395  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1396  *p_lb = 0;
1397  *p_ub = 0;
1398 // if ( p_last != NULL )
1399 // *p_last = 0;
1400  if ( p_st != NULL )
1401  *p_st = 0;
1402  if ( __kmp_env_consistency_check ) {
1403  if ( pr->pushed_ws != ct_none ) {
1404  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1405  }
1406  }
1407  } else if ( pr->nomerge ) {
1408  kmp_int32 last;
1409  T start;
1410  UT limit, trip, init;
1411  ST incr;
1412  T chunk = pr->u.p.parm1;
1413 
1414  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1415 
1416  init = chunk * pr->u.p.count++;
1417  trip = pr->u.p.tc - 1;
1418 
1419  if ( (status = (init <= trip)) == 0 ) {
1420  *p_lb = 0;
1421  *p_ub = 0;
1422 // if ( p_last != NULL )
1423 // *p_last = 0;
1424  if ( p_st != NULL )
1425  *p_st = 0;
1426  if ( __kmp_env_consistency_check ) {
1427  if ( pr->pushed_ws != ct_none ) {
1428  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1429  }
1430  }
1431  } else {
1432  start = pr->u.p.lb;
1433  limit = chunk + init - 1;
1434  incr = pr->u.p.st;
1435 
1436  if ( (last = (limit >= trip)) != 0 ) {
1437  limit = trip;
1438  #if KMP_OS_WINDOWS
1439  pr->u.p.last_upper = pr->u.p.ub;
1440  #endif /* KMP_OS_WINDOWS */
1441  }
1442  if ( p_last != NULL )
1443  *p_last = last;
1444  if ( p_st != NULL )
1445  *p_st = incr;
1446  if ( incr == 1 ) {
1447  *p_lb = start + init;
1448  *p_ub = start + limit;
1449  } else {
1450  *p_lb = start + init * incr;
1451  *p_ub = start + limit * incr;
1452  }
1453 
1454  if ( pr->ordered ) {
1455  pr->u.p.ordered_lower = init;
1456  pr->u.p.ordered_upper = limit;
1457  #ifdef KMP_DEBUG
1458  {
1459  const char * buff;
1460  // create format specifiers before the debug output
1461  buff = __kmp_str_format(
1462  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1463  traits_t< UT >::spec, traits_t< UT >::spec );
1464  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1465  __kmp_str_free( &buff );
1466  }
1467  #endif
1468  } // if
1469  } // if
1470  } else {
1471  pr->u.p.tc = 0;
1472  *p_lb = pr->u.p.lb;
1473  *p_ub = pr->u.p.ub;
1474  #if KMP_OS_WINDOWS
1475  pr->u.p.last_upper = *p_ub;
1476  #endif /* KMP_OS_WINDOWS */
1477  if ( p_last != NULL )
1478  *p_last = TRUE;
1479  if ( p_st != NULL )
1480  *p_st = pr->u.p.st;
1481  } // if
1482  #ifdef KMP_DEBUG
1483  {
1484  const char * buff;
1485  // create format specifiers before the debug output
1486  buff = __kmp_str_format(
1487  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1488  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1489  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1490  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1491  __kmp_str_free( &buff );
1492  }
1493  #endif
1494 #if INCLUDE_SSC_MARKS
1495  SSC_MARK_DISPATCH_NEXT();
1496 #endif
1497  return status;
1498  } else {
1499  kmp_int32 last = 0;
1500  dispatch_shared_info_template< UT > *sh;
1501  T start;
1502  ST incr;
1503  UT limit, trip, init;
1504 
1505  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1506  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1507 
1508  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1509  ( th->th.th_dispatch->th_dispatch_pr_current );
1510  KMP_DEBUG_ASSERT( pr );
1511  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1512  ( th->th.th_dispatch->th_dispatch_sh_current );
1513  KMP_DEBUG_ASSERT( sh );
1514 
1515  if ( pr->u.p.tc == 0 ) {
1516  // zero trip count
1517  status = 0;
1518  } else {
1519  switch (pr->schedule) {
1520  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1521  case kmp_sch_static_steal:
1522  {
1523  T chunk = pr->u.p.parm1;
1524 
1525  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1526 
1527  trip = pr->u.p.tc - 1;
1528 
1529  if ( ___kmp_size_type > 4 ) {
1530  // Other threads do not look into the data of this thread,
1531  // so it's not necessary to make volatile casting.
1532  init = ( pr->u.p.count )++;
1533  status = ( init < (UT)pr->u.p.ub );
1534  } else {
1535  typedef union {
1536  struct {
1537  UT count;
1538  T ub;
1539  } p;
1540  kmp_int64 b;
1541  } union_i4;
1542  // All operations on 'count' or 'ub' must be combined atomically together.
1543  // stealing implemented only for 4-byte indexes
1544  {
1545  union_i4 vold, vnew;
1546  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1547  vnew = vold;
1548  vnew.p.count++;
1549  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1550  ( volatile kmp_int64* )&pr->u.p.count,
1551  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1552  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1553  KMP_CPU_PAUSE();
1554  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1555  vnew = vold;
1556  vnew.p.count++;
1557  }
1558  vnew = vold;
1559  init = vnew.p.count;
1560  status = ( init < (UT)vnew.p.ub ) ;
1561  }
1562 
1563  if( !status ) {
1564  kmp_info_t **other_threads = team->t.t_threads;
1565  int while_limit = 10;
1566  int while_index = 0;
1567 
1568  // TODO: algorithm of searching for a victim
1569  // should be cleaned up and measured
1570  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1571  union_i4 vold, vnew;
1572  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1573  T victimIdx = pr->u.p.parm4;
1574  T oldVictimIdx = victimIdx;
1575  dispatch_private_info_template< T > * victim;
1576 
1577  do {
1578  if( !victimIdx ) {
1579  victimIdx = team->t.t_nproc - 1;
1580  } else {
1581  --victimIdx;
1582  }
1583  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1584  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1585  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1586  // TODO: think about a proper place of this test
1587  if ( ( !victim ) ||
1588  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1589  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1590  // TODO: delay would be nice
1591  continue;
1592  // the victim is not ready yet to participate in stealing
1593  // because the victim is still in kmp_init_dispatch
1594  }
1595  if ( oldVictimIdx == victimIdx ) {
1596  break;
1597  }
1598  pr->u.p.parm4 = victimIdx;
1599 
1600  while( 1 ) {
1601  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1602  vnew = vold;
1603 
1604  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1605  if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1606  break;
1607  }
1608  vnew.p.ub -= (remaining >> 2);
1609  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1610  #pragma warning( push )
1611  // disable warning on pointless comparison of unsigned with 0
1612  #pragma warning( disable: 186 )
1613  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1614  #pragma warning( pop )
1615  // TODO: Should this be acquire or release?
1616  if ( KMP_COMPARE_AND_STORE_ACQ64(
1617  ( volatile kmp_int64 * )&victim->u.p.count,
1618  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1619  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1620  status = 1;
1621  while_index = 0;
1622  // now update own count and ub
1623  #if KMP_ARCH_X86
1624  // stealing executed on non-KMP_ARCH_X86 only
1625  // Atomic 64-bit write on ia32 is
1626  // unavailable, so we do this in steps.
1627  // This code is not tested.
1628  init = vold.p.count;
1629  pr->u.p.ub = 0;
1630  pr->u.p.count = init + 1;
1631  pr->u.p.ub = vnew.p.count;
1632  #else
1633  init = vnew.p.ub;
1634  vold.p.count = init + 1;
1635  // TODO: is it safe and enough?
1636  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1637  #endif // KMP_ARCH_X86
1638  break;
1639  } // if
1640  KMP_CPU_PAUSE();
1641  } // while (1)
1642  } // while
1643  } // if
1644  } // if
1645  if ( !status ) {
1646  *p_lb = 0;
1647  *p_ub = 0;
1648  if ( p_st != NULL ) *p_st = 0;
1649  } else {
1650  start = pr->u.p.parm2;
1651  init *= chunk;
1652  limit = chunk + init - 1;
1653  incr = pr->u.p.st;
1654 
1655  KMP_DEBUG_ASSERT(init <= trip);
1656  if ( (last = (limit >= trip)) != 0 )
1657  limit = trip;
1658  if ( p_st != NULL ) *p_st = incr;
1659 
1660  if ( incr == 1 ) {
1661  *p_lb = start + init;
1662  *p_ub = start + limit;
1663  } else {
1664  *p_lb = start + init * incr;
1665  *p_ub = start + limit * incr;
1666  }
1667 
1668  if ( pr->ordered ) {
1669  pr->u.p.ordered_lower = init;
1670  pr->u.p.ordered_upper = limit;
1671  #ifdef KMP_DEBUG
1672  {
1673  const char * buff;
1674  // create format specifiers before the debug output
1675  buff = __kmp_str_format(
1676  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1677  traits_t< UT >::spec, traits_t< UT >::spec );
1678  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1679  __kmp_str_free( &buff );
1680  }
1681  #endif
1682  } // if
1683  } // if
1684  break;
1685  } // case
1686  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1687  case kmp_sch_static_balanced:
1688  {
1689  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1690  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1691  pr->u.p.count = 1;
1692  *p_lb = pr->u.p.lb;
1693  *p_ub = pr->u.p.ub;
1694  last = pr->u.p.parm1;
1695  if ( p_st != NULL )
1696  *p_st = pr->u.p.st;
1697  } else { /* no iterations to do */
1698  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1699  }
1700  if ( pr->ordered ) {
1701  #ifdef KMP_DEBUG
1702  {
1703  const char * buff;
1704  // create format specifiers before the debug output
1705  buff = __kmp_str_format(
1706  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1707  traits_t< UT >::spec, traits_t< UT >::spec );
1708  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1709  __kmp_str_free( &buff );
1710  }
1711  #endif
1712  } // if
1713  } // case
1714  break;
1715  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1716  case kmp_sch_static_chunked:
1717  {
1718  T parm1;
1719 
1720  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1721  gtid ) );
1722  parm1 = pr->u.p.parm1;
1723 
1724  trip = pr->u.p.tc - 1;
1725  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1726 
1727  if ( (status = (init <= trip)) != 0 ) {
1728  start = pr->u.p.lb;
1729  incr = pr->u.p.st;
1730  limit = parm1 + init - 1;
1731 
1732  if ( (last = (limit >= trip)) != 0 )
1733  limit = trip;
1734 
1735  if ( p_st != NULL ) *p_st = incr;
1736 
1737  pr->u.p.count += team->t.t_nproc;
1738 
1739  if ( incr == 1 ) {
1740  *p_lb = start + init;
1741  *p_ub = start + limit;
1742  }
1743  else {
1744  *p_lb = start + init * incr;
1745  *p_ub = start + limit * incr;
1746  }
1747 
1748  if ( pr->ordered ) {
1749  pr->u.p.ordered_lower = init;
1750  pr->u.p.ordered_upper = limit;
1751  #ifdef KMP_DEBUG
1752  {
1753  const char * buff;
1754  // create format specifiers before the debug output
1755  buff = __kmp_str_format(
1756  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1757  traits_t< UT >::spec, traits_t< UT >::spec );
1758  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1759  __kmp_str_free( &buff );
1760  }
1761  #endif
1762  } // if
1763  } // if
1764  } // case
1765  break;
1766 
1767  case kmp_sch_dynamic_chunked:
1768  {
1769  T chunk = pr->u.p.parm1;
1770 
1771  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1772  gtid ) );
1773 
1774  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1775  trip = pr->u.p.tc - 1;
1776 
1777  if ( (status = (init <= trip)) == 0 ) {
1778  *p_lb = 0;
1779  *p_ub = 0;
1780  if ( p_st != NULL ) *p_st = 0;
1781  } else {
1782  start = pr->u.p.lb;
1783  limit = chunk + init - 1;
1784  incr = pr->u.p.st;
1785 
1786  if ( (last = (limit >= trip)) != 0 )
1787  limit = trip;
1788 
1789  if ( p_st != NULL ) *p_st = incr;
1790 
1791  if ( incr == 1 ) {
1792  *p_lb = start + init;
1793  *p_ub = start + limit;
1794  } else {
1795  *p_lb = start + init * incr;
1796  *p_ub = start + limit * incr;
1797  }
1798 
1799  if ( pr->ordered ) {
1800  pr->u.p.ordered_lower = init;
1801  pr->u.p.ordered_upper = limit;
1802  #ifdef KMP_DEBUG
1803  {
1804  const char * buff;
1805  // create format specifiers before the debug output
1806  buff = __kmp_str_format(
1807  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1808  traits_t< UT >::spec, traits_t< UT >::spec );
1809  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1810  __kmp_str_free( &buff );
1811  }
1812  #endif
1813  } // if
1814  } // if
1815  } // case
1816  break;
1817 
1818  case kmp_sch_guided_iterative_chunked:
1819  {
1820  T chunkspec = pr->u.p.parm1;
1821  KD_TRACE(100,
1822  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1823  trip = pr->u.p.tc;
1824  // Start atomic part of calculations
1825  while(1) {
1826  ST remaining; // signed, because can be < 0
1827  init = sh->u.s.iteration; // shared value
1828  remaining = trip - init;
1829  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1830  // nothing to do, don't try atomic op
1831  status = 0;
1832  break;
1833  }
1834  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1835  // use dynamic-style shcedule
1836  // atomically inrement iterations, get old value
1837  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1838  remaining = trip - init;
1839  if (remaining <= 0) {
1840  status = 0; // all iterations got by other threads
1841  } else {
1842  // got some iterations to work on
1843  status = 1;
1844  if ( (T)remaining > chunkspec ) {
1845  limit = init + chunkspec - 1;
1846  } else {
1847  last = 1; // the last chunk
1848  limit = init + remaining - 1;
1849  } // if
1850  } // if
1851  break;
1852  } // if
1853  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1854  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1855  // CAS was successful, chunk obtained
1856  status = 1;
1857  --limit;
1858  break;
1859  } // if
1860  } // while
1861  if ( status != 0 ) {
1862  start = pr->u.p.lb;
1863  incr = pr->u.p.st;
1864  if ( p_st != NULL )
1865  *p_st = incr;
1866  *p_lb = start + init * incr;
1867  *p_ub = start + limit * incr;
1868  if ( pr->ordered ) {
1869  pr->u.p.ordered_lower = init;
1870  pr->u.p.ordered_upper = limit;
1871  #ifdef KMP_DEBUG
1872  {
1873  const char * buff;
1874  // create format specifiers before the debug output
1875  buff = __kmp_str_format(
1876  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1877  traits_t< UT >::spec, traits_t< UT >::spec );
1878  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1879  __kmp_str_free( &buff );
1880  }
1881  #endif
1882  } // if
1883  } else {
1884  *p_lb = 0;
1885  *p_ub = 0;
1886  if ( p_st != NULL )
1887  *p_st = 0;
1888  } // if
1889  } // case
1890  break;
1891 
1892  case kmp_sch_guided_analytical_chunked:
1893  {
1894  T chunkspec = pr->u.p.parm1;
1895  UT chunkIdx;
1896  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1897  /* for storing original FPCW value for Windows* OS on
1898  IA-32 architecture 8-byte version */
1899  unsigned int oldFpcw;
1900  unsigned int fpcwSet = 0;
1901  #endif
1902  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1903  gtid ) );
1904 
1905  trip = pr->u.p.tc;
1906 
1907  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1908  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1909 
1910  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1911  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1912  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1913  --trip;
1914  /* use dynamic-style scheduling */
1915  init = chunkIdx * chunkspec + pr->u.p.count;
1916  /* need to verify init > 0 in case of overflow in the above calculation */
1917  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1918  limit = init + chunkspec -1;
1919 
1920  if ( (last = (limit >= trip)) != 0 )
1921  limit = trip;
1922  }
1923  break;
1924  } else {
1925  /* use exponential-style scheduling */
1926  /* The following check is to workaround the lack of long double precision on Windows* OS.
1927  This check works around the possible effect that init != 0 for chunkIdx == 0.
1928  */
1929  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1930  /* If we haven't already done so, save original
1931  FPCW and set precision to 64-bit, as Windows* OS
1932  on IA-32 architecture defaults to 53-bit */
1933  if ( !fpcwSet ) {
1934  oldFpcw = _control87(0,0);
1935  _control87(_PC_64,_MCW_PC);
1936  fpcwSet = 0x30000;
1937  }
1938  #endif
1939  if ( chunkIdx ) {
1940  init = __kmp_dispatch_guided_remaining< T >(
1941  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1942  KMP_DEBUG_ASSERT(init);
1943  init = trip - init;
1944  } else
1945  init = 0;
1946  limit = trip - __kmp_dispatch_guided_remaining< T >(
1947  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1948  KMP_ASSERT(init <= limit);
1949  if ( init < limit ) {
1950  KMP_DEBUG_ASSERT(limit <= trip);
1951  --limit;
1952  status = 1;
1953  break;
1954  } // if
1955  } // if
1956  } // while (1)
1957  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1958  /* restore FPCW if necessary
1959  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1960  */
1961  if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1962  _control87(oldFpcw,_MCW_PC);
1963  #endif
1964  if ( status != 0 ) {
1965  start = pr->u.p.lb;
1966  incr = pr->u.p.st;
1967  if ( p_st != NULL )
1968  *p_st = incr;
1969  *p_lb = start + init * incr;
1970  *p_ub = start + limit * incr;
1971  if ( pr->ordered ) {
1972  pr->u.p.ordered_lower = init;
1973  pr->u.p.ordered_upper = limit;
1974  #ifdef KMP_DEBUG
1975  {
1976  const char * buff;
1977  // create format specifiers before the debug output
1978  buff = __kmp_str_format(
1979  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1980  traits_t< UT >::spec, traits_t< UT >::spec );
1981  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1982  __kmp_str_free( &buff );
1983  }
1984  #endif
1985  }
1986  } else {
1987  *p_lb = 0;
1988  *p_ub = 0;
1989  if ( p_st != NULL )
1990  *p_st = 0;
1991  }
1992  } // case
1993  break;
1994 
1995  case kmp_sch_trapezoidal:
1996  {
1997  UT index;
1998  T parm2 = pr->u.p.parm2;
1999  T parm3 = pr->u.p.parm3;
2000  T parm4 = pr->u.p.parm4;
2001  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2002  gtid ) );
2003 
2004  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2005 
2006  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2007  trip = pr->u.p.tc - 1;
2008 
2009  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2010  *p_lb = 0;
2011  *p_ub = 0;
2012  if ( p_st != NULL ) *p_st = 0;
2013  } else {
2014  start = pr->u.p.lb;
2015  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2016  incr = pr->u.p.st;
2017 
2018  if ( (last = (limit >= trip)) != 0 )
2019  limit = trip;
2020 
2021  if ( p_st != NULL ) *p_st = incr;
2022 
2023  if ( incr == 1 ) {
2024  *p_lb = start + init;
2025  *p_ub = start + limit;
2026  } else {
2027  *p_lb = start + init * incr;
2028  *p_ub = start + limit * incr;
2029  }
2030 
2031  if ( pr->ordered ) {
2032  pr->u.p.ordered_lower = init;
2033  pr->u.p.ordered_upper = limit;
2034  #ifdef KMP_DEBUG
2035  {
2036  const char * buff;
2037  // create format specifiers before the debug output
2038  buff = __kmp_str_format(
2039  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2040  traits_t< UT >::spec, traits_t< UT >::spec );
2041  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2042  __kmp_str_free( &buff );
2043  }
2044  #endif
2045  } // if
2046  } // if
2047  } // case
2048  break;
2049  default:
2050  {
2051  status = 0; // to avoid complaints on uninitialized variable use
2052  __kmp_msg(
2053  kmp_ms_fatal, // Severity
2054  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2055  KMP_HNT( GetNewerLibrary ), // Hint
2056  __kmp_msg_null // Variadic argument list terminator
2057  );
2058  }
2059  break;
2060  } // switch
2061  } // if tc == 0;
2062 
2063  if ( status == 0 ) {
2064  UT num_done;
2065 
2066  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2067  #ifdef KMP_DEBUG
2068  {
2069  const char * buff;
2070  // create format specifiers before the debug output
2071  buff = __kmp_str_format(
2072  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2073  traits_t< UT >::spec );
2074  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2075  __kmp_str_free( &buff );
2076  }
2077  #endif
2078 
2079  if ( (ST)num_done == team->t.t_nproc-1 ) {
2080  /* NOTE: release this buffer to be reused */
2081 
2082  KMP_MB(); /* Flush all pending memory write invalidates. */
2083 
2084  sh->u.s.num_done = 0;
2085  sh->u.s.iteration = 0;
2086 
2087  /* TODO replace with general release procedure? */
2088  if ( pr->ordered ) {
2089  sh->u.s.ordered_iteration = 0;
2090  }
2091 
2092  KMP_MB(); /* Flush all pending memory write invalidates. */
2093 
2094  sh -> buffer_index += KMP_MAX_DISP_BUF;
2095  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2096  gtid, sh->buffer_index) );
2097 
2098  KMP_MB(); /* Flush all pending memory write invalidates. */
2099 
2100  } // if
2101  if ( __kmp_env_consistency_check ) {
2102  if ( pr->pushed_ws != ct_none ) {
2103  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2104  }
2105  }
2106 
2107  th -> th.th_dispatch -> th_deo_fcn = NULL;
2108  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2109  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2110  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2111  } // if (status == 0)
2112 #if KMP_OS_WINDOWS
2113  else if ( last ) {
2114  pr->u.p.last_upper = pr->u.p.ub;
2115  }
2116 #endif /* KMP_OS_WINDOWS */
2117  if ( p_last != NULL && status != 0 )
2118  *p_last = last;
2119  } // if
2120 
2121  #ifdef KMP_DEBUG
2122  {
2123  const char * buff;
2124  // create format specifiers before the debug output
2125  buff = __kmp_str_format(
2126  "__kmp_dispatch_next: T#%%d normal case: " \
2127  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2128  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2129  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2130  __kmp_str_free( &buff );
2131  }
2132  #endif
2133 #if INCLUDE_SSC_MARKS
2134  SSC_MARK_DISPATCH_NEXT();
2135 #endif
2136  return status;
2137 }
2138 
2139 template< typename T >
2140 static void
2141 __kmp_dist_get_bounds(
2142  ident_t *loc,
2143  kmp_int32 gtid,
2144  kmp_int32 *plastiter,
2145  T *plower,
2146  T *pupper,
2147  typename traits_t< T >::signed_t incr
2148 ) {
2149  KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2150  typedef typename traits_t< T >::unsigned_t UT;
2151  typedef typename traits_t< T >::signed_t ST;
2152  register kmp_uint32 team_id;
2153  register kmp_uint32 nteams;
2154  register UT trip_count;
2155  register kmp_team_t *team;
2156  kmp_info_t * th;
2157 
2158  KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2159  KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2160  #ifdef KMP_DEBUG
2161  {
2162  const char * buff;
2163  // create format specifiers before the debug output
2164  buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2165  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2166  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2167  traits_t< T >::spec );
2168  KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2169  __kmp_str_free( &buff );
2170  }
2171  #endif
2172 
2173  if( __kmp_env_consistency_check ) {
2174  if( incr == 0 ) {
2175  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2176  }
2177  if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2178  // The loop is illegal.
2179  // Some zero-trip loops maintained by compiler, e.g.:
2180  // for(i=10;i<0;++i) // lower >= upper - run-time check
2181  // for(i=0;i>10;--i) // lower <= upper - run-time check
2182  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2183  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2184  // Compiler does not check the following illegal loops:
2185  // for(i=0;i<10;i+=incr) // where incr<0
2186  // for(i=10;i>0;i-=incr) // where incr<0
2187  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2188  }
2189  }
2190  th = __kmp_threads[gtid];
2191  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2192  team = th->th.th_team;
2193  #if OMP_40_ENABLED
2194  nteams = th->th.th_teams_size.nteams;
2195  #endif
2196  team_id = team->t.t_master_tid;
2197  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2198 
2199  // compute global trip count
2200  if( incr == 1 ) {
2201  trip_count = *pupper - *plower + 1;
2202  } else if(incr == -1) {
2203  trip_count = *plower - *pupper + 1;
2204  } else {
2205  trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2206  }
2207  if( trip_count <= nteams ) {
2208  KMP_DEBUG_ASSERT(
2209  __kmp_static == kmp_sch_static_greedy || \
2210  __kmp_static == kmp_sch_static_balanced
2211  ); // Unknown static scheduling type.
2212  // only some teams get single iteration, others get nothing
2213  if( team_id < trip_count ) {
2214  *pupper = *plower = *plower + team_id * incr;
2215  } else {
2216  *plower = *pupper + incr; // zero-trip loop
2217  }
2218  if( plastiter != NULL )
2219  *plastiter = ( team_id == trip_count - 1 );
2220  } else {
2221  if( __kmp_static == kmp_sch_static_balanced ) {
2222  register UT chunk = trip_count / nteams;
2223  register UT extras = trip_count % nteams;
2224  *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2225  *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2226  if( plastiter != NULL )
2227  *plastiter = ( team_id == nteams - 1 );
2228  } else {
2229  register T chunk_inc_count =
2230  ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2231  register T upper = *pupper;
2232  KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2233  // Unknown static scheduling type.
2234  *plower += team_id * chunk_inc_count;
2235  *pupper = *plower + chunk_inc_count - incr;
2236  // Check/correct bounds if needed
2237  if( incr > 0 ) {
2238  if( *pupper < *plower )
2239  *pupper = i_maxmin< T >::mx;
2240  if( plastiter != NULL )
2241  *plastiter = *plower <= upper && *pupper > upper - incr;
2242  if( *pupper > upper )
2243  *pupper = upper; // tracker C73258
2244  } else {
2245  if( *pupper > *plower )
2246  *pupper = i_maxmin< T >::mn;
2247  if( plastiter != NULL )
2248  *plastiter = *plower >= upper && *pupper < upper - incr;
2249  if( *pupper < upper )
2250  *pupper = upper; // tracker C73258
2251  }
2252  }
2253  }
2254 }
2255 
2256 //-----------------------------------------------------------------------------------------
2257 // Dispatch routines
2258 // Transfer call to template< type T >
2259 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2260 // T lb, T ub, ST st, ST chunk )
2261 extern "C" {
2262 
2278 void
2279 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2280  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2281 {
2282  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2283  KMP_DEBUG_ASSERT( __kmp_init_serial );
2284  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2285 }
2289 void
2290 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2291  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2292 {
2293  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2294  KMP_DEBUG_ASSERT( __kmp_init_serial );
2295  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2296 }
2297 
2301 void
2302 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2303  kmp_int64 lb, kmp_int64 ub,
2304  kmp_int64 st, kmp_int64 chunk )
2305 {
2306  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2307  KMP_DEBUG_ASSERT( __kmp_init_serial );
2308  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2309 }
2310 
2314 void
2315 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2316  kmp_uint64 lb, kmp_uint64 ub,
2317  kmp_int64 st, kmp_int64 chunk )
2318 {
2319  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2320  KMP_DEBUG_ASSERT( __kmp_init_serial );
2321  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2322 }
2323 
2333 void
2334 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2335  kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2336 {
2337  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2338  KMP_DEBUG_ASSERT( __kmp_init_serial );
2339  __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2340  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2341 }
2342 
2343 void
2344 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2345  kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2346 {
2347  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2348  KMP_DEBUG_ASSERT( __kmp_init_serial );
2349  __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2350  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2351 }
2352 
2353 void
2354 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2355  kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2356 {
2357  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2358  KMP_DEBUG_ASSERT( __kmp_init_serial );
2359  __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2360  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2361 }
2362 
2363 void
2364 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2365  kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2366 {
2367  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2368  KMP_DEBUG_ASSERT( __kmp_init_serial );
2369  __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2370  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2371 }
2372 
2385 int
2386 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2387  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2388 {
2389  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2390 }
2391 
2395 int
2396 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2397  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2398 {
2399  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2400 }
2401 
2405 int
2406 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2407  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2408 {
2409  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2410 }
2411 
2415 int
2416 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2417  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2418 {
2419  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2420 }
2421 
2428 void
2429 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2430 {
2431  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2432 }
2433 
2437 void
2438 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2439 {
2440  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2441 }
2442 
2446 void
2447 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2448 {
2449  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2450 }
2451 
2455 void
2456 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2457 {
2458  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2459 }
2462 //-----------------------------------------------------------------------------------------
2463 //Non-template routines from kmp_dispatch.c used in other sources
2464 
2465 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2466  return value == checker;
2467 }
2468 
2469 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2470  return value != checker;
2471 }
2472 
2473 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2474  return value < checker;
2475 }
2476 
2477 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2478  return value >= checker;
2479 }
2480 
2481 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2482  return value <= checker;
2483 }
2484 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2485  return value == checker;
2486 }
2487 
2488 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2489  return value != checker;
2490 }
2491 
2492 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2493  return value < checker;
2494 }
2495 
2496 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2497  return value >= checker;
2498 }
2499 
2500 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2501  return value <= checker;
2502 }
2503 
2504 kmp_uint32
2505 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2506  kmp_uint32 checker,
2507  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2508  , void * obj // Higher-level synchronization object, or NULL.
2509  )
2510 {
2511  // note: we may not belong to a team at this point
2512  register volatile kmp_uint32 * spin = spinner;
2513  register kmp_uint32 check = checker;
2514  register kmp_uint32 spins;
2515  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2516  register kmp_uint32 r;
2517 
2518  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2519  KMP_INIT_YIELD( spins );
2520  // main wait spin loop
2521  while(!f(r = TCR_4(*spin), check)) {
2522  KMP_FSYNC_SPIN_PREPARE( obj );
2523  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2524  It causes problems with infinite recursion because of exit lock */
2525  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2526  __kmp_abort_thread(); */
2527 
2528  /* if we have waited a bit, or are oversubscribed, yield */
2529  /* pause is in the following code */
2530  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2531  KMP_YIELD_SPIN( spins );
2532  }
2533  KMP_FSYNC_SPIN_ACQUIRED( obj );
2534  return r;
2535 }
2536 
2537 kmp_uint64
2538 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2539  kmp_uint64 checker,
2540  kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2541  , void * obj // Higher-level synchronization object, or NULL.
2542  )
2543 {
2544  // note: we may not belong to a team at this point
2545  register volatile kmp_uint64 * spin = spinner;
2546  register kmp_uint64 check = checker;
2547  register kmp_uint32 spins;
2548  register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2549  register kmp_uint64 r;
2550 
2551  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2552  KMP_INIT_YIELD( spins );
2553  // main wait spin loop
2554  while(!f(r = *spin, check))
2555  {
2556  KMP_FSYNC_SPIN_PREPARE( obj );
2557  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2558  It causes problems with infinite recursion because of exit lock */
2559  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2560  __kmp_abort_thread(); */
2561 
2562  // if we are oversubscribed,
2563  // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2564  // pause is in the following code
2565  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2566  KMP_YIELD_SPIN( spins );
2567  }
2568  KMP_FSYNC_SPIN_ACQUIRED( obj );
2569  return r;
2570 }
2571 
2572 } // extern "C"
2573 
2574 #ifdef KMP_GOMP_COMPAT
2575 
2576 void
2577 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2578  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2579  kmp_int32 chunk, int push_ws )
2580 {
2581  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2582  push_ws );
2583 }
2584 
2585 void
2586 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2587  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2588  kmp_int32 chunk, int push_ws )
2589 {
2590  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2591  push_ws );
2592 }
2593 
2594 void
2595 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2596  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2597  kmp_int64 chunk, int push_ws )
2598 {
2599  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2600  push_ws );
2601 }
2602 
2603 void
2604 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2605  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2606  kmp_int64 chunk, int push_ws )
2607 {
2608  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2609  push_ws );
2610 }
2611 
2612 void
2613 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2614 {
2615  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2616 }
2617 
2618 void
2619 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2620 {
2621  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2622 }
2623 
2624 void
2625 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2626 {
2627  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2628 }
2629 
2630 void
2631 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2632 {
2633  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2634 }
2635 
2636 #endif /* KMP_GOMP_COMPAT */
2637 
2638 /* ------------------------------------------------------------------------ */
2639 /* ------------------------------------------------------------------------ */
2640 
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:654
Definition: kmp.h:221
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:323
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)