Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  * $Revision: 43457 $
4  * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2014 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 /*
38  * Dynamic scheduling initialization and dispatch.
39  *
40  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
41  * it may change values between parallel regions. __kmp_max_nth
42  * is the largest value __kmp_nth may take, 1 is the smallest.
43  *
44  */
45 
46 /* ------------------------------------------------------------------------ */
47 /* ------------------------------------------------------------------------ */
48 
49 #include "kmp.h"
50 #include "kmp_i18n.h"
51 #include "kmp_itt.h"
52 #include "kmp_str.h"
53 #include "kmp_error.h"
54 #include "kmp_stats.h"
55 #if KMP_OS_WINDOWS && KMP_ARCH_X86
56  #include <float.h>
57 #endif
58 
59 /* ------------------------------------------------------------------------ */
60 /* ------------------------------------------------------------------------ */
61 
62 // template for type limits
63 template< typename T >
64 struct i_maxmin {
65  static const T mx;
66  static const T mn;
67 };
68 template<>
69 struct i_maxmin< int > {
70  static const int mx = 0x7fffffff;
71  static const int mn = 0x80000000;
72 };
73 template<>
74 struct i_maxmin< unsigned int > {
75  static const unsigned int mx = 0xffffffff;
76  static const unsigned int mn = 0x00000000;
77 };
78 template<>
79 struct i_maxmin< long long > {
80  static const long long mx = 0x7fffffffffffffffLL;
81  static const long long mn = 0x8000000000000000LL;
82 };
83 template<>
84 struct i_maxmin< unsigned long long > {
85  static const unsigned long long mx = 0xffffffffffffffffLL;
86  static const unsigned long long mn = 0x0000000000000000LL;
87 };
88 //-------------------------------------------------------------------------
89 
90 #ifdef KMP_STATIC_STEAL_ENABLED
91 
92  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
93  template< typename T >
94  struct dispatch_private_infoXX_template {
95  typedef typename traits_t< T >::unsigned_t UT;
96  typedef typename traits_t< T >::signed_t ST;
97  UT count; // unsigned
98  T ub;
99  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
100  T lb;
101  ST st; // signed
102  UT tc; // unsigned
103  T static_steal_counter; // for static_steal only; maybe better to put after ub
104 
105  /* parm[1-4] are used in different ways by different scheduling algorithms */
106 
107  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
108  // a) parm3 is properly aligned and
109  // b) all parm1-4 are in the same cache line.
110  // Because of parm1-4 are used together, performance seems to be better
111  // if they are in the same line (not measured though).
112 
113  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
114  T parm1;
115  T parm2;
116  T parm3;
117  T parm4;
118  };
119 
120  UT ordered_lower; // unsigned
121  UT ordered_upper; // unsigned
122  #if KMP_OS_WINDOWS
123  T last_upper;
124  #endif /* KMP_OS_WINDOWS */
125  };
126 
127 #else /* KMP_STATIC_STEAL_ENABLED */
128 
129  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
130  template< typename T >
131  struct dispatch_private_infoXX_template {
132  typedef typename traits_t< T >::unsigned_t UT;
133  typedef typename traits_t< T >::signed_t ST;
134  T lb;
135  T ub;
136  ST st; // signed
137  UT tc; // unsigned
138 
139  T parm1;
140  T parm2;
141  T parm3;
142  T parm4;
143 
144  UT count; // unsigned
145 
146  UT ordered_lower; // unsigned
147  UT ordered_upper; // unsigned
148  #if KMP_OS_WINDOWS
149  T last_upper;
150  #endif /* KMP_OS_WINDOWS */
151  };
152 
153 #endif /* KMP_STATIC_STEAL_ENABLED */
154 
155 // replaces dispatch_private_info structure and dispatch_private_info_t type
156 template< typename T >
157 struct KMP_ALIGN_CACHE dispatch_private_info_template {
158  // duplicate alignment here, otherwise size of structure is not correct in our compiler
159  union KMP_ALIGN_CACHE private_info_tmpl {
160  dispatch_private_infoXX_template< T > p;
161  dispatch_private_info64_t p64;
162  } u;
163  enum sched_type schedule; /* scheduling algorithm */
164  kmp_uint32 ordered; /* ordered clause specified */
165  kmp_uint32 ordered_bumped;
166  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
167  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
168  kmp_uint32 nomerge; /* don't merge iters if serialized */
169  kmp_uint32 type_size;
170  enum cons_type pushed_ws;
171 };
172 
173 
174 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
175 template< typename UT >
176 struct dispatch_shared_infoXX_template {
177  /* chunk index under dynamic, number of idle threads under static-steal;
178  iteration index otherwise */
179  volatile UT iteration;
180  volatile UT num_done;
181  volatile UT ordered_iteration;
182  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
183 };
184 
185 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
186 template< typename UT >
187 struct dispatch_shared_info_template {
188  // we need union here to keep the structure size
189  union shared_info_tmpl {
190  dispatch_shared_infoXX_template< UT > s;
191  dispatch_shared_info64_t s64;
192  } u;
193  volatile kmp_uint32 buffer_index;
194 };
195 
196 /* ------------------------------------------------------------------------ */
197 /* ------------------------------------------------------------------------ */
198 
199 #undef USE_TEST_LOCKS
200 
201 // test_then_add template (general template should NOT be used)
202 template< typename T >
203 static __forceinline T
204 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
205 
206 template<>
207 __forceinline kmp_int32
208 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
209 {
210  kmp_int32 r;
211  r = KMP_TEST_THEN_ADD32( p, d );
212  return r;
213 }
214 
215 template<>
216 __forceinline kmp_int64
217 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
218 {
219  kmp_int64 r;
220  r = KMP_TEST_THEN_ADD64( p, d );
221  return r;
222 }
223 
224 // test_then_inc_acq template (general template should NOT be used)
225 template< typename T >
226 static __forceinline T
227 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
228 
229 template<>
230 __forceinline kmp_int32
231 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
232 {
233  kmp_int32 r;
234  r = KMP_TEST_THEN_INC_ACQ32( p );
235  return r;
236 }
237 
238 template<>
239 __forceinline kmp_int64
240 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
241 {
242  kmp_int64 r;
243  r = KMP_TEST_THEN_INC_ACQ64( p );
244  return r;
245 }
246 
247 // test_then_inc template (general template should NOT be used)
248 template< typename T >
249 static __forceinline T
250 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
251 
252 template<>
253 __forceinline kmp_int32
254 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
255 {
256  kmp_int32 r;
257  r = KMP_TEST_THEN_INC32( p );
258  return r;
259 }
260 
261 template<>
262 __forceinline kmp_int64
263 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
264 {
265  kmp_int64 r;
266  r = KMP_TEST_THEN_INC64( p );
267  return r;
268 }
269 
270 // compare_and_swap template (general template should NOT be used)
271 template< typename T >
272 static __forceinline kmp_int32
273 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
274 
275 template<>
276 __forceinline kmp_int32
277 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
278 {
279  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
280 }
281 
282 template<>
283 __forceinline kmp_int32
284 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
285 {
286  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
287 }
288 
289 /*
290  Spin wait loop that first does pause, then yield.
291  Waits until function returns non-zero when called with *spinner and check.
292  Does NOT put threads to sleep.
293 #if USE_ITT_BUILD
294  Arguments:
295  obj -- is higher-level synchronization object to report to ittnotify. It is used to report
296  locks consistently. For example, if lock is acquired immediately, its address is
297  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
298  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
299  address, not an address of low-level spinner.
300 #endif // USE_ITT_BUILD
301 */
302 template< typename UT >
303 // ToDo: make inline function (move to header file for icl)
304 static UT // unsigned 4- or 8-byte type
305 __kmp_wait_yield( volatile UT * spinner,
306  UT checker,
307  kmp_uint32 (* pred)( UT, UT )
308  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
309  )
310 {
311  // note: we may not belong to a team at this point
312  register volatile UT * spin = spinner;
313  register UT check = checker;
314  register kmp_uint32 spins;
315  register kmp_uint32 (*f) ( UT, UT ) = pred;
316  register UT r;
317 
318  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
319  KMP_INIT_YIELD( spins );
320  // main wait spin loop
321  while(!f(r = *spin, check))
322  {
323  KMP_FSYNC_SPIN_PREPARE( obj );
324  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
325  It causes problems with infinite recursion because of exit lock */
326  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
327  __kmp_abort_thread(); */
328 
329  // if we are oversubscribed,
330  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
331  // pause is in the following code
332  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
333  KMP_YIELD_SPIN( spins );
334  }
335  KMP_FSYNC_SPIN_ACQUIRED( obj );
336  return r;
337 }
338 
339 template< typename UT >
340 static kmp_uint32 __kmp_eq( UT value, UT checker) {
341  return value == checker;
342 }
343 
344 template< typename UT >
345 static kmp_uint32 __kmp_neq( UT value, UT checker) {
346  return value != checker;
347 }
348 
349 template< typename UT >
350 static kmp_uint32 __kmp_lt( UT value, UT checker) {
351  return value < checker;
352 }
353 
354 template< typename UT >
355 static kmp_uint32 __kmp_ge( UT value, UT checker) {
356  return value >= checker;
357 }
358 
359 template< typename UT >
360 static kmp_uint32 __kmp_le( UT value, UT checker) {
361  return value <= checker;
362 }
363 
364 
365 /* ------------------------------------------------------------------------ */
366 /* ------------------------------------------------------------------------ */
367 
368 static void
369 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
370 {
371  kmp_info_t *th;
372 
373  KMP_DEBUG_ASSERT( gtid_ref );
374 
375  if ( __kmp_env_consistency_check ) {
376  th = __kmp_threads[*gtid_ref];
377  if ( th -> th.th_root -> r.r_active
378  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
379  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
380  }
381  }
382 }
383 
384 template< typename UT >
385 static void
386 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
387 {
388  typedef typename traits_t< UT >::signed_t ST;
389  dispatch_private_info_template< UT > * pr;
390 
391  int gtid = *gtid_ref;
392 // int cid = *cid_ref;
393  kmp_info_t *th = __kmp_threads[ gtid ];
394  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
395 
396  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
397  if ( __kmp_env_consistency_check ) {
398  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
399  ( th -> th.th_dispatch -> th_dispatch_pr_current );
400  if ( pr -> pushed_ws != ct_none ) {
401  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
402  }
403  }
404 
405  if ( ! th -> th.th_team -> t.t_serialized ) {
406  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
407  ( th -> th.th_dispatch -> th_dispatch_sh_current );
408  UT lower;
409 
410  if ( ! __kmp_env_consistency_check ) {
411  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
412  ( th -> th.th_dispatch -> th_dispatch_pr_current );
413  }
414  lower = pr->u.p.ordered_lower;
415 
416  #if ! defined( KMP_GOMP_COMPAT )
417  if ( __kmp_env_consistency_check ) {
418  if ( pr->ordered_bumped ) {
419  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
420  __kmp_error_construct2(
421  kmp_i18n_msg_CnsMultipleNesting,
422  ct_ordered_in_pdo, loc_ref,
423  & p->stack_data[ p->w_top ]
424  );
425  }
426  }
427  #endif /* !defined(KMP_GOMP_COMPAT) */
428 
429  KMP_MB();
430  #ifdef KMP_DEBUG
431  {
432  const char * buff;
433  // create format specifiers before the debug output
434  buff = __kmp_str_format(
435  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
436  traits_t< UT >::spec, traits_t< UT >::spec );
437  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
438  __kmp_str_free( &buff );
439  }
440  #endif
441 
442  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
443  USE_ITT_BUILD_ARG( NULL )
444  );
445  KMP_MB(); /* is this necessary? */
446  #ifdef KMP_DEBUG
447  {
448  const char * buff;
449  // create format specifiers before the debug output
450  buff = __kmp_str_format(
451  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
452  traits_t< UT >::spec, traits_t< UT >::spec );
453  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
454  __kmp_str_free( &buff );
455  }
456  #endif
457  }
458  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
459 }
460 
461 static void
462 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
463 {
464  kmp_info_t *th;
465 
466  if ( __kmp_env_consistency_check ) {
467  th = __kmp_threads[*gtid_ref];
468  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
469  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
470  }
471  }
472 }
473 
474 template< typename UT >
475 static void
476 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
477 {
478  typedef typename traits_t< UT >::signed_t ST;
479  dispatch_private_info_template< UT > * pr;
480 
481  int gtid = *gtid_ref;
482 // int cid = *cid_ref;
483  kmp_info_t *th = __kmp_threads[ gtid ];
484  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
485 
486  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
487  if ( __kmp_env_consistency_check ) {
488  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
489  ( th -> th.th_dispatch -> th_dispatch_pr_current );
490  if ( pr -> pushed_ws != ct_none ) {
491  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
492  }
493  }
494 
495  if ( ! th -> th.th_team -> t.t_serialized ) {
496  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
497  ( th -> th.th_dispatch -> th_dispatch_sh_current );
498 
499  if ( ! __kmp_env_consistency_check ) {
500  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
501  ( th -> th.th_dispatch -> th_dispatch_pr_current );
502  }
503 
504  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
505  #if ! defined( KMP_GOMP_COMPAT )
506  if ( __kmp_env_consistency_check ) {
507  if ( pr->ordered_bumped != 0 ) {
508  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
509  /* How to test it? - OM */
510  __kmp_error_construct2(
511  kmp_i18n_msg_CnsMultipleNesting,
512  ct_ordered_in_pdo, loc_ref,
513  & p->stack_data[ p->w_top ]
514  );
515  }
516  }
517  #endif /* !defined(KMP_GOMP_COMPAT) */
518 
519  KMP_MB(); /* Flush all pending memory write invalidates. */
520 
521  pr->ordered_bumped += 1;
522 
523  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
524  gtid, pr->ordered_bumped ) );
525 
526  KMP_MB(); /* Flush all pending memory write invalidates. */
527 
528  /* TODO use general release procedure? */
529  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
530 
531  KMP_MB(); /* Flush all pending memory write invalidates. */
532  }
533  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
534 }
535 
536 /* Computes and returns x to the power of y, where y must a non-negative integer */
537 template< typename UT >
538 static __forceinline long double
539 __kmp_pow(long double x, UT y) {
540  long double s=1.0L;
541 
542  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
543  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
544  while(y) {
545  if ( y & 1 )
546  s *= x;
547  x *= x;
548  y >>= 1;
549  }
550  return s;
551 }
552 
553 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
554  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
555  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
556  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
557 */
558 template< typename T >
559 static __inline typename traits_t< T >::unsigned_t
560 __kmp_dispatch_guided_remaining(
561  T tc,
562  typename traits_t< T >::floating_t base,
563  typename traits_t< T >::unsigned_t idx
564 ) {
565  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
566  least for ICL 8.1, long double arithmetic may not really have
567  long double precision, even with /Qlong_double. Currently, we
568  workaround that in the caller code, by manipulating the FPCW for
569  Windows* OS on IA-32 architecture. The lack of precision is not
570  expected to be a correctness issue, though.
571  */
572  typedef typename traits_t< T >::unsigned_t UT;
573 
574  long double x = tc * __kmp_pow< UT >(base, idx);
575  UT r = (UT) x;
576  if ( x == r )
577  return r;
578  return r + 1;
579 }
580 
581 // Parameters of the guided-iterative algorithm:
582 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
583 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
584 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
585 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
586 static int guided_int_param = 2;
587 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
588 
589 // UT - unsigned flavor of T, ST - signed flavor of T,
590 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
591 template< typename T >
592 static void
593 __kmp_dispatch_init(
594  ident_t * loc,
595  int gtid,
596  enum sched_type schedule,
597  T lb,
598  T ub,
599  typename traits_t< T >::signed_t st,
600  typename traits_t< T >::signed_t chunk,
601  int push_ws
602 ) {
603  typedef typename traits_t< T >::unsigned_t UT;
604  typedef typename traits_t< T >::signed_t ST;
605  typedef typename traits_t< T >::floating_t DBL;
606  static const int ___kmp_size_type = sizeof( UT );
607 
608  int active;
609  T tc;
610  kmp_info_t * th;
611  kmp_team_t * team;
612  kmp_uint32 my_buffer_index;
613  dispatch_private_info_template< T > * pr;
614  dispatch_shared_info_template< UT > volatile * sh;
615 
616  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
617  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
618 
619  if ( ! TCR_4( __kmp_init_parallel ) )
620  __kmp_parallel_initialize();
621 
622 #if INCLUDE_SSC_MARKS
623  SSC_MARK_DISPATCH_INIT();
624 #endif
625  #ifdef KMP_DEBUG
626  {
627  const char * buff;
628  // create format specifiers before the debug output
629  buff = __kmp_str_format(
630  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
631  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
632  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
633  __kmp_str_free( &buff );
634  }
635  #endif
636  /* setup data */
637  th = __kmp_threads[ gtid ];
638  team = th -> th.th_team;
639  active = ! team -> t.t_serialized;
640  th->th.th_ident = loc;
641 
642 #if USE_ITT_BUILD
643  kmp_uint64 cur_chunk = chunk;
644 #endif
645  if ( ! active ) {
646  pr = reinterpret_cast< dispatch_private_info_template< T >* >
647  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
648  } else {
649  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
650  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
651 
652  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
653 
654  /* What happens when number of threads changes, need to resize buffer? */
655  pr = reinterpret_cast< dispatch_private_info_template< T > * >
656  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
658  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
659  }
660 
661  /* Pick up the nomerge/ordered bits from the scheduling type */
662  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
663  pr->nomerge = TRUE;
664  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
665  } else {
666  pr->nomerge = FALSE;
667  }
668  pr->type_size = ___kmp_size_type; // remember the size of variables
669  if ( kmp_ord_lower & schedule ) {
670  pr->ordered = TRUE;
671  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
672  } else {
673  pr->ordered = FALSE;
674  }
675  if ( schedule == kmp_sch_static ) {
676  schedule = __kmp_static;
677  } else {
678  if ( schedule == kmp_sch_runtime ) {
679  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
680  schedule = team -> t.t_sched.r_sched_type;
681  // Detail the schedule if needed (global controls are differentiated appropriately)
682  if ( schedule == kmp_sch_guided_chunked ) {
683  schedule = __kmp_guided;
684  } else if ( schedule == kmp_sch_static ) {
685  schedule = __kmp_static;
686  }
687  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
688  chunk = team -> t.t_sched.chunk;
689 
690  #ifdef KMP_DEBUG
691  {
692  const char * buff;
693  // create format specifiers before the debug output
694  buff = __kmp_str_format(
695  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
696  traits_t< ST >::spec );
697  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
698  __kmp_str_free( &buff );
699  }
700  #endif
701  } else {
702  if ( schedule == kmp_sch_guided_chunked ) {
703  schedule = __kmp_guided;
704  }
705  if ( chunk <= 0 ) {
706  chunk = KMP_DEFAULT_CHUNK;
707  }
708  }
709 
710  if ( schedule == kmp_sch_auto ) {
711  // mapping and differentiation: in the __kmp_do_serial_initialize()
712  schedule = __kmp_auto;
713  #ifdef KMP_DEBUG
714  {
715  const char * buff;
716  // create format specifiers before the debug output
717  buff = __kmp_str_format(
718  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
719  traits_t< ST >::spec );
720  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
721  __kmp_str_free( &buff );
722  }
723  #endif
724  }
725 
726  /* guided analytical not safe for too many threads */
727  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
728  schedule = kmp_sch_guided_iterative_chunked;
729  KMP_WARNING( DispatchManyThreads );
730  }
731  pr->u.p.parm1 = chunk;
732  }
733  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
734  "unknown scheduling type" );
735 
736  pr->u.p.count = 0;
737 
738  if ( __kmp_env_consistency_check ) {
739  if ( st == 0 ) {
740  __kmp_error_construct(
741  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
742  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
743  );
744  }
745  }
746 
747  tc = ( ub - lb + st );
748  if ( st != 1 ) {
749  if ( st < 0 ) {
750  if ( lb < ub ) {
751  tc = 0; // zero-trip
752  } else { // lb >= ub
753  tc = (ST)tc / st; // convert to signed division
754  }
755  } else { // st > 0
756  if ( ub < lb ) {
757  tc = 0; // zero-trip
758  } else { // lb >= ub
759  tc /= st;
760  }
761  }
762  } else if ( ub < lb ) { // st == 1
763  tc = 0; // zero-trip
764  }
765 
766  pr->u.p.lb = lb;
767  pr->u.p.ub = ub;
768  pr->u.p.st = st;
769  pr->u.p.tc = tc;
770 
771  #if KMP_OS_WINDOWS
772  pr->u.p.last_upper = ub + st;
773  #endif /* KMP_OS_WINDOWS */
774 
775  /* NOTE: only the active parallel region(s) has active ordered sections */
776 
777  if ( active ) {
778  if ( pr->ordered == 0 ) {
779  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
780  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
781  } else {
782  pr->ordered_bumped = 0;
783 
784  pr->u.p.ordered_lower = 1;
785  pr->u.p.ordered_upper = 0;
786 
787  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
788  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
789  }
790  }
791 
792  if ( __kmp_env_consistency_check ) {
793  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
794  if ( push_ws ) {
795  __kmp_push_workshare( gtid, ws, loc );
796  pr->pushed_ws = ws;
797  } else {
798  __kmp_check_workshare( gtid, ws, loc );
799  pr->pushed_ws = ct_none;
800  }
801  }
802 
803  switch ( schedule ) {
804  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
806  {
807  T nproc = team->t.t_nproc;
808  T ntc, init;
809 
810  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
811 
812  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
813  if ( nproc > 1 && ntc >= nproc ) {
814  T id = __kmp_tid_from_gtid(gtid);
815  T small_chunk, extras;
816 
817  small_chunk = ntc / nproc;
818  extras = ntc % nproc;
819 
820  init = id * small_chunk + ( id < extras ? id : extras );
821  pr->u.p.count = init;
822  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
823 
824  pr->u.p.parm2 = lb;
825  //pr->pfields.parm3 = 0; // it's not used in static_steal
826  pr->u.p.parm4 = id;
827  pr->u.p.st = st;
828  break;
829  } else {
830  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
831  gtid ) );
832  schedule = kmp_sch_static_balanced;
833  /* too few iterations: fall-through to kmp_sch_static_balanced */
834  } // if
835  /* FALL-THROUGH to static balanced */
836  } // case
837  #endif
838  case kmp_sch_static_balanced:
839  {
840  T nproc = team->t.t_nproc;
841  T init, limit;
842 
843  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
844  gtid ) );
845 
846  if ( nproc > 1 ) {
847  T id = __kmp_tid_from_gtid(gtid);
848 
849  if ( tc < nproc ) {
850  if ( id < tc ) {
851  init = id;
852  limit = id;
853  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
854  } else {
855  pr->u.p.count = 1; /* means no more chunks to execute */
856  pr->u.p.parm1 = FALSE;
857  break;
858  }
859  } else {
860  T small_chunk = tc / nproc;
861  T extras = tc % nproc;
862  init = id * small_chunk + (id < extras ? id : extras);
863  limit = init + small_chunk - (id < extras ? 0 : 1);
864  pr->u.p.parm1 = (id == nproc - 1);
865  }
866  } else {
867  if ( tc > 0 ) {
868  init = 0;
869  limit = tc - 1;
870  pr->u.p.parm1 = TRUE;
871  } else {
872  // zero trip count
873  pr->u.p.count = 1; /* means no more chunks to execute */
874  pr->u.p.parm1 = FALSE;
875  break;
876  }
877  }
878 #if USE_ITT_BUILD
879  // Calculate chunk for metadata report
880  if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
881  cur_chunk = limit - init + 1;
882  }
883 #endif
884  if ( st == 1 ) {
885  pr->u.p.lb = lb + init;
886  pr->u.p.ub = lb + limit;
887  } else {
888  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
889  pr->u.p.lb = lb + init * st;
890  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
891  if ( st > 0 ) {
892  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
893  } else {
894  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
895  }
896  }
897  if ( pr->ordered ) {
898  pr->u.p.ordered_lower = init;
899  pr->u.p.ordered_upper = limit;
900  }
901  break;
902  } // case
903  case kmp_sch_guided_iterative_chunked :
904  {
905  T nproc = team->t.t_nproc;
906  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
907 
908  if ( nproc > 1 ) {
909  if ( (2L * chunk + 1 ) * nproc >= tc ) {
910  /* chunk size too large, switch to dynamic */
911  schedule = kmp_sch_dynamic_chunked;
912  } else {
913  // when remaining iters become less than parm2 - switch to dynamic
914  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
915  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
916  }
917  } else {
918  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
919  schedule = kmp_sch_static_greedy;
920  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
921  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
922  pr->u.p.parm1 = tc;
923  } // if
924  } // case
925  break;
926  case kmp_sch_guided_analytical_chunked:
927  {
928  T nproc = team->t.t_nproc;
929  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
930 
931  if ( nproc > 1 ) {
932  if ( (2L * chunk + 1 ) * nproc >= tc ) {
933  /* chunk size too large, switch to dynamic */
934  schedule = kmp_sch_dynamic_chunked;
935  } else {
936  /* commonly used term: (2 nproc - 1)/(2 nproc) */
937  DBL x;
938 
939  #if KMP_OS_WINDOWS && KMP_ARCH_X86
940  /* Linux* OS already has 64-bit computation by default for
941  long double, and on Windows* OS on Intel(R) 64,
942  /Qlong_double doesn't work. On Windows* OS
943  on IA-32 architecture, we need to set precision to
944  64-bit instead of the default 53-bit. Even though long
945  double doesn't work on Windows* OS on Intel(R) 64, the
946  resulting lack of precision is not expected to impact
947  the correctness of the algorithm, but this has not been
948  mathematically proven.
949  */
950  // save original FPCW and set precision to 64-bit, as
951  // Windows* OS on IA-32 architecture defaults to 53-bit
952  unsigned int oldFpcw = _control87(0,0);
953  _control87(_PC_64,_MCW_PC); // 0,0x30000
954  #endif
955  /* value used for comparison in solver for cross-over point */
956  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
957 
958  /* crossover point--chunk indexes equal to or greater than
959  this point switch to dynamic-style scheduling */
960  UT cross;
961 
962  /* commonly used term: (2 nproc - 1)/(2 nproc) */
963  x = (long double)1.0 - (long double)0.5 / nproc;
964 
965  #ifdef KMP_DEBUG
966  { // test natural alignment
967  struct _test_a {
968  char a;
969  union {
970  char b;
971  DBL d;
972  };
973  } t;
974  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
975  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
976  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
977  }
978  #endif // KMP_DEBUG
979 
980  /* save the term in thread private dispatch structure */
981  *(DBL*)&pr->u.p.parm3 = x;
982 
983  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
984  {
985  UT left, right, mid;
986  long double p;
987 
988  /* estimate initial upper and lower bound */
989 
990  /* doesn't matter what value right is as long as it is positive, but
991  it affects performance of the solver
992  */
993  right = 229;
994  p = __kmp_pow< UT >(x,right);
995  if ( p > target ) {
996  do{
997  p *= p;
998  right <<= 1;
999  } while(p>target && right < (1<<27));
1000  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1001  } else {
1002  left = 0;
1003  }
1004 
1005  /* bisection root-finding method */
1006  while ( left + 1 < right ) {
1007  mid = (left + right) / 2;
1008  if ( __kmp_pow< UT >(x,mid) > target ) {
1009  left = mid;
1010  } else {
1011  right = mid;
1012  }
1013  } // while
1014  cross = right;
1015  }
1016  /* assert sanity of computed crossover point */
1017  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1018 
1019  /* save the crossover point in thread private dispatch structure */
1020  pr->u.p.parm2 = cross;
1021 
1022  // C75803
1023  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1024  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1025  #else
1026  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1027  #endif
1028  /* dynamic-style scheduling offset */
1029  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1030  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1031  // restore FPCW
1032  _control87(oldFpcw,_MCW_PC);
1033  #endif
1034  } // if
1035  } else {
1036  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1037  gtid ) );
1038  schedule = kmp_sch_static_greedy;
1039  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1040  pr->u.p.parm1 = tc;
1041  } // if
1042  } // case
1043  break;
1044  case kmp_sch_static_greedy:
1045  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1046  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1047  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1048  tc;
1049  break;
1050  case kmp_sch_static_chunked :
1051  case kmp_sch_dynamic_chunked :
1052  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1053  break;
1054  case kmp_sch_trapezoidal :
1055  {
1056  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1057 
1058  T parm1, parm2, parm3, parm4;
1059  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1060 
1061  parm1 = chunk;
1062 
1063  /* F : size of the first cycle */
1064  parm2 = ( tc / (2 * team->t.t_nproc) );
1065 
1066  if ( parm2 < 1 ) {
1067  parm2 = 1;
1068  }
1069 
1070  /* L : size of the last cycle. Make sure the last cycle
1071  * is not larger than the first cycle.
1072  */
1073  if ( parm1 < 1 ) {
1074  parm1 = 1;
1075  } else if ( parm1 > parm2 ) {
1076  parm1 = parm2;
1077  }
1078 
1079  /* N : number of cycles */
1080  parm3 = ( parm2 + parm1 );
1081  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1082 
1083  if ( parm3 < 2 ) {
1084  parm3 = 2;
1085  }
1086 
1087  /* sigma : decreasing incr of the trapezoid */
1088  parm4 = ( parm3 - 1 );
1089  parm4 = ( parm2 - parm1 ) / parm4;
1090 
1091  // pointless check, because parm4 >= 0 always
1092  //if ( parm4 < 0 ) {
1093  // parm4 = 0;
1094  //}
1095 
1096  pr->u.p.parm1 = parm1;
1097  pr->u.p.parm2 = parm2;
1098  pr->u.p.parm3 = parm3;
1099  pr->u.p.parm4 = parm4;
1100  } // case
1101  break;
1102 
1103  default:
1104  {
1105  __kmp_msg(
1106  kmp_ms_fatal, // Severity
1107  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1108  KMP_HNT( GetNewerLibrary ), // Hint
1109  __kmp_msg_null // Variadic argument list terminator
1110  );
1111  }
1112  break;
1113  } // switch
1114  pr->schedule = schedule;
1115  if ( active ) {
1116  /* The name of this buffer should be my_buffer_index when it's free to use it */
1117 
1118  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1119  gtid, my_buffer_index, sh->buffer_index) );
1120  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1121  USE_ITT_BUILD_ARG( NULL )
1122  );
1123  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1124  // *always* 32-bit integers.
1125  KMP_MB(); /* is this necessary? */
1126  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1127  gtid, my_buffer_index, sh->buffer_index) );
1128 
1129  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1130  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1131 #if USE_ITT_BUILD
1132  if ( pr->ordered ) {
1133  __kmp_itt_ordered_init( gtid );
1134  }; // if
1135 #endif /* USE_ITT_BUILD */
1136  }; // if
1137 
1138 #if USE_ITT_BUILD
1139  // Report loop metadata
1140  if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
1141  kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
1142  if (KMP_MASTER_TID(tid)) {
1143  kmp_uint64 schedtype = 0;
1144 
1145  switch ( schedule ) {
1146  case kmp_sch_static_chunked:
1147  case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1148  break;
1149  case kmp_sch_static_greedy:
1150  cur_chunk = pr->u.p.parm1;
1151  break;
1152  case kmp_sch_dynamic_chunked:
1153  schedtype = 1;
1154  break;
1155  case kmp_sch_guided_iterative_chunked:
1156  case kmp_sch_guided_analytical_chunked:
1157  schedtype = 2;
1158  break;
1159  default:
1160 // Should we put this case under "static"?
1161 // case kmp_sch_static_steal:
1162  schedtype = 3;
1163  break;
1164  }
1165  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1166  }
1167  }
1168 #endif /* USE_ITT_BUILD */
1169 
1170  #ifdef KMP_DEBUG
1171  {
1172  const char * buff;
1173  // create format specifiers before the debug output
1174  buff = __kmp_str_format(
1175  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1176  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1177  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1178  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1179  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1180  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1181  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1182  KD_TRACE(10, ( buff,
1183  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1184  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1185  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1186  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1187  __kmp_str_free( &buff );
1188  }
1189  #endif
1190  #if ( KMP_STATIC_STEAL_ENABLED )
1191  if ( ___kmp_size_type < 8 ) {
1192  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1193  // all the parm3 variables will contain the same value.
1194  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1195  // rather than program life-time increment.
1196  // So the dedicated variable is required. The 'static_steal_counter' is used.
1197  if( schedule == kmp_sch_static_steal ) {
1198  // Other threads will inspect this variable when searching for a victim.
1199  // This is a flag showing that other threads may steal from this thread since then.
1200  volatile T * p = &pr->u.p.static_steal_counter;
1201  *p = *p + 1;
1202  }
1203  }
1204  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1205 }
1206 
1207 /*
1208  * For ordered loops, either __kmp_dispatch_finish() should be called after
1209  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1210  * every chunk of iterations. If the ordered section(s) were not executed
1211  * for this iteration (or every iteration in this chunk), we need to set the
1212  * ordered iteration counters so that the next thread can proceed.
1213  */
1214 template< typename UT >
1215 static void
1216 __kmp_dispatch_finish( int gtid, ident_t *loc )
1217 {
1218  typedef typename traits_t< UT >::signed_t ST;
1219  kmp_info_t *th = __kmp_threads[ gtid ];
1220 
1221  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1222  if ( ! th -> th.th_team -> t.t_serialized ) {
1223 
1224  dispatch_private_info_template< UT > * pr =
1225  reinterpret_cast< dispatch_private_info_template< UT >* >
1226  ( th->th.th_dispatch->th_dispatch_pr_current );
1227  dispatch_shared_info_template< UT > volatile * sh =
1228  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1229  ( th->th.th_dispatch->th_dispatch_sh_current );
1230  KMP_DEBUG_ASSERT( pr );
1231  KMP_DEBUG_ASSERT( sh );
1232  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1233  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1234 
1235  if ( pr->ordered_bumped ) {
1236  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1237  gtid ) );
1238  pr->ordered_bumped = 0;
1239  } else {
1240  UT lower = pr->u.p.ordered_lower;
1241 
1242  #ifdef KMP_DEBUG
1243  {
1244  const char * buff;
1245  // create format specifiers before the debug output
1246  buff = __kmp_str_format(
1247  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1248  traits_t< UT >::spec, traits_t< UT >::spec );
1249  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1250  __kmp_str_free( &buff );
1251  }
1252  #endif
1253 
1254  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1255  USE_ITT_BUILD_ARG(NULL)
1256  );
1257  KMP_MB(); /* is this necessary? */
1258  #ifdef KMP_DEBUG
1259  {
1260  const char * buff;
1261  // create format specifiers before the debug output
1262  buff = __kmp_str_format(
1263  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1264  traits_t< UT >::spec, traits_t< UT >::spec );
1265  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1266  __kmp_str_free( &buff );
1267  }
1268  #endif
1269 
1270  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1271  } // if
1272  } // if
1273  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1274 }
1275 
1276 #ifdef KMP_GOMP_COMPAT
1277 
1278 template< typename UT >
1279 static void
1280 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1281 {
1282  typedef typename traits_t< UT >::signed_t ST;
1283  kmp_info_t *th = __kmp_threads[ gtid ];
1284 
1285  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1286  if ( ! th -> th.th_team -> t.t_serialized ) {
1287 // int cid;
1288  dispatch_private_info_template< UT > * pr =
1289  reinterpret_cast< dispatch_private_info_template< UT >* >
1290  ( th->th.th_dispatch->th_dispatch_pr_current );
1291  dispatch_shared_info_template< UT > volatile * sh =
1292  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1293  ( th->th.th_dispatch->th_dispatch_sh_current );
1294  KMP_DEBUG_ASSERT( pr );
1295  KMP_DEBUG_ASSERT( sh );
1296  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1297  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1298 
1299 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1300  UT lower = pr->u.p.ordered_lower;
1301  UT upper = pr->u.p.ordered_upper;
1302  UT inc = upper - lower + 1;
1303 
1304  if ( pr->ordered_bumped == inc ) {
1305  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1306  gtid ) );
1307  pr->ordered_bumped = 0;
1308  } else {
1309  inc -= pr->ordered_bumped;
1310 
1311  #ifdef KMP_DEBUG
1312  {
1313  const char * buff;
1314  // create format specifiers before the debug output
1315  buff = __kmp_str_format(
1316  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1317  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1318  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1319  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1320  __kmp_str_free( &buff );
1321  }
1322  #endif
1323 
1324  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1325  USE_ITT_BUILD_ARG(NULL)
1326  );
1327 
1328  KMP_MB(); /* is this necessary? */
1329  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1330  gtid ) );
1331  pr->ordered_bumped = 0;
1333  #ifdef KMP_DEBUG
1334  {
1335  const char * buff;
1336  // create format specifiers before the debug output
1337  buff = __kmp_str_format(
1338  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1339  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1340  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1341  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1342  __kmp_str_free( &buff );
1343  }
1344  #endif
1345 
1346  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1347  }
1348 // }
1349  }
1350  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1351 }
1352 
1353 #endif /* KMP_GOMP_COMPAT */
1354 
1355 template< typename T >
1356 static int
1357 __kmp_dispatch_next(
1358  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1359 ) {
1360 
1361  typedef typename traits_t< T >::unsigned_t UT;
1362  typedef typename traits_t< T >::signed_t ST;
1363  typedef typename traits_t< T >::floating_t DBL;
1364  static const int ___kmp_size_type = sizeof( UT );
1365 
1366  int status;
1367  dispatch_private_info_template< T > * pr;
1368  kmp_info_t * th = __kmp_threads[ gtid ];
1369  kmp_team_t * team = th -> th.th_team;
1370 
1371  KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1372  #ifdef KMP_DEBUG
1373  {
1374  const char * buff;
1375  // create format specifiers before the debug output
1376  buff = __kmp_str_format(
1377  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1378  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1379  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1380  __kmp_str_free( &buff );
1381  }
1382  #endif
1383 
1384  if ( team -> t.t_serialized ) {
1385  /* NOTE: serialize this dispatch becase we are not at the active level */
1386  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1387  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1388  KMP_DEBUG_ASSERT( pr );
1389 
1390  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1391  *p_lb = 0;
1392  *p_ub = 0;
1393 // if ( p_last != NULL )
1394 // *p_last = 0;
1395  if ( p_st != NULL )
1396  *p_st = 0;
1397  if ( __kmp_env_consistency_check ) {
1398  if ( pr->pushed_ws != ct_none ) {
1399  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1400  }
1401  }
1402  } else if ( pr->nomerge ) {
1403  kmp_int32 last;
1404  T start;
1405  UT limit, trip, init;
1406  ST incr;
1407  T chunk = pr->u.p.parm1;
1408 
1409  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1410 
1411  init = chunk * pr->u.p.count++;
1412  trip = pr->u.p.tc - 1;
1413 
1414  if ( (status = (init <= trip)) == 0 ) {
1415  *p_lb = 0;
1416  *p_ub = 0;
1417 // if ( p_last != NULL )
1418 // *p_last = 0;
1419  if ( p_st != NULL )
1420  *p_st = 0;
1421  if ( __kmp_env_consistency_check ) {
1422  if ( pr->pushed_ws != ct_none ) {
1423  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1424  }
1425  }
1426  } else {
1427  start = pr->u.p.lb;
1428  limit = chunk + init - 1;
1429  incr = pr->u.p.st;
1430 
1431  if ( (last = (limit >= trip)) != 0 ) {
1432  limit = trip;
1433  #if KMP_OS_WINDOWS
1434  pr->u.p.last_upper = pr->u.p.ub;
1435  #endif /* KMP_OS_WINDOWS */
1436  }
1437  if ( p_last != NULL )
1438  *p_last = last;
1439  if ( p_st != NULL )
1440  *p_st = incr;
1441  if ( incr == 1 ) {
1442  *p_lb = start + init;
1443  *p_ub = start + limit;
1444  } else {
1445  *p_lb = start + init * incr;
1446  *p_ub = start + limit * incr;
1447  }
1448 
1449  if ( pr->ordered ) {
1450  pr->u.p.ordered_lower = init;
1451  pr->u.p.ordered_upper = limit;
1452  #ifdef KMP_DEBUG
1453  {
1454  const char * buff;
1455  // create format specifiers before the debug output
1456  buff = __kmp_str_format(
1457  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1458  traits_t< UT >::spec, traits_t< UT >::spec );
1459  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1460  __kmp_str_free( &buff );
1461  }
1462  #endif
1463  } // if
1464  } // if
1465  } else {
1466  pr->u.p.tc = 0;
1467  *p_lb = pr->u.p.lb;
1468  *p_ub = pr->u.p.ub;
1469  #if KMP_OS_WINDOWS
1470  pr->u.p.last_upper = *p_ub;
1471  #endif /* KMP_OS_WINDOWS */
1472  if ( p_last != NULL )
1473  *p_last = TRUE;
1474  if ( p_st != NULL )
1475  *p_st = pr->u.p.st;
1476  } // if
1477  #ifdef KMP_DEBUG
1478  {
1479  const char * buff;
1480  // create format specifiers before the debug output
1481  buff = __kmp_str_format(
1482  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1483  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1484  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1485  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1486  __kmp_str_free( &buff );
1487  }
1488  #endif
1489 #if INCLUDE_SSC_MARKS
1490  SSC_MARK_DISPATCH_NEXT();
1491 #endif
1492  return status;
1493  } else {
1494  kmp_int32 last = 0;
1495  dispatch_shared_info_template< UT > *sh;
1496  T start;
1497  ST incr;
1498  UT limit, trip, init;
1499 
1500  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1501  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1502 
1503  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1504  ( th->th.th_dispatch->th_dispatch_pr_current );
1505  KMP_DEBUG_ASSERT( pr );
1506  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1507  ( th->th.th_dispatch->th_dispatch_sh_current );
1508  KMP_DEBUG_ASSERT( sh );
1509 
1510  if ( pr->u.p.tc == 0 ) {
1511  // zero trip count
1512  status = 0;
1513  } else {
1514  switch (pr->schedule) {
1515  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1516  case kmp_sch_static_steal:
1517  {
1518  T chunk = pr->u.p.parm1;
1519 
1520  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1521 
1522  trip = pr->u.p.tc - 1;
1523 
1524  if ( ___kmp_size_type > 4 ) {
1525  // Other threads do not look into the data of this thread,
1526  // so it's not necessary to make volatile casting.
1527  init = ( pr->u.p.count )++;
1528  status = ( init < (UT)pr->u.p.ub );
1529  } else {
1530  typedef union {
1531  struct {
1532  UT count;
1533  T ub;
1534  } p;
1535  kmp_int64 b;
1536  } union_i4;
1537  // All operations on 'count' or 'ub' must be combined atomically together.
1538  // stealing implemented only for 4-byte indexes
1539  {
1540  union_i4 vold, vnew;
1541  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1542  vnew = vold;
1543  vnew.p.count++;
1544  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1545  ( volatile kmp_int64* )&pr->u.p.count,
1546  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1547  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1548  KMP_CPU_PAUSE();
1549  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1550  vnew = vold;
1551  vnew.p.count++;
1552  }
1553  vnew = vold;
1554  init = vnew.p.count;
1555  status = ( init < (UT)vnew.p.ub ) ;
1556  }
1557 
1558  if( !status ) {
1559  kmp_info_t **other_threads = team->t.t_threads;
1560  int while_limit = 10;
1561  int while_index = 0;
1562 
1563  // TODO: algorithm of searching for a victim
1564  // should be cleaned up and measured
1565  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1566  union_i4 vold, vnew;
1567  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1568  T victimIdx = pr->u.p.parm4;
1569  T oldVictimIdx = victimIdx;
1570  dispatch_private_info_template< T > * victim;
1571 
1572  do {
1573  if( !victimIdx ) {
1574  victimIdx = team->t.t_nproc - 1;
1575  } else {
1576  --victimIdx;
1577  }
1578  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1579  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1580  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1581  // TODO: think about a proper place of this test
1582  if ( ( !victim ) ||
1583  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1584  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1585  // TODO: delay would be nice
1586  continue;
1587  // the victim is not ready yet to participate in stealing
1588  // because the victim is still in kmp_init_dispatch
1589  }
1590  if ( oldVictimIdx == victimIdx ) {
1591  break;
1592  }
1593  pr->u.p.parm4 = victimIdx;
1594 
1595  while( 1 ) {
1596  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1597  vnew = vold;
1598 
1599  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1600  if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1601  break;
1602  }
1603  vnew.p.ub -= (remaining >> 2);
1604  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1605  #pragma warning( push )
1606  // disable warning on pointless comparison of unsigned with 0
1607  #pragma warning( disable: 186 )
1608  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1609  #pragma warning( pop )
1610  // TODO: Should this be acquire or release?
1611  if ( KMP_COMPARE_AND_STORE_ACQ64(
1612  ( volatile kmp_int64 * )&victim->u.p.count,
1613  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1614  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1615  status = 1;
1616  while_index = 0;
1617  // now update own count and ub
1618  #if KMP_ARCH_X86
1619  // stealing executed on non-KMP_ARCH_X86 only
1620  // Atomic 64-bit write on ia32 is
1621  // unavailable, so we do this in steps.
1622  // This code is not tested.
1623  init = vold.p.count;
1624  pr->u.p.ub = 0;
1625  pr->u.p.count = init + 1;
1626  pr->u.p.ub = vnew.p.count;
1627  #else
1628  init = vnew.p.ub;
1629  vold.p.count = init + 1;
1630  // TODO: is it safe and enough?
1631  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1632  #endif // KMP_ARCH_X86
1633  break;
1634  } // if
1635  KMP_CPU_PAUSE();
1636  } // while (1)
1637  } // while
1638  } // if
1639  } // if
1640  if ( !status ) {
1641  *p_lb = 0;
1642  *p_ub = 0;
1643  if ( p_st != NULL ) *p_st = 0;
1644  } else {
1645  start = pr->u.p.parm2;
1646  init *= chunk;
1647  limit = chunk + init - 1;
1648  incr = pr->u.p.st;
1649 
1650  KMP_DEBUG_ASSERT(init <= trip);
1651  if ( (last = (limit >= trip)) != 0 )
1652  limit = trip;
1653  if ( p_st != NULL ) *p_st = incr;
1654 
1655  if ( incr == 1 ) {
1656  *p_lb = start + init;
1657  *p_ub = start + limit;
1658  } else {
1659  *p_lb = start + init * incr;
1660  *p_ub = start + limit * incr;
1661  }
1662 
1663  if ( pr->ordered ) {
1664  pr->u.p.ordered_lower = init;
1665  pr->u.p.ordered_upper = limit;
1666  #ifdef KMP_DEBUG
1667  {
1668  const char * buff;
1669  // create format specifiers before the debug output
1670  buff = __kmp_str_format(
1671  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1672  traits_t< UT >::spec, traits_t< UT >::spec );
1673  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1674  __kmp_str_free( &buff );
1675  }
1676  #endif
1677  } // if
1678  } // if
1679  break;
1680  } // case
1681  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1682  case kmp_sch_static_balanced:
1683  {
1684  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1685  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1686  pr->u.p.count = 1;
1687  *p_lb = pr->u.p.lb;
1688  *p_ub = pr->u.p.ub;
1689  last = pr->u.p.parm1;
1690  if ( p_st != NULL )
1691  *p_st = pr->u.p.st;
1692  } else { /* no iterations to do */
1693  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1694  }
1695  if ( pr->ordered ) {
1696  #ifdef KMP_DEBUG
1697  {
1698  const char * buff;
1699  // create format specifiers before the debug output
1700  buff = __kmp_str_format(
1701  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1702  traits_t< UT >::spec, traits_t< UT >::spec );
1703  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1704  __kmp_str_free( &buff );
1705  }
1706  #endif
1707  } // if
1708  } // case
1709  break;
1710  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1711  case kmp_sch_static_chunked:
1712  {
1713  T parm1;
1714 
1715  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1716  gtid ) );
1717  parm1 = pr->u.p.parm1;
1718 
1719  trip = pr->u.p.tc - 1;
1720  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1721 
1722  if ( (status = (init <= trip)) != 0 ) {
1723  start = pr->u.p.lb;
1724  incr = pr->u.p.st;
1725  limit = parm1 + init - 1;
1726 
1727  if ( (last = (limit >= trip)) != 0 )
1728  limit = trip;
1729 
1730  if ( p_st != NULL ) *p_st = incr;
1731 
1732  pr->u.p.count += team->t.t_nproc;
1733 
1734  if ( incr == 1 ) {
1735  *p_lb = start + init;
1736  *p_ub = start + limit;
1737  }
1738  else {
1739  *p_lb = start + init * incr;
1740  *p_ub = start + limit * incr;
1741  }
1742 
1743  if ( pr->ordered ) {
1744  pr->u.p.ordered_lower = init;
1745  pr->u.p.ordered_upper = limit;
1746  #ifdef KMP_DEBUG
1747  {
1748  const char * buff;
1749  // create format specifiers before the debug output
1750  buff = __kmp_str_format(
1751  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1752  traits_t< UT >::spec, traits_t< UT >::spec );
1753  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1754  __kmp_str_free( &buff );
1755  }
1756  #endif
1757  } // if
1758  } // if
1759  } // case
1760  break;
1761 
1762  case kmp_sch_dynamic_chunked:
1763  {
1764  T chunk = pr->u.p.parm1;
1765 
1766  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1767  gtid ) );
1768 
1769  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1770  trip = pr->u.p.tc - 1;
1771 
1772  if ( (status = (init <= trip)) == 0 ) {
1773  *p_lb = 0;
1774  *p_ub = 0;
1775  if ( p_st != NULL ) *p_st = 0;
1776  } else {
1777  start = pr->u.p.lb;
1778  limit = chunk + init - 1;
1779  incr = pr->u.p.st;
1780 
1781  if ( (last = (limit >= trip)) != 0 )
1782  limit = trip;
1783 
1784  if ( p_st != NULL ) *p_st = incr;
1785 
1786  if ( incr == 1 ) {
1787  *p_lb = start + init;
1788  *p_ub = start + limit;
1789  } else {
1790  *p_lb = start + init * incr;
1791  *p_ub = start + limit * incr;
1792  }
1793 
1794  if ( pr->ordered ) {
1795  pr->u.p.ordered_lower = init;
1796  pr->u.p.ordered_upper = limit;
1797  #ifdef KMP_DEBUG
1798  {
1799  const char * buff;
1800  // create format specifiers before the debug output
1801  buff = __kmp_str_format(
1802  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1803  traits_t< UT >::spec, traits_t< UT >::spec );
1804  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1805  __kmp_str_free( &buff );
1806  }
1807  #endif
1808  } // if
1809  } // if
1810  } // case
1811  break;
1812 
1813  case kmp_sch_guided_iterative_chunked:
1814  {
1815  T chunkspec = pr->u.p.parm1;
1816  KD_TRACE(100,
1817  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1818  trip = pr->u.p.tc;
1819  // Start atomic part of calculations
1820  while(1) {
1821  ST remaining; // signed, because can be < 0
1822  init = sh->u.s.iteration; // shared value
1823  remaining = trip - init;
1824  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1825  // nothing to do, don't try atomic op
1826  status = 0;
1827  break;
1828  }
1829  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1830  // use dynamic-style shcedule
1831  // atomically inrement iterations, get old value
1832  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1833  remaining = trip - init;
1834  if (remaining <= 0) {
1835  status = 0; // all iterations got by other threads
1836  } else {
1837  // got some iterations to work on
1838  status = 1;
1839  if ( (T)remaining > chunkspec ) {
1840  limit = init + chunkspec - 1;
1841  } else {
1842  last = 1; // the last chunk
1843  limit = init + remaining - 1;
1844  } // if
1845  } // if
1846  break;
1847  } // if
1848  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1849  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1850  // CAS was successful, chunk obtained
1851  status = 1;
1852  --limit;
1853  break;
1854  } // if
1855  } // while
1856  if ( status != 0 ) {
1857  start = pr->u.p.lb;
1858  incr = pr->u.p.st;
1859  if ( p_st != NULL )
1860  *p_st = incr;
1861  *p_lb = start + init * incr;
1862  *p_ub = start + limit * incr;
1863  if ( pr->ordered ) {
1864  pr->u.p.ordered_lower = init;
1865  pr->u.p.ordered_upper = limit;
1866  #ifdef KMP_DEBUG
1867  {
1868  const char * buff;
1869  // create format specifiers before the debug output
1870  buff = __kmp_str_format(
1871  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1872  traits_t< UT >::spec, traits_t< UT >::spec );
1873  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1874  __kmp_str_free( &buff );
1875  }
1876  #endif
1877  } // if
1878  } else {
1879  *p_lb = 0;
1880  *p_ub = 0;
1881  if ( p_st != NULL )
1882  *p_st = 0;
1883  } // if
1884  } // case
1885  break;
1886 
1887  case kmp_sch_guided_analytical_chunked:
1888  {
1889  T chunkspec = pr->u.p.parm1;
1890  UT chunkIdx;
1891  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1892  /* for storing original FPCW value for Windows* OS on
1893  IA-32 architecture 8-byte version */
1894  unsigned int oldFpcw;
1895  unsigned int fpcwSet = 0;
1896  #endif
1897  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1898  gtid ) );
1899 
1900  trip = pr->u.p.tc;
1901 
1902  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1903  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1904 
1905  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1906  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1907  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1908  --trip;
1909  /* use dynamic-style scheduling */
1910  init = chunkIdx * chunkspec + pr->u.p.count;
1911  /* need to verify init > 0 in case of overflow in the above calculation */
1912  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1913  limit = init + chunkspec -1;
1914 
1915  if ( (last = (limit >= trip)) != 0 )
1916  limit = trip;
1917  }
1918  break;
1919  } else {
1920  /* use exponential-style scheduling */
1921  /* The following check is to workaround the lack of long double precision on Windows* OS.
1922  This check works around the possible effect that init != 0 for chunkIdx == 0.
1923  */
1924  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1925  /* If we haven't already done so, save original
1926  FPCW and set precision to 64-bit, as Windows* OS
1927  on IA-32 architecture defaults to 53-bit */
1928  if ( !fpcwSet ) {
1929  oldFpcw = _control87(0,0);
1930  _control87(_PC_64,_MCW_PC);
1931  fpcwSet = 0x30000;
1932  }
1933  #endif
1934  if ( chunkIdx ) {
1935  init = __kmp_dispatch_guided_remaining< T >(
1936  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1937  KMP_DEBUG_ASSERT(init);
1938  init = trip - init;
1939  } else
1940  init = 0;
1941  limit = trip - __kmp_dispatch_guided_remaining< T >(
1942  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1943  KMP_ASSERT(init <= limit);
1944  if ( init < limit ) {
1945  KMP_DEBUG_ASSERT(limit <= trip);
1946  --limit;
1947  status = 1;
1948  break;
1949  } // if
1950  } // if
1951  } // while (1)
1952  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1953  /* restore FPCW if necessary
1954  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1955  */
1956  if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1957  _control87(oldFpcw,_MCW_PC);
1958  #endif
1959  if ( status != 0 ) {
1960  start = pr->u.p.lb;
1961  incr = pr->u.p.st;
1962  if ( p_st != NULL )
1963  *p_st = incr;
1964  *p_lb = start + init * incr;
1965  *p_ub = start + limit * incr;
1966  if ( pr->ordered ) {
1967  pr->u.p.ordered_lower = init;
1968  pr->u.p.ordered_upper = limit;
1969  #ifdef KMP_DEBUG
1970  {
1971  const char * buff;
1972  // create format specifiers before the debug output
1973  buff = __kmp_str_format(
1974  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1975  traits_t< UT >::spec, traits_t< UT >::spec );
1976  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1977  __kmp_str_free( &buff );
1978  }
1979  #endif
1980  }
1981  } else {
1982  *p_lb = 0;
1983  *p_ub = 0;
1984  if ( p_st != NULL )
1985  *p_st = 0;
1986  }
1987  } // case
1988  break;
1989 
1990  case kmp_sch_trapezoidal:
1991  {
1992  UT index;
1993  T parm2 = pr->u.p.parm2;
1994  T parm3 = pr->u.p.parm3;
1995  T parm4 = pr->u.p.parm4;
1996  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1997  gtid ) );
1998 
1999  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2000 
2001  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2002  trip = pr->u.p.tc - 1;
2003 
2004  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2005  *p_lb = 0;
2006  *p_ub = 0;
2007  if ( p_st != NULL ) *p_st = 0;
2008  } else {
2009  start = pr->u.p.lb;
2010  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2011  incr = pr->u.p.st;
2012 
2013  if ( (last = (limit >= trip)) != 0 )
2014  limit = trip;
2015 
2016  if ( p_st != NULL ) *p_st = incr;
2017 
2018  if ( incr == 1 ) {
2019  *p_lb = start + init;
2020  *p_ub = start + limit;
2021  } else {
2022  *p_lb = start + init * incr;
2023  *p_ub = start + limit * incr;
2024  }
2025 
2026  if ( pr->ordered ) {
2027  pr->u.p.ordered_lower = init;
2028  pr->u.p.ordered_upper = limit;
2029  #ifdef KMP_DEBUG
2030  {
2031  const char * buff;
2032  // create format specifiers before the debug output
2033  buff = __kmp_str_format(
2034  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2035  traits_t< UT >::spec, traits_t< UT >::spec );
2036  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2037  __kmp_str_free( &buff );
2038  }
2039  #endif
2040  } // if
2041  } // if
2042  } // case
2043  break;
2044  default:
2045  {
2046  status = 0; // to avoid complaints on uninitialized variable use
2047  __kmp_msg(
2048  kmp_ms_fatal, // Severity
2049  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2050  KMP_HNT( GetNewerLibrary ), // Hint
2051  __kmp_msg_null // Variadic argument list terminator
2052  );
2053  }
2054  break;
2055  } // switch
2056  } // if tc == 0;
2057 
2058  if ( status == 0 ) {
2059  UT num_done;
2060 
2061  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2062  #ifdef KMP_DEBUG
2063  {
2064  const char * buff;
2065  // create format specifiers before the debug output
2066  buff = __kmp_str_format(
2067  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2068  traits_t< UT >::spec );
2069  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2070  __kmp_str_free( &buff );
2071  }
2072  #endif
2073 
2074  if ( (ST)num_done == team->t.t_nproc-1 ) {
2075  /* NOTE: release this buffer to be reused */
2076 
2077  KMP_MB(); /* Flush all pending memory write invalidates. */
2078 
2079  sh->u.s.num_done = 0;
2080  sh->u.s.iteration = 0;
2081 
2082  /* TODO replace with general release procedure? */
2083  if ( pr->ordered ) {
2084  sh->u.s.ordered_iteration = 0;
2085  }
2086 
2087  KMP_MB(); /* Flush all pending memory write invalidates. */
2088 
2089  sh -> buffer_index += KMP_MAX_DISP_BUF;
2090  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2091  gtid, sh->buffer_index) );
2092 
2093  KMP_MB(); /* Flush all pending memory write invalidates. */
2094 
2095  } // if
2096  if ( __kmp_env_consistency_check ) {
2097  if ( pr->pushed_ws != ct_none ) {
2098  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2099  }
2100  }
2101 
2102  th -> th.th_dispatch -> th_deo_fcn = NULL;
2103  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2104  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2105  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2106  } // if (status == 0)
2107 #if KMP_OS_WINDOWS
2108  else if ( last ) {
2109  pr->u.p.last_upper = pr->u.p.ub;
2110  }
2111 #endif /* KMP_OS_WINDOWS */
2112  if ( p_last != NULL && status != 0 )
2113  *p_last = last;
2114  } // if
2115 
2116  #ifdef KMP_DEBUG
2117  {
2118  const char * buff;
2119  // create format specifiers before the debug output
2120  buff = __kmp_str_format(
2121  "__kmp_dispatch_next: T#%%d normal case: " \
2122  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2123  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2124  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2125  __kmp_str_free( &buff );
2126  }
2127  #endif
2128 #if INCLUDE_SSC_MARKS
2129  SSC_MARK_DISPATCH_NEXT();
2130 #endif
2131  return status;
2132 }
2133 
2134 template< typename T >
2135 static void
2136 __kmp_dist_get_bounds(
2137  ident_t *loc,
2138  kmp_int32 gtid,
2139  kmp_int32 *plastiter,
2140  T *plower,
2141  T *pupper,
2142  typename traits_t< T >::signed_t incr
2143 ) {
2144  KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2145  typedef typename traits_t< T >::unsigned_t UT;
2146  typedef typename traits_t< T >::signed_t ST;
2147  register kmp_uint32 team_id;
2148  register kmp_uint32 nteams;
2149  register UT trip_count;
2150  register kmp_team_t *team;
2151  kmp_info_t * th;
2152 
2153  KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2154  KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2155  #ifdef KMP_DEBUG
2156  {
2157  const char * buff;
2158  // create format specifiers before the debug output
2159  buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2160  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2161  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2162  traits_t< T >::spec );
2163  KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2164  __kmp_str_free( &buff );
2165  }
2166  #endif
2167 
2168  if( __kmp_env_consistency_check ) {
2169  if( incr == 0 ) {
2170  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2171  }
2172  if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2173  // The loop is illegal.
2174  // Some zero-trip loops maintained by compiler, e.g.:
2175  // for(i=10;i<0;++i) // lower >= upper - run-time check
2176  // for(i=0;i>10;--i) // lower <= upper - run-time check
2177  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2178  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2179  // Compiler does not check the following illegal loops:
2180  // for(i=0;i<10;i+=incr) // where incr<0
2181  // for(i=10;i>0;i-=incr) // where incr<0
2182  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2183  }
2184  }
2185  th = __kmp_threads[gtid];
2186  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2187  team = th->th.th_team;
2188  #if OMP_40_ENABLED
2189  nteams = th->th.th_teams_size.nteams;
2190  #endif
2191  team_id = team->t.t_master_tid;
2192  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2193 
2194  // compute global trip count
2195  if( incr == 1 ) {
2196  trip_count = *pupper - *plower + 1;
2197  } else if(incr == -1) {
2198  trip_count = *plower - *pupper + 1;
2199  } else {
2200  trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2201  }
2202  if( trip_count <= nteams ) {
2203  KMP_DEBUG_ASSERT(
2204  __kmp_static == kmp_sch_static_greedy || \
2205  __kmp_static == kmp_sch_static_balanced
2206  ); // Unknown static scheduling type.
2207  // only some teams get single iteration, others get nothing
2208  if( team_id < trip_count ) {
2209  *pupper = *plower = *plower + team_id * incr;
2210  } else {
2211  *plower = *pupper + incr; // zero-trip loop
2212  }
2213  if( plastiter != NULL )
2214  *plastiter = ( team_id == trip_count - 1 );
2215  } else {
2216  if( __kmp_static == kmp_sch_static_balanced ) {
2217  register UT chunk = trip_count / nteams;
2218  register UT extras = trip_count % nteams;
2219  *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2220  *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2221  if( plastiter != NULL )
2222  *plastiter = ( team_id == nteams - 1 );
2223  } else {
2224  register T chunk_inc_count =
2225  ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2226  register T upper = *pupper;
2227  KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2228  // Unknown static scheduling type.
2229  *plower += team_id * chunk_inc_count;
2230  *pupper = *plower + chunk_inc_count - incr;
2231  // Check/correct bounds if needed
2232  if( incr > 0 ) {
2233  if( *pupper < *plower )
2234  *pupper = i_maxmin< T >::mx;
2235  if( plastiter != NULL )
2236  *plastiter = *plower <= upper && *pupper > upper - incr;
2237  if( *pupper > upper )
2238  *pupper = upper; // tracker C73258
2239  } else {
2240  if( *pupper > *plower )
2241  *pupper = i_maxmin< T >::mn;
2242  if( plastiter != NULL )
2243  *plastiter = *plower >= upper && *pupper < upper - incr;
2244  if( *pupper < upper )
2245  *pupper = upper; // tracker C73258
2246  }
2247  }
2248  }
2249 }
2250 
2251 //-----------------------------------------------------------------------------------------
2252 // Dispatch routines
2253 // Transfer call to template< type T >
2254 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2255 // T lb, T ub, ST st, ST chunk )
2256 extern "C" {
2257 
2273 void
2274 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2275  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2276 {
2277  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2278  KMP_DEBUG_ASSERT( __kmp_init_serial );
2279  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2280 }
2284 void
2285 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2286  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2287 {
2288  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2289  KMP_DEBUG_ASSERT( __kmp_init_serial );
2290  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2291 }
2292 
2296 void
2297 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2298  kmp_int64 lb, kmp_int64 ub,
2299  kmp_int64 st, kmp_int64 chunk )
2300 {
2301  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2302  KMP_DEBUG_ASSERT( __kmp_init_serial );
2303  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2304 }
2305 
2309 void
2310 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2311  kmp_uint64 lb, kmp_uint64 ub,
2312  kmp_int64 st, kmp_int64 chunk )
2313 {
2314  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2315  KMP_DEBUG_ASSERT( __kmp_init_serial );
2316  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2317 }
2318 
2328 void
2329 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2330  kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2331 {
2332  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2333  KMP_DEBUG_ASSERT( __kmp_init_serial );
2334  __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2335  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2336 }
2337 
2338 void
2339 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2340  kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2341 {
2342  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2343  KMP_DEBUG_ASSERT( __kmp_init_serial );
2344  __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2345  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2346 }
2347 
2348 void
2349 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2350  kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2351 {
2352  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2353  KMP_DEBUG_ASSERT( __kmp_init_serial );
2354  __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2355  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2356 }
2357 
2358 void
2359 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2360  kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2361 {
2362  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2363  KMP_DEBUG_ASSERT( __kmp_init_serial );
2364  __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2365  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2366 }
2367 
2380 int
2381 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2382  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2383 {
2384  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2385 }
2386 
2390 int
2391 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2392  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2393 {
2394  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2395 }
2396 
2400 int
2401 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2402  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2403 {
2404  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2405 }
2406 
2410 int
2411 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2412  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2413 {
2414  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2415 }
2416 
2423 void
2424 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2425 {
2426  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2427 }
2428 
2432 void
2433 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2434 {
2435  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2436 }
2437 
2441 void
2442 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2443 {
2444  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2445 }
2446 
2450 void
2451 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2452 {
2453  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2454 }
2457 //-----------------------------------------------------------------------------------------
2458 //Non-template routines from kmp_dispatch.c used in other sources
2459 
2460 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2461  return value == checker;
2462 }
2463 
2464 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2465  return value != checker;
2466 }
2467 
2468 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2469  return value < checker;
2470 }
2471 
2472 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2473  return value >= checker;
2474 }
2475 
2476 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2477  return value <= checker;
2478 }
2479 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2480  return value == checker;
2481 }
2482 
2483 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2484  return value != checker;
2485 }
2486 
2487 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2488  return value < checker;
2489 }
2490 
2491 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2492  return value >= checker;
2493 }
2494 
2495 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2496  return value <= checker;
2497 }
2498 
2499 kmp_uint32
2500 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2501  kmp_uint32 checker,
2502  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2503  , void * obj // Higher-level synchronization object, or NULL.
2504  )
2505 {
2506  // note: we may not belong to a team at this point
2507  register volatile kmp_uint32 * spin = spinner;
2508  register kmp_uint32 check = checker;
2509  register kmp_uint32 spins;
2510  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2511  register kmp_uint32 r;
2512 
2513  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2514  KMP_INIT_YIELD( spins );
2515  // main wait spin loop
2516  while(!f(r = TCR_4(*spin), check)) {
2517  KMP_FSYNC_SPIN_PREPARE( obj );
2518  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2519  It causes problems with infinite recursion because of exit lock */
2520  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2521  __kmp_abort_thread(); */
2522 
2523  /* if we have waited a bit, or are oversubscribed, yield */
2524  /* pause is in the following code */
2525  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2526  KMP_YIELD_SPIN( spins );
2527  }
2528  KMP_FSYNC_SPIN_ACQUIRED( obj );
2529  return r;
2530 }
2531 
2532 kmp_uint64
2533 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2534  kmp_uint64 checker,
2535  kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2536  , void * obj // Higher-level synchronization object, or NULL.
2537  )
2538 {
2539  // note: we may not belong to a team at this point
2540  register volatile kmp_uint64 * spin = spinner;
2541  register kmp_uint64 check = checker;
2542  register kmp_uint32 spins;
2543  register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2544  register kmp_uint64 r;
2545 
2546  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2547  KMP_INIT_YIELD( spins );
2548  // main wait spin loop
2549  while(!f(r = *spin, check))
2550  {
2551  KMP_FSYNC_SPIN_PREPARE( obj );
2552  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2553  It causes problems with infinite recursion because of exit lock */
2554  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2555  __kmp_abort_thread(); */
2556 
2557  // if we are oversubscribed,
2558  // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2559  // pause is in the following code
2560  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2561  KMP_YIELD_SPIN( spins );
2562  }
2563  KMP_FSYNC_SPIN_ACQUIRED( obj );
2564  return r;
2565 }
2566 
2567 } // extern "C"
2568 
2569 #ifdef KMP_GOMP_COMPAT
2570 
2571 void
2572 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2573  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2574  kmp_int32 chunk, int push_ws )
2575 {
2576  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2577  push_ws );
2578 }
2579 
2580 void
2581 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2582  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2583  kmp_int32 chunk, int push_ws )
2584 {
2585  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2586  push_ws );
2587 }
2588 
2589 void
2590 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2591  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2592  kmp_int64 chunk, int push_ws )
2593 {
2594  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2595  push_ws );
2596 }
2597 
2598 void
2599 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2600  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2601  kmp_int64 chunk, int push_ws )
2602 {
2603  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2604  push_ws );
2605 }
2606 
2607 void
2608 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2609 {
2610  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2611 }
2612 
2613 void
2614 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2615 {
2616  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2617 }
2618 
2619 void
2620 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2621 {
2622  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2623 }
2624 
2625 void
2626 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2627 {
2628  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2629 }
2630 
2631 #endif /* KMP_GOMP_COMPAT */
2632 
2633 /* ------------------------------------------------------------------------ */
2634 /* ------------------------------------------------------------------------ */
2635 
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:654
Definition: kmp.h:218
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:320
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)