Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  * $Revision: 42624 $
4  * $Date: 2013-08-27 10:53:11 -0500 (Tue, 27 Aug 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 /*
38  * Dynamic scheduling initialization and dispatch.
39  *
40  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
41  * it may change values between parallel regions. __kmp_max_nth
42  * is the largest value __kmp_nth may take, 1 is the smallest.
43  *
44  */
45 
46 /* ------------------------------------------------------------------------ */
47 /* ------------------------------------------------------------------------ */
48 
49 #include "kmp.h"
50 #include "kmp_i18n.h"
51 #include "kmp_itt.h"
52 #include "kmp_str.h"
53 #include "kmp_error.h"
54 #if KMP_OS_WINDOWS && KMP_ARCH_X86
55  #include <float.h>
56 #endif
57 
58 /* ------------------------------------------------------------------------ */
59 /* ------------------------------------------------------------------------ */
60 
61 #ifdef KMP_STATIC_STEAL_ENABLED
62 
63  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
64  template< typename T >
65  struct dispatch_private_infoXX_template {
66  typedef typename traits_t< T >::unsigned_t UT;
67  typedef typename traits_t< T >::signed_t ST;
68  UT count; // unsigned
69  T ub;
70  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
71  T lb;
72  ST st; // signed
73  UT tc; // unsigned
74  T static_steal_counter; // for static_steal only; maybe better to put after ub
75 
76  /* parm[1-4] are used in different ways by different scheduling algorithms */
77 
78  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
79  // a) parm3 is properly aligned and
80  // b) all parm1-4 are in the same cache line.
81  // Because of parm1-4 are used together, performance seems to be better
82  // if they are in the same line (not measured though).
83 
84  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
85  T parm1;
86  T parm2;
87  T parm3;
88  T parm4;
89  };
90 
91  UT ordered_lower; // unsigned
92  UT ordered_upper; // unsigned
93  #if KMP_OS_WINDOWS
94  T last_upper;
95  #endif /* KMP_OS_WINDOWS */
96  };
97 
98 #else /* KMP_STATIC_STEAL_ENABLED */
99 
100  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
101  template< typename T >
102  struct dispatch_private_infoXX_template {
103  typedef typename traits_t< T >::unsigned_t UT;
104  typedef typename traits_t< T >::signed_t ST;
105  T lb;
106  T ub;
107  ST st; // signed
108  UT tc; // unsigned
109 
110  T parm1;
111  T parm2;
112  T parm3;
113  T parm4;
114 
115  UT count; // unsigned
116 
117  UT ordered_lower; // unsigned
118  UT ordered_upper; // unsigned
119  #if KMP_OS_WINDOWS
120  T last_upper;
121  #endif /* KMP_OS_WINDOWS */
122  };
123 
124 #endif /* KMP_STATIC_STEAL_ENABLED */
125 
126 // replaces dispatch_private_info structure and dispatch_private_info_t type
127 template< typename T >
128 struct KMP_ALIGN_CACHE dispatch_private_info_template {
129  // duplicate alignment here, otherwise size of structure is not correct in our compiler
130  union KMP_ALIGN_CACHE private_info_tmpl {
131  dispatch_private_infoXX_template< T > p;
132  dispatch_private_info64_t p64;
133  } u;
134  enum sched_type schedule; /* scheduling algorithm */
135  kmp_uint32 ordered; /* ordered clause specified */
136  kmp_uint32 ordered_bumped;
137  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
138  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
139  kmp_uint32 nomerge; /* don't merge iters if serialized */
140  kmp_uint32 type_size;
141  enum cons_type pushed_ws;
142 };
143 
144 
145 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
146 template< typename UT >
147 struct dispatch_shared_infoXX_template {
148  /* chunk index under dynamic, number of idle threads under static-steal;
149  iteration index otherwise */
150  volatile UT iteration;
151  volatile UT num_done;
152  volatile UT ordered_iteration;
153  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
154 };
155 
156 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
157 template< typename UT >
158 struct dispatch_shared_info_template {
159  // we need union here to keep the structure size
160  union shared_info_tmpl {
161  dispatch_shared_infoXX_template< UT > s;
162  dispatch_shared_info64_t s64;
163  } u;
164  volatile kmp_uint32 buffer_index;
165 };
166 
167 /* ------------------------------------------------------------------------ */
168 /* ------------------------------------------------------------------------ */
169 
170 static void
171 __kmp_static_delay( int arg )
172 {
173  /* Work around weird code-gen bug that causes assert to trip */
174  #if KMP_ARCH_X86_64 && KMP_OS_LINUX
175  #else
176  KMP_ASSERT( arg >= 0 );
177  #endif
178 }
179 
180 static void
181 __kmp_static_yield( int arg )
182 {
183  __kmp_yield( arg );
184 }
185 
186 #undef USE_TEST_LOCKS
187 
188 // test_then_add template (general template should NOT be used)
189 template< typename T >
190 static __forceinline T
191 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
192 
193 template<>
194 __forceinline kmp_int32
195 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
196 {
197  kmp_int32 r;
198  r = KMP_TEST_THEN_ADD32( p, d );
199  return r;
200 }
201 
202 template<>
203 __forceinline kmp_int64
204 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
205 {
206  kmp_int64 r;
207  r = KMP_TEST_THEN_ADD64( p, d );
208  return r;
209 }
210 
211 // test_then_inc_acq template (general template should NOT be used)
212 template< typename T >
213 static __forceinline T
214 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
215 
216 template<>
217 __forceinline kmp_int32
218 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
219 {
220  kmp_int32 r;
221  r = KMP_TEST_THEN_INC_ACQ32( p );
222  return r;
223 }
224 
225 template<>
226 __forceinline kmp_int64
227 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
228 {
229  kmp_int64 r;
230  r = KMP_TEST_THEN_INC_ACQ64( p );
231  return r;
232 }
233 
234 // test_then_inc template (general template should NOT be used)
235 template< typename T >
236 static __forceinline T
237 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
238 
239 template<>
240 __forceinline kmp_int32
241 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
242 {
243  kmp_int32 r;
244  r = KMP_TEST_THEN_INC32( p );
245  return r;
246 }
247 
248 template<>
249 __forceinline kmp_int64
250 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
251 {
252  kmp_int64 r;
253  r = KMP_TEST_THEN_INC64( p );
254  return r;
255 }
256 
257 // compare_and_swap template (general template should NOT be used)
258 template< typename T >
259 static __forceinline kmp_int32
260 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
261 
262 template<>
263 __forceinline kmp_int32
264 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
265 {
266  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
267 }
268 
269 template<>
270 __forceinline kmp_int32
271 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
272 {
273  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
274 }
275 
276 /*
277  Spin wait loop that first does pause, then yield.
278  Waits until function returns non-zero when called with *spinner and check.
279  Does NOT put threads to sleep.
280 #if USE_ITT_BUILD
281  Arguments:
282  obj -- is higher-level syncronization object to report to ittnotify. It is used to report
283  locks consistently. For example, if lock is acquired immediately, its address is
284  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
285  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
286  address, not an address of low-level spinner.
287 #endif // USE_ITT_BUILD
288 */
289 template< typename UT >
290 // ToDo: make inline function (move to header file for icl)
291 static UT // unsigned 4- or 8-byte type
292 __kmp_wait_yield( volatile UT * spinner,
293  UT checker,
294  kmp_uint32 (* pred)( UT, UT )
295  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
296  )
297 {
298  // note: we may not belong to a team at this point
299  register volatile UT * spin = spinner;
300  register UT check = checker;
301  register kmp_uint32 spins;
302  register kmp_uint32 (*f) ( UT, UT ) = pred;
303  register UT r;
304 
305  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
306  KMP_INIT_YIELD( spins );
307  // main wait spin loop
308  while(!f(r = *spin, check))
309  {
310  KMP_FSYNC_SPIN_PREPARE( obj );
311  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
312  It causes problems with infinite recursion because of exit lock */
313  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
314  __kmp_abort_thread(); */
315 
316  __kmp_static_delay(TRUE);
317 
318  // if we are oversubscribed,
319  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
320  // pause is in the following code
321  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
322  KMP_YIELD_SPIN( spins );
323  }
324  KMP_FSYNC_SPIN_ACQUIRED( obj );
325  return r;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_eq( UT value, UT checker) {
330  return value == checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_neq( UT value, UT checker) {
335  return value != checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_lt( UT value, UT checker) {
340  return value < checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_ge( UT value, UT checker) {
345  return value >= checker;
346 }
347 
348 template< typename UT >
349 static kmp_uint32 __kmp_le( UT value, UT checker) {
350  return value <= checker;
351 }
352 
353 
354 /* ------------------------------------------------------------------------ */
355 /* ------------------------------------------------------------------------ */
356 
357 static void
358 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
359 {
360  kmp_info_t *th;
361 
362  KMP_DEBUG_ASSERT( gtid_ref );
363 
364  if ( __kmp_env_consistency_check ) {
365  th = __kmp_threads[*gtid_ref];
366  if ( th -> th.th_root -> r.r_active
367  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
368  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
369  }
370  }
371 }
372 
373 template< typename UT >
374 static void
375 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
376 {
377  typedef typename traits_t< UT >::signed_t ST;
378  dispatch_private_info_template< UT > * pr;
379 
380  int gtid = *gtid_ref;
381 // int cid = *cid_ref;
382  kmp_info_t *th = __kmp_threads[ gtid ];
383  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
384 
385  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
386  if ( __kmp_env_consistency_check ) {
387  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
388  ( th -> th.th_dispatch -> th_dispatch_pr_current );
389  if ( pr -> pushed_ws != ct_none ) {
390  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
391  }
392  }
393 
394  if ( ! th -> th.th_team -> t.t_serialized ) {
395  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
396  ( th -> th.th_dispatch -> th_dispatch_sh_current );
397  UT lower;
398 
399  if ( ! __kmp_env_consistency_check ) {
400  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
401  ( th -> th.th_dispatch -> th_dispatch_pr_current );
402  }
403  lower = pr->u.p.ordered_lower;
404 
405  #if ! defined( KMP_GOMP_COMPAT )
406  if ( __kmp_env_consistency_check ) {
407  if ( pr->ordered_bumped ) {
408  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
409  __kmp_error_construct2(
410  kmp_i18n_msg_CnsMultipleNesting,
411  ct_ordered_in_pdo, loc_ref,
412  & p->stack_data[ p->w_top ]
413  );
414  }
415  }
416  #endif /* !defined(KMP_GOMP_COMPAT) */
417 
418  KMP_MB();
419  #ifdef KMP_DEBUG
420  {
421  const char * buff;
422  // create format specifiers before the debug output
423  buff = __kmp_str_format(
424  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
425  traits_t< UT >::spec, traits_t< UT >::spec );
426  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
427  __kmp_str_free( &buff );
428  }
429  #endif
430 
431  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
432  USE_ITT_BUILD_ARG( NULL )
433  );
434  KMP_MB(); /* is this necessary? */
435  #ifdef KMP_DEBUG
436  {
437  const char * buff;
438  // create format specifiers before the debug output
439  buff = __kmp_str_format(
440  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
441  traits_t< UT >::spec, traits_t< UT >::spec );
442  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
443  __kmp_str_free( &buff );
444  }
445  #endif
446  }
447  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
448 }
449 
450 static void
451 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
452 {
453  kmp_info_t *th;
454 
455  if ( __kmp_env_consistency_check ) {
456  th = __kmp_threads[*gtid_ref];
457  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
458  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
459  }
460  }
461 }
462 
463 template< typename UT >
464 static void
465 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
466 {
467  typedef typename traits_t< UT >::signed_t ST;
468  dispatch_private_info_template< UT > * pr;
469 
470  int gtid = *gtid_ref;
471 // int cid = *cid_ref;
472  kmp_info_t *th = __kmp_threads[ gtid ];
473  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
474 
475  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
476  if ( __kmp_env_consistency_check ) {
477  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
478  ( th -> th.th_dispatch -> th_dispatch_pr_current );
479  if ( pr -> pushed_ws != ct_none ) {
480  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
481  }
482  }
483 
484  if ( ! th -> th.th_team -> t.t_serialized ) {
485  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
486  ( th -> th.th_dispatch -> th_dispatch_sh_current );
487 
488  if ( ! __kmp_env_consistency_check ) {
489  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
490  ( th -> th.th_dispatch -> th_dispatch_pr_current );
491  }
492 
493  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
494  #if ! defined( KMP_GOMP_COMPAT )
495  if ( __kmp_env_consistency_check ) {
496  if ( pr->ordered_bumped != 0 ) {
497  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
498  /* How to test it? - OM */
499  __kmp_error_construct2(
500  kmp_i18n_msg_CnsMultipleNesting,
501  ct_ordered_in_pdo, loc_ref,
502  & p->stack_data[ p->w_top ]
503  );
504  }
505  }
506  #endif /* !defined(KMP_GOMP_COMPAT) */
507 
508  KMP_MB(); /* Flush all pending memory write invalidates. */
509 
510  pr->ordered_bumped += 1;
511 
512  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
513  gtid, pr->ordered_bumped ) );
514 
515  KMP_MB(); /* Flush all pending memory write invalidates. */
516 
517  /* TODO use general release procedure? */
518  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
519 
520  KMP_MB(); /* Flush all pending memory write invalidates. */
521  }
522  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
523 }
524 
525 /* Computes and returns x to the power of y, where y must a non-negative integer */
526 template< typename UT >
527 static __forceinline long double
528 __kmp_pow(long double x, UT y) {
529  long double s=1.0L;
530 
531  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
532  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
533  while(y) {
534  if ( y & 1 )
535  s *= x;
536  x *= x;
537  y >>= 1;
538  }
539  return s;
540 }
541 
542 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
543  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
544  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
545  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
546 */
547 template< typename T >
548 static __inline typename traits_t< T >::unsigned_t
549 __kmp_dispatch_guided_remaining(
550  T tc,
551  typename traits_t< T >::floating_t base,
552  typename traits_t< T >::unsigned_t idx
553 ) {
554  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
555  least for ICL 8.1, long double arithmetic may not really have
556  long double precision, even with /Qlong_double. Currently, we
557  workaround that in the caller code, by manipulating the FPCW for
558  Windows* OS on IA-32 architecture. The lack of precision is not
559  expected to be a correctness issue, though.
560  */
561  typedef typename traits_t< T >::unsigned_t UT;
562 
563  long double x = tc * __kmp_pow< UT >(base, idx);
564  UT r = (UT) x;
565  if ( x == r )
566  return r;
567  return r + 1;
568 }
569 
570 // Parameters of the guided-iterative algorithm:
571 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
572 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
573 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
574 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
575 static int guided_int_param = 2;
576 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
577 
578 // UT - unsigned flavor of T, ST - signed flavor of T,
579 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
580 template< typename T >
581 static void
582 __kmp_dispatch_init(
583  ident_t * loc,
584  int gtid,
585  enum sched_type schedule,
586  T lb,
587  T ub,
588  typename traits_t< T >::signed_t st,
589  typename traits_t< T >::signed_t chunk,
590  int push_ws
591 ) {
592  typedef typename traits_t< T >::unsigned_t UT;
593  typedef typename traits_t< T >::signed_t ST;
594  typedef typename traits_t< T >::floating_t DBL;
595  static const int ___kmp_size_type = sizeof( UT );
596 
597  int active;
598  T tc;
599  kmp_info_t * th;
600  kmp_team_t * team;
601  kmp_uint32 my_buffer_index;
602  dispatch_private_info_template< T > * pr;
603  dispatch_shared_info_template< UT > volatile * sh;
604 
605  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
606  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
607 
608  if ( ! TCR_4( __kmp_init_parallel ) )
609  __kmp_parallel_initialize();
610 
611  #ifdef KMP_DEBUG
612  {
613  const char * buff;
614  // create format specifiers before the debug output
615  buff = __kmp_str_format(
616  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
617  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
618  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
619  __kmp_str_free( &buff );
620  }
621  #endif
622  /* setup data */
623  th = __kmp_threads[ gtid ];
624  team = th -> th.th_team;
625  active = ! team -> t.t_serialized;
626  th->th.th_ident = loc;
627 
628  if ( ! active ) {
629  pr = reinterpret_cast< dispatch_private_info_template< T >* >
630  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
631  } else {
632  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
633  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
634 
635  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
636 
637  /* What happens when number of threads changes, need to resize buffer? */
638  pr = reinterpret_cast< dispatch_private_info_template< T > * >
639  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
640  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
641  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
642  }
643 
644  /* Pick up the nomerge/ordered bits from the scheduling type */
645  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
646  pr->nomerge = TRUE;
647  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
648  } else {
649  pr->nomerge = FALSE;
650  }
651  pr->type_size = ___kmp_size_type; // remember the size of variables
652  if ( kmp_ord_lower & schedule ) {
653  pr->ordered = TRUE;
654  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
655  } else {
656  pr->ordered = FALSE;
657  }
658  if ( schedule == kmp_sch_static ) {
659  schedule = __kmp_static;
660  } else {
661  if ( schedule == kmp_sch_runtime ) {
662  #if OMP_30_ENABLED
663  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
664  schedule = team -> t.t_sched.r_sched_type;
665  // Detail the schedule if needed (global controls are differentiated appropriately)
666  if ( schedule == kmp_sch_guided_chunked ) {
667  schedule = __kmp_guided;
668  } else if ( schedule == kmp_sch_static ) {
669  schedule = __kmp_static;
670  }
671  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
672  chunk = team -> t.t_sched.chunk;
673  #else
674  kmp_r_sched_t r_sched = __kmp_get_schedule_global();
675  // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
676  schedule = r_sched.r_sched_type;
677  chunk = r_sched.chunk;
678  #endif
679 
680  #ifdef KMP_DEBUG
681  {
682  const char * buff;
683  // create format specifiers before the debug output
684  buff = __kmp_str_format(
685  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
686  traits_t< ST >::spec );
687  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
688  __kmp_str_free( &buff );
689  }
690  #endif
691  } else {
692  if ( schedule == kmp_sch_guided_chunked ) {
693  schedule = __kmp_guided;
694  }
695  if ( chunk <= 0 ) {
696  chunk = KMP_DEFAULT_CHUNK;
697  }
698  }
699 
700  #if OMP_30_ENABLED
701  if ( schedule == kmp_sch_auto ) {
702  // mapping and differentiation: in the __kmp_do_serial_initialize()
703  schedule = __kmp_auto;
704  #ifdef KMP_DEBUG
705  {
706  const char * buff;
707  // create format specifiers before the debug output
708  buff = __kmp_str_format(
709  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
710  traits_t< ST >::spec );
711  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
712  __kmp_str_free( &buff );
713  }
714  #endif
715  }
716  #endif // OMP_30_ENABLED
717 
718  /* guided analytical not safe for too many threads */
719  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
720  schedule = kmp_sch_guided_iterative_chunked;
721  KMP_WARNING( DispatchManyThreads );
722  }
723  pr->u.p.parm1 = chunk;
724  }
725  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
726  "unknown scheduling type" );
727 
728  pr->u.p.count = 0;
729 
730  if ( __kmp_env_consistency_check ) {
731  if ( st == 0 ) {
732  __kmp_error_construct(
733  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
734  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
735  );
736  }
737  }
738 
739  tc = ( ub - lb + st );
740  if ( st != 1 ) {
741  if ( st < 0 ) {
742  if ( lb < ub ) {
743  tc = 0; // zero-trip
744  } else { // lb >= ub
745  tc = (ST)tc / st; // convert to signed division
746  }
747  } else { // st > 0
748  if ( ub < lb ) {
749  tc = 0; // zero-trip
750  } else { // lb >= ub
751  tc /= st;
752  }
753  }
754  } else if ( ub < lb ) { // st == 1
755  tc = 0; // zero-trip
756  }
757 
758  pr->u.p.lb = lb;
759  pr->u.p.ub = ub;
760  pr->u.p.st = st;
761  pr->u.p.tc = tc;
762 
763  #if KMP_OS_WINDOWS
764  pr->u.p.last_upper = ub + st;
765  #endif /* KMP_OS_WINDOWS */
766 
767  /* NOTE: only the active parallel region(s) has active ordered sections */
768 
769  if ( active ) {
770  if ( pr->ordered == 0 ) {
771  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
772  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
773  } else {
774  pr->ordered_bumped = 0;
775 
776  pr->u.p.ordered_lower = 1;
777  pr->u.p.ordered_upper = 0;
778 
779  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
780  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
781  }
782  }
783 
784  if ( __kmp_env_consistency_check ) {
785  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
786  if ( push_ws ) {
787  __kmp_push_workshare( gtid, ws, loc );
788  pr->pushed_ws = ws;
789  } else {
790  __kmp_check_workshare( gtid, ws, loc );
791  pr->pushed_ws = ct_none;
792  }
793  }
794 
795  switch ( schedule ) {
796  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
798  {
799  T nproc = team->t.t_nproc;
800  T ntc, init;
801 
802  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
803 
804  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
805  if ( nproc > 1 && ntc >= nproc ) {
806  T id = __kmp_tid_from_gtid(gtid);
807  T small_chunk, extras;
808 
809  small_chunk = ntc / nproc;
810  extras = ntc % nproc;
811 
812  init = id * small_chunk + ( id < extras ? id : extras );
813  pr->u.p.count = init;
814  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
815 
816  pr->u.p.parm2 = lb;
817  //pr->pfields.parm3 = 0; // it's not used in static_steal
818  pr->u.p.parm4 = id;
819  pr->u.p.st = st;
820  break;
821  } else {
822  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
823  gtid ) );
824  schedule = kmp_sch_static_balanced;
825  /* too few iterations: fall-through to kmp_sch_static_balanced */
826  } // if
827  /* FALL-THROUGH to static balanced */
828  } // case
829  #endif
830  case kmp_sch_static_balanced:
831  {
832  T nproc = team->t.t_nproc;
833  T init, limit;
834 
835  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
836  gtid ) );
837 
838  if ( nproc > 1 ) {
839  T id = __kmp_tid_from_gtid(gtid);
840 
841  if ( tc < nproc ) {
842  if ( id < tc ) {
843  init = id;
844  limit = id;
845  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
846  } else {
847  pr->u.p.count = 1; /* means no more chunks to execute */
848  pr->u.p.parm1 = FALSE;
849  break;
850  }
851  } else {
852  T small_chunk = tc / nproc;
853  T extras = tc % nproc;
854  init = id * small_chunk + (id < extras ? id : extras);
855  limit = init + small_chunk - (id < extras ? 0 : 1);
856  pr->u.p.parm1 = (id == nproc - 1);
857  }
858  } else {
859  if ( tc > 0 ) {
860  init = 0;
861  limit = tc - 1;
862  pr->u.p.parm1 = TRUE;
863  } else {
864  // zero trip count
865  pr->u.p.count = 1; /* means no more chunks to execute */
866  pr->u.p.parm1 = FALSE;
867  break;
868  }
869  }
870  if ( st == 1 ) {
871  pr->u.p.lb = lb + init;
872  pr->u.p.ub = lb + limit;
873  } else {
874  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
875  pr->u.p.lb = lb + init * st;
876  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
877  if ( st > 0 ) {
878  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
879  } else {
880  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
881  }
882  }
883  if ( pr->ordered ) {
884  pr->u.p.ordered_lower = init;
885  pr->u.p.ordered_upper = limit;
886  }
887  break;
888  } // case
889  case kmp_sch_guided_iterative_chunked :
890  {
891  T nproc = team->t.t_nproc;
892  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
893 
894  if ( nproc > 1 ) {
895  if ( (2L * chunk + 1 ) * nproc >= tc ) {
896  /* chunk size too large, switch to dynamic */
897  schedule = kmp_sch_dynamic_chunked;
898  } else {
899  // when remaining iters become less than parm2 - switch to dynamic
900  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
901  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
902  }
903  } else {
904  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
905  schedule = kmp_sch_static_greedy;
906  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
907  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
908  pr->u.p.parm1 = tc;
909  } // if
910  } // case
911  break;
912  case kmp_sch_guided_analytical_chunked:
913  {
914  T nproc = team->t.t_nproc;
915  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
916 
917  if ( nproc > 1 ) {
918  if ( (2L * chunk + 1 ) * nproc >= tc ) {
919  /* chunk size too large, switch to dynamic */
920  schedule = kmp_sch_dynamic_chunked;
921  } else {
922  /* commonly used term: (2 nproc - 1)/(2 nproc) */
923  DBL x;
924 
925  #if KMP_OS_WINDOWS && KMP_ARCH_X86
926  /* Linux* OS already has 64-bit computation by default for
927  long double, and on Windows* OS on Intel(R) 64,
928  /Qlong_double doesn't work. On Windows* OS
929  on IA-32 architecture, we need to set precision to
930  64-bit instead of the default 53-bit. Even though long
931  double doesn't work on Windows* OS on Intel(R) 64, the
932  resulting lack of precision is not expected to impact
933  the correctness of the algorithm, but this has not been
934  mathematically proven.
935  */
936  // save original FPCW and set precision to 64-bit, as
937  // Windows* OS on IA-32 architecture defaults to 53-bit
938  unsigned int oldFpcw = _control87(0,0x30000);
939  #endif
940  /* value used for comparison in solver for cross-over point */
941  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
942 
943  /* crossover point--chunk indexes equal to or greater than
944  this point switch to dynamic-style scheduling */
945  UT cross;
946 
947  /* commonly used term: (2 nproc - 1)/(2 nproc) */
948  x = (long double)1.0 - (long double)0.5 / nproc;
949 
950  #ifdef KMP_DEBUG
951  { // test natural alignment
952  struct _test_a {
953  char a;
954  union {
955  char b;
956  DBL d;
957  };
958  } t;
959  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
960  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
961  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
962  }
963  #endif // KMP_DEBUG
964 
965  /* save the term in thread private dispatch structure */
966  *(DBL*)&pr->u.p.parm3 = x;
967 
968  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
969  {
970  UT left, right, mid;
971  long double p;
972 
973  /* estimate initial upper and lower bound */
974 
975  /* doesn't matter what value right is as long as it is positive, but
976  it affects performance of the solver
977  */
978  right = 229;
979  p = __kmp_pow< UT >(x,right);
980  if ( p > target ) {
981  do{
982  p *= p;
983  right <<= 1;
984  } while(p>target && right < (1<<27));
985  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
986  } else {
987  left = 0;
988  }
989 
990  /* bisection root-finding method */
991  while ( left + 1 < right ) {
992  mid = (left + right) / 2;
993  if ( __kmp_pow< UT >(x,mid) > target ) {
994  left = mid;
995  } else {
996  right = mid;
997  }
998  } // while
999  cross = right;
1000  }
1001  /* assert sanity of computed crossover point */
1002  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1003 
1004  /* save the crossover point in thread private dispatch structure */
1005  pr->u.p.parm2 = cross;
1006 
1007  // C75803
1008  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1009  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1010  #else
1011  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1012  #endif
1013  /* dynamic-style scheduling offset */
1014  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1015  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1016  // restore FPCW
1017  _control87(oldFpcw,0x30000);
1018  #endif
1019  } // if
1020  } else {
1021  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1022  gtid ) );
1023  schedule = kmp_sch_static_greedy;
1024  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1025  pr->u.p.parm1 = tc;
1026  } // if
1027  } // case
1028  break;
1029  case kmp_sch_static_greedy:
1030  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1031  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1032  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1033  tc;
1034  break;
1035  case kmp_sch_static_chunked :
1036  case kmp_sch_dynamic_chunked :
1037  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1038  break;
1039  case kmp_sch_trapezoidal :
1040  {
1041  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1042 
1043  T parm1, parm2, parm3, parm4;
1044  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1045 
1046  parm1 = chunk;
1047 
1048  /* F : size of the first cycle */
1049  parm2 = ( tc / (2 * team->t.t_nproc) );
1050 
1051  if ( parm2 < 1 ) {
1052  parm2 = 1;
1053  }
1054 
1055  /* L : size of the last cycle. Make sure the last cycle
1056  * is not larger than the first cycle.
1057  */
1058  if ( parm1 < 1 ) {
1059  parm1 = 1;
1060  } else if ( parm1 > parm2 ) {
1061  parm1 = parm2;
1062  }
1063 
1064  /* N : number of cycles */
1065  parm3 = ( parm2 + parm1 );
1066  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1067 
1068  if ( parm3 < 2 ) {
1069  parm3 = 2;
1070  }
1071 
1072  /* sigma : decreasing incr of the trapezoid */
1073  parm4 = ( parm3 - 1 );
1074  parm4 = ( parm2 - parm1 ) / parm4;
1075 
1076  // pointless check, because parm4 >= 0 always
1077  //if ( parm4 < 0 ) {
1078  // parm4 = 0;
1079  //}
1080 
1081  pr->u.p.parm1 = parm1;
1082  pr->u.p.parm2 = parm2;
1083  pr->u.p.parm3 = parm3;
1084  pr->u.p.parm4 = parm4;
1085  } // case
1086  break;
1087 
1088  default:
1089  {
1090  __kmp_msg(
1091  kmp_ms_fatal, // Severity
1092  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1093  KMP_HNT( GetNewerLibrary ), // Hint
1094  __kmp_msg_null // Variadic argument list terminator
1095  );
1096  }
1097  break;
1098  } // switch
1099  pr->schedule = schedule;
1100  if ( active ) {
1101  /* The name of this buffer should be my_buffer_index when it's free to use it */
1102 
1103  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1104  gtid, my_buffer_index, sh->buffer_index) );
1105  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1106  USE_ITT_BUILD_ARG( NULL )
1107  );
1108  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1109  // *always* 32-bit integers.
1110  KMP_MB(); /* is this necessary? */
1111  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1112  gtid, my_buffer_index, sh->buffer_index) );
1113 
1114  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1115  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1116 #if USE_ITT_BUILD
1117  if ( pr->ordered ) {
1118  __kmp_itt_ordered_init( gtid );
1119  }; // if
1120 #endif /* USE_ITT_BUILD */
1121  }; // if
1122  #ifdef KMP_DEBUG
1123  {
1124  const char * buff;
1125  // create format specifiers before the debug output
1126  buff = __kmp_str_format(
1127  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1128  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1129  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1130  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1131  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1132  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1133  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1134  KD_TRACE(10, ( buff,
1135  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1136  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1137  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1138  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1139  __kmp_str_free( &buff );
1140  }
1141  #endif
1142  #if ( KMP_STATIC_STEAL_ENABLED )
1143  if ( ___kmp_size_type < 8 ) {
1144  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1145  // all the parm3 variables will contain the same value.
1146  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1147  // rather than program life-time increment.
1148  // So the dedicated variable is required. The 'static_steal_counter' is used.
1149  if( schedule == kmp_sch_static_steal ) {
1150  // Other threads will inspect this variable when searching for a victim.
1151  // This is a flag showing that other threads may steal from this thread since then.
1152  volatile T * p = &pr->u.p.static_steal_counter;
1153  *p = *p + 1;
1154  }
1155  }
1156  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1157 }
1158 
1159 /*
1160  * For ordered loops, either __kmp_dispatch_finish() should be called after
1161  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1162  * every chunk of iterations. If the ordered section(s) were not executed
1163  * for this iteration (or every iteration in this chunk), we need to set the
1164  * ordered iteration counters so that the next thread can proceed.
1165  */
1166 template< typename UT >
1167 static void
1168 __kmp_dispatch_finish( int gtid, ident_t *loc )
1169 {
1170  typedef typename traits_t< UT >::signed_t ST;
1171  kmp_info_t *th = __kmp_threads[ gtid ];
1172 
1173  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1174  if ( ! th -> th.th_team -> t.t_serialized ) {
1175 
1176  dispatch_private_info_template< UT > * pr =
1177  reinterpret_cast< dispatch_private_info_template< UT >* >
1178  ( th->th.th_dispatch->th_dispatch_pr_current );
1179  dispatch_shared_info_template< UT > volatile * sh =
1180  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1181  ( th->th.th_dispatch->th_dispatch_sh_current );
1182  KMP_DEBUG_ASSERT( pr );
1183  KMP_DEBUG_ASSERT( sh );
1184  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1185  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1186 
1187  if ( pr->ordered_bumped ) {
1188  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1189  gtid ) );
1190  pr->ordered_bumped = 0;
1191  } else {
1192  UT lower = pr->u.p.ordered_lower;
1193 
1194  #ifdef KMP_DEBUG
1195  {
1196  const char * buff;
1197  // create format specifiers before the debug output
1198  buff = __kmp_str_format(
1199  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1200  traits_t< UT >::spec, traits_t< UT >::spec );
1201  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1202  __kmp_str_free( &buff );
1203  }
1204  #endif
1205 
1206  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1207  USE_ITT_BUILD_ARG(NULL)
1208  );
1209  KMP_MB(); /* is this necessary? */
1210  #ifdef KMP_DEBUG
1211  {
1212  const char * buff;
1213  // create format specifiers before the debug output
1214  buff = __kmp_str_format(
1215  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1216  traits_t< UT >::spec, traits_t< UT >::spec );
1217  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1218  __kmp_str_free( &buff );
1219  }
1220  #endif
1221 
1222  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1223  } // if
1224  } // if
1225  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1226 }
1227 
1228 #ifdef KMP_GOMP_COMPAT
1229 
1230 template< typename UT >
1231 static void
1232 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1233 {
1234  typedef typename traits_t< UT >::signed_t ST;
1235  kmp_info_t *th = __kmp_threads[ gtid ];
1236 
1237  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1238  if ( ! th -> th.th_team -> t.t_serialized ) {
1239 // int cid;
1240  dispatch_private_info_template< UT > * pr =
1241  reinterpret_cast< dispatch_private_info_template< UT >* >
1242  ( th->th.th_dispatch->th_dispatch_pr_current );
1243  dispatch_shared_info_template< UT > volatile * sh =
1244  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1245  ( th->th.th_dispatch->th_dispatch_sh_current );
1246  KMP_DEBUG_ASSERT( pr );
1247  KMP_DEBUG_ASSERT( sh );
1248  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1249  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1250 
1251 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1252  UT lower = pr->u.p.ordered_lower;
1253  UT upper = pr->u.p.ordered_upper;
1254  UT inc = upper - lower + 1;
1255 
1256  if ( pr->ordered_bumped == inc ) {
1257  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1258  gtid ) );
1259  pr->ordered_bumped = 0;
1260  } else {
1261  inc -= pr->ordered_bumped;
1262 
1263  #ifdef KMP_DEBUG
1264  {
1265  const char * buff;
1266  // create format specifiers before the debug output
1267  buff = __kmp_str_format(
1268  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1269  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1270  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1271  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1272  __kmp_str_free( &buff );
1273  }
1274  #endif
1275 
1276  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1277  USE_ITT_BUILD_ARG(NULL)
1278  );
1279 
1280  KMP_MB(); /* is this necessary? */
1281  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1282  gtid ) );
1283  pr->ordered_bumped = 0;
1285  #ifdef KMP_DEBUG
1286  {
1287  const char * buff;
1288  // create format specifiers before the debug output
1289  buff = __kmp_str_format(
1290  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1291  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1292  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1293  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1294  __kmp_str_free( &buff );
1295  }
1296  #endif
1297 
1298  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1299  }
1300 // }
1301  }
1302  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1303 }
1304 
1305 #endif /* KMP_GOMP_COMPAT */
1306 
1307 template< typename T >
1308 static int
1309 __kmp_dispatch_next(
1310  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1311 ) {
1312 
1313  typedef typename traits_t< T >::unsigned_t UT;
1314  typedef typename traits_t< T >::signed_t ST;
1315  typedef typename traits_t< T >::floating_t DBL;
1316  static const int ___kmp_size_type = sizeof( UT );
1317 
1318  int status;
1319  dispatch_private_info_template< T > * pr;
1320  kmp_info_t * th = __kmp_threads[ gtid ];
1321  kmp_team_t * team = th -> th.th_team;
1322 
1323  #ifdef KMP_DEBUG
1324  {
1325  const char * buff;
1326  // create format specifiers before the debug output
1327  buff = __kmp_str_format(
1328  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1329  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1330  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1331  __kmp_str_free( &buff );
1332  }
1333  #endif
1334 
1335  if ( team -> t.t_serialized ) {
1336  /* NOTE: serialize this dispatch becase we are not at the active level */
1337  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1338  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1339  KMP_DEBUG_ASSERT( pr );
1340 
1341  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1342  *p_lb = 0;
1343  *p_ub = 0;
1344  if ( p_st != 0 ) {
1345  *p_st = 0;
1346  }
1347  if ( __kmp_env_consistency_check ) {
1348  if ( pr->pushed_ws != ct_none ) {
1349  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1350  }
1351  }
1352  } else if ( pr->nomerge ) {
1353  kmp_int32 last;
1354  T start;
1355  UT limit, trip, init;
1356  ST incr;
1357  T chunk = pr->u.p.parm1;
1358 
1359  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1360 
1361  init = chunk * pr->u.p.count++;
1362  trip = pr->u.p.tc - 1;
1363 
1364  if ( (status = (init <= trip)) == 0 ) {
1365  *p_lb = 0;
1366  *p_ub = 0;
1367  if ( p_st != 0 ) *p_st = 0;
1368  if ( __kmp_env_consistency_check ) {
1369  if ( pr->pushed_ws != ct_none ) {
1370  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1371  }
1372  }
1373  } else {
1374  start = pr->u.p.lb;
1375  limit = chunk + init - 1;
1376  incr = pr->u.p.st;
1377 
1378  if ( (last = (limit >= trip)) != 0 ) {
1379  limit = trip;
1380  #if KMP_OS_WINDOWS
1381  pr->u.p.last_upper = pr->u.p.ub;
1382  #endif /* KMP_OS_WINDOWS */
1383  }
1384  if ( p_last ) {
1385  *p_last = last;
1386  }
1387  if ( p_st != 0 ) {
1388  *p_st = incr;
1389  }
1390  if ( incr == 1 ) {
1391  *p_lb = start + init;
1392  *p_ub = start + limit;
1393  } else {
1394  *p_lb = start + init * incr;
1395  *p_ub = start + limit * incr;
1396  }
1397 
1398  if ( pr->ordered ) {
1399  pr->u.p.ordered_lower = init;
1400  pr->u.p.ordered_upper = limit;
1401  #ifdef KMP_DEBUG
1402  {
1403  const char * buff;
1404  // create format specifiers before the debug output
1405  buff = __kmp_str_format(
1406  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1407  traits_t< UT >::spec, traits_t< UT >::spec );
1408  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1409  __kmp_str_free( &buff );
1410  }
1411  #endif
1412  } // if
1413  } // if
1414  } else {
1415  pr->u.p.tc = 0;
1416 
1417  *p_lb = pr->u.p.lb;
1418  *p_ub = pr->u.p.ub;
1419  #if KMP_OS_WINDOWS
1420  pr->u.p.last_upper = *p_ub;
1421  #endif /* KMP_OS_WINDOWS */
1422 
1423  if ( p_st != 0 ) {
1424  *p_st = pr->u.p.st;
1425  }
1426  if ( p_last ) {
1427  *p_last = TRUE;
1428  }
1429  } // if
1430  #ifdef KMP_DEBUG
1431  {
1432  const char * buff;
1433  // create format specifiers before the debug output
1434  buff = __kmp_str_format(
1435  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1436  "p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
1437  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1438  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
1439  __kmp_str_free( &buff );
1440  }
1441  #endif
1442  return status;
1443  } else {
1444  kmp_int32 last = 0;
1445  dispatch_shared_info_template< UT > *sh;
1446  T start;
1447  ST incr;
1448  UT limit, trip, init;
1449 
1450  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1451  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1452 
1453  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1454  ( th->th.th_dispatch->th_dispatch_pr_current );
1455  KMP_DEBUG_ASSERT( pr );
1456  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1457  ( th->th.th_dispatch->th_dispatch_sh_current );
1458  KMP_DEBUG_ASSERT( sh );
1459 
1460  if ( pr->u.p.tc == 0 ) {
1461  // zero trip count
1462  status = 0;
1463  } else {
1464  switch (pr->schedule) {
1465  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1466  case kmp_sch_static_steal:
1467  {
1468  T chunk = pr->u.p.parm1;
1469 
1470  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1471 
1472  trip = pr->u.p.tc - 1;
1473 
1474  if ( ___kmp_size_type > 4 ) {
1475  // Other threads do not look into the data of this thread,
1476  // so it's not necessary to make volatile casting.
1477  init = ( pr->u.p.count )++;
1478  status = ( init < (UT)pr->u.p.ub );
1479  } else {
1480  typedef union {
1481  struct {
1482  UT count;
1483  T ub;
1484  } p;
1485  kmp_int64 b;
1486  } union_i4;
1487  // All operations on 'count' or 'ub' must be combined atomically together.
1488  // stealing implemented only for 4-byte indexes
1489  {
1490  union_i4 vold, vnew;
1491  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1492  vnew = vold;
1493  vnew.p.count++;
1494  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1495  ( volatile kmp_int64* )&pr->u.p.count,
1496  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1497  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1498  KMP_CPU_PAUSE();
1499  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1500  vnew = vold;
1501  vnew.p.count++;
1502  }
1503  vnew = vold;
1504  init = vnew.p.count;
1505  status = ( init < (UT)vnew.p.ub ) ;
1506  }
1507 
1508  if( !status ) {
1509  kmp_info_t **other_threads = team->t.t_threads;
1510  int while_limit = 10;
1511  int while_index = 0;
1512 
1513  // TODO: algorithm of searching for a victim
1514  // should be cleaned up and measured
1515  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1516  union_i4 vold, vnew;
1517  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1518  T victimIdx = pr->u.p.parm4;
1519  T oldVictimIdx = victimIdx;
1520  dispatch_private_info_template< T > * victim;
1521 
1522  do {
1523  if( !victimIdx ) {
1524  victimIdx = team->t.t_nproc - 1;
1525  } else {
1526  --victimIdx;
1527  }
1528  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1529  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1530  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1531  // TODO: think about a proper place of this test
1532  if ( ( !victim ) ||
1533  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1534  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1535  // TODO: delay would be nice
1536  continue;
1537  // the victim is not ready yet to participate in stealing
1538  // because the victim is still in kmp_init_dispatch
1539  }
1540  if ( oldVictimIdx == victimIdx ) {
1541  break;
1542  }
1543  pr->u.p.parm4 = victimIdx;
1544 
1545  while( 1 ) {
1546  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1547  vnew = vold;
1548 
1549  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1550  if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1551  break;
1552  }
1553  vnew.p.ub -= (remaining >> 2);
1554  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1555  #pragma warning( push )
1556  // disable warning on pointless comparison of unsigned with 0
1557  #pragma warning( disable: 186 )
1558  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1559  #pragma warning( pop )
1560  // TODO: Should this be acquire or release?
1561  if ( KMP_COMPARE_AND_STORE_ACQ64(
1562  ( volatile kmp_int64 * )&victim->u.p.count,
1563  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1564  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1565  status = 1;
1566  while_index = 0;
1567  // now update own count and ub
1568  #if KMP_ARCH_X86
1569  // stealing executed on non-KMP_ARCH_X86 only
1570  // Atomic 64-bit write on ia32 is
1571  // unavailable, so we do this in steps.
1572  // This code is not tested.
1573  init = vold.p.count;
1574  pr->u.p.ub = 0;
1575  pr->u.p.count = init + 1;
1576  pr->u.p.ub = vnew.p.count;
1577  #else
1578  init = vnew.p.ub;
1579  vold.p.count = init + 1;
1580  // TODO: is it safe and enough?
1581  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1582  #endif // KMP_ARCH_X86
1583  break;
1584  } // if
1585  KMP_CPU_PAUSE();
1586  } // while (1)
1587  } // while
1588  } // if
1589  } // if
1590  if ( !status ) {
1591  *p_lb = 0;
1592  *p_ub = 0;
1593  if ( p_st != 0 ) *p_st = 0;
1594  } else {
1595  start = pr->u.p.parm2;
1596  init *= chunk;
1597  limit = chunk + init - 1;
1598  incr = pr->u.p.st;
1599 
1600  KMP_DEBUG_ASSERT(init <= trip);
1601  if ( (last = (limit >= trip)) != 0 )
1602  limit = trip;
1603  if ( p_last ) {
1604  *p_last = last;
1605  }
1606  if ( p_st != 0 ) *p_st = incr;
1607 
1608  if ( incr == 1 ) {
1609  *p_lb = start + init;
1610  *p_ub = start + limit;
1611  } else {
1612  *p_lb = start + init * incr;
1613  *p_ub = start + limit * incr;
1614  }
1615 
1616  if ( pr->ordered ) {
1617  pr->u.p.ordered_lower = init;
1618  pr->u.p.ordered_upper = limit;
1619  #ifdef KMP_DEBUG
1620  {
1621  const char * buff;
1622  // create format specifiers before the debug output
1623  buff = __kmp_str_format(
1624  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1625  traits_t< UT >::spec, traits_t< UT >::spec );
1626  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1627  __kmp_str_free( &buff );
1628  }
1629  #endif
1630  } // if
1631  } // if
1632  break;
1633  } // case
1634  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1635  case kmp_sch_static_balanced:
1636  {
1637  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1638  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1639  pr->u.p.count = 1;
1640  *p_lb = pr->u.p.lb;
1641  *p_ub = pr->u.p.ub;
1642  last = pr->u.p.parm1;
1643  if ( p_last ) {
1644  *p_last = last;
1645  }
1646  if ( p_st )
1647  *p_st = pr->u.p.st;
1648  } else { /* no iterations to do */
1649  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1650  }
1651  if ( pr->ordered ) {
1652  #ifdef KMP_DEBUG
1653  {
1654  const char * buff;
1655  // create format specifiers before the debug output
1656  buff = __kmp_str_format(
1657  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1658  traits_t< UT >::spec, traits_t< UT >::spec );
1659  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1660  __kmp_str_free( &buff );
1661  }
1662  #endif
1663  } // if
1664  } // case
1665  break;
1666  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1667  case kmp_sch_static_chunked:
1668  {
1669  T parm1;
1670 
1671  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1672  gtid ) );
1673  parm1 = pr->u.p.parm1;
1674 
1675  trip = pr->u.p.tc - 1;
1676  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1677 
1678  if ( (status = (init <= trip)) != 0 ) {
1679  start = pr->u.p.lb;
1680  incr = pr->u.p.st;
1681  limit = parm1 + init - 1;
1682 
1683  if ( (last = (limit >= trip)) != 0 )
1684  limit = trip;
1685 
1686  if ( p_last ) {
1687  *p_last = last;
1688  }
1689  if ( p_st != 0 ) *p_st = incr;
1690 
1691  pr->u.p.count += team->t.t_nproc;
1692 
1693  if ( incr == 1 ) {
1694  *p_lb = start + init;
1695  *p_ub = start + limit;
1696  }
1697  else {
1698  *p_lb = start + init * incr;
1699  *p_ub = start + limit * incr;
1700  }
1701 
1702  if ( pr->ordered ) {
1703  pr->u.p.ordered_lower = init;
1704  pr->u.p.ordered_upper = limit;
1705  #ifdef KMP_DEBUG
1706  {
1707  const char * buff;
1708  // create format specifiers before the debug output
1709  buff = __kmp_str_format(
1710  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1711  traits_t< UT >::spec, traits_t< UT >::spec );
1712  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1713  __kmp_str_free( &buff );
1714  }
1715  #endif
1716  } // if
1717  } // if
1718  } // case
1719  break;
1720 
1721  case kmp_sch_dynamic_chunked:
1722  {
1723  T chunk = pr->u.p.parm1;
1724 
1725  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1726  gtid ) );
1727 
1728  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1729  trip = pr->u.p.tc - 1;
1730 
1731  if ( (status = (init <= trip)) == 0 ) {
1732  *p_lb = 0;
1733  *p_ub = 0;
1734  if ( p_st != 0 ) *p_st = 0;
1735  } else {
1736  start = pr->u.p.lb;
1737  limit = chunk + init - 1;
1738  incr = pr->u.p.st;
1739 
1740  if ( (last = (limit >= trip)) != 0 )
1741  limit = trip;
1742  if ( p_last ) {
1743  *p_last = last;
1744  }
1745  if ( p_st != 0 ) *p_st = incr;
1746 
1747  if ( incr == 1 ) {
1748  *p_lb = start + init;
1749  *p_ub = start + limit;
1750  } else {
1751  *p_lb = start + init * incr;
1752  *p_ub = start + limit * incr;
1753  }
1754 
1755  if ( pr->ordered ) {
1756  pr->u.p.ordered_lower = init;
1757  pr->u.p.ordered_upper = limit;
1758  #ifdef KMP_DEBUG
1759  {
1760  const char * buff;
1761  // create format specifiers before the debug output
1762  buff = __kmp_str_format(
1763  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1764  traits_t< UT >::spec, traits_t< UT >::spec );
1765  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1766  __kmp_str_free( &buff );
1767  }
1768  #endif
1769  } // if
1770  } // if
1771  } // case
1772  break;
1773 
1774  case kmp_sch_guided_iterative_chunked:
1775  {
1776  T chunkspec = pr->u.p.parm1;
1777  KD_TRACE(100,
1778  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1779  trip = pr->u.p.tc;
1780  // Start atomic part of calculations
1781  while(1) {
1782  ST remaining; // signed, because can be < 0
1783  init = sh->u.s.iteration; // shared value
1784  remaining = trip - init;
1785  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1786  // nothing to do, don't try atomic op
1787  status = 0;
1788  break;
1789  }
1790  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1791  // use dynamic-style shcedule
1792  // atomically inrement iterations, get old value
1793  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1794  remaining = trip - init;
1795  if (remaining <= 0) {
1796  status = 0; // all iterations got by other threads
1797  } else {
1798  // got some iterations to work on
1799  status = 1;
1800  if ( (T)remaining > chunkspec ) {
1801  limit = init + chunkspec - 1;
1802  } else {
1803  last = 1; // the last chunk
1804  limit = init + remaining - 1;
1805  } // if
1806  } // if
1807  break;
1808  } // if
1809  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1810  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1811  // CAS was successful, chunk obtained
1812  status = 1;
1813  --limit;
1814  break;
1815  } // if
1816  } // while
1817  if ( status != 0 ) {
1818  start = pr->u.p.lb;
1819  incr = pr->u.p.st;
1820  if ( p_st != NULL )
1821  *p_st = incr;
1822  if ( p_last != NULL )
1823  *p_last = last;
1824  *p_lb = start + init * incr;
1825  *p_ub = start + limit * incr;
1826  if ( pr->ordered ) {
1827  pr->u.p.ordered_lower = init;
1828  pr->u.p.ordered_upper = limit;
1829  #ifdef KMP_DEBUG
1830  {
1831  const char * buff;
1832  // create format specifiers before the debug output
1833  buff = __kmp_str_format(
1834  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1835  traits_t< UT >::spec, traits_t< UT >::spec );
1836  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1837  __kmp_str_free( &buff );
1838  }
1839  #endif
1840  } // if
1841  } else {
1842  *p_lb = 0;
1843  *p_ub = 0;
1844  if ( p_st != NULL )
1845  *p_st = 0;
1846  } // if
1847  } // case
1848  break;
1849 
1850  case kmp_sch_guided_analytical_chunked:
1851  {
1852  T chunkspec = pr->u.p.parm1;
1853  UT chunkIdx;
1854  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1855  /* for storing original FPCW value for Windows* OS on
1856  IA-32 architecture 8-byte version */
1857  unsigned int oldFpcw;
1858  int fpcwSet = 0;
1859  #endif
1860  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1861  gtid ) );
1862 
1863  trip = pr->u.p.tc;
1864 
1865  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1866  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1867 
1868  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1869  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1870  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1871  --trip;
1872  /* use dynamic-style scheduling */
1873  init = chunkIdx * chunkspec + pr->u.p.count;
1874  /* need to verify init > 0 in case of overflow in the above calculation */
1875  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1876  limit = init + chunkspec -1;
1877 
1878  if ( (last = (limit >= trip)) != 0 )
1879  limit = trip;
1880  }
1881  break;
1882  } else {
1883  /* use exponential-style scheduling */
1884  /* The following check is to workaround the lack of long double precision on Windows* OS.
1885  This check works around the possible effect that init != 0 for chunkIdx == 0.
1886  */
1887  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1888  /* If we haven't already done so, save original
1889  FPCW and set precision to 64-bit, as Windows* OS
1890  on IA-32 architecture defaults to 53-bit */
1891  if ( !fpcwSet ) {
1892  oldFpcw = _control87(0,0x30000);
1893  fpcwSet = 0x30000;
1894  }
1895  #endif
1896  if ( chunkIdx ) {
1897  init = __kmp_dispatch_guided_remaining< T >(
1898  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1899  KMP_DEBUG_ASSERT(init);
1900  init = trip - init;
1901  } else
1902  init = 0;
1903  limit = trip - __kmp_dispatch_guided_remaining< T >(
1904  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1905  KMP_ASSERT(init <= limit);
1906  if ( init < limit ) {
1907  KMP_DEBUG_ASSERT(limit <= trip);
1908  --limit;
1909  status = 1;
1910  break;
1911  } // if
1912  } // if
1913  } // while (1)
1914  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1915  /* restore FPCW if necessary */
1916  if ( oldFpcw & fpcwSet != 0 )
1917  _control87(oldFpcw,0x30000);
1918  #endif
1919  if ( status != 0 ) {
1920  start = pr->u.p.lb;
1921  incr = pr->u.p.st;
1922  if ( p_st != NULL )
1923  *p_st = incr;
1924  if ( p_last != NULL )
1925  *p_last = last;
1926  *p_lb = start + init * incr;
1927  *p_ub = start + limit * incr;
1928  if ( pr->ordered ) {
1929  pr->u.p.ordered_lower = init;
1930  pr->u.p.ordered_upper = limit;
1931  #ifdef KMP_DEBUG
1932  {
1933  const char * buff;
1934  // create format specifiers before the debug output
1935  buff = __kmp_str_format(
1936  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1937  traits_t< UT >::spec, traits_t< UT >::spec );
1938  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1939  __kmp_str_free( &buff );
1940  }
1941  #endif
1942  }
1943  } else {
1944  *p_lb = 0;
1945  *p_ub = 0;
1946  if ( p_st != NULL )
1947  *p_st = 0;
1948  }
1949  } // case
1950  break;
1951 
1952  case kmp_sch_trapezoidal:
1953  {
1954  UT index;
1955  T parm2 = pr->u.p.parm2;
1956  T parm3 = pr->u.p.parm3;
1957  T parm4 = pr->u.p.parm4;
1958  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1959  gtid ) );
1960 
1961  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1962 
1963  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1964  trip = pr->u.p.tc - 1;
1965 
1966  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1967  *p_lb = 0;
1968  *p_ub = 0;
1969  if ( p_st != 0 ) *p_st = 0;
1970  } else {
1971  start = pr->u.p.lb;
1972  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1973  incr = pr->u.p.st;
1974 
1975  if ( (last = (limit >= trip)) != 0 )
1976  limit = trip;
1977 
1978  if ( p_last != 0 ) {
1979  *p_last = last;
1980  }
1981  if ( p_st != 0 ) *p_st = incr;
1982 
1983  if ( incr == 1 ) {
1984  *p_lb = start + init;
1985  *p_ub = start + limit;
1986  } else {
1987  *p_lb = start + init * incr;
1988  *p_ub = start + limit * incr;
1989  }
1990 
1991  if ( pr->ordered ) {
1992  pr->u.p.ordered_lower = init;
1993  pr->u.p.ordered_upper = limit;
1994  #ifdef KMP_DEBUG
1995  {
1996  const char * buff;
1997  // create format specifiers before the debug output
1998  buff = __kmp_str_format(
1999  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2000  traits_t< UT >::spec, traits_t< UT >::spec );
2001  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2002  __kmp_str_free( &buff );
2003  }
2004  #endif
2005  } // if
2006  } // if
2007  } // case
2008  break;
2009  } // switch
2010  } // if tc == 0;
2011 
2012  if ( status == 0 ) {
2013  UT num_done;
2014 
2015  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2016  #ifdef KMP_DEBUG
2017  {
2018  const char * buff;
2019  // create format specifiers before the debug output
2020  buff = __kmp_str_format(
2021  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2022  traits_t< UT >::spec );
2023  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2024  __kmp_str_free( &buff );
2025  }
2026  #endif
2027 
2028  if ( num_done == team->t.t_nproc-1 ) {
2029  /* NOTE: release this buffer to be reused */
2030 
2031  KMP_MB(); /* Flush all pending memory write invalidates. */
2032 
2033  sh->u.s.num_done = 0;
2034  sh->u.s.iteration = 0;
2035 
2036  /* TODO replace with general release procedure? */
2037  if ( pr->ordered ) {
2038  sh->u.s.ordered_iteration = 0;
2039  }
2040 
2041  KMP_MB(); /* Flush all pending memory write invalidates. */
2042 
2043  sh -> buffer_index += KMP_MAX_DISP_BUF;
2044  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2045  gtid, sh->buffer_index) );
2046 
2047  KMP_MB(); /* Flush all pending memory write invalidates. */
2048 
2049  } // if
2050  if ( __kmp_env_consistency_check ) {
2051  if ( pr->pushed_ws != ct_none ) {
2052  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2053  }
2054  }
2055 
2056  th -> th.th_dispatch -> th_deo_fcn = NULL;
2057  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2058  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2059  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2060  } // if (status == 0)
2061 #if KMP_OS_WINDOWS
2062  else if ( last ) {
2063  pr->u.p.last_upper = pr->u.p.ub;
2064  }
2065 #endif /* KMP_OS_WINDOWS */
2066  } // if
2067 
2068  #ifdef KMP_DEBUG
2069  {
2070  const char * buff;
2071  // create format specifiers before the debug output
2072  buff = __kmp_str_format(
2073  "__kmp_dispatch_next: T#%%d normal case: " \
2074  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2075  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2076  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2077  __kmp_str_free( &buff );
2078  }
2079  #endif
2080  return status;
2081 }
2082 
2083 //-----------------------------------------------------------------------------------------
2084 // Dispatch routines
2085 // Transfer call to template< type T >
2086 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2087 // T lb, T ub, ST st, ST chunk )
2088 extern "C" {
2089 
2105 void
2106 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2107  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2108 {
2109  KMP_DEBUG_ASSERT( __kmp_init_serial );
2110  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2111 }
2115 void
2116 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2117  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2118 {
2119  KMP_DEBUG_ASSERT( __kmp_init_serial );
2120  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2121 }
2122 
2126 void
2127 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2128  kmp_int64 lb, kmp_int64 ub,
2129  kmp_int64 st, kmp_int64 chunk )
2130 {
2131  KMP_DEBUG_ASSERT( __kmp_init_serial );
2132  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2133 }
2134 
2138 void
2139 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2140  kmp_uint64 lb, kmp_uint64 ub,
2141  kmp_int64 st, kmp_int64 chunk )
2142 {
2143  KMP_DEBUG_ASSERT( __kmp_init_serial );
2144  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2145 }
2146 
2159 int
2160 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2161  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2162 {
2163  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2164 }
2165 
2169 int
2170 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2171  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2172 {
2173  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2174 }
2175 
2179 int
2180 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2181  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2182 {
2183  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2184 }
2185 
2189 int
2190 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2191  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2192 {
2193  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2194 }
2195 
2202 void
2203 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2204 {
2205  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2206 }
2207 
2211 void
2212 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2213 {
2214  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2215 }
2216 
2220 void
2221 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2222 {
2223  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2224 }
2225 
2229 void
2230 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2231 {
2232  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2233 }
2236 //-----------------------------------------------------------------------------------------
2237 //Non-template routines from kmp_dispatch.c used in other sources
2238 
2239 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2240  return value == checker;
2241 }
2242 
2243 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2244  return value != checker;
2245 }
2246 
2247 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2248  return value < checker;
2249 }
2250 
2251 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2252  return value >= checker;
2253 }
2254 
2255 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2256  return value <= checker;
2257 }
2258 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2259  return value == checker;
2260 }
2261 
2262 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2263  return value != checker;
2264 }
2265 
2266 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2267  return value < checker;
2268 }
2269 
2270 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2271  return value >= checker;
2272 }
2273 
2274 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2275  return value <= checker;
2276 }
2277 
2278 kmp_uint32
2279 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2280  kmp_uint32 checker,
2281  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2282  , void * obj // Higher-level synchronization object, or NULL.
2283  )
2284 {
2285  // note: we may not belong to a team at this point
2286  register volatile kmp_uint32 * spin = spinner;
2287  register kmp_uint32 check = checker;
2288  register kmp_uint32 spins;
2289  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2290  register kmp_uint32 r;
2291 
2292  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2293  KMP_INIT_YIELD( spins );
2294  // main wait spin loop
2295  while(!f(r = TCR_4(*spin), check)) {
2296  KMP_FSYNC_SPIN_PREPARE( obj );
2297  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2298  It causes problems with infinite recursion because of exit lock */
2299  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2300  __kmp_abort_thread(); */
2301 
2302  __kmp_static_delay(TRUE);
2303 
2304  /* if we have waited a bit, or are oversubscribed, yield */
2305  /* pause is in the following code */
2306  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2307  KMP_YIELD_SPIN( spins );
2308  }
2309  KMP_FSYNC_SPIN_ACQUIRED( obj );
2310  return r;
2311 }
2312 
2313 kmp_uint64
2314 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2315  kmp_uint64 checker,
2316  kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2317  , void * obj // Higher-level synchronization object, or NULL.
2318  )
2319 {
2320  // note: we may not belong to a team at this point
2321  register volatile kmp_uint64 * spin = spinner;
2322  register kmp_uint64 check = checker;
2323  register kmp_uint32 spins;
2324  register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2325  register kmp_uint64 r;
2326 
2327  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2328  KMP_INIT_YIELD( spins );
2329  // main wait spin loop
2330  while(!f(r = *spin, check))
2331  {
2332  KMP_FSYNC_SPIN_PREPARE( obj );
2333  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2334  It causes problems with infinite recursion because of exit lock */
2335  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2336  __kmp_abort_thread(); */
2337 
2338  __kmp_static_delay(TRUE);
2339 
2340  // if we are oversubscribed,
2341  // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2342  // pause is in the following code
2343  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2344  KMP_YIELD_SPIN( spins );
2345  }
2346  KMP_FSYNC_SPIN_ACQUIRED( obj );
2347  return r;
2348 }
2349 
2350 } // extern "C"
2351 
2352 #ifdef KMP_GOMP_COMPAT
2353 
2354 void
2355 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2356  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2357  kmp_int32 chunk, int push_ws )
2358 {
2359  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2360  push_ws );
2361 }
2362 
2363 void
2364 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2365  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2366  kmp_int32 chunk, int push_ws )
2367 {
2368  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2369  push_ws );
2370 }
2371 
2372 void
2373 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2374  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2375  kmp_int64 chunk, int push_ws )
2376 {
2377  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2378  push_ws );
2379 }
2380 
2381 void
2382 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2383  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2384  kmp_int64 chunk, int push_ws )
2385 {
2386  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2387  push_ws );
2388 }
2389 
2390 void
2391 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2392 {
2393  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2394 }
2395 
2396 void
2397 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2398 {
2399  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2400 }
2401 
2402 void
2403 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2404 {
2405  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2406 }
2407 
2408 void
2409 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2410 {
2411  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2412 }
2413 
2414 #endif /* KMP_GOMP_COMPAT */
2415 
2416 /* ------------------------------------------------------------------------ */
2417 /* ------------------------------------------------------------------------ */
2418