Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  * $Revision: 42674 $
4  * $Date: 2013-09-18 11:12:49 -0500 (Wed, 18 Sep 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 /*
38  * Dynamic scheduling initialization and dispatch.
39  *
40  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
41  * it may change values between parallel regions. __kmp_max_nth
42  * is the largest value __kmp_nth may take, 1 is the smallest.
43  *
44  */
45 
46 /* ------------------------------------------------------------------------ */
47 /* ------------------------------------------------------------------------ */
48 
49 #include "kmp.h"
50 #include "kmp_i18n.h"
51 #include "kmp_itt.h"
52 #include "kmp_str.h"
53 #include "kmp_error.h"
54 #if KMP_OS_WINDOWS && KMP_ARCH_X86
55  #include <float.h>
56 #endif
57 
58 /* ------------------------------------------------------------------------ */
59 /* ------------------------------------------------------------------------ */
60 
61 #ifdef KMP_STATIC_STEAL_ENABLED
62 
63  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
64  template< typename T >
65  struct dispatch_private_infoXX_template {
66  typedef typename traits_t< T >::unsigned_t UT;
67  typedef typename traits_t< T >::signed_t ST;
68  UT count; // unsigned
69  T ub;
70  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
71  T lb;
72  ST st; // signed
73  UT tc; // unsigned
74  T static_steal_counter; // for static_steal only; maybe better to put after ub
75 
76  /* parm[1-4] are used in different ways by different scheduling algorithms */
77 
78  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
79  // a) parm3 is properly aligned and
80  // b) all parm1-4 are in the same cache line.
81  // Because of parm1-4 are used together, performance seems to be better
82  // if they are in the same line (not measured though).
83 
84  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
85  T parm1;
86  T parm2;
87  T parm3;
88  T parm4;
89  };
90 
91  UT ordered_lower; // unsigned
92  UT ordered_upper; // unsigned
93  #if KMP_OS_WINDOWS
94  T last_upper;
95  #endif /* KMP_OS_WINDOWS */
96  };
97 
98 #else /* KMP_STATIC_STEAL_ENABLED */
99 
100  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
101  template< typename T >
102  struct dispatch_private_infoXX_template {
103  typedef typename traits_t< T >::unsigned_t UT;
104  typedef typename traits_t< T >::signed_t ST;
105  T lb;
106  T ub;
107  ST st; // signed
108  UT tc; // unsigned
109 
110  T parm1;
111  T parm2;
112  T parm3;
113  T parm4;
114 
115  UT count; // unsigned
116 
117  UT ordered_lower; // unsigned
118  UT ordered_upper; // unsigned
119  #if KMP_OS_WINDOWS
120  T last_upper;
121  #endif /* KMP_OS_WINDOWS */
122  };
123 
124 #endif /* KMP_STATIC_STEAL_ENABLED */
125 
126 // replaces dispatch_private_info structure and dispatch_private_info_t type
127 template< typename T >
128 struct KMP_ALIGN_CACHE dispatch_private_info_template {
129  // duplicate alignment here, otherwise size of structure is not correct in our compiler
130  union KMP_ALIGN_CACHE private_info_tmpl {
131  dispatch_private_infoXX_template< T > p;
132  dispatch_private_info64_t p64;
133  } u;
134  enum sched_type schedule; /* scheduling algorithm */
135  kmp_uint32 ordered; /* ordered clause specified */
136  kmp_uint32 ordered_bumped;
137  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
138  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
139  kmp_uint32 nomerge; /* don't merge iters if serialized */
140  kmp_uint32 type_size;
141  enum cons_type pushed_ws;
142 };
143 
144 
145 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
146 template< typename UT >
147 struct dispatch_shared_infoXX_template {
148  /* chunk index under dynamic, number of idle threads under static-steal;
149  iteration index otherwise */
150  volatile UT iteration;
151  volatile UT num_done;
152  volatile UT ordered_iteration;
153  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
154 };
155 
156 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
157 template< typename UT >
158 struct dispatch_shared_info_template {
159  // we need union here to keep the structure size
160  union shared_info_tmpl {
161  dispatch_shared_infoXX_template< UT > s;
162  dispatch_shared_info64_t s64;
163  } u;
164  volatile kmp_uint32 buffer_index;
165 };
166 
167 /* ------------------------------------------------------------------------ */
168 /* ------------------------------------------------------------------------ */
169 
170 static void
171 __kmp_static_delay( int arg )
172 {
173  /* Work around weird code-gen bug that causes assert to trip */
174  #if KMP_ARCH_X86_64 && KMP_OS_LINUX
175  #else
176  KMP_ASSERT( arg >= 0 );
177  #endif
178 }
179 
180 static void
181 __kmp_static_yield( int arg )
182 {
183  __kmp_yield( arg );
184 }
185 
186 #undef USE_TEST_LOCKS
187 
188 // test_then_add template (general template should NOT be used)
189 template< typename T >
190 static __forceinline T
191 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
192 
193 template<>
194 __forceinline kmp_int32
195 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
196 {
197  kmp_int32 r;
198  r = KMP_TEST_THEN_ADD32( p, d );
199  return r;
200 }
201 
202 template<>
203 __forceinline kmp_int64
204 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
205 {
206  kmp_int64 r;
207  r = KMP_TEST_THEN_ADD64( p, d );
208  return r;
209 }
210 
211 // test_then_inc_acq template (general template should NOT be used)
212 template< typename T >
213 static __forceinline T
214 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
215 
216 template<>
217 __forceinline kmp_int32
218 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
219 {
220  kmp_int32 r;
221  r = KMP_TEST_THEN_INC_ACQ32( p );
222  return r;
223 }
224 
225 template<>
226 __forceinline kmp_int64
227 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
228 {
229  kmp_int64 r;
230  r = KMP_TEST_THEN_INC_ACQ64( p );
231  return r;
232 }
233 
234 // test_then_inc template (general template should NOT be used)
235 template< typename T >
236 static __forceinline T
237 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
238 
239 template<>
240 __forceinline kmp_int32
241 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
242 {
243  kmp_int32 r;
244  r = KMP_TEST_THEN_INC32( p );
245  return r;
246 }
247 
248 template<>
249 __forceinline kmp_int64
250 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
251 {
252  kmp_int64 r;
253  r = KMP_TEST_THEN_INC64( p );
254  return r;
255 }
256 
257 // compare_and_swap template (general template should NOT be used)
258 template< typename T >
259 static __forceinline kmp_int32
260 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
261 
262 template<>
263 __forceinline kmp_int32
264 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
265 {
266  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
267 }
268 
269 template<>
270 __forceinline kmp_int32
271 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
272 {
273  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
274 }
275 
276 /*
277  Spin wait loop that first does pause, then yield.
278  Waits until function returns non-zero when called with *spinner and check.
279  Does NOT put threads to sleep.
280 #if USE_ITT_BUILD
281  Arguments:
282  obj -- is higher-level syncronization object to report to ittnotify. It is used to report
283  locks consistently. For example, if lock is acquired immediately, its address is
284  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
285  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
286  address, not an address of low-level spinner.
287 #endif // USE_ITT_BUILD
288 */
289 template< typename UT >
290 // ToDo: make inline function (move to header file for icl)
291 static UT // unsigned 4- or 8-byte type
292 __kmp_wait_yield( volatile UT * spinner,
293  UT checker,
294  kmp_uint32 (* pred)( UT, UT )
295  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
296  )
297 {
298  // note: we may not belong to a team at this point
299  register volatile UT * spin = spinner;
300  register UT check = checker;
301  register kmp_uint32 spins;
302  register kmp_uint32 (*f) ( UT, UT ) = pred;
303  register UT r;
304 
305  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
306  KMP_INIT_YIELD( spins );
307  // main wait spin loop
308  while(!f(r = *spin, check))
309  {
310  KMP_FSYNC_SPIN_PREPARE( obj );
311  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
312  It causes problems with infinite recursion because of exit lock */
313  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
314  __kmp_abort_thread(); */
315 
316  __kmp_static_delay(TRUE);
317 
318  // if we are oversubscribed,
319  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
320  // pause is in the following code
321  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
322  KMP_YIELD_SPIN( spins );
323  }
324  KMP_FSYNC_SPIN_ACQUIRED( obj );
325  return r;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_eq( UT value, UT checker) {
330  return value == checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_neq( UT value, UT checker) {
335  return value != checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_lt( UT value, UT checker) {
340  return value < checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_ge( UT value, UT checker) {
345  return value >= checker;
346 }
347 
348 template< typename UT >
349 static kmp_uint32 __kmp_le( UT value, UT checker) {
350  return value <= checker;
351 }
352 
353 
354 /* ------------------------------------------------------------------------ */
355 /* ------------------------------------------------------------------------ */
356 
357 static void
358 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
359 {
360  kmp_info_t *th;
361 
362  KMP_DEBUG_ASSERT( gtid_ref );
363 
364  if ( __kmp_env_consistency_check ) {
365  th = __kmp_threads[*gtid_ref];
366  if ( th -> th.th_root -> r.r_active
367  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
368  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
369  }
370  }
371 }
372 
373 template< typename UT >
374 static void
375 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
376 {
377  typedef typename traits_t< UT >::signed_t ST;
378  dispatch_private_info_template< UT > * pr;
379 
380  int gtid = *gtid_ref;
381 // int cid = *cid_ref;
382  kmp_info_t *th = __kmp_threads[ gtid ];
383  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
384 
385  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
386  if ( __kmp_env_consistency_check ) {
387  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
388  ( th -> th.th_dispatch -> th_dispatch_pr_current );
389  if ( pr -> pushed_ws != ct_none ) {
390  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
391  }
392  }
393 
394  if ( ! th -> th.th_team -> t.t_serialized ) {
395  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
396  ( th -> th.th_dispatch -> th_dispatch_sh_current );
397  UT lower;
398 
399  if ( ! __kmp_env_consistency_check ) {
400  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
401  ( th -> th.th_dispatch -> th_dispatch_pr_current );
402  }
403  lower = pr->u.p.ordered_lower;
404 
405  #if ! defined( KMP_GOMP_COMPAT )
406  if ( __kmp_env_consistency_check ) {
407  if ( pr->ordered_bumped ) {
408  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
409  __kmp_error_construct2(
410  kmp_i18n_msg_CnsMultipleNesting,
411  ct_ordered_in_pdo, loc_ref,
412  & p->stack_data[ p->w_top ]
413  );
414  }
415  }
416  #endif /* !defined(KMP_GOMP_COMPAT) */
417 
418  KMP_MB();
419  #ifdef KMP_DEBUG
420  {
421  const char * buff;
422  // create format specifiers before the debug output
423  buff = __kmp_str_format(
424  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
425  traits_t< UT >::spec, traits_t< UT >::spec );
426  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
427  __kmp_str_free( &buff );
428  }
429  #endif
430 
431  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
432  USE_ITT_BUILD_ARG( NULL )
433  );
434  KMP_MB(); /* is this necessary? */
435  #ifdef KMP_DEBUG
436  {
437  const char * buff;
438  // create format specifiers before the debug output
439  buff = __kmp_str_format(
440  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
441  traits_t< UT >::spec, traits_t< UT >::spec );
442  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
443  __kmp_str_free( &buff );
444  }
445  #endif
446  }
447  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
448 }
449 
450 static void
451 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
452 {
453  kmp_info_t *th;
454 
455  if ( __kmp_env_consistency_check ) {
456  th = __kmp_threads[*gtid_ref];
457  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
458  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
459  }
460  }
461 }
462 
463 template< typename UT >
464 static void
465 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
466 {
467  typedef typename traits_t< UT >::signed_t ST;
468  dispatch_private_info_template< UT > * pr;
469 
470  int gtid = *gtid_ref;
471 // int cid = *cid_ref;
472  kmp_info_t *th = __kmp_threads[ gtid ];
473  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
474 
475  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
476  if ( __kmp_env_consistency_check ) {
477  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
478  ( th -> th.th_dispatch -> th_dispatch_pr_current );
479  if ( pr -> pushed_ws != ct_none ) {
480  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
481  }
482  }
483 
484  if ( ! th -> th.th_team -> t.t_serialized ) {
485  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
486  ( th -> th.th_dispatch -> th_dispatch_sh_current );
487 
488  if ( ! __kmp_env_consistency_check ) {
489  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
490  ( th -> th.th_dispatch -> th_dispatch_pr_current );
491  }
492 
493  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
494  #if ! defined( KMP_GOMP_COMPAT )
495  if ( __kmp_env_consistency_check ) {
496  if ( pr->ordered_bumped != 0 ) {
497  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
498  /* How to test it? - OM */
499  __kmp_error_construct2(
500  kmp_i18n_msg_CnsMultipleNesting,
501  ct_ordered_in_pdo, loc_ref,
502  & p->stack_data[ p->w_top ]
503  );
504  }
505  }
506  #endif /* !defined(KMP_GOMP_COMPAT) */
507 
508  KMP_MB(); /* Flush all pending memory write invalidates. */
509 
510  pr->ordered_bumped += 1;
511 
512  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
513  gtid, pr->ordered_bumped ) );
514 
515  KMP_MB(); /* Flush all pending memory write invalidates. */
516 
517  /* TODO use general release procedure? */
518  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
519 
520  KMP_MB(); /* Flush all pending memory write invalidates. */
521  }
522  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
523 }
524 
525 /* Computes and returns x to the power of y, where y must a non-negative integer */
526 template< typename UT >
527 static __forceinline long double
528 __kmp_pow(long double x, UT y) {
529  long double s=1.0L;
530 
531  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
532  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
533  while(y) {
534  if ( y & 1 )
535  s *= x;
536  x *= x;
537  y >>= 1;
538  }
539  return s;
540 }
541 
542 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
543  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
544  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
545  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
546 */
547 template< typename T >
548 static __inline typename traits_t< T >::unsigned_t
549 __kmp_dispatch_guided_remaining(
550  T tc,
551  typename traits_t< T >::floating_t base,
552  typename traits_t< T >::unsigned_t idx
553 ) {
554  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
555  least for ICL 8.1, long double arithmetic may not really have
556  long double precision, even with /Qlong_double. Currently, we
557  workaround that in the caller code, by manipulating the FPCW for
558  Windows* OS on IA-32 architecture. The lack of precision is not
559  expected to be a correctness issue, though.
560  */
561  typedef typename traits_t< T >::unsigned_t UT;
562 
563  long double x = tc * __kmp_pow< UT >(base, idx);
564  UT r = (UT) x;
565  if ( x == r )
566  return r;
567  return r + 1;
568 }
569 
570 // Parameters of the guided-iterative algorithm:
571 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
572 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
573 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
574 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
575 static int guided_int_param = 2;
576 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
577 
578 // UT - unsigned flavor of T, ST - signed flavor of T,
579 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
580 template< typename T >
581 static void
582 __kmp_dispatch_init(
583  ident_t * loc,
584  int gtid,
585  enum sched_type schedule,
586  T lb,
587  T ub,
588  typename traits_t< T >::signed_t st,
589  typename traits_t< T >::signed_t chunk,
590  int push_ws
591 ) {
592  typedef typename traits_t< T >::unsigned_t UT;
593  typedef typename traits_t< T >::signed_t ST;
594  typedef typename traits_t< T >::floating_t DBL;
595  static const int ___kmp_size_type = sizeof( UT );
596 
597  int active;
598  T tc;
599  kmp_info_t * th;
600  kmp_team_t * team;
601  kmp_uint32 my_buffer_index;
602  dispatch_private_info_template< T > * pr;
603  dispatch_shared_info_template< UT > volatile * sh;
604 
605  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
606  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
607 
608  if ( ! TCR_4( __kmp_init_parallel ) )
609  __kmp_parallel_initialize();
610 
611  #ifdef KMP_DEBUG
612  {
613  const char * buff;
614  // create format specifiers before the debug output
615  buff = __kmp_str_format(
616  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
617  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
618  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
619  __kmp_str_free( &buff );
620  }
621  #endif
622  /* setup data */
623  th = __kmp_threads[ gtid ];
624  team = th -> th.th_team;
625  active = ! team -> t.t_serialized;
626  th->th.th_ident = loc;
627 
628  if ( ! active ) {
629  pr = reinterpret_cast< dispatch_private_info_template< T >* >
630  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
631  } else {
632  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
633  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
634 
635  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
636 
637  /* What happens when number of threads changes, need to resize buffer? */
638  pr = reinterpret_cast< dispatch_private_info_template< T > * >
639  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
640  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
641  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
642  }
643 
644  /* Pick up the nomerge/ordered bits from the scheduling type */
645  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
646  pr->nomerge = TRUE;
647  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
648  } else {
649  pr->nomerge = FALSE;
650  }
651  pr->type_size = ___kmp_size_type; // remember the size of variables
652  if ( kmp_ord_lower & schedule ) {
653  pr->ordered = TRUE;
654  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
655  } else {
656  pr->ordered = FALSE;
657  }
658  if ( schedule == kmp_sch_static ) {
659  schedule = __kmp_static;
660  } else {
661  if ( schedule == kmp_sch_runtime ) {
662  #if OMP_30_ENABLED
663  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
664  schedule = team -> t.t_sched.r_sched_type;
665  // Detail the schedule if needed (global controls are differentiated appropriately)
666  if ( schedule == kmp_sch_guided_chunked ) {
667  schedule = __kmp_guided;
668  } else if ( schedule == kmp_sch_static ) {
669  schedule = __kmp_static;
670  }
671  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
672  chunk = team -> t.t_sched.chunk;
673  #else
674  kmp_r_sched_t r_sched = __kmp_get_schedule_global();
675  // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
676  schedule = r_sched.r_sched_type;
677  chunk = r_sched.chunk;
678  #endif
679 
680  #ifdef KMP_DEBUG
681  {
682  const char * buff;
683  // create format specifiers before the debug output
684  buff = __kmp_str_format(
685  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
686  traits_t< ST >::spec );
687  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
688  __kmp_str_free( &buff );
689  }
690  #endif
691  } else {
692  if ( schedule == kmp_sch_guided_chunked ) {
693  schedule = __kmp_guided;
694  }
695  if ( chunk <= 0 ) {
696  chunk = KMP_DEFAULT_CHUNK;
697  }
698  }
699 
700  #if OMP_30_ENABLED
701  if ( schedule == kmp_sch_auto ) {
702  // mapping and differentiation: in the __kmp_do_serial_initialize()
703  schedule = __kmp_auto;
704  #ifdef KMP_DEBUG
705  {
706  const char * buff;
707  // create format specifiers before the debug output
708  buff = __kmp_str_format(
709  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
710  traits_t< ST >::spec );
711  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
712  __kmp_str_free( &buff );
713  }
714  #endif
715  }
716  #endif // OMP_30_ENABLED
717 
718  /* guided analytical not safe for too many threads */
719  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
720  schedule = kmp_sch_guided_iterative_chunked;
721  KMP_WARNING( DispatchManyThreads );
722  }
723  pr->u.p.parm1 = chunk;
724  }
725  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
726  "unknown scheduling type" );
727 
728  pr->u.p.count = 0;
729 
730  if ( __kmp_env_consistency_check ) {
731  if ( st == 0 ) {
732  __kmp_error_construct(
733  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
734  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
735  );
736  }
737  }
738 
739  tc = ( ub - lb + st );
740  if ( st != 1 ) {
741  if ( st < 0 ) {
742  if ( lb < ub ) {
743  tc = 0; // zero-trip
744  } else { // lb >= ub
745  tc = (ST)tc / st; // convert to signed division
746  }
747  } else { // st > 0
748  if ( ub < lb ) {
749  tc = 0; // zero-trip
750  } else { // lb >= ub
751  tc /= st;
752  }
753  }
754  } else if ( ub < lb ) { // st == 1
755  tc = 0; // zero-trip
756  }
757 
758  pr->u.p.lb = lb;
759  pr->u.p.ub = ub;
760  pr->u.p.st = st;
761  pr->u.p.tc = tc;
762 
763  #if KMP_OS_WINDOWS
764  pr->u.p.last_upper = ub + st;
765  #endif /* KMP_OS_WINDOWS */
766 
767  /* NOTE: only the active parallel region(s) has active ordered sections */
768 
769  if ( active ) {
770  if ( pr->ordered == 0 ) {
771  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
772  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
773  } else {
774  pr->ordered_bumped = 0;
775 
776  pr->u.p.ordered_lower = 1;
777  pr->u.p.ordered_upper = 0;
778 
779  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
780  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
781  }
782  }
783 
784  if ( __kmp_env_consistency_check ) {
785  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
786  if ( push_ws ) {
787  __kmp_push_workshare( gtid, ws, loc );
788  pr->pushed_ws = ws;
789  } else {
790  __kmp_check_workshare( gtid, ws, loc );
791  pr->pushed_ws = ct_none;
792  }
793  }
794 
795  switch ( schedule ) {
796  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
798  {
799  T nproc = team->t.t_nproc;
800  T ntc, init;
801 
802  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
803 
804  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
805  if ( nproc > 1 && ntc >= nproc ) {
806  T id = __kmp_tid_from_gtid(gtid);
807  T small_chunk, extras;
808 
809  small_chunk = ntc / nproc;
810  extras = ntc % nproc;
811 
812  init = id * small_chunk + ( id < extras ? id : extras );
813  pr->u.p.count = init;
814  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
815 
816  pr->u.p.parm2 = lb;
817  //pr->pfields.parm3 = 0; // it's not used in static_steal
818  pr->u.p.parm4 = id;
819  pr->u.p.st = st;
820  break;
821  } else {
822  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
823  gtid ) );
824  schedule = kmp_sch_static_balanced;
825  /* too few iterations: fall-through to kmp_sch_static_balanced */
826  } // if
827  /* FALL-THROUGH to static balanced */
828  } // case
829  #endif
830  case kmp_sch_static_balanced:
831  {
832  T nproc = team->t.t_nproc;
833  T init, limit;
834 
835  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
836  gtid ) );
837 
838  if ( nproc > 1 ) {
839  T id = __kmp_tid_from_gtid(gtid);
840 
841  if ( tc < nproc ) {
842  if ( id < tc ) {
843  init = id;
844  limit = id;
845  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
846  } else {
847  pr->u.p.count = 1; /* means no more chunks to execute */
848  pr->u.p.parm1 = FALSE;
849  break;
850  }
851  } else {
852  T small_chunk = tc / nproc;
853  T extras = tc % nproc;
854  init = id * small_chunk + (id < extras ? id : extras);
855  limit = init + small_chunk - (id < extras ? 0 : 1);
856  pr->u.p.parm1 = (id == nproc - 1);
857  }
858  } else {
859  if ( tc > 0 ) {
860  init = 0;
861  limit = tc - 1;
862  pr->u.p.parm1 = TRUE;
863  } else {
864  // zero trip count
865  pr->u.p.count = 1; /* means no more chunks to execute */
866  pr->u.p.parm1 = FALSE;
867  break;
868  }
869  }
870  if ( st == 1 ) {
871  pr->u.p.lb = lb + init;
872  pr->u.p.ub = lb + limit;
873  } else {
874  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
875  pr->u.p.lb = lb + init * st;
876  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
877  if ( st > 0 ) {
878  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
879  } else {
880  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
881  }
882  }
883  if ( pr->ordered ) {
884  pr->u.p.ordered_lower = init;
885  pr->u.p.ordered_upper = limit;
886  }
887  break;
888  } // case
889  case kmp_sch_guided_iterative_chunked :
890  {
891  T nproc = team->t.t_nproc;
892  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
893 
894  if ( nproc > 1 ) {
895  if ( (2L * chunk + 1 ) * nproc >= tc ) {
896  /* chunk size too large, switch to dynamic */
897  schedule = kmp_sch_dynamic_chunked;
898  } else {
899  // when remaining iters become less than parm2 - switch to dynamic
900  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
901  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
902  }
903  } else {
904  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
905  schedule = kmp_sch_static_greedy;
906  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
907  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
908  pr->u.p.parm1 = tc;
909  } // if
910  } // case
911  break;
912  case kmp_sch_guided_analytical_chunked:
913  {
914  T nproc = team->t.t_nproc;
915  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
916 
917  if ( nproc > 1 ) {
918  if ( (2L * chunk + 1 ) * nproc >= tc ) {
919  /* chunk size too large, switch to dynamic */
920  schedule = kmp_sch_dynamic_chunked;
921  } else {
922  /* commonly used term: (2 nproc - 1)/(2 nproc) */
923  DBL x;
924 
925  #if KMP_OS_WINDOWS && KMP_ARCH_X86
926  /* Linux* OS already has 64-bit computation by default for
927  long double, and on Windows* OS on Intel(R) 64,
928  /Qlong_double doesn't work. On Windows* OS
929  on IA-32 architecture, we need to set precision to
930  64-bit instead of the default 53-bit. Even though long
931  double doesn't work on Windows* OS on Intel(R) 64, the
932  resulting lack of precision is not expected to impact
933  the correctness of the algorithm, but this has not been
934  mathematically proven.
935  */
936  // save original FPCW and set precision to 64-bit, as
937  // Windows* OS on IA-32 architecture defaults to 53-bit
938  unsigned int oldFpcw = _control87(0,0);
939  _control87(_PC_64,_MCW_PC); // 0,0x30000
940  #endif
941  /* value used for comparison in solver for cross-over point */
942  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
943 
944  /* crossover point--chunk indexes equal to or greater than
945  this point switch to dynamic-style scheduling */
946  UT cross;
947 
948  /* commonly used term: (2 nproc - 1)/(2 nproc) */
949  x = (long double)1.0 - (long double)0.5 / nproc;
950 
951  #ifdef KMP_DEBUG
952  { // test natural alignment
953  struct _test_a {
954  char a;
955  union {
956  char b;
957  DBL d;
958  };
959  } t;
960  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
961  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
962  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
963  }
964  #endif // KMP_DEBUG
965 
966  /* save the term in thread private dispatch structure */
967  *(DBL*)&pr->u.p.parm3 = x;
968 
969  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
970  {
971  UT left, right, mid;
972  long double p;
973 
974  /* estimate initial upper and lower bound */
975 
976  /* doesn't matter what value right is as long as it is positive, but
977  it affects performance of the solver
978  */
979  right = 229;
980  p = __kmp_pow< UT >(x,right);
981  if ( p > target ) {
982  do{
983  p *= p;
984  right <<= 1;
985  } while(p>target && right < (1<<27));
986  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
987  } else {
988  left = 0;
989  }
990 
991  /* bisection root-finding method */
992  while ( left + 1 < right ) {
993  mid = (left + right) / 2;
994  if ( __kmp_pow< UT >(x,mid) > target ) {
995  left = mid;
996  } else {
997  right = mid;
998  }
999  } // while
1000  cross = right;
1001  }
1002  /* assert sanity of computed crossover point */
1003  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1004 
1005  /* save the crossover point in thread private dispatch structure */
1006  pr->u.p.parm2 = cross;
1007 
1008  // C75803
1009  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1010  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1011  #else
1012  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1013  #endif
1014  /* dynamic-style scheduling offset */
1015  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1016  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1017  // restore FPCW
1018  _control87(oldFpcw,_MCW_PC);
1019  #endif
1020  } // if
1021  } else {
1022  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1023  gtid ) );
1024  schedule = kmp_sch_static_greedy;
1025  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1026  pr->u.p.parm1 = tc;
1027  } // if
1028  } // case
1029  break;
1030  case kmp_sch_static_greedy:
1031  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1032  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1033  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1034  tc;
1035  break;
1036  case kmp_sch_static_chunked :
1037  case kmp_sch_dynamic_chunked :
1038  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1039  break;
1040  case kmp_sch_trapezoidal :
1041  {
1042  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1043 
1044  T parm1, parm2, parm3, parm4;
1045  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1046 
1047  parm1 = chunk;
1048 
1049  /* F : size of the first cycle */
1050  parm2 = ( tc / (2 * team->t.t_nproc) );
1051 
1052  if ( parm2 < 1 ) {
1053  parm2 = 1;
1054  }
1055 
1056  /* L : size of the last cycle. Make sure the last cycle
1057  * is not larger than the first cycle.
1058  */
1059  if ( parm1 < 1 ) {
1060  parm1 = 1;
1061  } else if ( parm1 > parm2 ) {
1062  parm1 = parm2;
1063  }
1064 
1065  /* N : number of cycles */
1066  parm3 = ( parm2 + parm1 );
1067  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1068 
1069  if ( parm3 < 2 ) {
1070  parm3 = 2;
1071  }
1072 
1073  /* sigma : decreasing incr of the trapezoid */
1074  parm4 = ( parm3 - 1 );
1075  parm4 = ( parm2 - parm1 ) / parm4;
1076 
1077  // pointless check, because parm4 >= 0 always
1078  //if ( parm4 < 0 ) {
1079  // parm4 = 0;
1080  //}
1081 
1082  pr->u.p.parm1 = parm1;
1083  pr->u.p.parm2 = parm2;
1084  pr->u.p.parm3 = parm3;
1085  pr->u.p.parm4 = parm4;
1086  } // case
1087  break;
1088 
1089  default:
1090  {
1091  __kmp_msg(
1092  kmp_ms_fatal, // Severity
1093  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1094  KMP_HNT( GetNewerLibrary ), // Hint
1095  __kmp_msg_null // Variadic argument list terminator
1096  );
1097  }
1098  break;
1099  } // switch
1100  pr->schedule = schedule;
1101  if ( active ) {
1102  /* The name of this buffer should be my_buffer_index when it's free to use it */
1103 
1104  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1105  gtid, my_buffer_index, sh->buffer_index) );
1106  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1107  USE_ITT_BUILD_ARG( NULL )
1108  );
1109  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1110  // *always* 32-bit integers.
1111  KMP_MB(); /* is this necessary? */
1112  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1113  gtid, my_buffer_index, sh->buffer_index) );
1114 
1115  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1116  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1117 #if USE_ITT_BUILD
1118  if ( pr->ordered ) {
1119  __kmp_itt_ordered_init( gtid );
1120  }; // if
1121 #endif /* USE_ITT_BUILD */
1122  }; // if
1123  #ifdef KMP_DEBUG
1124  {
1125  const char * buff;
1126  // create format specifiers before the debug output
1127  buff = __kmp_str_format(
1128  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1129  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1130  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1131  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1132  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1133  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1134  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1135  KD_TRACE(10, ( buff,
1136  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1137  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1138  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1139  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1140  __kmp_str_free( &buff );
1141  }
1142  #endif
1143  #if ( KMP_STATIC_STEAL_ENABLED )
1144  if ( ___kmp_size_type < 8 ) {
1145  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1146  // all the parm3 variables will contain the same value.
1147  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1148  // rather than program life-time increment.
1149  // So the dedicated variable is required. The 'static_steal_counter' is used.
1150  if( schedule == kmp_sch_static_steal ) {
1151  // Other threads will inspect this variable when searching for a victim.
1152  // This is a flag showing that other threads may steal from this thread since then.
1153  volatile T * p = &pr->u.p.static_steal_counter;
1154  *p = *p + 1;
1155  }
1156  }
1157  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1158 }
1159 
1160 /*
1161  * For ordered loops, either __kmp_dispatch_finish() should be called after
1162  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1163  * every chunk of iterations. If the ordered section(s) were not executed
1164  * for this iteration (or every iteration in this chunk), we need to set the
1165  * ordered iteration counters so that the next thread can proceed.
1166  */
1167 template< typename UT >
1168 static void
1169 __kmp_dispatch_finish( int gtid, ident_t *loc )
1170 {
1171  typedef typename traits_t< UT >::signed_t ST;
1172  kmp_info_t *th = __kmp_threads[ gtid ];
1173 
1174  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1175  if ( ! th -> th.th_team -> t.t_serialized ) {
1176 
1177  dispatch_private_info_template< UT > * pr =
1178  reinterpret_cast< dispatch_private_info_template< UT >* >
1179  ( th->th.th_dispatch->th_dispatch_pr_current );
1180  dispatch_shared_info_template< UT > volatile * sh =
1181  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1182  ( th->th.th_dispatch->th_dispatch_sh_current );
1183  KMP_DEBUG_ASSERT( pr );
1184  KMP_DEBUG_ASSERT( sh );
1185  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1186  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1187 
1188  if ( pr->ordered_bumped ) {
1189  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1190  gtid ) );
1191  pr->ordered_bumped = 0;
1192  } else {
1193  UT lower = pr->u.p.ordered_lower;
1194 
1195  #ifdef KMP_DEBUG
1196  {
1197  const char * buff;
1198  // create format specifiers before the debug output
1199  buff = __kmp_str_format(
1200  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1201  traits_t< UT >::spec, traits_t< UT >::spec );
1202  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1203  __kmp_str_free( &buff );
1204  }
1205  #endif
1206 
1207  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1208  USE_ITT_BUILD_ARG(NULL)
1209  );
1210  KMP_MB(); /* is this necessary? */
1211  #ifdef KMP_DEBUG
1212  {
1213  const char * buff;
1214  // create format specifiers before the debug output
1215  buff = __kmp_str_format(
1216  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1217  traits_t< UT >::spec, traits_t< UT >::spec );
1218  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1219  __kmp_str_free( &buff );
1220  }
1221  #endif
1222 
1223  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1224  } // if
1225  } // if
1226  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1227 }
1228 
1229 #ifdef KMP_GOMP_COMPAT
1230 
1231 template< typename UT >
1232 static void
1233 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1234 {
1235  typedef typename traits_t< UT >::signed_t ST;
1236  kmp_info_t *th = __kmp_threads[ gtid ];
1237 
1238  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1239  if ( ! th -> th.th_team -> t.t_serialized ) {
1240 // int cid;
1241  dispatch_private_info_template< UT > * pr =
1242  reinterpret_cast< dispatch_private_info_template< UT >* >
1243  ( th->th.th_dispatch->th_dispatch_pr_current );
1244  dispatch_shared_info_template< UT > volatile * sh =
1245  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1246  ( th->th.th_dispatch->th_dispatch_sh_current );
1247  KMP_DEBUG_ASSERT( pr );
1248  KMP_DEBUG_ASSERT( sh );
1249  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1250  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1251 
1252 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1253  UT lower = pr->u.p.ordered_lower;
1254  UT upper = pr->u.p.ordered_upper;
1255  UT inc = upper - lower + 1;
1256 
1257  if ( pr->ordered_bumped == inc ) {
1258  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1259  gtid ) );
1260  pr->ordered_bumped = 0;
1261  } else {
1262  inc -= pr->ordered_bumped;
1263 
1264  #ifdef KMP_DEBUG
1265  {
1266  const char * buff;
1267  // create format specifiers before the debug output
1268  buff = __kmp_str_format(
1269  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1270  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1271  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1272  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1273  __kmp_str_free( &buff );
1274  }
1275  #endif
1276 
1277  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1278  USE_ITT_BUILD_ARG(NULL)
1279  );
1280 
1281  KMP_MB(); /* is this necessary? */
1282  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1283  gtid ) );
1284  pr->ordered_bumped = 0;
1286  #ifdef KMP_DEBUG
1287  {
1288  const char * buff;
1289  // create format specifiers before the debug output
1290  buff = __kmp_str_format(
1291  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1292  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1293  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1294  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1295  __kmp_str_free( &buff );
1296  }
1297  #endif
1298 
1299  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1300  }
1301 // }
1302  }
1303  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1304 }
1305 
1306 #endif /* KMP_GOMP_COMPAT */
1307 
1308 template< typename T >
1309 static int
1310 __kmp_dispatch_next(
1311  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1312 ) {
1313 
1314  typedef typename traits_t< T >::unsigned_t UT;
1315  typedef typename traits_t< T >::signed_t ST;
1316  typedef typename traits_t< T >::floating_t DBL;
1317  static const int ___kmp_size_type = sizeof( UT );
1318 
1319  int status;
1320  dispatch_private_info_template< T > * pr;
1321  kmp_info_t * th = __kmp_threads[ gtid ];
1322  kmp_team_t * team = th -> th.th_team;
1323 
1324  #ifdef KMP_DEBUG
1325  {
1326  const char * buff;
1327  // create format specifiers before the debug output
1328  buff = __kmp_str_format(
1329  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1330  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1331  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1332  __kmp_str_free( &buff );
1333  }
1334  #endif
1335 
1336  if ( team -> t.t_serialized ) {
1337  /* NOTE: serialize this dispatch becase we are not at the active level */
1338  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1339  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1340  KMP_DEBUG_ASSERT( pr );
1341 
1342  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1343  *p_lb = 0;
1344  *p_ub = 0;
1345  if ( p_st != 0 ) {
1346  *p_st = 0;
1347  }
1348  if ( __kmp_env_consistency_check ) {
1349  if ( pr->pushed_ws != ct_none ) {
1350  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1351  }
1352  }
1353  } else if ( pr->nomerge ) {
1354  kmp_int32 last;
1355  T start;
1356  UT limit, trip, init;
1357  ST incr;
1358  T chunk = pr->u.p.parm1;
1359 
1360  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1361 
1362  init = chunk * pr->u.p.count++;
1363  trip = pr->u.p.tc - 1;
1364 
1365  if ( (status = (init <= trip)) == 0 ) {
1366  *p_lb = 0;
1367  *p_ub = 0;
1368  if ( p_st != 0 ) *p_st = 0;
1369  if ( __kmp_env_consistency_check ) {
1370  if ( pr->pushed_ws != ct_none ) {
1371  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1372  }
1373  }
1374  } else {
1375  start = pr->u.p.lb;
1376  limit = chunk + init - 1;
1377  incr = pr->u.p.st;
1378 
1379  if ( (last = (limit >= trip)) != 0 ) {
1380  limit = trip;
1381  #if KMP_OS_WINDOWS
1382  pr->u.p.last_upper = pr->u.p.ub;
1383  #endif /* KMP_OS_WINDOWS */
1384  }
1385  if ( p_last ) {
1386  *p_last = last;
1387  }
1388  if ( p_st != 0 ) {
1389  *p_st = incr;
1390  }
1391  if ( incr == 1 ) {
1392  *p_lb = start + init;
1393  *p_ub = start + limit;
1394  } else {
1395  *p_lb = start + init * incr;
1396  *p_ub = start + limit * incr;
1397  }
1398 
1399  if ( pr->ordered ) {
1400  pr->u.p.ordered_lower = init;
1401  pr->u.p.ordered_upper = limit;
1402  #ifdef KMP_DEBUG
1403  {
1404  const char * buff;
1405  // create format specifiers before the debug output
1406  buff = __kmp_str_format(
1407  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1408  traits_t< UT >::spec, traits_t< UT >::spec );
1409  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1410  __kmp_str_free( &buff );
1411  }
1412  #endif
1413  } // if
1414  } // if
1415  } else {
1416  pr->u.p.tc = 0;
1417 
1418  *p_lb = pr->u.p.lb;
1419  *p_ub = pr->u.p.ub;
1420  #if KMP_OS_WINDOWS
1421  pr->u.p.last_upper = *p_ub;
1422  #endif /* KMP_OS_WINDOWS */
1423 
1424  if ( p_st != 0 ) {
1425  *p_st = pr->u.p.st;
1426  }
1427  if ( p_last ) {
1428  *p_last = TRUE;
1429  }
1430  } // if
1431  #ifdef KMP_DEBUG
1432  {
1433  const char * buff;
1434  // create format specifiers before the debug output
1435  buff = __kmp_str_format(
1436  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1437  "p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
1438  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1439  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
1440  __kmp_str_free( &buff );
1441  }
1442  #endif
1443  return status;
1444  } else {
1445  kmp_int32 last = 0;
1446  dispatch_shared_info_template< UT > *sh;
1447  T start;
1448  ST incr;
1449  UT limit, trip, init;
1450 
1451  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1452  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1453 
1454  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1455  ( th->th.th_dispatch->th_dispatch_pr_current );
1456  KMP_DEBUG_ASSERT( pr );
1457  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1458  ( th->th.th_dispatch->th_dispatch_sh_current );
1459  KMP_DEBUG_ASSERT( sh );
1460 
1461  if ( pr->u.p.tc == 0 ) {
1462  // zero trip count
1463  status = 0;
1464  } else {
1465  switch (pr->schedule) {
1466  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1467  case kmp_sch_static_steal:
1468  {
1469  T chunk = pr->u.p.parm1;
1470 
1471  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1472 
1473  trip = pr->u.p.tc - 1;
1474 
1475  if ( ___kmp_size_type > 4 ) {
1476  // Other threads do not look into the data of this thread,
1477  // so it's not necessary to make volatile casting.
1478  init = ( pr->u.p.count )++;
1479  status = ( init < (UT)pr->u.p.ub );
1480  } else {
1481  typedef union {
1482  struct {
1483  UT count;
1484  T ub;
1485  } p;
1486  kmp_int64 b;
1487  } union_i4;
1488  // All operations on 'count' or 'ub' must be combined atomically together.
1489  // stealing implemented only for 4-byte indexes
1490  {
1491  union_i4 vold, vnew;
1492  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1493  vnew = vold;
1494  vnew.p.count++;
1495  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1496  ( volatile kmp_int64* )&pr->u.p.count,
1497  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1498  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1499  KMP_CPU_PAUSE();
1500  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1501  vnew = vold;
1502  vnew.p.count++;
1503  }
1504  vnew = vold;
1505  init = vnew.p.count;
1506  status = ( init < (UT)vnew.p.ub ) ;
1507  }
1508 
1509  if( !status ) {
1510  kmp_info_t **other_threads = team->t.t_threads;
1511  int while_limit = 10;
1512  int while_index = 0;
1513 
1514  // TODO: algorithm of searching for a victim
1515  // should be cleaned up and measured
1516  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1517  union_i4 vold, vnew;
1518  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1519  T victimIdx = pr->u.p.parm4;
1520  T oldVictimIdx = victimIdx;
1521  dispatch_private_info_template< T > * victim;
1522 
1523  do {
1524  if( !victimIdx ) {
1525  victimIdx = team->t.t_nproc - 1;
1526  } else {
1527  --victimIdx;
1528  }
1529  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1530  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1531  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1532  // TODO: think about a proper place of this test
1533  if ( ( !victim ) ||
1534  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1535  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1536  // TODO: delay would be nice
1537  continue;
1538  // the victim is not ready yet to participate in stealing
1539  // because the victim is still in kmp_init_dispatch
1540  }
1541  if ( oldVictimIdx == victimIdx ) {
1542  break;
1543  }
1544  pr->u.p.parm4 = victimIdx;
1545 
1546  while( 1 ) {
1547  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1548  vnew = vold;
1549 
1550  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1551  if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1552  break;
1553  }
1554  vnew.p.ub -= (remaining >> 2);
1555  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1556  #pragma warning( push )
1557  // disable warning on pointless comparison of unsigned with 0
1558  #pragma warning( disable: 186 )
1559  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1560  #pragma warning( pop )
1561  // TODO: Should this be acquire or release?
1562  if ( KMP_COMPARE_AND_STORE_ACQ64(
1563  ( volatile kmp_int64 * )&victim->u.p.count,
1564  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1565  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1566  status = 1;
1567  while_index = 0;
1568  // now update own count and ub
1569  #if KMP_ARCH_X86
1570  // stealing executed on non-KMP_ARCH_X86 only
1571  // Atomic 64-bit write on ia32 is
1572  // unavailable, so we do this in steps.
1573  // This code is not tested.
1574  init = vold.p.count;
1575  pr->u.p.ub = 0;
1576  pr->u.p.count = init + 1;
1577  pr->u.p.ub = vnew.p.count;
1578  #else
1579  init = vnew.p.ub;
1580  vold.p.count = init + 1;
1581  // TODO: is it safe and enough?
1582  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1583  #endif // KMP_ARCH_X86
1584  break;
1585  } // if
1586  KMP_CPU_PAUSE();
1587  } // while (1)
1588  } // while
1589  } // if
1590  } // if
1591  if ( !status ) {
1592  *p_lb = 0;
1593  *p_ub = 0;
1594  if ( p_st != 0 ) *p_st = 0;
1595  } else {
1596  start = pr->u.p.parm2;
1597  init *= chunk;
1598  limit = chunk + init - 1;
1599  incr = pr->u.p.st;
1600 
1601  KMP_DEBUG_ASSERT(init <= trip);
1602  if ( (last = (limit >= trip)) != 0 )
1603  limit = trip;
1604  if ( p_last ) {
1605  *p_last = last;
1606  }
1607  if ( p_st != 0 ) *p_st = incr;
1608 
1609  if ( incr == 1 ) {
1610  *p_lb = start + init;
1611  *p_ub = start + limit;
1612  } else {
1613  *p_lb = start + init * incr;
1614  *p_ub = start + limit * incr;
1615  }
1616 
1617  if ( pr->ordered ) {
1618  pr->u.p.ordered_lower = init;
1619  pr->u.p.ordered_upper = limit;
1620  #ifdef KMP_DEBUG
1621  {
1622  const char * buff;
1623  // create format specifiers before the debug output
1624  buff = __kmp_str_format(
1625  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1626  traits_t< UT >::spec, traits_t< UT >::spec );
1627  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1628  __kmp_str_free( &buff );
1629  }
1630  #endif
1631  } // if
1632  } // if
1633  break;
1634  } // case
1635  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1636  case kmp_sch_static_balanced:
1637  {
1638  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1639  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1640  pr->u.p.count = 1;
1641  *p_lb = pr->u.p.lb;
1642  *p_ub = pr->u.p.ub;
1643  last = pr->u.p.parm1;
1644  if ( p_last ) {
1645  *p_last = last;
1646  }
1647  if ( p_st )
1648  *p_st = pr->u.p.st;
1649  } else { /* no iterations to do */
1650  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1651  }
1652  if ( pr->ordered ) {
1653  #ifdef KMP_DEBUG
1654  {
1655  const char * buff;
1656  // create format specifiers before the debug output
1657  buff = __kmp_str_format(
1658  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1659  traits_t< UT >::spec, traits_t< UT >::spec );
1660  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1661  __kmp_str_free( &buff );
1662  }
1663  #endif
1664  } // if
1665  } // case
1666  break;
1667  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1668  case kmp_sch_static_chunked:
1669  {
1670  T parm1;
1671 
1672  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1673  gtid ) );
1674  parm1 = pr->u.p.parm1;
1675 
1676  trip = pr->u.p.tc - 1;
1677  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1678 
1679  if ( (status = (init <= trip)) != 0 ) {
1680  start = pr->u.p.lb;
1681  incr = pr->u.p.st;
1682  limit = parm1 + init - 1;
1683 
1684  if ( (last = (limit >= trip)) != 0 )
1685  limit = trip;
1686 
1687  if ( p_last ) {
1688  *p_last = last;
1689  }
1690  if ( p_st != 0 ) *p_st = incr;
1691 
1692  pr->u.p.count += team->t.t_nproc;
1693 
1694  if ( incr == 1 ) {
1695  *p_lb = start + init;
1696  *p_ub = start + limit;
1697  }
1698  else {
1699  *p_lb = start + init * incr;
1700  *p_ub = start + limit * incr;
1701  }
1702 
1703  if ( pr->ordered ) {
1704  pr->u.p.ordered_lower = init;
1705  pr->u.p.ordered_upper = limit;
1706  #ifdef KMP_DEBUG
1707  {
1708  const char * buff;
1709  // create format specifiers before the debug output
1710  buff = __kmp_str_format(
1711  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1712  traits_t< UT >::spec, traits_t< UT >::spec );
1713  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1714  __kmp_str_free( &buff );
1715  }
1716  #endif
1717  } // if
1718  } // if
1719  } // case
1720  break;
1721 
1722  case kmp_sch_dynamic_chunked:
1723  {
1724  T chunk = pr->u.p.parm1;
1725 
1726  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1727  gtid ) );
1728 
1729  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1730  trip = pr->u.p.tc - 1;
1731 
1732  if ( (status = (init <= trip)) == 0 ) {
1733  *p_lb = 0;
1734  *p_ub = 0;
1735  if ( p_st != 0 ) *p_st = 0;
1736  } else {
1737  start = pr->u.p.lb;
1738  limit = chunk + init - 1;
1739  incr = pr->u.p.st;
1740 
1741  if ( (last = (limit >= trip)) != 0 )
1742  limit = trip;
1743  if ( p_last ) {
1744  *p_last = last;
1745  }
1746  if ( p_st != 0 ) *p_st = incr;
1747 
1748  if ( incr == 1 ) {
1749  *p_lb = start + init;
1750  *p_ub = start + limit;
1751  } else {
1752  *p_lb = start + init * incr;
1753  *p_ub = start + limit * incr;
1754  }
1755 
1756  if ( pr->ordered ) {
1757  pr->u.p.ordered_lower = init;
1758  pr->u.p.ordered_upper = limit;
1759  #ifdef KMP_DEBUG
1760  {
1761  const char * buff;
1762  // create format specifiers before the debug output
1763  buff = __kmp_str_format(
1764  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1765  traits_t< UT >::spec, traits_t< UT >::spec );
1766  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1767  __kmp_str_free( &buff );
1768  }
1769  #endif
1770  } // if
1771  } // if
1772  } // case
1773  break;
1774 
1775  case kmp_sch_guided_iterative_chunked:
1776  {
1777  T chunkspec = pr->u.p.parm1;
1778  KD_TRACE(100,
1779  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1780  trip = pr->u.p.tc;
1781  // Start atomic part of calculations
1782  while(1) {
1783  ST remaining; // signed, because can be < 0
1784  init = sh->u.s.iteration; // shared value
1785  remaining = trip - init;
1786  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1787  // nothing to do, don't try atomic op
1788  status = 0;
1789  break;
1790  }
1791  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1792  // use dynamic-style shcedule
1793  // atomically inrement iterations, get old value
1794  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1795  remaining = trip - init;
1796  if (remaining <= 0) {
1797  status = 0; // all iterations got by other threads
1798  } else {
1799  // got some iterations to work on
1800  status = 1;
1801  if ( (T)remaining > chunkspec ) {
1802  limit = init + chunkspec - 1;
1803  } else {
1804  last = 1; // the last chunk
1805  limit = init + remaining - 1;
1806  } // if
1807  } // if
1808  break;
1809  } // if
1810  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1811  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1812  // CAS was successful, chunk obtained
1813  status = 1;
1814  --limit;
1815  break;
1816  } // if
1817  } // while
1818  if ( status != 0 ) {
1819  start = pr->u.p.lb;
1820  incr = pr->u.p.st;
1821  if ( p_st != NULL )
1822  *p_st = incr;
1823  if ( p_last != NULL )
1824  *p_last = last;
1825  *p_lb = start + init * incr;
1826  *p_ub = start + limit * incr;
1827  if ( pr->ordered ) {
1828  pr->u.p.ordered_lower = init;
1829  pr->u.p.ordered_upper = limit;
1830  #ifdef KMP_DEBUG
1831  {
1832  const char * buff;
1833  // create format specifiers before the debug output
1834  buff = __kmp_str_format(
1835  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1836  traits_t< UT >::spec, traits_t< UT >::spec );
1837  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1838  __kmp_str_free( &buff );
1839  }
1840  #endif
1841  } // if
1842  } else {
1843  *p_lb = 0;
1844  *p_ub = 0;
1845  if ( p_st != NULL )
1846  *p_st = 0;
1847  } // if
1848  } // case
1849  break;
1850 
1851  case kmp_sch_guided_analytical_chunked:
1852  {
1853  T chunkspec = pr->u.p.parm1;
1854  UT chunkIdx;
1855  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1856  /* for storing original FPCW value for Windows* OS on
1857  IA-32 architecture 8-byte version */
1858  unsigned int oldFpcw;
1859  unsigned int fpcwSet = 0;
1860  #endif
1861  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1862  gtid ) );
1863 
1864  trip = pr->u.p.tc;
1865 
1866  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1867  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1868 
1869  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1870  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1871  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1872  --trip;
1873  /* use dynamic-style scheduling */
1874  init = chunkIdx * chunkspec + pr->u.p.count;
1875  /* need to verify init > 0 in case of overflow in the above calculation */
1876  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1877  limit = init + chunkspec -1;
1878 
1879  if ( (last = (limit >= trip)) != 0 )
1880  limit = trip;
1881  }
1882  break;
1883  } else {
1884  /* use exponential-style scheduling */
1885  /* The following check is to workaround the lack of long double precision on Windows* OS.
1886  This check works around the possible effect that init != 0 for chunkIdx == 0.
1887  */
1888  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1889  /* If we haven't already done so, save original
1890  FPCW and set precision to 64-bit, as Windows* OS
1891  on IA-32 architecture defaults to 53-bit */
1892  if ( !fpcwSet ) {
1893  oldFpcw = _control87(0,0);
1894  _control87(_PC_64,_MCW_PC);
1895  fpcwSet = 0x30000;
1896  }
1897  #endif
1898  if ( chunkIdx ) {
1899  init = __kmp_dispatch_guided_remaining< T >(
1900  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1901  KMP_DEBUG_ASSERT(init);
1902  init = trip - init;
1903  } else
1904  init = 0;
1905  limit = trip - __kmp_dispatch_guided_remaining< T >(
1906  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1907  KMP_ASSERT(init <= limit);
1908  if ( init < limit ) {
1909  KMP_DEBUG_ASSERT(limit <= trip);
1910  --limit;
1911  status = 1;
1912  break;
1913  } // if
1914  } // if
1915  } // while (1)
1916  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1917  /* restore FPCW if necessary
1918  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1919  */
1920  if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1921  _control87(oldFpcw,_MCW_PC);
1922  #endif
1923  if ( status != 0 ) {
1924  start = pr->u.p.lb;
1925  incr = pr->u.p.st;
1926  if ( p_st != NULL )
1927  *p_st = incr;
1928  if ( p_last != NULL )
1929  *p_last = last;
1930  *p_lb = start + init * incr;
1931  *p_ub = start + limit * incr;
1932  if ( pr->ordered ) {
1933  pr->u.p.ordered_lower = init;
1934  pr->u.p.ordered_upper = limit;
1935  #ifdef KMP_DEBUG
1936  {
1937  const char * buff;
1938  // create format specifiers before the debug output
1939  buff = __kmp_str_format(
1940  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1941  traits_t< UT >::spec, traits_t< UT >::spec );
1942  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1943  __kmp_str_free( &buff );
1944  }
1945  #endif
1946  }
1947  } else {
1948  *p_lb = 0;
1949  *p_ub = 0;
1950  if ( p_st != NULL )
1951  *p_st = 0;
1952  }
1953  } // case
1954  break;
1955 
1956  case kmp_sch_trapezoidal:
1957  {
1958  UT index;
1959  T parm2 = pr->u.p.parm2;
1960  T parm3 = pr->u.p.parm3;
1961  T parm4 = pr->u.p.parm4;
1962  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1963  gtid ) );
1964 
1965  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1966 
1967  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1968  trip = pr->u.p.tc - 1;
1969 
1970  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1971  *p_lb = 0;
1972  *p_ub = 0;
1973  if ( p_st != 0 ) *p_st = 0;
1974  } else {
1975  start = pr->u.p.lb;
1976  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1977  incr = pr->u.p.st;
1978 
1979  if ( (last = (limit >= trip)) != 0 )
1980  limit = trip;
1981 
1982  if ( p_last != 0 ) {
1983  *p_last = last;
1984  }
1985  if ( p_st != 0 ) *p_st = incr;
1986 
1987  if ( incr == 1 ) {
1988  *p_lb = start + init;
1989  *p_ub = start + limit;
1990  } else {
1991  *p_lb = start + init * incr;
1992  *p_ub = start + limit * incr;
1993  }
1994 
1995  if ( pr->ordered ) {
1996  pr->u.p.ordered_lower = init;
1997  pr->u.p.ordered_upper = limit;
1998  #ifdef KMP_DEBUG
1999  {
2000  const char * buff;
2001  // create format specifiers before the debug output
2002  buff = __kmp_str_format(
2003  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2004  traits_t< UT >::spec, traits_t< UT >::spec );
2005  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2006  __kmp_str_free( &buff );
2007  }
2008  #endif
2009  } // if
2010  } // if
2011  } // case
2012  break;
2013  } // switch
2014  } // if tc == 0;
2015 
2016  if ( status == 0 ) {
2017  UT num_done;
2018 
2019  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2020  #ifdef KMP_DEBUG
2021  {
2022  const char * buff;
2023  // create format specifiers before the debug output
2024  buff = __kmp_str_format(
2025  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2026  traits_t< UT >::spec );
2027  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2028  __kmp_str_free( &buff );
2029  }
2030  #endif
2031 
2032  if ( num_done == team->t.t_nproc-1 ) {
2033  /* NOTE: release this buffer to be reused */
2034 
2035  KMP_MB(); /* Flush all pending memory write invalidates. */
2036 
2037  sh->u.s.num_done = 0;
2038  sh->u.s.iteration = 0;
2039 
2040  /* TODO replace with general release procedure? */
2041  if ( pr->ordered ) {
2042  sh->u.s.ordered_iteration = 0;
2043  }
2044 
2045  KMP_MB(); /* Flush all pending memory write invalidates. */
2046 
2047  sh -> buffer_index += KMP_MAX_DISP_BUF;
2048  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2049  gtid, sh->buffer_index) );
2050 
2051  KMP_MB(); /* Flush all pending memory write invalidates. */
2052 
2053  } // if
2054  if ( __kmp_env_consistency_check ) {
2055  if ( pr->pushed_ws != ct_none ) {
2056  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2057  }
2058  }
2059 
2060  th -> th.th_dispatch -> th_deo_fcn = NULL;
2061  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2062  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2063  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2064  } // if (status == 0)
2065 #if KMP_OS_WINDOWS
2066  else if ( last ) {
2067  pr->u.p.last_upper = pr->u.p.ub;
2068  }
2069 #endif /* KMP_OS_WINDOWS */
2070  } // if
2071 
2072  #ifdef KMP_DEBUG
2073  {
2074  const char * buff;
2075  // create format specifiers before the debug output
2076  buff = __kmp_str_format(
2077  "__kmp_dispatch_next: T#%%d normal case: " \
2078  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2079  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2080  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2081  __kmp_str_free( &buff );
2082  }
2083  #endif
2084  return status;
2085 }
2086 
2087 //-----------------------------------------------------------------------------------------
2088 // Dispatch routines
2089 // Transfer call to template< type T >
2090 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2091 // T lb, T ub, ST st, ST chunk )
2092 extern "C" {
2093 
2109 void
2110 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2111  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2112 {
2113  KMP_DEBUG_ASSERT( __kmp_init_serial );
2114  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2115 }
2119 void
2120 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2121  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2122 {
2123  KMP_DEBUG_ASSERT( __kmp_init_serial );
2124  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2125 }
2126 
2130 void
2131 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2132  kmp_int64 lb, kmp_int64 ub,
2133  kmp_int64 st, kmp_int64 chunk )
2134 {
2135  KMP_DEBUG_ASSERT( __kmp_init_serial );
2136  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2137 }
2138 
2142 void
2143 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2144  kmp_uint64 lb, kmp_uint64 ub,
2145  kmp_int64 st, kmp_int64 chunk )
2146 {
2147  KMP_DEBUG_ASSERT( __kmp_init_serial );
2148  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2149 }
2150 
2163 int
2164 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2165  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2166 {
2167  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2168 }
2169 
2173 int
2174 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2175  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2176 {
2177  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2178 }
2179 
2183 int
2184 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2185  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2186 {
2187  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2188 }
2189 
2193 int
2194 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2195  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2196 {
2197  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2198 }
2199 
2206 void
2207 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2208 {
2209  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2210 }
2211 
2215 void
2216 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2217 {
2218  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2219 }
2220 
2224 void
2225 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2226 {
2227  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2228 }
2229 
2233 void
2234 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2235 {
2236  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2237 }
2240 //-----------------------------------------------------------------------------------------
2241 //Non-template routines from kmp_dispatch.c used in other sources
2242 
2243 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2244  return value == checker;
2245 }
2246 
2247 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2248  return value != checker;
2249 }
2250 
2251 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2252  return value < checker;
2253 }
2254 
2255 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2256  return value >= checker;
2257 }
2258 
2259 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2260  return value <= checker;
2261 }
2262 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2263  return value == checker;
2264 }
2265 
2266 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2267  return value != checker;
2268 }
2269 
2270 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2271  return value < checker;
2272 }
2273 
2274 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2275  return value >= checker;
2276 }
2277 
2278 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2279  return value <= checker;
2280 }
2281 
2282 kmp_uint32
2283 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2284  kmp_uint32 checker,
2285  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2286  , void * obj // Higher-level synchronization object, or NULL.
2287  )
2288 {
2289  // note: we may not belong to a team at this point
2290  register volatile kmp_uint32 * spin = spinner;
2291  register kmp_uint32 check = checker;
2292  register kmp_uint32 spins;
2293  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2294  register kmp_uint32 r;
2295 
2296  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2297  KMP_INIT_YIELD( spins );
2298  // main wait spin loop
2299  while(!f(r = TCR_4(*spin), check)) {
2300  KMP_FSYNC_SPIN_PREPARE( obj );
2301  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2302  It causes problems with infinite recursion because of exit lock */
2303  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2304  __kmp_abort_thread(); */
2305 
2306  __kmp_static_delay(TRUE);
2307 
2308  /* if we have waited a bit, or are oversubscribed, yield */
2309  /* pause is in the following code */
2310  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2311  KMP_YIELD_SPIN( spins );
2312  }
2313  KMP_FSYNC_SPIN_ACQUIRED( obj );
2314  return r;
2315 }
2316 
2317 kmp_uint64
2318 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2319  kmp_uint64 checker,
2320  kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2321  , void * obj // Higher-level synchronization object, or NULL.
2322  )
2323 {
2324  // note: we may not belong to a team at this point
2325  register volatile kmp_uint64 * spin = spinner;
2326  register kmp_uint64 check = checker;
2327  register kmp_uint32 spins;
2328  register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2329  register kmp_uint64 r;
2330 
2331  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2332  KMP_INIT_YIELD( spins );
2333  // main wait spin loop
2334  while(!f(r = *spin, check))
2335  {
2336  KMP_FSYNC_SPIN_PREPARE( obj );
2337  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2338  It causes problems with infinite recursion because of exit lock */
2339  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2340  __kmp_abort_thread(); */
2341 
2342  __kmp_static_delay(TRUE);
2343 
2344  // if we are oversubscribed,
2345  // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2346  // pause is in the following code
2347  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2348  KMP_YIELD_SPIN( spins );
2349  }
2350  KMP_FSYNC_SPIN_ACQUIRED( obj );
2351  return r;
2352 }
2353 
2354 } // extern "C"
2355 
2356 #ifdef KMP_GOMP_COMPAT
2357 
2358 void
2359 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2360  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2361  kmp_int32 chunk, int push_ws )
2362 {
2363  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2364  push_ws );
2365 }
2366 
2367 void
2368 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2369  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2370  kmp_int32 chunk, int push_ws )
2371 {
2372  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2373  push_ws );
2374 }
2375 
2376 void
2377 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2378  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2379  kmp_int64 chunk, int push_ws )
2380 {
2381  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2382  push_ws );
2383 }
2384 
2385 void
2386 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2387  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2388  kmp_int64 chunk, int push_ws )
2389 {
2390  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2391  push_ws );
2392 }
2393 
2394 void
2395 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2396 {
2397  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2398 }
2399 
2400 void
2401 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2402 {
2403  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2404 }
2405 
2406 void
2407 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2408 {
2409  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2410 }
2411 
2412 void
2413 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2414 {
2415  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2416 }
2417 
2418 #endif /* KMP_GOMP_COMPAT */
2419 
2420 /* ------------------------------------------------------------------------ */
2421 /* ------------------------------------------------------------------------ */
2422 
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
Definition: kmp.h:200
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:302
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)