LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /* Dynamic scheduling initialization and dispatch.
17  *
18  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19  * it may change values between parallel regions. __kmp_max_nth
20  * is the largest value __kmp_nth may take, 1 is the smallest.
21  */
22 
23 // Need to raise Win version from XP to Vista here for support of
24 // InterlockedExchange64
25 #if defined(_WIN32_WINNT) && defined(_M_IX86)
26 #undef _WIN32_WINNT
27 #define _WIN32_WINNT 0x0502
28 #endif
29 
30 #include "kmp.h"
31 #include "kmp_error.h"
32 #include "kmp_i18n.h"
33 #include "kmp_itt.h"
34 #include "kmp_stats.h"
35 #include "kmp_str.h"
36 #if KMP_OS_WINDOWS && KMP_ARCH_X86
37 #include <float.h>
38 #endif
39 
40 #if OMPT_SUPPORT
41 #include "ompt-internal.h"
42 #include "ompt-specific.h"
43 #endif
44 
45 /* ------------------------------------------------------------------------ */
46 
47 #if KMP_STATIC_STEAL_ENABLED
48 
49 // replaces dispatch_private_info{32,64} structures and
50 // dispatch_private_info{32,64}_t types
51 template <typename T> struct dispatch_private_infoXX_template {
52  typedef typename traits_t<T>::unsigned_t UT;
53  typedef typename traits_t<T>::signed_t ST;
54  UT count; // unsigned
55  T ub;
56  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57  T lb;
58  ST st; // signed
59  UT tc; // unsigned
60  T static_steal_counter; // for static_steal only; maybe better to put after ub
61 
62  /* parm[1-4] are used in different ways by different scheduling algorithms */
63 
64  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65  // a) parm3 is properly aligned and
66  // b) all parm1-4 are in the same cache line.
67  // Because of parm1-4 are used together, performance seems to be better
68  // if they are in the same line (not measured though).
69 
70  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71  T parm1;
72  T parm2;
73  T parm3;
74  T parm4;
75  };
76 
77  UT ordered_lower; // unsigned
78  UT ordered_upper; // unsigned
79 #if KMP_OS_WINDOWS
80  T last_upper;
81 #endif /* KMP_OS_WINDOWS */
82 };
83 
84 #else /* KMP_STATIC_STEAL_ENABLED */
85 
86 // replaces dispatch_private_info{32,64} structures and
87 // dispatch_private_info{32,64}_t types
88 template <typename T> struct dispatch_private_infoXX_template {
89  typedef typename traits_t<T>::unsigned_t UT;
90  typedef typename traits_t<T>::signed_t ST;
91  T lb;
92  T ub;
93  ST st; // signed
94  UT tc; // unsigned
95 
96  T parm1;
97  T parm2;
98  T parm3;
99  T parm4;
100 
101  UT count; // unsigned
102 
103  UT ordered_lower; // unsigned
104  UT ordered_upper; // unsigned
105 #if KMP_OS_WINDOWS
106  T last_upper;
107 #endif /* KMP_OS_WINDOWS */
108 };
109 
110 #endif /* KMP_STATIC_STEAL_ENABLED */
111 
112 // replaces dispatch_private_info structure and dispatch_private_info_t type
113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114  // duplicate alignment here, otherwise size of structure is not correct in our
115  // compiler
116  union KMP_ALIGN_CACHE private_info_tmpl {
117  dispatch_private_infoXX_template<T> p;
118  dispatch_private_info64_t p64;
119  } u;
120  enum sched_type schedule; /* scheduling algorithm */
121  kmp_uint32 ordered; /* ordered clause specified */
122  kmp_uint32 ordered_bumped;
123  // To retain the structure size after making ordered_iteration scalar
124  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126  kmp_uint32 nomerge; /* don't merge iters if serialized */
127  kmp_uint32 type_size;
128  enum cons_type pushed_ws;
129 };
130 
131 // replaces dispatch_shared_info{32,64} structures and
132 // dispatch_shared_info{32,64}_t types
133 template <typename UT> struct dispatch_shared_infoXX_template {
134  /* chunk index under dynamic, number of idle threads under static-steal;
135  iteration index otherwise */
136  volatile UT iteration;
137  volatile UT num_done;
138  volatile UT ordered_iteration;
139  // to retain the structure size making ordered_iteration scalar
140  UT ordered_dummy[KMP_MAX_ORDERED - 3];
141 };
142 
143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
144 template <typename UT> struct dispatch_shared_info_template {
145  // we need union here to keep the structure size
146  union shared_info_tmpl {
147  dispatch_shared_infoXX_template<UT> s;
148  dispatch_shared_info64_t s64;
149  } u;
150  volatile kmp_uint32 buffer_index;
151 #if OMP_45_ENABLED
152  volatile kmp_int32 doacross_buf_idx; // teamwise index
153  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154  kmp_int32 doacross_num_done; // count finished threads
155 #endif
156 #if KMP_USE_HWLOC
157  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158  // machines (> 48 cores). Performance analysis showed that a cache thrash
159  // was occurring and this padding helps alleviate the problem.
160  char padding[64];
161 #endif
162 };
163 
164 /* ------------------------------------------------------------------------ */
165 
166 #undef USE_TEST_LOCKS
167 
168 // test_then_add template (general template should NOT be used)
169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
170 
171 template <>
172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173  kmp_int32 d) {
174  kmp_int32 r;
175  r = KMP_TEST_THEN_ADD32(p, d);
176  return r;
177 }
178 
179 template <>
180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181  kmp_int64 d) {
182  kmp_int64 r;
183  r = KMP_TEST_THEN_ADD64(p, d);
184  return r;
185 }
186 
187 // test_then_inc_acq template (general template should NOT be used)
188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
189 
190 template <>
191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192  kmp_int32 r;
193  r = KMP_TEST_THEN_INC_ACQ32(p);
194  return r;
195 }
196 
197 template <>
198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199  kmp_int64 r;
200  r = KMP_TEST_THEN_INC_ACQ64(p);
201  return r;
202 }
203 
204 // test_then_inc template (general template should NOT be used)
205 template <typename T> static __forceinline T test_then_inc(volatile T *p);
206 
207 template <>
208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209  kmp_int32 r;
210  r = KMP_TEST_THEN_INC32(p);
211  return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216  kmp_int64 r;
217  r = KMP_TEST_THEN_INC64(p);
218  return r;
219 }
220 
221 // compare_and_swap template (general template should NOT be used)
222 template <typename T>
223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
224 
225 template <>
226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227  kmp_int32 c, kmp_int32 s) {
228  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
229 }
230 
231 template <>
232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233  kmp_int64 c, kmp_int64 s) {
234  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
235 }
236 
237 /* Spin wait loop that first does pause, then yield.
238  Waits until function returns non-zero when called with *spinner and check.
239  Does NOT put threads to sleep.
240 #if USE_ITT_BUILD
241  Arguments:
242  obj -- is higher-level synchronization object to report to ittnotify.
243  It is used to report locks consistently. For example, if lock is
244  acquired immediately, its address is reported to ittnotify via
245  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246  and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247  same address, not an address of low-level spinner.
248 #endif // USE_ITT_BUILD
249 */
250 template <typename UT>
251 // ToDo: make inline function (move to header file for icl)
252 static UT // unsigned 4- or 8-byte type
253  __kmp_wait_yield(
254  volatile UT *spinner, UT checker,
255  kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256  void *obj) // Higher-level synchronization object, or NULL.
257  ) {
258  // note: we may not belong to a team at this point
259  volatile UT *spin = spinner;
260  UT check = checker;
261  kmp_uint32 spins;
262  kmp_uint32 (*f)(UT, UT) = pred;
263  UT r;
264 
265  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
266  KMP_INIT_YIELD(spins);
267  // main wait spin loop
268  while (!f(r = *spin, check)) {
269  KMP_FSYNC_SPIN_PREPARE(obj);
270  /* GEH - remove this since it was accidentally introduced when kmp_wait was
271  split. It causes problems with infinite recursion because of exit lock */
272  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273  __kmp_abort_thread(); */
274 
275  // if we are oversubscribed, or have waited a bit (and
276  // KMP_LIBRARY=throughput, then yield. pause is in the following code
277  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278  KMP_YIELD_SPIN(spins);
279  }
280  KMP_FSYNC_SPIN_ACQUIRED(obj);
281  return r;
282 }
283 
284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285  return value == checker;
286 }
287 
288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289  return value != checker;
290 }
291 
292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293  return value < checker;
294 }
295 
296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297  return value >= checker;
298 }
299 
300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301  return value <= checker;
302 }
303 
304 /* ------------------------------------------------------------------------ */
305 
306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307  ident_t *loc_ref) {
308  kmp_info_t *th;
309 
310  KMP_DEBUG_ASSERT(gtid_ref);
311 
312  if (__kmp_env_consistency_check) {
313  th = __kmp_threads[*gtid_ref];
314  if (th->th.th_root->r.r_active &&
315  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316 #if KMP_USE_DYNAMIC_LOCK
317  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318 #else
319  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320 #endif
321  }
322  }
323 }
324 
325 template <typename UT>
326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327  typedef typename traits_t<UT>::signed_t ST;
328  dispatch_private_info_template<UT> *pr;
329 
330  int gtid = *gtid_ref;
331  // int cid = *cid_ref;
332  kmp_info_t *th = __kmp_threads[gtid];
333  KMP_DEBUG_ASSERT(th->th.th_dispatch);
334 
335  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336  if (__kmp_env_consistency_check) {
337  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338  th->th.th_dispatch->th_dispatch_pr_current);
339  if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342 #else
343  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344 #endif
345  }
346  }
347 
348  if (!th->th.th_team->t.t_serialized) {
349  dispatch_shared_info_template<UT> *sh =
350  reinterpret_cast<dispatch_shared_info_template<UT> *>(
351  th->th.th_dispatch->th_dispatch_sh_current);
352  UT lower;
353 
354  if (!__kmp_env_consistency_check) {
355  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356  th->th.th_dispatch->th_dispatch_pr_current);
357  }
358  lower = pr->u.p.ordered_lower;
359 
360 #if !defined(KMP_GOMP_COMPAT)
361  if (__kmp_env_consistency_check) {
362  if (pr->ordered_bumped) {
363  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365  ct_ordered_in_pdo, loc_ref,
366  &p->stack_data[p->w_top]);
367  }
368  }
369 #endif /* !defined(KMP_GOMP_COMPAT) */
370 
371  KMP_MB();
372 #ifdef KMP_DEBUG
373  {
374  const char *buff;
375  // create format specifiers before the debug output
376  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377  "ordered_iter:%%%s lower:%%%s\n",
378  traits_t<UT>::spec, traits_t<UT>::spec);
379  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380  __kmp_str_free(&buff);
381  }
382 #endif
383 
384  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386  KMP_MB(); /* is this necessary? */
387 #ifdef KMP_DEBUG
388  {
389  const char *buff;
390  // create format specifiers before the debug output
391  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392  "ordered_iter:%%%s lower:%%%s\n",
393  traits_t<UT>::spec, traits_t<UT>::spec);
394  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395  __kmp_str_free(&buff);
396  }
397 #endif
398  }
399  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400 }
401 
402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403  ident_t *loc_ref) {
404  kmp_info_t *th;
405 
406  if (__kmp_env_consistency_check) {
407  th = __kmp_threads[*gtid_ref];
408  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410  }
411  }
412 }
413 
414 template <typename UT>
415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416  typedef typename traits_t<UT>::signed_t ST;
417  dispatch_private_info_template<UT> *pr;
418 
419  int gtid = *gtid_ref;
420  // int cid = *cid_ref;
421  kmp_info_t *th = __kmp_threads[gtid];
422  KMP_DEBUG_ASSERT(th->th.th_dispatch);
423 
424  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425  if (__kmp_env_consistency_check) {
426  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427  th->th.th_dispatch->th_dispatch_pr_current);
428  if (pr->pushed_ws != ct_none) {
429  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430  }
431  }
432 
433  if (!th->th.th_team->t.t_serialized) {
434  dispatch_shared_info_template<UT> *sh =
435  reinterpret_cast<dispatch_shared_info_template<UT> *>(
436  th->th.th_dispatch->th_dispatch_sh_current);
437 
438  if (!__kmp_env_consistency_check) {
439  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440  th->th.th_dispatch->th_dispatch_pr_current);
441  }
442 
443  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
444 #if !defined(KMP_GOMP_COMPAT)
445  if (__kmp_env_consistency_check) {
446  if (pr->ordered_bumped != 0) {
447  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448  /* How to test it? - OM */
449  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450  ct_ordered_in_pdo, loc_ref,
451  &p->stack_data[p->w_top]);
452  }
453  }
454 #endif /* !defined(KMP_GOMP_COMPAT) */
455 
456  KMP_MB(); /* Flush all pending memory write invalidates. */
457 
458  pr->ordered_bumped += 1;
459 
460  KD_TRACE(1000,
461  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462  gtid, pr->ordered_bumped));
463 
464  KMP_MB(); /* Flush all pending memory write invalidates. */
465 
466  /* TODO use general release procedure? */
467  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468 
469  KMP_MB(); /* Flush all pending memory write invalidates. */
470  }
471  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472 }
473 
474 // Computes and returns x to the power of y, where y must a non-negative integer
475 template <typename UT>
476 static __forceinline long double __kmp_pow(long double x, UT y) {
477  long double s = 1.0L;
478 
479  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481  while (y) {
482  if (y & 1)
483  s *= x;
484  x *= x;
485  y >>= 1;
486  }
487  return s;
488 }
489 
490 /* Computes and returns the number of unassigned iterations after idx chunks
491  have been assigned (the total number of unassigned iterations in chunks with
492  index greater than or equal to idx). __forceinline seems to be broken so that
493  if we __forceinline this function, the behavior is wrong
494  (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495 template <typename T>
496 static __inline typename traits_t<T>::unsigned_t
497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498  typename traits_t<T>::unsigned_t idx) {
499  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500  ICL 8.1, long double arithmetic may not really have long double precision,
501  even with /Qlong_double. Currently, we workaround that in the caller code,
502  by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack
503  of precision is not expected to be a correctness issue, though. */
504  typedef typename traits_t<T>::unsigned_t UT;
505 
506  long double x = tc * __kmp_pow<UT>(base, idx);
507  UT r = (UT)x;
508  if (x == r)
509  return r;
510  return r + 1;
511 }
512 
513 // Parameters of the guided-iterative algorithm:
514 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
515 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
516 // by default n = 2. For example with n = 3 the chunks distribution will be more
517 // flat.
518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519 static int guided_int_param = 2;
520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
521 
522 // UT - unsigned flavor of T, ST - signed flavor of T,
523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
524 template <typename T>
525 static void
526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527  T ub, typename traits_t<T>::signed_t st,
528  typename traits_t<T>::signed_t chunk, int push_ws) {
529  typedef typename traits_t<T>::unsigned_t UT;
530  typedef typename traits_t<T>::signed_t ST;
531  typedef typename traits_t<T>::floating_t DBL;
532 
533  int active;
534  T tc;
535  kmp_info_t *th;
536  kmp_team_t *team;
537  kmp_uint32 my_buffer_index;
538  dispatch_private_info_template<T> *pr;
539  dispatch_shared_info_template<UT> volatile *sh;
540 
541  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542  sizeof(dispatch_private_info));
543  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544  sizeof(dispatch_shared_info));
545 
546  if (!TCR_4(__kmp_init_parallel))
547  __kmp_parallel_initialize();
548 
549 #if INCLUDE_SSC_MARKS
550  SSC_MARK_DISPATCH_INIT();
551 #endif
552 #ifdef KMP_DEBUG
553  {
554  const char *buff;
555  // create format specifiers before the debug output
556  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558  traits_t<ST>::spec, traits_t<T>::spec,
559  traits_t<T>::spec, traits_t<ST>::spec);
560  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561  __kmp_str_free(&buff);
562  }
563 #endif
564  /* setup data */
565  th = __kmp_threads[gtid];
566  team = th->th.th_team;
567  active = !team->t.t_serialized;
568  th->th.th_ident = loc;
569 
570 #if USE_ITT_BUILD
571  kmp_uint64 cur_chunk = chunk;
572  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573  __kmp_forkjoin_frames_mode == 3 &&
574  KMP_MASTER_GTID(gtid) &&
575 #if OMP_40_ENABLED
576  th->th.th_teams_microtask == NULL &&
577 #endif
578  team->t.t_active_level == 1;
579 #endif
580  if (!active) {
581  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583  } else {
584  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
586 
587  my_buffer_index = th->th.th_dispatch->th_disp_index++;
588 
589  /* What happens when number of threads changes, need to resize buffer? */
590  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591  &th->th.th_dispatch
592  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593  sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595  }
596 
597 #if (KMP_STATIC_STEAL_ENABLED)
598  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599  // AC: we now have only one implementation of stealing, so use it
600  schedule = kmp_sch_static_steal;
601  else
602 #endif
603  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
604 
605  /* Pick up the nomerge/ordered bits from the scheduling type */
606  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607  pr->nomerge = TRUE;
608  schedule =
609  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610  } else {
611  pr->nomerge = FALSE;
612  }
613  pr->type_size = traits_t<T>::type_size; // remember the size of variables
614  if (kmp_ord_lower & schedule) {
615  pr->ordered = TRUE;
616  schedule =
617  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618  } else {
619  pr->ordered = FALSE;
620  }
621 
622  if (schedule == kmp_sch_static) {
623  schedule = __kmp_static;
624  } else {
625  if (schedule == kmp_sch_runtime) {
626  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627  // not specified)
628  schedule = team->t.t_sched.r_sched_type;
629  // Detail the schedule if needed (global controls are differentiated
630  // appropriately)
631  if (schedule == kmp_sch_guided_chunked) {
632  schedule = __kmp_guided;
633  } else if (schedule == kmp_sch_static) {
634  schedule = __kmp_static;
635  }
636  // Use the chunk size specified by OMP_SCHEDULE (or default if not
637  // specified)
638  chunk = team->t.t_sched.chunk;
639 #if USE_ITT_BUILD
640  cur_chunk = chunk;
641 #endif
642 #ifdef KMP_DEBUG
643  {
644  const char *buff;
645  // create format specifiers before the debug output
646  buff = __kmp_str_format(
647  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648  traits_t<ST>::spec);
649  KD_TRACE(10, (buff, gtid, schedule, chunk));
650  __kmp_str_free(&buff);
651  }
652 #endif
653  } else {
654  if (schedule == kmp_sch_guided_chunked) {
655  schedule = __kmp_guided;
656  }
657  if (chunk <= 0) {
658  chunk = KMP_DEFAULT_CHUNK;
659  }
660  }
661 
662  if (schedule == kmp_sch_auto) {
663  // mapping and differentiation: in the __kmp_do_serial_initialize()
664  schedule = __kmp_auto;
665 #ifdef KMP_DEBUG
666  {
667  const char *buff;
668  // create format specifiers before the debug output
669  buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670  "schedule:%%d chunk:%%%s\n",
671  traits_t<ST>::spec);
672  KD_TRACE(10, (buff, gtid, schedule, chunk));
673  __kmp_str_free(&buff);
674  }
675 #endif
676  }
677 
678  /* guided analytical not safe for too many threads */
679  if (schedule == kmp_sch_guided_analytical_chunked &&
680  th->th.th_team_nproc > 1 << 20) {
681  schedule = kmp_sch_guided_iterative_chunked;
682  KMP_WARNING(DispatchManyThreads);
683  }
684  if (schedule == kmp_sch_runtime_simd) {
685  // compiler provides simd_width in the chunk parameter
686  schedule = team->t.t_sched.r_sched_type;
687  // Detail the schedule if needed (global controls are differentiated
688  // appropriately)
689  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690  schedule == __kmp_static) {
691  schedule = kmp_sch_static_balanced_chunked;
692  } else {
693  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694  schedule = kmp_sch_guided_simd;
695  }
696  chunk = team->t.t_sched.chunk * chunk;
697  }
698 #if USE_ITT_BUILD
699  cur_chunk = chunk;
700 #endif
701 #ifdef KMP_DEBUG
702  {
703  const char *buff;
704  // create format specifiers before the debug output
705  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
706  " chunk:%%%s\n",
707  traits_t<ST>::spec);
708  KD_TRACE(10, (buff, gtid, schedule, chunk));
709  __kmp_str_free(&buff);
710  }
711 #endif
712  }
713  pr->u.p.parm1 = chunk;
714  }
715  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
716  "unknown scheduling type");
717 
718  pr->u.p.count = 0;
719 
720  if (__kmp_env_consistency_check) {
721  if (st == 0) {
722  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723  (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
724  }
725  }
726  // compute trip count
727  if (st == 1) { // most common case
728  if (ub >= lb) {
729  tc = ub - lb + 1;
730  } else { // ub < lb
731  tc = 0; // zero-trip
732  }
733  } else if (st < 0) {
734  if (lb >= ub) {
735  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
736  // where the division needs to be unsigned regardless of the result type
737  tc = (UT)(lb - ub) / (-st) + 1;
738  } else { // lb < ub
739  tc = 0; // zero-trip
740  }
741  } else { // st > 0
742  if (ub >= lb) {
743  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
744  // where the division needs to be unsigned regardless of the result type
745  tc = (UT)(ub - lb) / st + 1;
746  } else { // ub < lb
747  tc = 0; // zero-trip
748  }
749  }
750 
751  // Any half-decent optimizer will remove this test when the blocks are empty
752  // since the macros expand to nothing when statistics are disabled.
753  if (schedule == __kmp_static) {
754  KMP_COUNT_BLOCK(OMP_FOR_static);
755  KMP_COUNT_VALUE(FOR_static_iterations, tc);
756  } else {
757  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
758  KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
759  }
760 
761  pr->u.p.lb = lb;
762  pr->u.p.ub = ub;
763  pr->u.p.st = st;
764  pr->u.p.tc = tc;
765 
766 #if KMP_OS_WINDOWS
767  pr->u.p.last_upper = ub + st;
768 #endif /* KMP_OS_WINDOWS */
769 
770  /* NOTE: only the active parallel region(s) has active ordered sections */
771 
772  if (active) {
773  if (pr->ordered == 0) {
774  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
775  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
776  } else {
777  pr->ordered_bumped = 0;
778 
779  pr->u.p.ordered_lower = 1;
780  pr->u.p.ordered_upper = 0;
781 
782  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
783  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
784  }
785  }
786 
787  if (__kmp_env_consistency_check) {
788  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
789  if (push_ws) {
790  __kmp_push_workshare(gtid, ws, loc);
791  pr->pushed_ws = ws;
792  } else {
793  __kmp_check_workshare(gtid, ws, loc);
794  pr->pushed_ws = ct_none;
795  }
796  }
797 
798  switch (schedule) {
799 #if (KMP_STATIC_STEAL_ENABLED)
800  case kmp_sch_static_steal: {
801  T nproc = th->th.th_team_nproc;
802  T ntc, init;
803 
804  KD_TRACE(100,
805  ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
806 
807  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
808  if (nproc > 1 && ntc >= nproc) {
809  KMP_COUNT_BLOCK(OMP_FOR_static_steal);
810  T id = __kmp_tid_from_gtid(gtid);
811  T small_chunk, extras;
812 
813  small_chunk = ntc / nproc;
814  extras = ntc % nproc;
815 
816  init = id * small_chunk + (id < extras ? id : extras);
817  pr->u.p.count = init;
818  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
819 
820  pr->u.p.parm2 = lb;
821  // pr->pfields.parm3 = 0; // it's not used in static_steal
822  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
823  pr->u.p.st = st;
824  if (traits_t<T>::type_size > 4) {
825  // AC: TODO: check if 16-byte CAS available and use it to
826  // improve performance (probably wait for explicit request
827  // before spending time on this).
828  // For now use dynamically allocated per-thread lock,
829  // free memory in __kmp_dispatch_next when status==0.
830  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
831  th->th.th_dispatch->th_steal_lock =
832  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
833  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
834  }
835  break;
836  } else {
837  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
838  "kmp_sch_static_balanced\n",
839  gtid));
840  schedule = kmp_sch_static_balanced;
841  /* too few iterations: fall-through to kmp_sch_static_balanced */
842  } // if
843  /* FALL-THROUGH to static balanced */
844  } // case
845 #endif
846  case kmp_sch_static_balanced: {
847  T nproc = th->th.th_team_nproc;
848  T init, limit;
849 
850  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
851  gtid));
852 
853  if (nproc > 1) {
854  T id = __kmp_tid_from_gtid(gtid);
855 
856  if (tc < nproc) {
857  if (id < tc) {
858  init = id;
859  limit = id;
860  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
861  } else {
862  pr->u.p.count = 1; /* means no more chunks to execute */
863  pr->u.p.parm1 = FALSE;
864  break;
865  }
866  } else {
867  T small_chunk = tc / nproc;
868  T extras = tc % nproc;
869  init = id * small_chunk + (id < extras ? id : extras);
870  limit = init + small_chunk - (id < extras ? 0 : 1);
871  pr->u.p.parm1 = (id == nproc - 1);
872  }
873  } else {
874  if (tc > 0) {
875  init = 0;
876  limit = tc - 1;
877  pr->u.p.parm1 = TRUE;
878  } else { // zero trip count
879  pr->u.p.count = 1; /* means no more chunks to execute */
880  pr->u.p.parm1 = FALSE;
881  break;
882  }
883  }
884 #if USE_ITT_BUILD
885  // Calculate chunk for metadata report
886  if (itt_need_metadata_reporting)
887  cur_chunk = limit - init + 1;
888 #endif
889  if (st == 1) {
890  pr->u.p.lb = lb + init;
891  pr->u.p.ub = lb + limit;
892  } else {
893  // calculated upper bound, "ub" is user-defined upper bound
894  T ub_tmp = lb + limit * st;
895  pr->u.p.lb = lb + init * st;
896  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
897  // it exactly
898  if (st > 0) {
899  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
900  } else {
901  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
902  }
903  }
904  if (pr->ordered) {
905  pr->u.p.ordered_lower = init;
906  pr->u.p.ordered_upper = limit;
907  }
908  break;
909  } // case
910  case kmp_sch_static_balanced_chunked: {
911  // similar to balanced, but chunk adjusted to multiple of simd width
912  T nth = th->th.th_team_nproc;
913  KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
914  " -> falling-through to static_greedy\n",
915  gtid));
916  schedule = kmp_sch_static_greedy;
917  if (nth > 1)
918  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
919  else
920  pr->u.p.parm1 = tc;
921  break;
922  } // case
923  case kmp_sch_guided_iterative_chunked:
924  case kmp_sch_guided_simd: {
925  T nproc = th->th.th_team_nproc;
926  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
927  " case\n",
928  gtid));
929 
930  if (nproc > 1) {
931  if ((2L * chunk + 1) * nproc >= tc) {
932  /* chunk size too large, switch to dynamic */
933  schedule = kmp_sch_dynamic_chunked;
934  } else {
935  // when remaining iters become less than parm2 - switch to dynamic
936  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
937  *(double *)&pr->u.p.parm3 =
938  guided_flt_param / nproc; // may occupy parm3 and parm4
939  }
940  } else {
941  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
942  "kmp_sch_static_greedy\n",
943  gtid));
944  schedule = kmp_sch_static_greedy;
945  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
947  gtid));
948  pr->u.p.parm1 = tc;
949  } // if
950  } // case
951  break;
952  case kmp_sch_guided_analytical_chunked: {
953  T nproc = th->th.th_team_nproc;
954  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
955  " case\n",
956  gtid));
957  if (nproc > 1) {
958  if ((2L * chunk + 1) * nproc >= tc) {
959  /* chunk size too large, switch to dynamic */
960  schedule = kmp_sch_dynamic_chunked;
961  } else {
962  /* commonly used term: (2 nproc - 1)/(2 nproc) */
963  DBL x;
964 
965 #if KMP_OS_WINDOWS && KMP_ARCH_X86
966  /* Linux* OS already has 64-bit computation by default for long double,
967  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
968  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
969  instead of the default 53-bit. Even though long double doesn't work
970  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
971  expected to impact the correctness of the algorithm, but this has not
972  been mathematically proven. */
973  // save original FPCW and set precision to 64-bit, as
974  // Windows* OS on IA-32 architecture defaults to 53-bit
975  unsigned int oldFpcw = _control87(0, 0);
976  _control87(_PC_64, _MCW_PC); // 0,0x30000
977 #endif
978  /* value used for comparison in solver for cross-over point */
979  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
980 
981  /* crossover point--chunk indexes equal to or greater than
982  this point switch to dynamic-style scheduling */
983  UT cross;
984 
985  /* commonly used term: (2 nproc - 1)/(2 nproc) */
986  x = (long double)1.0 - (long double)0.5 / nproc;
987 
988 #ifdef KMP_DEBUG
989  { // test natural alignment
990  struct _test_a {
991  char a;
992  union {
993  char b;
994  DBL d;
995  };
996  } t;
997  ptrdiff_t natural_alignment =
998  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
999  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
1000  // long)natural_alignment );
1001  KMP_DEBUG_ASSERT(
1002  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1003  }
1004 #endif // KMP_DEBUG
1005 
1006  /* save the term in thread private dispatch structure */
1007  *(DBL *)&pr->u.p.parm3 = x;
1008 
1009  /* solve for the crossover point to the nearest integer i for which C_i
1010  <= chunk */
1011  {
1012  UT left, right, mid;
1013  long double p;
1014 
1015  /* estimate initial upper and lower bound */
1016 
1017  /* doesn't matter what value right is as long as it is positive, but
1018  it affects performance of the solver */
1019  right = 229;
1020  p = __kmp_pow<UT>(x, right);
1021  if (p > target) {
1022  do {
1023  p *= p;
1024  right <<= 1;
1025  } while (p > target && right < (1 << 27));
1026  /* lower bound is previous (failed) estimate of upper bound */
1027  left = right >> 1;
1028  } else {
1029  left = 0;
1030  }
1031 
1032  /* bisection root-finding method */
1033  while (left + 1 < right) {
1034  mid = (left + right) / 2;
1035  if (__kmp_pow<UT>(x, mid) > target) {
1036  left = mid;
1037  } else {
1038  right = mid;
1039  }
1040  } // while
1041  cross = right;
1042  }
1043  /* assert sanity of computed crossover point */
1044  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1045  __kmp_pow<UT>(x, cross) <= target);
1046 
1047  /* save the crossover point in thread private dispatch structure */
1048  pr->u.p.parm2 = cross;
1049 
1050 // C75803
1051 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1052 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1053 #else
1054 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1055 #endif
1056  /* dynamic-style scheduling offset */
1057  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1058  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1059  cross * chunk;
1060 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1061  // restore FPCW
1062  _control87(oldFpcw, _MCW_PC);
1063 #endif
1064  } // if
1065  } else {
1066  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1067  "kmp_sch_static_greedy\n",
1068  gtid));
1069  schedule = kmp_sch_static_greedy;
1070  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071  pr->u.p.parm1 = tc;
1072  } // if
1073  } // case
1074  break;
1075  case kmp_sch_static_greedy:
1076  KD_TRACE(100,
1077  ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1078  pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1079  ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1080  : tc;
1081  break;
1082  case kmp_sch_static_chunked:
1083  case kmp_sch_dynamic_chunked:
1084  if (pr->u.p.parm1 <= 0) {
1085  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1086  }
1087  KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1088  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1089  gtid));
1090  break;
1091  case kmp_sch_trapezoidal: {
1092  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1093 
1094  T parm1, parm2, parm3, parm4;
1095  KD_TRACE(100,
1096  ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1097 
1098  parm1 = chunk;
1099 
1100  /* F : size of the first cycle */
1101  parm2 = (tc / (2 * th->th.th_team_nproc));
1102 
1103  if (parm2 < 1) {
1104  parm2 = 1;
1105  }
1106 
1107  /* L : size of the last cycle. Make sure the last cycle is not larger
1108  than the first cycle. */
1109  if (parm1 < 1) {
1110  parm1 = 1;
1111  } else if (parm1 > parm2) {
1112  parm1 = parm2;
1113  }
1114 
1115  /* N : number of cycles */
1116  parm3 = (parm2 + parm1);
1117  parm3 = (2 * tc + parm3 - 1) / parm3;
1118 
1119  if (parm3 < 2) {
1120  parm3 = 2;
1121  }
1122 
1123  /* sigma : decreasing incr of the trapezoid */
1124  parm4 = (parm3 - 1);
1125  parm4 = (parm2 - parm1) / parm4;
1126 
1127  // pointless check, because parm4 >= 0 always
1128  // if ( parm4 < 0 ) {
1129  // parm4 = 0;
1130  //}
1131 
1132  pr->u.p.parm1 = parm1;
1133  pr->u.p.parm2 = parm2;
1134  pr->u.p.parm3 = parm3;
1135  pr->u.p.parm4 = parm4;
1136  } // case
1137  break;
1138 
1139  default: {
1140  __kmp_msg(kmp_ms_fatal, // Severity
1141  KMP_MSG(UnknownSchedTypeDetected), // Primary message
1142  KMP_HNT(GetNewerLibrary), // Hint
1143  __kmp_msg_null // Variadic argument list terminator
1144  );
1145  } break;
1146  } // switch
1147  pr->schedule = schedule;
1148  if (active) {
1149  /* The name of this buffer should be my_buffer_index when it's free to use
1150  * it */
1151 
1152  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1153  "sh->buffer_index:%d\n",
1154  gtid, my_buffer_index, sh->buffer_index));
1155  __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1156  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1157  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1158  // my_buffer_index are *always* 32-bit integers.
1159  KMP_MB(); /* is this necessary? */
1160  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1161  "sh->buffer_index:%d\n",
1162  gtid, my_buffer_index, sh->buffer_index));
1163 
1164  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1165  th->th.th_dispatch->th_dispatch_sh_current =
1166  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1167 #if USE_ITT_BUILD
1168  if (pr->ordered) {
1169  __kmp_itt_ordered_init(gtid);
1170  }; // if
1171  // Report loop metadata
1172  if (itt_need_metadata_reporting) {
1173  // Only report metadata by master of active team at level 1
1174  kmp_uint64 schedtype = 0;
1175  switch (schedule) {
1176  case kmp_sch_static_chunked:
1177  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1178  break;
1179  case kmp_sch_static_greedy:
1180  cur_chunk = pr->u.p.parm1;
1181  break;
1182  case kmp_sch_dynamic_chunked:
1183  schedtype = 1;
1184  break;
1185  case kmp_sch_guided_iterative_chunked:
1186  case kmp_sch_guided_analytical_chunked:
1187  case kmp_sch_guided_simd:
1188  schedtype = 2;
1189  break;
1190  default:
1191  // Should we put this case under "static"?
1192  // case kmp_sch_static_steal:
1193  schedtype = 3;
1194  break;
1195  }
1196  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1197  }
1198 #endif /* USE_ITT_BUILD */
1199  }; // if
1200 
1201 #ifdef KMP_DEBUG
1202  {
1203  const char *buff;
1204  // create format specifiers before the debug output
1205  buff = __kmp_str_format(
1206  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1207  "lb:%%%s ub:%%%s"
1208  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1209  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1210  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1211  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1212  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1213  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1214  KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1215  pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1216  pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1217  pr->u.p.parm3, pr->u.p.parm4));
1218  __kmp_str_free(&buff);
1219  }
1220 #endif
1221 #if (KMP_STATIC_STEAL_ENABLED)
1222  // It cannot be guaranteed that after execution of a loop with some other
1223  // schedule kind all the parm3 variables will contain the same value. Even if
1224  // all parm3 will be the same, it still exists a bad case like using 0 and 1
1225  // rather than program life-time increment. So the dedicated variable is
1226  // required. The 'static_steal_counter' is used.
1227  if (schedule == kmp_sch_static_steal) {
1228  // Other threads will inspect this variable when searching for a victim.
1229  // This is a flag showing that other threads may steal from this thread
1230  // since then.
1231  volatile T *p = &pr->u.p.static_steal_counter;
1232  *p = *p + 1;
1233  }
1234 #endif // ( KMP_STATIC_STEAL_ENABLED )
1235 
1236 #if OMPT_SUPPORT && OMPT_TRACE
1237  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1238  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1239  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1240  ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1241  team_info->parallel_id, task_info->task_id, team_info->microtask);
1242  }
1243 #endif
1244 }
1245 
1246 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1247  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1248  * every chunk of iterations. If the ordered section(s) were not executed
1249  * for this iteration (or every iteration in this chunk), we need to set the
1250  * ordered iteration counters so that the next thread can proceed. */
1251 template <typename UT>
1252 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1253  typedef typename traits_t<UT>::signed_t ST;
1254  kmp_info_t *th = __kmp_threads[gtid];
1255 
1256  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1257  if (!th->th.th_team->t.t_serialized) {
1258 
1259  dispatch_private_info_template<UT> *pr =
1260  reinterpret_cast<dispatch_private_info_template<UT> *>(
1261  th->th.th_dispatch->th_dispatch_pr_current);
1262  dispatch_shared_info_template<UT> volatile *sh =
1263  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1264  th->th.th_dispatch->th_dispatch_sh_current);
1265  KMP_DEBUG_ASSERT(pr);
1266  KMP_DEBUG_ASSERT(sh);
1267  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1268  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1269 
1270  if (pr->ordered_bumped) {
1271  KD_TRACE(
1272  1000,
1273  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1274  gtid));
1275  pr->ordered_bumped = 0;
1276  } else {
1277  UT lower = pr->u.p.ordered_lower;
1278 
1279 #ifdef KMP_DEBUG
1280  {
1281  const char *buff;
1282  // create format specifiers before the debug output
1283  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1284  "ordered_iteration:%%%s lower:%%%s\n",
1285  traits_t<UT>::spec, traits_t<UT>::spec);
1286  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1287  __kmp_str_free(&buff);
1288  }
1289 #endif
1290 
1291  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1292  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1293  KMP_MB(); /* is this necessary? */
1294 #ifdef KMP_DEBUG
1295  {
1296  const char *buff;
1297  // create format specifiers before the debug output
1298  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1299  "ordered_iteration:%%%s lower:%%%s\n",
1300  traits_t<UT>::spec, traits_t<UT>::spec);
1301  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1302  __kmp_str_free(&buff);
1303  }
1304 #endif
1305 
1306  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1307  } // if
1308  } // if
1309  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1310 }
1311 
1312 #ifdef KMP_GOMP_COMPAT
1313 
1314 template <typename UT>
1315 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1316  typedef typename traits_t<UT>::signed_t ST;
1317  kmp_info_t *th = __kmp_threads[gtid];
1318 
1319  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1320  if (!th->th.th_team->t.t_serialized) {
1321  // int cid;
1322  dispatch_private_info_template<UT> *pr =
1323  reinterpret_cast<dispatch_private_info_template<UT> *>(
1324  th->th.th_dispatch->th_dispatch_pr_current);
1325  dispatch_shared_info_template<UT> volatile *sh =
1326  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1327  th->th.th_dispatch->th_dispatch_sh_current);
1328  KMP_DEBUG_ASSERT(pr);
1329  KMP_DEBUG_ASSERT(sh);
1330  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1331  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1332 
1333  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1334  UT lower = pr->u.p.ordered_lower;
1335  UT upper = pr->u.p.ordered_upper;
1336  UT inc = upper - lower + 1;
1337 
1338  if (pr->ordered_bumped == inc) {
1339  KD_TRACE(
1340  1000,
1341  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1342  gtid));
1343  pr->ordered_bumped = 0;
1344  } else {
1345  inc -= pr->ordered_bumped;
1346 
1347 #ifdef KMP_DEBUG
1348  {
1349  const char *buff;
1350  // create format specifiers before the debug output
1351  buff = __kmp_str_format(
1352  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1353  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1354  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1355  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1356  __kmp_str_free(&buff);
1357  }
1358 #endif
1359 
1360  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1361  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1362 
1363  KMP_MB(); /* is this necessary? */
1364  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1365  "ordered_bumped to zero\n",
1366  gtid));
1367  pr->ordered_bumped = 0;
1369 #ifdef KMP_DEBUG
1370  {
1371  const char *buff;
1372  // create format specifiers before the debug output
1373  buff = __kmp_str_format(
1374  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1375  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1376  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1377  traits_t<UT>::spec);
1378  KD_TRACE(1000,
1379  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1380  __kmp_str_free(&buff);
1381  }
1382 #endif
1383 
1384  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1385  }
1386  // }
1387  }
1388  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1389 }
1390 
1391 #endif /* KMP_GOMP_COMPAT */
1392 
1393 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1394  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1395  is not called. */
1396 #if OMPT_SUPPORT && OMPT_TRACE
1397 #define OMPT_LOOP_END \
1398  if (status == 0) { \
1399  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1400  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1401  ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1402  ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1403  team_info->parallel_id, task_info->task_id); \
1404  } \
1405  }
1406 #else
1407 #define OMPT_LOOP_END // no-op
1408 #endif
1409 
1410 template <typename T>
1411 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1412  T *p_lb, T *p_ub,
1413  typename traits_t<T>::signed_t *p_st) {
1414 
1415  typedef typename traits_t<T>::unsigned_t UT;
1416  typedef typename traits_t<T>::signed_t ST;
1417  typedef typename traits_t<T>::floating_t DBL;
1418 
1419  // This is potentially slightly misleading, schedule(runtime) will appear here
1420  // even if the actual runtme schedule is static. (Which points out a
1421  // disadavantage of schedule(runtime): even when static scheduling is used it
1422  // costs more than a compile time choice to use static scheduling would.)
1423  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1424 
1425  int status;
1426  dispatch_private_info_template<T> *pr;
1427  kmp_info_t *th = __kmp_threads[gtid];
1428  kmp_team_t *team = th->th.th_team;
1429 
1430  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1431 #ifdef KMP_DEBUG
1432  {
1433  const char *buff;
1434  // create format specifiers before the debug output
1435  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1436  "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1437  traits_t<T>::spec, traits_t<T>::spec,
1438  traits_t<ST>::spec);
1439  KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1440  __kmp_str_free(&buff);
1441  }
1442 #endif
1443 
1444  if (team->t.t_serialized) {
1445  /* NOTE: serialize this dispatch becase we are not at the active level */
1446  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1447  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1448  KMP_DEBUG_ASSERT(pr);
1449 
1450  if ((status = (pr->u.p.tc != 0)) == 0) {
1451  *p_lb = 0;
1452  *p_ub = 0;
1453  // if ( p_last != NULL )
1454  // *p_last = 0;
1455  if (p_st != NULL)
1456  *p_st = 0;
1457  if (__kmp_env_consistency_check) {
1458  if (pr->pushed_ws != ct_none) {
1459  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1460  }
1461  }
1462  } else if (pr->nomerge) {
1463  kmp_int32 last;
1464  T start;
1465  UT limit, trip, init;
1466  ST incr;
1467  T chunk = pr->u.p.parm1;
1468 
1469  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1470  gtid));
1471 
1472  init = chunk * pr->u.p.count++;
1473  trip = pr->u.p.tc - 1;
1474 
1475  if ((status = (init <= trip)) == 0) {
1476  *p_lb = 0;
1477  *p_ub = 0;
1478  // if ( p_last != NULL )
1479  // *p_last = 0;
1480  if (p_st != NULL)
1481  *p_st = 0;
1482  if (__kmp_env_consistency_check) {
1483  if (pr->pushed_ws != ct_none) {
1484  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1485  }
1486  }
1487  } else {
1488  start = pr->u.p.lb;
1489  limit = chunk + init - 1;
1490  incr = pr->u.p.st;
1491 
1492  if ((last = (limit >= trip)) != 0) {
1493  limit = trip;
1494 #if KMP_OS_WINDOWS
1495  pr->u.p.last_upper = pr->u.p.ub;
1496 #endif /* KMP_OS_WINDOWS */
1497  }
1498  if (p_last != NULL)
1499  *p_last = last;
1500  if (p_st != NULL)
1501  *p_st = incr;
1502  if (incr == 1) {
1503  *p_lb = start + init;
1504  *p_ub = start + limit;
1505  } else {
1506  *p_lb = start + init * incr;
1507  *p_ub = start + limit * incr;
1508  }
1509 
1510  if (pr->ordered) {
1511  pr->u.p.ordered_lower = init;
1512  pr->u.p.ordered_upper = limit;
1513 #ifdef KMP_DEBUG
1514  {
1515  const char *buff;
1516  // create format specifiers before the debug output
1517  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1518  "ordered_lower:%%%s ordered_upper:%%%s\n",
1519  traits_t<UT>::spec, traits_t<UT>::spec);
1520  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1521  pr->u.p.ordered_upper));
1522  __kmp_str_free(&buff);
1523  }
1524 #endif
1525  } // if
1526  } // if
1527  } else {
1528  pr->u.p.tc = 0;
1529  *p_lb = pr->u.p.lb;
1530  *p_ub = pr->u.p.ub;
1531 #if KMP_OS_WINDOWS
1532  pr->u.p.last_upper = *p_ub;
1533 #endif /* KMP_OS_WINDOWS */
1534  if (p_last != NULL)
1535  *p_last = TRUE;
1536  if (p_st != NULL)
1537  *p_st = pr->u.p.st;
1538  } // if
1539 #ifdef KMP_DEBUG
1540  {
1541  const char *buff;
1542  // create format specifiers before the debug output
1543  buff = __kmp_str_format(
1544  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1545  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1546  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1547  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1548  __kmp_str_free(&buff);
1549  }
1550 #endif
1551 #if INCLUDE_SSC_MARKS
1552  SSC_MARK_DISPATCH_NEXT();
1553 #endif
1554  OMPT_LOOP_END;
1555  return status;
1556  } else {
1557  kmp_int32 last = 0;
1558  dispatch_shared_info_template<UT> *sh;
1559  T start;
1560  ST incr;
1561  UT limit, trip, init;
1562 
1563  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1564  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1565 
1566  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1567  th->th.th_dispatch->th_dispatch_pr_current);
1568  KMP_DEBUG_ASSERT(pr);
1569  sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1570  th->th.th_dispatch->th_dispatch_sh_current);
1571  KMP_DEBUG_ASSERT(sh);
1572 
1573  if (pr->u.p.tc == 0) {
1574  // zero trip count
1575  status = 0;
1576  } else {
1577  switch (pr->schedule) {
1578 #if (KMP_STATIC_STEAL_ENABLED)
1579  case kmp_sch_static_steal: {
1580  T chunk = pr->u.p.parm1;
1581  int nproc = th->th.th_team_nproc;
1582 
1583  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1584  gtid));
1585 
1586  trip = pr->u.p.tc - 1;
1587 
1588  if (traits_t<T>::type_size > 4) {
1589  // use lock for 8-byte and CAS for 4-byte induction
1590  // variable. TODO (optional): check and use 16-byte CAS
1591  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1592  KMP_DEBUG_ASSERT(lck != NULL);
1593  if (pr->u.p.count < (UT)pr->u.p.ub) {
1594  __kmp_acquire_lock(lck, gtid);
1595  // try to get own chunk of iterations
1596  init = (pr->u.p.count)++;
1597  status = (init < (UT)pr->u.p.ub);
1598  __kmp_release_lock(lck, gtid);
1599  } else {
1600  status = 0; // no own chunks
1601  }
1602  if (!status) { // try to steal
1603  kmp_info_t **other_threads = team->t.t_threads;
1604  int while_limit = nproc; // nproc attempts to find a victim
1605  int while_index = 0;
1606  // TODO: algorithm of searching for a victim
1607  // should be cleaned up and measured
1608  while ((!status) && (while_limit != ++while_index)) {
1609  T remaining;
1610  T victimIdx = pr->u.p.parm4;
1611  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1612  dispatch_private_info_template<T> *victim =
1613  reinterpret_cast<dispatch_private_info_template<T> *>(
1614  other_threads[victimIdx]
1615  ->th.th_dispatch->th_dispatch_pr_current);
1616  while ((victim == NULL || victim == pr ||
1617  (*(volatile T *)&victim->u.p.static_steal_counter !=
1618  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1619  oldVictimIdx != victimIdx) {
1620  victimIdx = (victimIdx + 1) % nproc;
1621  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1622  other_threads[victimIdx]
1623  ->th.th_dispatch->th_dispatch_pr_current);
1624  };
1625  if (!victim ||
1626  (*(volatile T *)&victim->u.p.static_steal_counter !=
1627  *(volatile T *)&pr->u.p.static_steal_counter)) {
1628  continue; // try once more (nproc attempts in total)
1629  // no victim is ready yet to participate in stealing
1630  // because all victims are still in kmp_init_dispatch
1631  }
1632  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1633  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1634  continue; // not enough chunks to steal, goto next victim
1635  }
1636 
1637  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1638  KMP_ASSERT(lck != NULL);
1639  __kmp_acquire_lock(lck, gtid);
1640  limit = victim->u.p.ub; // keep initial ub
1641  if (victim->u.p.count >= limit ||
1642  (remaining = limit - victim->u.p.count) < 2) {
1643  __kmp_release_lock(lck, gtid);
1644  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1645  continue; // not enough chunks to steal
1646  }
1647  // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1648  // or by 1
1649  if (remaining > 3) {
1650  KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1651  init = (victim->u.p.ub -=
1652  (remaining >> 2)); // steal 1/4 of remaining
1653  } else {
1654  KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1655  init =
1656  (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1657  }
1658  __kmp_release_lock(lck, gtid);
1659 
1660  KMP_DEBUG_ASSERT(init + 1 <= limit);
1661  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1662  status = 1;
1663  while_index = 0;
1664  // now update own count and ub with stolen range but init chunk
1665  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1666  pr->u.p.count = init + 1;
1667  pr->u.p.ub = limit;
1668  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1669  } // while (search for victim)
1670  } // if (try to find victim and steal)
1671  } else {
1672  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1673  typedef union {
1674  struct {
1675  UT count;
1676  T ub;
1677  } p;
1678  kmp_int64 b;
1679  } union_i4;
1680  // All operations on 'count' or 'ub' must be combined atomically
1681  // together.
1682  {
1683  union_i4 vold, vnew;
1684  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1685  vnew = vold;
1686  vnew.p.count++;
1687  while (!KMP_COMPARE_AND_STORE_ACQ64(
1688  (volatile kmp_int64 *)&pr->u.p.count,
1689  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1690  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1691  KMP_CPU_PAUSE();
1692  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1693  vnew = vold;
1694  vnew.p.count++;
1695  }
1696  vnew = vold;
1697  init = vnew.p.count;
1698  status = (init < (UT)vnew.p.ub);
1699  }
1700 
1701  if (!status) {
1702  kmp_info_t **other_threads = team->t.t_threads;
1703  int while_limit = nproc; // nproc attempts to find a victim
1704  int while_index = 0;
1705 
1706  // TODO: algorithm of searching for a victim
1707  // should be cleaned up and measured
1708  while ((!status) && (while_limit != ++while_index)) {
1709  union_i4 vold, vnew;
1710  kmp_int32 remaining;
1711  T victimIdx = pr->u.p.parm4;
1712  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1713  dispatch_private_info_template<T> *victim =
1714  reinterpret_cast<dispatch_private_info_template<T> *>(
1715  other_threads[victimIdx]
1716  ->th.th_dispatch->th_dispatch_pr_current);
1717  while ((victim == NULL || victim == pr ||
1718  (*(volatile T *)&victim->u.p.static_steal_counter !=
1719  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1720  oldVictimIdx != victimIdx) {
1721  victimIdx = (victimIdx + 1) % nproc;
1722  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1723  other_threads[victimIdx]
1724  ->th.th_dispatch->th_dispatch_pr_current);
1725  };
1726  if (!victim ||
1727  (*(volatile T *)&victim->u.p.static_steal_counter !=
1728  *(volatile T *)&pr->u.p.static_steal_counter)) {
1729  continue; // try once more (nproc attempts in total)
1730  // no victim is ready yet to participate in stealing
1731  // because all victims are still in kmp_init_dispatch
1732  }
1733  pr->u.p.parm4 = victimIdx; // new victim found
1734  while (1) { // CAS loop if victim has enough chunks to steal
1735  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1736  vnew = vold;
1737 
1738  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1739  if (vnew.p.count >= (UT)vnew.p.ub ||
1740  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1741  pr->u.p.parm4 =
1742  (victimIdx + 1) % nproc; // shift start victim id
1743  break; // not enough chunks to steal, goto next victim
1744  }
1745  if (remaining > 3) {
1746  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1747  } else {
1748  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1749  }
1750  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1751  // TODO: Should this be acquire or release?
1752  if (KMP_COMPARE_AND_STORE_ACQ64(
1753  (volatile kmp_int64 *)&victim->u.p.count,
1754  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1755  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1756  // stealing succeeded
1757  KMP_COUNT_VALUE(FOR_static_steal_stolen,
1758  vold.p.ub - vnew.p.ub);
1759  status = 1;
1760  while_index = 0;
1761  // now update own count and ub
1762  init = vnew.p.ub;
1763  vold.p.count = init + 1;
1764 #if KMP_ARCH_X86
1765  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1766  vold.b);
1767 #else
1768  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1769 #endif
1770  break;
1771  } // if (check CAS result)
1772  KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1773  } // while (try to steal from particular victim)
1774  } // while (search for victim)
1775  } // if (try to find victim and steal)
1776  } // if (4-byte induction variable)
1777  if (!status) {
1778  *p_lb = 0;
1779  *p_ub = 0;
1780  if (p_st != NULL)
1781  *p_st = 0;
1782  } else {
1783  start = pr->u.p.parm2;
1784  init *= chunk;
1785  limit = chunk + init - 1;
1786  incr = pr->u.p.st;
1787  KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1788 
1789  KMP_DEBUG_ASSERT(init <= trip);
1790  if ((last = (limit >= trip)) != 0)
1791  limit = trip;
1792  if (p_st != NULL)
1793  *p_st = incr;
1794 
1795  if (incr == 1) {
1796  *p_lb = start + init;
1797  *p_ub = start + limit;
1798  } else {
1799  *p_lb = start + init * incr;
1800  *p_ub = start + limit * incr;
1801  }
1802 
1803  if (pr->ordered) {
1804  pr->u.p.ordered_lower = init;
1805  pr->u.p.ordered_upper = limit;
1806 #ifdef KMP_DEBUG
1807  {
1808  const char *buff;
1809  // create format specifiers before the debug output
1810  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1811  "ordered_lower:%%%s ordered_upper:%%%s\n",
1812  traits_t<UT>::spec, traits_t<UT>::spec);
1813  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1814  pr->u.p.ordered_upper));
1815  __kmp_str_free(&buff);
1816  }
1817 #endif
1818  } // if
1819  } // if
1820  break;
1821  } // case
1822 #endif // ( KMP_STATIC_STEAL_ENABLED )
1823  case kmp_sch_static_balanced: {
1824  KD_TRACE(
1825  100,
1826  ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1827  if ((status = !pr->u.p.count) !=
1828  0) { /* check if thread has any iteration to do */
1829  pr->u.p.count = 1;
1830  *p_lb = pr->u.p.lb;
1831  *p_ub = pr->u.p.ub;
1832  last = pr->u.p.parm1;
1833  if (p_st != NULL)
1834  *p_st = pr->u.p.st;
1835  } else { /* no iterations to do */
1836  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1837  }
1838  if (pr->ordered) {
1839 #ifdef KMP_DEBUG
1840  {
1841  const char *buff;
1842  // create format specifiers before the debug output
1843  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1844  "ordered_lower:%%%s ordered_upper:%%%s\n",
1845  traits_t<UT>::spec, traits_t<UT>::spec);
1846  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1847  pr->u.p.ordered_upper));
1848  __kmp_str_free(&buff);
1849  }
1850 #endif
1851  } // if
1852  } // case
1853  break;
1854  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1855  merged here */
1856  case kmp_sch_static_chunked: {
1857  T parm1;
1858 
1859  KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1860  "kmp_sch_static_[affinity|chunked] case\n",
1861  gtid));
1862  parm1 = pr->u.p.parm1;
1863 
1864  trip = pr->u.p.tc - 1;
1865  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1866 
1867  if ((status = (init <= trip)) != 0) {
1868  start = pr->u.p.lb;
1869  incr = pr->u.p.st;
1870  limit = parm1 + init - 1;
1871 
1872  if ((last = (limit >= trip)) != 0)
1873  limit = trip;
1874 
1875  if (p_st != NULL)
1876  *p_st = incr;
1877 
1878  pr->u.p.count += th->th.th_team_nproc;
1879 
1880  if (incr == 1) {
1881  *p_lb = start + init;
1882  *p_ub = start + limit;
1883  } else {
1884  *p_lb = start + init * incr;
1885  *p_ub = start + limit * incr;
1886  }
1887 
1888  if (pr->ordered) {
1889  pr->u.p.ordered_lower = init;
1890  pr->u.p.ordered_upper = limit;
1891 #ifdef KMP_DEBUG
1892  {
1893  const char *buff;
1894  // create format specifiers before the debug output
1895  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1896  "ordered_lower:%%%s ordered_upper:%%%s\n",
1897  traits_t<UT>::spec, traits_t<UT>::spec);
1898  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1899  pr->u.p.ordered_upper));
1900  __kmp_str_free(&buff);
1901  }
1902 #endif
1903  } // if
1904  } // if
1905  } // case
1906  break;
1907 
1908  case kmp_sch_dynamic_chunked: {
1909  T chunk = pr->u.p.parm1;
1910 
1911  KD_TRACE(
1912  100,
1913  ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1914 
1915  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1916  trip = pr->u.p.tc - 1;
1917 
1918  if ((status = (init <= trip)) == 0) {
1919  *p_lb = 0;
1920  *p_ub = 0;
1921  if (p_st != NULL)
1922  *p_st = 0;
1923  } else {
1924  start = pr->u.p.lb;
1925  limit = chunk + init - 1;
1926  incr = pr->u.p.st;
1927 
1928  if ((last = (limit >= trip)) != 0)
1929  limit = trip;
1930 
1931  if (p_st != NULL)
1932  *p_st = incr;
1933 
1934  if (incr == 1) {
1935  *p_lb = start + init;
1936  *p_ub = start + limit;
1937  } else {
1938  *p_lb = start + init * incr;
1939  *p_ub = start + limit * incr;
1940  }
1941 
1942  if (pr->ordered) {
1943  pr->u.p.ordered_lower = init;
1944  pr->u.p.ordered_upper = limit;
1945 #ifdef KMP_DEBUG
1946  {
1947  const char *buff;
1948  // create format specifiers before the debug output
1949  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1950  "ordered_lower:%%%s ordered_upper:%%%s\n",
1951  traits_t<UT>::spec, traits_t<UT>::spec);
1952  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1953  pr->u.p.ordered_upper));
1954  __kmp_str_free(&buff);
1955  }
1956 #endif
1957  } // if
1958  } // if
1959  } // case
1960  break;
1961 
1962  case kmp_sch_guided_iterative_chunked: {
1963  T chunkspec = pr->u.p.parm1;
1964  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1965  "iterative case\n",
1966  gtid));
1967  trip = pr->u.p.tc;
1968  // Start atomic part of calculations
1969  while (1) {
1970  ST remaining; // signed, because can be < 0
1971  init = sh->u.s.iteration; // shared value
1972  remaining = trip - init;
1973  if (remaining <= 0) { // AC: need to compare with 0 first
1974  // nothing to do, don't try atomic op
1975  status = 0;
1976  break;
1977  }
1978  if ((T)remaining <
1979  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1980  // use dynamic-style shcedule
1981  // atomically inrement iterations, get old value
1982  init = test_then_add<ST>(
1983  RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunkspec);
1984  remaining = trip - init;
1985  if (remaining <= 0) {
1986  status = 0; // all iterations got by other threads
1987  } else { // got some iterations to work on
1988  status = 1;
1989  if ((T)remaining > chunkspec) {
1990  limit = init + chunkspec - 1;
1991  } else {
1992  last = 1; // the last chunk
1993  limit = init + remaining - 1;
1994  } // if
1995  } // if
1996  break;
1997  } // if
1998  limit = init + (UT)(remaining *
1999  *(double *)&pr->u.p.parm3); // divide by K*nproc
2000  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2001  (ST)init, (ST)limit)) {
2002  // CAS was successful, chunk obtained
2003  status = 1;
2004  --limit;
2005  break;
2006  } // if
2007  } // while
2008  if (status != 0) {
2009  start = pr->u.p.lb;
2010  incr = pr->u.p.st;
2011  if (p_st != NULL)
2012  *p_st = incr;
2013  *p_lb = start + init * incr;
2014  *p_ub = start + limit * incr;
2015  if (pr->ordered) {
2016  pr->u.p.ordered_lower = init;
2017  pr->u.p.ordered_upper = limit;
2018 #ifdef KMP_DEBUG
2019  {
2020  const char *buff;
2021  // create format specifiers before the debug output
2022  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2023  "ordered_lower:%%%s ordered_upper:%%%s\n",
2024  traits_t<UT>::spec, traits_t<UT>::spec);
2025  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2026  pr->u.p.ordered_upper));
2027  __kmp_str_free(&buff);
2028  }
2029 #endif
2030  } // if
2031  } else {
2032  *p_lb = 0;
2033  *p_ub = 0;
2034  if (p_st != NULL)
2035  *p_st = 0;
2036  } // if
2037  } // case
2038  break;
2039 
2040  case kmp_sch_guided_simd: {
2041  // same as iterative but curr-chunk adjusted to be multiple of given
2042  // chunk
2043  T chunk = pr->u.p.parm1;
2044  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2045  gtid));
2046  trip = pr->u.p.tc;
2047  // Start atomic part of calculations
2048  while (1) {
2049  ST remaining; // signed, because can be < 0
2050  init = sh->u.s.iteration; // shared value
2051  remaining = trip - init;
2052  if (remaining <= 0) { // AC: need to compare with 0 first
2053  status = 0; // nothing to do, don't try atomic op
2054  break;
2055  }
2056  KMP_DEBUG_ASSERT(init % chunk == 0);
2057  // compare with K*nproc*(chunk+1), K=2 by default
2058  if ((T)remaining < pr->u.p.parm2) {
2059  // use dynamic-style shcedule
2060  // atomically inrement iterations, get old value
2061  init = test_then_add<ST>(
2062  RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunk);
2063  remaining = trip - init;
2064  if (remaining <= 0) {
2065  status = 0; // all iterations got by other threads
2066  } else {
2067  // got some iterations to work on
2068  status = 1;
2069  if ((T)remaining > chunk) {
2070  limit = init + chunk - 1;
2071  } else {
2072  last = 1; // the last chunk
2073  limit = init + remaining - 1;
2074  } // if
2075  } // if
2076  break;
2077  } // if
2078  // divide by K*nproc
2079  UT span = remaining * (*(double *)&pr->u.p.parm3);
2080  UT rem = span % chunk;
2081  if (rem) // adjust so that span%chunk == 0
2082  span += chunk - rem;
2083  limit = init + span;
2084  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2085  (ST)init, (ST)limit)) {
2086  // CAS was successful, chunk obtained
2087  status = 1;
2088  --limit;
2089  break;
2090  } // if
2091  } // while
2092  if (status != 0) {
2093  start = pr->u.p.lb;
2094  incr = pr->u.p.st;
2095  if (p_st != NULL)
2096  *p_st = incr;
2097  *p_lb = start + init * incr;
2098  *p_ub = start + limit * incr;
2099  if (pr->ordered) {
2100  pr->u.p.ordered_lower = init;
2101  pr->u.p.ordered_upper = limit;
2102 #ifdef KMP_DEBUG
2103  {
2104  const char *buff;
2105  // create format specifiers before the debug output
2106  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2107  "ordered_lower:%%%s ordered_upper:%%%s\n",
2108  traits_t<UT>::spec, traits_t<UT>::spec);
2109  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2110  pr->u.p.ordered_upper));
2111  __kmp_str_free(&buff);
2112  }
2113 #endif
2114  } // if
2115  } else {
2116  *p_lb = 0;
2117  *p_ub = 0;
2118  if (p_st != NULL)
2119  *p_st = 0;
2120  } // if
2121  } // case
2122  break;
2123 
2124  case kmp_sch_guided_analytical_chunked: {
2125  T chunkspec = pr->u.p.parm1;
2126  UT chunkIdx;
2127 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2128  /* for storing original FPCW value for Windows* OS on
2129  IA-32 architecture 8-byte version */
2130  unsigned int oldFpcw;
2131  unsigned int fpcwSet = 0;
2132 #endif
2133  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2134  "analytical case\n",
2135  gtid));
2136 
2137  trip = pr->u.p.tc;
2138 
2139  KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2140  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2141  trip);
2142 
2143  while (1) { /* this while loop is a safeguard against unexpected zero
2144  chunk sizes */
2145  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2146  if (chunkIdx >= (UT)pr->u.p.parm2) {
2147  --trip;
2148  /* use dynamic-style scheduling */
2149  init = chunkIdx * chunkspec + pr->u.p.count;
2150  /* need to verify init > 0 in case of overflow in the above
2151  * calculation */
2152  if ((status = (init > 0 && init <= trip)) != 0) {
2153  limit = init + chunkspec - 1;
2154 
2155  if ((last = (limit >= trip)) != 0)
2156  limit = trip;
2157  }
2158  break;
2159  } else {
2160 /* use exponential-style scheduling */
2161 /* The following check is to workaround the lack of long double precision on
2162  Windows* OS.
2163  This check works around the possible effect that init != 0 for chunkIdx == 0.
2164  */
2165 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2166  /* If we haven't already done so, save original FPCW and set
2167  precision to 64-bit, as Windows* OS on IA-32 architecture
2168  defaults to 53-bit */
2169  if (!fpcwSet) {
2170  oldFpcw = _control87(0, 0);
2171  _control87(_PC_64, _MCW_PC);
2172  fpcwSet = 0x30000;
2173  }
2174 #endif
2175  if (chunkIdx) {
2176  init = __kmp_dispatch_guided_remaining<T>(
2177  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2178  KMP_DEBUG_ASSERT(init);
2179  init = trip - init;
2180  } else
2181  init = 0;
2182  limit = trip - __kmp_dispatch_guided_remaining<T>(
2183  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2184  KMP_ASSERT(init <= limit);
2185  if (init < limit) {
2186  KMP_DEBUG_ASSERT(limit <= trip);
2187  --limit;
2188  status = 1;
2189  break;
2190  } // if
2191  } // if
2192  } // while (1)
2193 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2194  /* restore FPCW if necessary
2195  AC: check fpcwSet flag first because oldFpcw can be uninitialized
2196  here */
2197  if (fpcwSet && (oldFpcw & fpcwSet))
2198  _control87(oldFpcw, _MCW_PC);
2199 #endif
2200  if (status != 0) {
2201  start = pr->u.p.lb;
2202  incr = pr->u.p.st;
2203  if (p_st != NULL)
2204  *p_st = incr;
2205  *p_lb = start + init * incr;
2206  *p_ub = start + limit * incr;
2207  if (pr->ordered) {
2208  pr->u.p.ordered_lower = init;
2209  pr->u.p.ordered_upper = limit;
2210 #ifdef KMP_DEBUG
2211  {
2212  const char *buff;
2213  // create format specifiers before the debug output
2214  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2215  "ordered_lower:%%%s ordered_upper:%%%s\n",
2216  traits_t<UT>::spec, traits_t<UT>::spec);
2217  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2218  pr->u.p.ordered_upper));
2219  __kmp_str_free(&buff);
2220  }
2221 #endif
2222  }
2223  } else {
2224  *p_lb = 0;
2225  *p_ub = 0;
2226  if (p_st != NULL)
2227  *p_st = 0;
2228  }
2229  } // case
2230  break;
2231 
2232  case kmp_sch_trapezoidal: {
2233  UT index;
2234  T parm2 = pr->u.p.parm2;
2235  T parm3 = pr->u.p.parm3;
2236  T parm4 = pr->u.p.parm4;
2237  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2238  gtid));
2239 
2240  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2241 
2242  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2243  trip = pr->u.p.tc - 1;
2244 
2245  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2246  *p_lb = 0;
2247  *p_ub = 0;
2248  if (p_st != NULL)
2249  *p_st = 0;
2250  } else {
2251  start = pr->u.p.lb;
2252  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2253  incr = pr->u.p.st;
2254 
2255  if ((last = (limit >= trip)) != 0)
2256  limit = trip;
2257 
2258  if (p_st != NULL)
2259  *p_st = incr;
2260 
2261  if (incr == 1) {
2262  *p_lb = start + init;
2263  *p_ub = start + limit;
2264  } else {
2265  *p_lb = start + init * incr;
2266  *p_ub = start + limit * incr;
2267  }
2268 
2269  if (pr->ordered) {
2270  pr->u.p.ordered_lower = init;
2271  pr->u.p.ordered_upper = limit;
2272 #ifdef KMP_DEBUG
2273  {
2274  const char *buff;
2275  // create format specifiers before the debug output
2276  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2277  "ordered_lower:%%%s ordered_upper:%%%s\n",
2278  traits_t<UT>::spec, traits_t<UT>::spec);
2279  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2280  pr->u.p.ordered_upper));
2281  __kmp_str_free(&buff);
2282  }
2283 #endif
2284  } // if
2285  } // if
2286  } // case
2287  break;
2288  default: {
2289  status = 0; // to avoid complaints on uninitialized variable use
2290  __kmp_msg(kmp_ms_fatal, // Severity
2291  KMP_MSG(UnknownSchedTypeDetected), // Primary message
2292  KMP_HNT(GetNewerLibrary), // Hint
2293  __kmp_msg_null // Variadic argument list terminator
2294  );
2295  } break;
2296  } // switch
2297  } // if tc == 0;
2298 
2299  if (status == 0) {
2300  UT num_done;
2301 
2302  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2303 #ifdef KMP_DEBUG
2304  {
2305  const char *buff;
2306  // create format specifiers before the debug output
2307  buff = __kmp_str_format(
2308  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2309  traits_t<UT>::spec);
2310  KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2311  __kmp_str_free(&buff);
2312  }
2313 #endif
2314 
2315  if ((ST)num_done == th->th.th_team_nproc - 1) {
2316 #if (KMP_STATIC_STEAL_ENABLED)
2317  if (pr->schedule == kmp_sch_static_steal &&
2318  traits_t<T>::type_size > 4) {
2319  int i;
2320  kmp_info_t **other_threads = team->t.t_threads;
2321  // loop complete, safe to destroy locks used for stealing
2322  for (i = 0; i < th->th.th_team_nproc; ++i) {
2323  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2324  KMP_ASSERT(lck != NULL);
2325  __kmp_destroy_lock(lck);
2326  __kmp_free(lck);
2327  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2328  }
2329  }
2330 #endif
2331  /* NOTE: release this buffer to be reused */
2332 
2333  KMP_MB(); /* Flush all pending memory write invalidates. */
2334 
2335  sh->u.s.num_done = 0;
2336  sh->u.s.iteration = 0;
2337 
2338  /* TODO replace with general release procedure? */
2339  if (pr->ordered) {
2340  sh->u.s.ordered_iteration = 0;
2341  }
2342 
2343  KMP_MB(); /* Flush all pending memory write invalidates. */
2344 
2345  sh->buffer_index += __kmp_dispatch_num_buffers;
2346  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2347  gtid, sh->buffer_index));
2348 
2349  KMP_MB(); /* Flush all pending memory write invalidates. */
2350 
2351  } // if
2352  if (__kmp_env_consistency_check) {
2353  if (pr->pushed_ws != ct_none) {
2354  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2355  }
2356  }
2357 
2358  th->th.th_dispatch->th_deo_fcn = NULL;
2359  th->th.th_dispatch->th_dxo_fcn = NULL;
2360  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2361  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2362  } // if (status == 0)
2363 #if KMP_OS_WINDOWS
2364  else if (last) {
2365  pr->u.p.last_upper = pr->u.p.ub;
2366  }
2367 #endif /* KMP_OS_WINDOWS */
2368  if (p_last != NULL && status != 0)
2369  *p_last = last;
2370  } // if
2371 
2372 #ifdef KMP_DEBUG
2373  {
2374  const char *buff;
2375  // create format specifiers before the debug output
2376  buff = __kmp_str_format(
2377  "__kmp_dispatch_next: T#%%d normal case: "
2378  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2379  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2380  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2381  __kmp_str_free(&buff);
2382  }
2383 #endif
2384 #if INCLUDE_SSC_MARKS
2385  SSC_MARK_DISPATCH_NEXT();
2386 #endif
2387  OMPT_LOOP_END;
2388  return status;
2389 }
2390 
2391 template <typename T>
2392 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2393  kmp_int32 *plastiter, T *plower, T *pupper,
2394  typename traits_t<T>::signed_t incr) {
2395  typedef typename traits_t<T>::unsigned_t UT;
2396  typedef typename traits_t<T>::signed_t ST;
2397  kmp_uint32 team_id;
2398  kmp_uint32 nteams;
2399  UT trip_count;
2400  kmp_team_t *team;
2401  kmp_info_t *th;
2402 
2403  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2404  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2405 #ifdef KMP_DEBUG
2406  {
2407  const char *buff;
2408  // create format specifiers before the debug output
2409  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2410  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2411  traits_t<T>::spec, traits_t<T>::spec,
2412  traits_t<ST>::spec, traits_t<T>::spec);
2413  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2414  __kmp_str_free(&buff);
2415  }
2416 #endif
2417 
2418  if (__kmp_env_consistency_check) {
2419  if (incr == 0) {
2420  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2421  loc);
2422  }
2423  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2424  // The loop is illegal.
2425  // Some zero-trip loops maintained by compiler, e.g.:
2426  // for(i=10;i<0;++i) // lower >= upper - run-time check
2427  // for(i=0;i>10;--i) // lower <= upper - run-time check
2428  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2429  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2430  // Compiler does not check the following illegal loops:
2431  // for(i=0;i<10;i+=incr) // where incr<0
2432  // for(i=10;i>0;i-=incr) // where incr<0
2433  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2434  }
2435  }
2436  th = __kmp_threads[gtid];
2437  team = th->th.th_team;
2438 #if OMP_40_ENABLED
2439  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2440  nteams = th->th.th_teams_size.nteams;
2441 #endif
2442  team_id = team->t.t_master_tid;
2443  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2444 
2445  // compute global trip count
2446  if (incr == 1) {
2447  trip_count = *pupper - *plower + 1;
2448  } else if (incr == -1) {
2449  trip_count = *plower - *pupper + 1;
2450  } else if (incr > 0) {
2451  // upper-lower can exceed the limit of signed type
2452  trip_count = (UT)(*pupper - *plower) / incr + 1;
2453  } else {
2454  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2455  }
2456 
2457  if (trip_count <= nteams) {
2458  KMP_DEBUG_ASSERT(
2459  __kmp_static == kmp_sch_static_greedy ||
2460  __kmp_static ==
2461  kmp_sch_static_balanced); // Unknown static scheduling type.
2462  // only some teams get single iteration, others get nothing
2463  if (team_id < trip_count) {
2464  *pupper = *plower = *plower + team_id * incr;
2465  } else {
2466  *plower = *pupper + incr; // zero-trip loop
2467  }
2468  if (plastiter != NULL)
2469  *plastiter = (team_id == trip_count - 1);
2470  } else {
2471  if (__kmp_static == kmp_sch_static_balanced) {
2472  UT chunk = trip_count / nteams;
2473  UT extras = trip_count % nteams;
2474  *plower +=
2475  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2476  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2477  if (plastiter != NULL)
2478  *plastiter = (team_id == nteams - 1);
2479  } else {
2480  T chunk_inc_count =
2481  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2482  T upper = *pupper;
2483  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2484  // Unknown static scheduling type.
2485  *plower += team_id * chunk_inc_count;
2486  *pupper = *plower + chunk_inc_count - incr;
2487  // Check/correct bounds if needed
2488  if (incr > 0) {
2489  if (*pupper < *plower)
2490  *pupper = traits_t<T>::max_value;
2491  if (plastiter != NULL)
2492  *plastiter = *plower <= upper && *pupper > upper - incr;
2493  if (*pupper > upper)
2494  *pupper = upper; // tracker C73258
2495  } else {
2496  if (*pupper > *plower)
2497  *pupper = traits_t<T>::min_value;
2498  if (plastiter != NULL)
2499  *plastiter = *plower >= upper && *pupper < upper - incr;
2500  if (*pupper < upper)
2501  *pupper = upper; // tracker C73258
2502  }
2503  }
2504  }
2505 }
2506 
2507 //-----------------------------------------------------------------------------
2508 // Dispatch routines
2509 // Transfer call to template< type T >
2510 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2511 // T lb, T ub, ST st, ST chunk )
2512 extern "C" {
2513 
2530 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2531  enum sched_type schedule, kmp_int32 lb,
2532  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2533  KMP_DEBUG_ASSERT(__kmp_init_serial);
2534  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2535 }
2539 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2540  enum sched_type schedule, kmp_uint32 lb,
2541  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2542  KMP_DEBUG_ASSERT(__kmp_init_serial);
2543  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2544 }
2545 
2549 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2550  enum sched_type schedule, kmp_int64 lb,
2551  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2552  KMP_DEBUG_ASSERT(__kmp_init_serial);
2553  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2554 }
2555 
2559 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2560  enum sched_type schedule, kmp_uint64 lb,
2561  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2562  KMP_DEBUG_ASSERT(__kmp_init_serial);
2563  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2564 }
2565 
2575 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2576  enum sched_type schedule, kmp_int32 *p_last,
2577  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2578  kmp_int32 chunk) {
2579  KMP_DEBUG_ASSERT(__kmp_init_serial);
2580  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2581  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2582 }
2583 
2584 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2585  enum sched_type schedule, kmp_int32 *p_last,
2586  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2587  kmp_int32 chunk) {
2588  KMP_DEBUG_ASSERT(__kmp_init_serial);
2589  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2590  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2591 }
2592 
2593 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2594  enum sched_type schedule, kmp_int32 *p_last,
2595  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2596  kmp_int64 chunk) {
2597  KMP_DEBUG_ASSERT(__kmp_init_serial);
2598  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2599  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2600 }
2601 
2602 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2603  enum sched_type schedule, kmp_int32 *p_last,
2604  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2605  kmp_int64 chunk) {
2606  KMP_DEBUG_ASSERT(__kmp_init_serial);
2607  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2608  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2609 }
2610 
2624 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2625  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2626  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2627 }
2628 
2632 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2633  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2634  kmp_int32 *p_st) {
2635  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2636 }
2637 
2641 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2642  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2643  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2644 }
2645 
2649 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2650  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2651  kmp_int64 *p_st) {
2652  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2653 }
2654 
2661 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2662  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2663 }
2664 
2668 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2669  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2670 }
2671 
2675 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2676  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2677 }
2678 
2682 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2683  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2684 }
2687 //-----------------------------------------------------------------------------
2688 // Non-template routines from kmp_dispatch.cpp used in other sources
2689 
2690 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2691  return value == checker;
2692 }
2693 
2694 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2695  return value != checker;
2696 }
2697 
2698 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2699  return value < checker;
2700 }
2701 
2702 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2703  return value >= checker;
2704 }
2705 
2706 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2707  return value <= checker;
2708 }
2709 
2710 kmp_uint32
2711 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2712  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2713  void *obj // Higher-level synchronization object, or NULL.
2714  ) {
2715  // note: we may not belong to a team at this point
2716  volatile kmp_uint32 *spin = spinner;
2717  kmp_uint32 check = checker;
2718  kmp_uint32 spins;
2719  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2720  kmp_uint32 r;
2721 
2722  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2723  KMP_INIT_YIELD(spins);
2724  // main wait spin loop
2725  while (!f(r = TCR_4(*spin), check)) {
2726  KMP_FSYNC_SPIN_PREPARE(obj);
2727  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2728  split. It causes problems with infinite recursion because of exit lock */
2729  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2730  __kmp_abort_thread(); */
2731 
2732  /* if we have waited a bit, or are oversubscribed, yield */
2733  /* pause is in the following code */
2734  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2735  KMP_YIELD_SPIN(spins);
2736  }
2737  KMP_FSYNC_SPIN_ACQUIRED(obj);
2738  return r;
2739 }
2740 
2741 void __kmp_wait_yield_4_ptr(
2742  void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2743  void *obj // Higher-level synchronization object, or NULL.
2744  ) {
2745  // note: we may not belong to a team at this point
2746  void *spin = spinner;
2747  kmp_uint32 check = checker;
2748  kmp_uint32 spins;
2749  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2750 
2751  KMP_FSYNC_SPIN_INIT(obj, spin);
2752  KMP_INIT_YIELD(spins);
2753  // main wait spin loop
2754  while (!f(spin, check)) {
2755  KMP_FSYNC_SPIN_PREPARE(obj);
2756  /* if we have waited a bit, or are oversubscribed, yield */
2757  /* pause is in the following code */
2758  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2759  KMP_YIELD_SPIN(spins);
2760  }
2761  KMP_FSYNC_SPIN_ACQUIRED(obj);
2762 }
2763 
2764 } // extern "C"
2765 
2766 #ifdef KMP_GOMP_COMPAT
2767 
2768 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2769  enum sched_type schedule, kmp_int32 lb,
2770  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2771  int push_ws) {
2772  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2773  push_ws);
2774 }
2775 
2776 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2777  enum sched_type schedule, kmp_uint32 lb,
2778  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2779  int push_ws) {
2780  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2781  push_ws);
2782 }
2783 
2784 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2785  enum sched_type schedule, kmp_int64 lb,
2786  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2787  int push_ws) {
2788  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2789  push_ws);
2790 }
2791 
2792 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2793  enum sched_type schedule, kmp_uint64 lb,
2794  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2795  int push_ws) {
2796  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2797  push_ws);
2798 }
2799 
2800 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2801  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2802 }
2803 
2804 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2805  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2806 }
2807 
2808 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2809  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2810 }
2811 
2812 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2813  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2814 }
2815 
2816 #endif /* KMP_GOMP_COMPAT */
2817 
2818 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:792
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:805
Definition: kmp.h:208
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:315
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)