Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_csupport.c
1 /*
2  * kmp_csupport.c -- kfront linkage support for OpenMP.
3  * $Revision: 43473 $
4  * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2014 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "omp.h" /* extern "C" declarations of user-visible routines */
38 #include "kmp.h"
39 #include "kmp_i18n.h"
40 #include "kmp_itt.h"
41 #include "kmp_error.h"
42 #include "kmp_stats.h"
43 
44 #define MAX_MESSAGE 512
45 
46 /* ------------------------------------------------------------------------ */
47 /* ------------------------------------------------------------------------ */
48 
49 /* flags will be used in future, e.g., to implement */
50 /* openmp_strict library restrictions */
51 
61 void
62 __kmpc_begin(ident_t *loc, kmp_int32 flags)
63 {
64  // By default __kmp_ignore_mppbeg() returns TRUE.
65  if (__kmp_ignore_mppbeg() == FALSE) {
66  __kmp_internal_begin();
67 
68  KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
69  }
70 }
71 
79 void
81 {
82  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
83  // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
84  // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
85  // will unregister this root (it can cause library shut down).
86  if (__kmp_ignore_mppend() == FALSE) {
87  KC_TRACE( 10, ("__kmpc_end: called\n" ) );
88  KA_TRACE( 30, ("__kmpc_end\n" ));
89 
90  __kmp_internal_end_thread( -1 );
91  }
92 }
93 
113 kmp_int32
115 {
116  kmp_int32 gtid = __kmp_entry_gtid();
117 
118  KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
119 
120  return gtid;
121 }
122 
136 kmp_int32
138 {
139  KC_TRACE( 10, ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_nth ) );
140 
141  return TCR_4(__kmp_nth);
142 }
143 
150 kmp_int32
152 {
153  KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
154  return __kmp_tid_from_gtid( __kmp_entry_gtid() );
155 }
156 
162 kmp_int32
164 {
165  KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
166 
167  return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
168 }
169 
176 kmp_int32
178 {
179 #ifndef KMP_DEBUG
180 
181  return TRUE;
182 
183 #else
184 
185  const char *semi2;
186  const char *semi3;
187  int line_no;
188 
189  if (__kmp_par_range == 0) {
190  return TRUE;
191  }
192  semi2 = loc->psource;
193  if (semi2 == NULL) {
194  return TRUE;
195  }
196  semi2 = strchr(semi2, ';');
197  if (semi2 == NULL) {
198  return TRUE;
199  }
200  semi2 = strchr(semi2 + 1, ';');
201  if (semi2 == NULL) {
202  return TRUE;
203  }
204  if (__kmp_par_range_filename[0]) {
205  const char *name = semi2 - 1;
206  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
207  name--;
208  }
209  if ((*name == '/') || (*name == ';')) {
210  name++;
211  }
212  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
213  return __kmp_par_range < 0;
214  }
215  }
216  semi3 = strchr(semi2 + 1, ';');
217  if (__kmp_par_range_routine[0]) {
218  if ((semi3 != NULL) && (semi3 > semi2)
219  && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
220  return __kmp_par_range < 0;
221  }
222  }
223  if (sscanf(semi3 + 1, "%d", &line_no) == 1) {
224  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
225  return __kmp_par_range > 0;
226  }
227  return __kmp_par_range < 0;
228  }
229  return TRUE;
230 
231 #endif /* KMP_DEBUG */
232 
233 }
234 
240 kmp_int32
242 {
243  return __kmp_entry_thread() -> th.th_root -> r.r_active;
244 }
245 
255 void
256 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
257 {
258  KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
259  global_tid, num_threads ) );
260 
261  __kmp_push_num_threads( loc, global_tid, num_threads );
262 }
263 
264 void
265 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
266 {
267  KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
268 
269  /* the num_threads are automatically popped */
270 }
271 
272 
273 #if OMP_40_ENABLED
274 
275 void
276 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
277 {
278  KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
279  global_tid, proc_bind ) );
280 
281  __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
282 }
283 
284 #endif /* OMP_40_ENABLED */
285 
286 
296 void
297 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
298 {
299  KMP_STOP_EXPLICIT_TIMER(OMP_serial);
300  KMP_COUNT_BLOCK(OMP_PARALLEL);
301  int gtid = __kmp_entry_gtid();
302  // maybe to save thr_state is enough here
303  {
304  va_list ap;
305  va_start( ap, microtask );
306 
307 #if INCLUDE_SSC_MARKS
308  SSC_MARK_FORKING();
309 #endif
310  __kmp_fork_call( loc, gtid, fork_context_intel,
311  argc,
312  VOLATILE_CAST(microtask_t) microtask,
313  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
314 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
315 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
316  &ap
317 #else
318  ap
319 #endif
320  );
321 #if INCLUDE_SSC_MARKS
322  SSC_MARK_JOINING();
323 #endif
324  __kmp_join_call( loc, gtid );
325 
326  va_end( ap );
327  }
328  KMP_START_EXPLICIT_TIMER(OMP_serial);
329 }
330 
331 #if OMP_40_ENABLED
332 
342 void
343 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
344 {
345  KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
346  global_tid, num_teams, num_threads ) );
347 
348  __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
349 }
350 
360 void
361 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
362 {
363  int gtid = __kmp_entry_gtid();
364  kmp_info_t *this_thr = __kmp_threads[ gtid ];
365  va_list ap;
366  va_start( ap, microtask );
367 
368  // remember teams entry point and nesting level
369  this_thr->th.th_teams_microtask = microtask;
370  this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
371 
372  // check if __kmpc_push_num_teams called, set default number of teams otherwise
373  if ( this_thr->th.th_teams_size.nteams == 0 ) {
374  __kmp_push_num_teams( loc, gtid, 0, 0 );
375  }
376  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
377  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
378  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
379 
380  __kmp_fork_call( loc, gtid, fork_context_intel,
381  argc,
382  VOLATILE_CAST(microtask_t) __kmp_teams_master,
383  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
384 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
385  &ap
386 #else
387  ap
388 #endif
389  );
390  __kmp_join_call( loc, gtid );
391  this_thr->th.th_teams_microtask = NULL;
392  this_thr->th.th_teams_level = 0;
393  *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
394  va_end( ap );
395 }
396 #endif /* OMP_40_ENABLED */
397 
398 
399 //
400 // I don't think this function should ever have been exported.
401 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
402 // openmp code ever called it, but it's been exported from the RTL for so
403 // long that I'm afraid to remove the definition.
404 //
405 int
406 __kmpc_invoke_task_func( int gtid )
407 {
408  return __kmp_invoke_task_func( gtid );
409 }
410 
423 void
424 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
425 {
426  __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
427  * kmp_fork_call since the tasks to be done are similar in each case.
428  */
429 }
430 
438 void
439 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
440 {
441  kmp_internal_control_t *top;
442  kmp_info_t *this_thr;
443  kmp_team_t *serial_team;
444 
445  KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
446 
447  /* skip all this code for autopar serialized loops since it results in
448  unacceptable overhead */
449  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
450  return;
451 
452  // Not autopar code
453  if( ! TCR_4( __kmp_init_parallel ) )
454  __kmp_parallel_initialize();
455 
456  this_thr = __kmp_threads[ global_tid ];
457  serial_team = this_thr->th.th_serial_team;
458 
459  KMP_MB();
460  KMP_DEBUG_ASSERT( serial_team );
461  KMP_ASSERT( serial_team -> t.t_serialized );
462  KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
463  KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
464  KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
465  KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
466 
467  /* If necessary, pop the internal control stack values and replace the team values */
468  top = serial_team -> t.t_control_stack_top;
469  if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
470  copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
471  serial_team -> t.t_control_stack_top = top -> next;
472  __kmp_free(top);
473  }
474 
475  //if( serial_team -> t.t_serialized > 1 )
476  serial_team -> t.t_level--;
477 
478  /* pop dispatch buffers stack */
479  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
480  {
481  dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
482  serial_team->t.t_dispatch->th_disp_buffer =
483  serial_team->t.t_dispatch->th_disp_buffer->next;
484  __kmp_free( disp_buffer );
485  }
486 
487  -- serial_team -> t.t_serialized;
488  if ( serial_team -> t.t_serialized == 0 ) {
489 
490  /* return to the parallel section */
491 
492 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
493  if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
494  __kmp_clear_x87_fpu_status_word();
495  __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
496  __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
497  }
498 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
499 
500  this_thr -> th.th_team = serial_team -> t.t_parent;
501  this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
502 
503  /* restore values cached in the thread */
504  this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */
505  this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */
506  this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
507 
508  /* TODO the below shouldn't need to be adjusted for serialized teams */
509  this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
510  t.t_dispatch[ serial_team -> t.t_master_tid ];
511 
512  __kmp_pop_current_task_from_thread( this_thr );
513 
514  KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
515  this_thr -> th.th_current_task -> td_flags.executing = 1;
516 
517  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
518  //
519  // Copy the task team from the new child / old parent team
520  // to the thread. If non-NULL, copy the state flag also.
521  //
522  if ( ( this_thr -> th.th_task_team = this_thr -> th.th_team -> t.t_task_team ) != NULL ) {
523  this_thr -> th.th_task_state = this_thr -> th.th_task_team -> tt.tt_state;
524  }
525  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
526  global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
527  }
528  } else {
529  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
530  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
531  global_tid, serial_team, serial_team -> t.t_serialized ) );
532  }
533  }
534 
535 #if USE_ITT_BUILD
536  kmp_uint64 cur_time = 0;
537 #if USE_ITT_NOTIFY
538  if( __itt_get_timestamp_ptr ) {
539  cur_time = __itt_get_timestamp();
540  }
541 #endif /* USE_ITT_NOTIFY */
542  // Report the barrier
543  if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr ) {
544  if( this_thr->th.th_team->t.t_level == 0 ) {
545  __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized, cur_time, 0, loc, this_thr->th.th_team_nproc, 0 );
546  }
547  }
548  // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
549  if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
550  {
551  this_thr->th.th_ident = loc;
552  __kmp_itt_region_joined( global_tid, 1 );
553  }
554  if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG )
555  {
556  this_thr->th.th_ident = loc;
557  // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier.
558  __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time, cur_time, 0, loc, this_thr->th.th_team_nproc, 2 );
559  }
560 #endif /* USE_ITT_BUILD */
561 
562  if ( __kmp_env_consistency_check )
563  __kmp_pop_parallel( global_tid, NULL );
564 }
565 
578 void
580 {
581  KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
582 
583  /* need explicit __mf() here since use volatile instead in library */
584  KMP_MB(); /* Flush all pending memory write invalidates. */
585 
586  #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
587  #if KMP_MIC
588  // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
589  // We shouldn't need it, though, since the ABI rules require that
590  // * If the compiler generates NGO stores it also generates the fence
591  // * If users hand-code NGO stores they should insert the fence
592  // therefore no incomplete unordered stores should be visible.
593  #else
594  // C74404
595  // This is to address non-temporal store instructions (sfence needed).
596  // The clflush instruction is addressed either (mfence needed).
597  // Probably the non-temporal load monvtdqa instruction should also be addressed.
598  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
599  if ( ! __kmp_cpuinfo.initialized ) {
600  __kmp_query_cpuid( & __kmp_cpuinfo );
601  }; // if
602  if ( ! __kmp_cpuinfo.sse2 ) {
603  // CPU cannot execute SSE2 instructions.
604  } else {
605  #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
606  _mm_mfence();
607  #else
608  __sync_synchronize();
609  #endif // KMP_COMPILER_ICC
610  }; // if
611  #endif // KMP_MIC
612  #elif KMP_ARCH_ARM
613  // Nothing yet
614  #elif KMP_ARCH_PPC64
615  // Nothing needed here (we have a real MB above).
616  #if KMP_OS_CNK
617  // The flushing thread needs to yield here; this prevents a
618  // busy-waiting thread from saturating the pipeline. flush is
619  // often used in loops like this:
620  // while (!flag) {
621  // #pragma omp flush(flag)
622  // }
623  // and adding the yield here is good for at least a 10x speedup
624  // when running >2 threads per core (on the NAS LU benchmark).
625  __kmp_yield(TRUE);
626  #endif
627  #else
628  #error Unknown or unsupported architecture
629  #endif
630 
631 }
632 
633 /* -------------------------------------------------------------------------- */
634 
635 /* -------------------------------------------------------------------------- */
636 
644 void
645 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
646 {
647  KMP_COUNT_BLOCK(OMP_BARRIER);
648  KMP_TIME_BLOCK(OMP_barrier);
649  int explicit_barrier_flag;
650  KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
651 
652  if (! TCR_4(__kmp_init_parallel))
653  __kmp_parallel_initialize();
654 
655  if ( __kmp_env_consistency_check ) {
656  if ( loc == 0 ) {
657  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
658  }; // if
659 
660  __kmp_check_barrier( global_tid, ct_barrier, loc );
661  }
662 
663  __kmp_threads[ global_tid ]->th.th_ident = loc;
664  // TODO: explicit barrier_wait_id:
665  // this function is called when 'barrier' directive is present or
666  // implicit barrier at the end of a worksharing construct.
667  // 1) better to add a per-thread barrier counter to a thread data structure
668  // 2) set to 0 when a new team is created
669  // 4) no sync is required
670 
671  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
672 }
673 
674 /* The BARRIER for a MASTER section is always explicit */
681 kmp_int32
682 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
683 {
684  KMP_COUNT_BLOCK(OMP_MASTER);
685  int status = 0;
686 
687  KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
688 
689  if( ! TCR_4( __kmp_init_parallel ) )
690  __kmp_parallel_initialize();
691 
692  if( KMP_MASTER_GTID( global_tid ))
693  status = 1;
694 
695  if ( __kmp_env_consistency_check ) {
696  if (status)
697  __kmp_push_sync( global_tid, ct_master, loc, NULL );
698  else
699  __kmp_check_sync( global_tid, ct_master, loc, NULL );
700  }
701 
702  return status;
703 }
704 
713 void
714 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
715 {
716  KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
717 
718  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
719 
720  if ( __kmp_env_consistency_check ) {
721  if( global_tid < 0 )
722  KMP_WARNING( ThreadIdentInvalid );
723 
724  if( KMP_MASTER_GTID( global_tid ))
725  __kmp_pop_sync( global_tid, ct_master, loc );
726  }
727 }
728 
736 void
737 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
738 {
739  int cid = 0;
740  kmp_info_t *th;
741  KMP_DEBUG_ASSERT( __kmp_init_serial );
742 
743  KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
744 
745  if (! TCR_4(__kmp_init_parallel))
746  __kmp_parallel_initialize();
747 
748 #if USE_ITT_BUILD
749  __kmp_itt_ordered_prep( gtid );
750  // TODO: ordered_wait_id
751 #endif /* USE_ITT_BUILD */
752 
753  th = __kmp_threads[ gtid ];
754 
755  if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
756  (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
757  else
758  __kmp_parallel_deo( & gtid, & cid, loc );
759 
760 #if USE_ITT_BUILD
761  __kmp_itt_ordered_start( gtid );
762 #endif /* USE_ITT_BUILD */
763 }
764 
772 void
773 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
774 {
775  int cid = 0;
776  kmp_info_t *th;
777 
778  KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
779 
780 #if USE_ITT_BUILD
781  __kmp_itt_ordered_end( gtid );
782  // TODO: ordered_wait_id
783 #endif /* USE_ITT_BUILD */
784 
785  th = __kmp_threads[ gtid ];
786 
787  if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
788  (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
789  else
790  __kmp_parallel_dxo( & gtid, & cid, loc );
791 }
792 
793 static kmp_user_lock_p
794 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
795 {
796  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
797 
798  //
799  // Because of the double-check, the following load
800  // doesn't need to be volatile.
801  //
802  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
803 
804  if ( lck == NULL ) {
805  void * idx;
806 
807  // Allocate & initialize the lock.
808  // Remember allocated locks in table in order to free them in __kmp_cleanup()
809  lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
810  __kmp_init_user_lock_with_checks( lck );
811  __kmp_set_user_lock_location( lck, loc );
812 #if USE_ITT_BUILD
813  __kmp_itt_critical_creating( lck );
814  // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
815  // lock. It is the only place where we can guarantee it. There are chances the lock will
816  // destroyed with no usage, but it is not a problem, because this is not real event seen
817  // by user but rather setting name for object (lock). See more details in kmp_itt.h.
818 #endif /* USE_ITT_BUILD */
819 
820  //
821  // Use a cmpxchg instruction to slam the start of the critical
822  // section with the lock pointer. If another thread beat us
823  // to it, deallocate the lock, and use the lock that the other
824  // thread allocated.
825  //
826  int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
827 
828  if ( status == 0 ) {
829  // Deallocate the lock and reload the value.
830 #if USE_ITT_BUILD
831  __kmp_itt_critical_destroyed( lck );
832  // Let ITT know the lock is destroyed and the same memory location may be reused for
833  // another purpose.
834 #endif /* USE_ITT_BUILD */
835  __kmp_destroy_user_lock_with_checks( lck );
836  __kmp_user_lock_free( &idx, gtid, lck );
837  lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
838  KMP_DEBUG_ASSERT( lck != NULL );
839  }
840  }
841  return lck;
842 }
843 
854 void
855 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
856  KMP_COUNT_BLOCK(OMP_CRITICAL);
857 
858  kmp_user_lock_p lck;
859 
860  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
861 
862  //TODO: add THR_OVHD_STATE
863 
864  KMP_CHECK_USER_LOCK_INIT();
865 
866  if ( ( __kmp_user_lock_kind == lk_tas )
867  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
868  lck = (kmp_user_lock_p)crit;
869  }
870 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
871  else if ( ( __kmp_user_lock_kind == lk_futex )
872  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
873  lck = (kmp_user_lock_p)crit;
874  }
875 #endif
876  else { // ticket, queuing or drdpa
877  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
878  }
879 
880  if ( __kmp_env_consistency_check )
881  __kmp_push_sync( global_tid, ct_critical, loc, lck );
882 
883  /* since the critical directive binds to all threads, not just
884  * the current team we have to check this even if we are in a
885  * serialized team */
886  /* also, even if we are the uber thread, we still have to conduct the lock,
887  * as we have to contend with sibling threads */
888 
889 #if USE_ITT_BUILD
890  __kmp_itt_critical_acquiring( lck );
891 #endif /* USE_ITT_BUILD */
892  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
893 
894  __kmp_acquire_user_lock_with_checks( lck, global_tid );
895 
896 #if USE_ITT_BUILD
897  __kmp_itt_critical_acquired( lck );
898 #endif /* USE_ITT_BUILD */
899 
900  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
901 } // __kmpc_critical
902 
912 void
913 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
914 {
915  kmp_user_lock_p lck;
916 
917  KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
918 
919  if ( ( __kmp_user_lock_kind == lk_tas )
920  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
921  lck = (kmp_user_lock_p)crit;
922  }
923 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
924  else if ( ( __kmp_user_lock_kind == lk_futex )
925  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
926  lck = (kmp_user_lock_p)crit;
927  }
928 #endif
929  else { // ticket, queuing or drdpa
930  lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
931  }
932 
933  KMP_ASSERT(lck != NULL);
934 
935  if ( __kmp_env_consistency_check )
936  __kmp_pop_sync( global_tid, ct_critical, loc );
937 
938 #if USE_ITT_BUILD
939  __kmp_itt_critical_releasing( lck );
940 #endif /* USE_ITT_BUILD */
941  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
942 
943  __kmp_release_user_lock_with_checks( lck, global_tid );
944 
945  KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
946 }
947 
956 kmp_int32
957 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
958 {
959  int status;
960 
961  KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
962 
963  if (! TCR_4(__kmp_init_parallel))
964  __kmp_parallel_initialize();
965 
966  if ( __kmp_env_consistency_check )
967  __kmp_check_barrier( global_tid, ct_barrier, loc );
968 
969 #if USE_ITT_NOTIFY
970  __kmp_threads[global_tid]->th.th_ident = loc;
971 #endif
972  status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
973 
974  return (status != 0) ? 0 : 1;
975 }
976 
986 void
987 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
988 {
989  KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
990 
991  __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
992 }
993 
1004 kmp_int32
1005 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1006 {
1007  kmp_int32 ret;
1008 
1009  KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1010 
1011  if (! TCR_4(__kmp_init_parallel))
1012  __kmp_parallel_initialize();
1013 
1014  if ( __kmp_env_consistency_check ) {
1015  if ( loc == 0 ) {
1016  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1017  }
1018  __kmp_check_barrier( global_tid, ct_barrier, loc );
1019  }
1020 
1021 #if USE_ITT_NOTIFY
1022  __kmp_threads[global_tid]->th.th_ident = loc;
1023 #endif
1024  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1025 
1026  ret = __kmpc_master (loc, global_tid);
1027 
1028  if ( __kmp_env_consistency_check ) {
1029  /* there's no __kmpc_end_master called; so the (stats) */
1030  /* actions of __kmpc_end_master are done here */
1031 
1032  if ( global_tid < 0 ) {
1033  KMP_WARNING( ThreadIdentInvalid );
1034  }
1035  if (ret) {
1036  /* only one thread should do the pop since only */
1037  /* one did the push (see __kmpc_master()) */
1038 
1039  __kmp_pop_sync( global_tid, ct_master, loc );
1040  }
1041  }
1042 
1043  return (ret);
1044 }
1045 
1046 /* The BARRIER for a SINGLE process section is always explicit */
1058 kmp_int32
1059 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1060 {
1061  KMP_COUNT_BLOCK(OMP_SINGLE);
1062  kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1063  return rc;
1064 }
1065 
1075 void
1076 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1077 {
1078  __kmp_exit_single( global_tid );
1079 }
1080 
1088 void
1089 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1090 {
1091  KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1092 
1093  if ( __kmp_env_consistency_check )
1094  __kmp_pop_workshare( global_tid, ct_pdo, loc );
1095 }
1096 
1097 /*
1098  * User routines which take C-style arguments (call by value)
1099  * different from the Fortran equivalent routines
1100  */
1101 
1102 void
1103 ompc_set_num_threads( int arg )
1104 {
1105 // !!!!! TODO: check the per-task binding
1106  __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1107 }
1108 
1109 void
1110 ompc_set_dynamic( int flag )
1111 {
1112  kmp_info_t *thread;
1113 
1114  /* For the thread-private implementation of the internal controls */
1115  thread = __kmp_entry_thread();
1116 
1117  __kmp_save_internal_controls( thread );
1118 
1119  set__dynamic( thread, flag ? TRUE : FALSE );
1120 }
1121 
1122 void
1123 ompc_set_nested( int flag )
1124 {
1125  kmp_info_t *thread;
1126 
1127  /* For the thread-private internal controls implementation */
1128  thread = __kmp_entry_thread();
1129 
1130  __kmp_save_internal_controls( thread );
1131 
1132  set__nested( thread, flag ? TRUE : FALSE );
1133 }
1134 
1135 void
1136 ompc_set_max_active_levels( int max_active_levels )
1137 {
1138  /* TO DO */
1139  /* we want per-task implementation of this internal control */
1140 
1141  /* For the per-thread internal controls implementation */
1142  __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1143 }
1144 
1145 void
1146 ompc_set_schedule( omp_sched_t kind, int modifier )
1147 {
1148 // !!!!! TODO: check the per-task binding
1149  __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1150 }
1151 
1152 int
1153 ompc_get_ancestor_thread_num( int level )
1154 {
1155  return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1156 }
1157 
1158 int
1159 ompc_get_team_size( int level )
1160 {
1161  return __kmp_get_team_size( __kmp_entry_gtid(), level );
1162 }
1163 
1164 void
1165 kmpc_set_stacksize( int arg )
1166 {
1167  // __kmp_aux_set_stacksize initializes the library if needed
1168  __kmp_aux_set_stacksize( arg );
1169 }
1170 
1171 void
1172 kmpc_set_stacksize_s( size_t arg )
1173 {
1174  // __kmp_aux_set_stacksize initializes the library if needed
1175  __kmp_aux_set_stacksize( arg );
1176 }
1177 
1178 void
1179 kmpc_set_blocktime( int arg )
1180 {
1181  int gtid, tid;
1182  kmp_info_t *thread;
1183 
1184  gtid = __kmp_entry_gtid();
1185  tid = __kmp_tid_from_gtid(gtid);
1186  thread = __kmp_thread_from_gtid(gtid);
1187 
1188  __kmp_aux_set_blocktime( arg, thread, tid );
1189 }
1190 
1191 void
1192 kmpc_set_library( int arg )
1193 {
1194  // __kmp_user_set_library initializes the library if needed
1195  __kmp_user_set_library( (enum library_type)arg );
1196 }
1197 
1198 void
1199 kmpc_set_defaults( char const * str )
1200 {
1201  // __kmp_aux_set_defaults initializes the library if needed
1202  __kmp_aux_set_defaults( str, strlen( str ) );
1203 }
1204 
1205 int
1206 kmpc_set_affinity_mask_proc( int proc, void **mask )
1207 {
1208 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1209  return -1;
1210 #else
1211  if ( ! TCR_4(__kmp_init_middle) ) {
1212  __kmp_middle_initialize();
1213  }
1214  return __kmp_aux_set_affinity_mask_proc( proc, mask );
1215 #endif
1216 }
1217 
1218 int
1219 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1220 {
1221 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1222  return -1;
1223 #else
1224  if ( ! TCR_4(__kmp_init_middle) ) {
1225  __kmp_middle_initialize();
1226  }
1227  return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1228 #endif
1229 }
1230 
1231 int
1232 kmpc_get_affinity_mask_proc( int proc, void **mask )
1233 {
1234 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1235  return -1;
1236 #else
1237  if ( ! TCR_4(__kmp_init_middle) ) {
1238  __kmp_middle_initialize();
1239  }
1240  return __kmp_aux_get_affinity_mask_proc( proc, mask );
1241 #endif
1242 }
1243 
1244 
1245 /* -------------------------------------------------------------------------- */
1286 void
1287 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1288 {
1289  void **data_ptr;
1290 
1291  KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1292 
1293  KMP_MB();
1294 
1295  data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1296 
1297  if ( __kmp_env_consistency_check ) {
1298  if ( loc == 0 ) {
1299  KMP_WARNING( ConstructIdentInvalid );
1300  }
1301  }
1302 
1303  /* ToDo: Optimize the following two barriers into some kind of split barrier */
1304 
1305  if (didit) *data_ptr = cpy_data;
1306 
1307  /* This barrier is not a barrier region boundary */
1308 #if USE_ITT_NOTIFY
1309  __kmp_threads[gtid]->th.th_ident = loc;
1310 #endif
1311  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1312 
1313  if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1314 
1315  /* Consider next barrier the user-visible barrier for barrier region boundaries */
1316  /* Nesting checks are already handled by the single construct checks */
1317 
1318 #if USE_ITT_NOTIFY
1319  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
1320 #endif
1321  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1322 }
1323 
1324 /* -------------------------------------------------------------------------- */
1325 
1326 #define INIT_LOCK __kmp_init_user_lock_with_checks
1327 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1328 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1329 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1330 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1331 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1332 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1333 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1334 #define TEST_LOCK __kmp_test_user_lock_with_checks
1335 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1336 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1337 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1338 
1339 
1340 /*
1341  * TODO: Make check abort messages use location info & pass it
1342  * into with_checks routines
1343  */
1344 
1345 /* initialize the lock */
1346 void
1347 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1348  static char const * const func = "omp_init_lock";
1349  kmp_user_lock_p lck;
1350  KMP_DEBUG_ASSERT( __kmp_init_serial );
1351 
1352  if ( __kmp_env_consistency_check ) {
1353  if ( user_lock == NULL ) {
1354  KMP_FATAL( LockIsUninitialized, func );
1355  }
1356  }
1357 
1358  KMP_CHECK_USER_LOCK_INIT();
1359 
1360  if ( ( __kmp_user_lock_kind == lk_tas )
1361  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1362  lck = (kmp_user_lock_p)user_lock;
1363  }
1364 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1365  else if ( ( __kmp_user_lock_kind == lk_futex )
1366  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1367  lck = (kmp_user_lock_p)user_lock;
1368  }
1369 #endif
1370  else {
1371  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1372  }
1373  INIT_LOCK( lck );
1374  __kmp_set_user_lock_location( lck, loc );
1375 
1376 #if USE_ITT_BUILD
1377  __kmp_itt_lock_creating( lck );
1378 #endif /* USE_ITT_BUILD */
1379 } // __kmpc_init_lock
1380 
1381 /* initialize the lock */
1382 void
1383 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1384  static char const * const func = "omp_init_nest_lock";
1385  kmp_user_lock_p lck;
1386  KMP_DEBUG_ASSERT( __kmp_init_serial );
1387 
1388  if ( __kmp_env_consistency_check ) {
1389  if ( user_lock == NULL ) {
1390  KMP_FATAL( LockIsUninitialized, func );
1391  }
1392  }
1393 
1394  KMP_CHECK_USER_LOCK_INIT();
1395 
1396  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1397  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1398  lck = (kmp_user_lock_p)user_lock;
1399  }
1400 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1401  else if ( ( __kmp_user_lock_kind == lk_futex )
1402  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1403  <= OMP_NEST_LOCK_T_SIZE ) ) {
1404  lck = (kmp_user_lock_p)user_lock;
1405  }
1406 #endif
1407  else {
1408  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1409  }
1410 
1411  INIT_NESTED_LOCK( lck );
1412  __kmp_set_user_lock_location( lck, loc );
1413 
1414 #if USE_ITT_BUILD
1415  __kmp_itt_lock_creating( lck );
1416 #endif /* USE_ITT_BUILD */
1417 } // __kmpc_init_nest_lock
1418 
1419 void
1420 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1421 
1422  kmp_user_lock_p lck;
1423 
1424  if ( ( __kmp_user_lock_kind == lk_tas )
1425  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1426  lck = (kmp_user_lock_p)user_lock;
1427  }
1428 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1429  else if ( ( __kmp_user_lock_kind == lk_futex )
1430  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1431  lck = (kmp_user_lock_p)user_lock;
1432  }
1433 #endif
1434  else {
1435  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
1436  }
1437 
1438 #if USE_ITT_BUILD
1439  __kmp_itt_lock_destroyed( lck );
1440 #endif /* USE_ITT_BUILD */
1441  DESTROY_LOCK( lck );
1442 
1443  if ( ( __kmp_user_lock_kind == lk_tas )
1444  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1445  ;
1446  }
1447 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1448  else if ( ( __kmp_user_lock_kind == lk_futex )
1449  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1450  ;
1451  }
1452 #endif
1453  else {
1454  __kmp_user_lock_free( user_lock, gtid, lck );
1455  }
1456 } // __kmpc_destroy_lock
1457 
1458 /* destroy the lock */
1459 void
1460 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1461 
1462  kmp_user_lock_p lck;
1463 
1464  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1465  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1466  lck = (kmp_user_lock_p)user_lock;
1467  }
1468 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1469  else if ( ( __kmp_user_lock_kind == lk_futex )
1470  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1471  <= OMP_NEST_LOCK_T_SIZE ) ) {
1472  lck = (kmp_user_lock_p)user_lock;
1473  }
1474 #endif
1475  else {
1476  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
1477  }
1478 
1479 #if USE_ITT_BUILD
1480  __kmp_itt_lock_destroyed( lck );
1481 #endif /* USE_ITT_BUILD */
1482 
1483  DESTROY_NESTED_LOCK( lck );
1484 
1485  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1486  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1487  ;
1488  }
1489 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1490  else if ( ( __kmp_user_lock_kind == lk_futex )
1491  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1492  <= OMP_NEST_LOCK_T_SIZE ) ) {
1493  ;
1494  }
1495 #endif
1496  else {
1497  __kmp_user_lock_free( user_lock, gtid, lck );
1498  }
1499 } // __kmpc_destroy_nest_lock
1500 
1501 void
1502 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1503  KMP_COUNT_BLOCK(OMP_set_lock);
1504  kmp_user_lock_p lck;
1505 
1506  if ( ( __kmp_user_lock_kind == lk_tas )
1507  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1508  lck = (kmp_user_lock_p)user_lock;
1509  }
1510 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1511  else if ( ( __kmp_user_lock_kind == lk_futex )
1512  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1513  lck = (kmp_user_lock_p)user_lock;
1514  }
1515 #endif
1516  else {
1517  lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
1518  }
1519 
1520 #if USE_ITT_BUILD
1521  __kmp_itt_lock_acquiring( lck );
1522 #endif /* USE_ITT_BUILD */
1523 
1524  ACQUIRE_LOCK( lck, gtid );
1525 
1526 #if USE_ITT_BUILD
1527  __kmp_itt_lock_acquired( lck );
1528 #endif /* USE_ITT_BUILD */
1529 }
1530 
1531 
1532 void
1533 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1534  kmp_user_lock_p lck;
1535 
1536  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1537  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1538  lck = (kmp_user_lock_p)user_lock;
1539  }
1540 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1541  else if ( ( __kmp_user_lock_kind == lk_futex )
1542  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1543  <= OMP_NEST_LOCK_T_SIZE ) ) {
1544  lck = (kmp_user_lock_p)user_lock;
1545  }
1546 #endif
1547  else {
1548  lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
1549  }
1550 
1551 #if USE_ITT_BUILD
1552  __kmp_itt_lock_acquiring( lck );
1553 #endif /* USE_ITT_BUILD */
1554 
1555  ACQUIRE_NESTED_LOCK( lck, gtid );
1556 
1557 #if USE_ITT_BUILD
1558  __kmp_itt_lock_acquired( lck );
1559 #endif /* USE_ITT_BUILD */
1560 }
1561 
1562 void
1563 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1564 {
1565  kmp_user_lock_p lck;
1566 
1567  /* Can't use serial interval since not block structured */
1568  /* release the lock */
1569 
1570  if ( ( __kmp_user_lock_kind == lk_tas )
1571  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1572 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1573  // "fast" path implemented to fix customer performance issue
1574 #if USE_ITT_BUILD
1575  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
1576 #endif /* USE_ITT_BUILD */
1577  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
1578  KMP_MB();
1579  return;
1580 #else
1581  lck = (kmp_user_lock_p)user_lock;
1582 #endif
1583  }
1584 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1585  else if ( ( __kmp_user_lock_kind == lk_futex )
1586  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1587  lck = (kmp_user_lock_p)user_lock;
1588  }
1589 #endif
1590  else {
1591  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
1592  }
1593 
1594 #if USE_ITT_BUILD
1595  __kmp_itt_lock_releasing( lck );
1596 #endif /* USE_ITT_BUILD */
1597 
1598  RELEASE_LOCK( lck, gtid );
1599 }
1600 
1601 /* release the lock */
1602 void
1603 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1604 {
1605  kmp_user_lock_p lck;
1606 
1607  /* Can't use serial interval since not block structured */
1608 
1609  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1610  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1611 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1612  // "fast" path implemented to fix customer performance issue
1613  kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
1614 #if USE_ITT_BUILD
1615  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
1616 #endif /* USE_ITT_BUILD */
1617  if ( --(tl->lk.depth_locked) == 0 ) {
1618  TCW_4(tl->lk.poll, 0);
1619  }
1620  KMP_MB();
1621  return;
1622 #else
1623  lck = (kmp_user_lock_p)user_lock;
1624 #endif
1625  }
1626 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1627  else if ( ( __kmp_user_lock_kind == lk_futex )
1628  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1629  <= OMP_NEST_LOCK_T_SIZE ) ) {
1630  lck = (kmp_user_lock_p)user_lock;
1631  }
1632 #endif
1633  else {
1634  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
1635  }
1636 
1637 #if USE_ITT_BUILD
1638  __kmp_itt_lock_releasing( lck );
1639 #endif /* USE_ITT_BUILD */
1640 
1641  RELEASE_NESTED_LOCK( lck, gtid );
1642 }
1643 
1644 /* try to acquire the lock */
1645 int
1646 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1647 {
1648  KMP_COUNT_BLOCK(OMP_test_lock);
1649  KMP_TIME_BLOCK(OMP_test_lock);
1650  kmp_user_lock_p lck;
1651  int rc;
1652 
1653  if ( ( __kmp_user_lock_kind == lk_tas )
1654  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1655  lck = (kmp_user_lock_p)user_lock;
1656  }
1657 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1658  else if ( ( __kmp_user_lock_kind == lk_futex )
1659  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1660  lck = (kmp_user_lock_p)user_lock;
1661  }
1662 #endif
1663  else {
1664  lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
1665  }
1666 
1667 #if USE_ITT_BUILD
1668  __kmp_itt_lock_acquiring( lck );
1669 #endif /* USE_ITT_BUILD */
1670 
1671  rc = TEST_LOCK( lck, gtid );
1672 #if USE_ITT_BUILD
1673  if ( rc ) {
1674  __kmp_itt_lock_acquired( lck );
1675  } else {
1676  __kmp_itt_lock_cancelled( lck );
1677  }
1678 #endif /* USE_ITT_BUILD */
1679  return ( rc ? FTN_TRUE : FTN_FALSE );
1680 
1681  /* Can't use serial interval since not block structured */
1682 }
1683 
1684 /* try to acquire the lock */
1685 int
1686 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1687 {
1688  kmp_user_lock_p lck;
1689  int rc;
1690 
1691  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1692  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1693  lck = (kmp_user_lock_p)user_lock;
1694  }
1695 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
1696  else if ( ( __kmp_user_lock_kind == lk_futex )
1697  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1698  <= OMP_NEST_LOCK_T_SIZE ) ) {
1699  lck = (kmp_user_lock_p)user_lock;
1700  }
1701 #endif
1702  else {
1703  lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
1704  }
1705 
1706 #if USE_ITT_BUILD
1707  __kmp_itt_lock_acquiring( lck );
1708 #endif /* USE_ITT_BUILD */
1709 
1710  rc = TEST_NESTED_LOCK( lck, gtid );
1711 #if USE_ITT_BUILD
1712  if ( rc ) {
1713  __kmp_itt_lock_acquired( lck );
1714  } else {
1715  __kmp_itt_lock_cancelled( lck );
1716  }
1717 #endif /* USE_ITT_BUILD */
1718  return rc;
1719 
1720  /* Can't use serial interval since not block structured */
1721 }
1722 
1723 
1724 /*--------------------------------------------------------------------------------------------------------------------*/
1725 
1726 /*
1727  * Interface to fast scalable reduce methods routines
1728  */
1729 
1730 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
1731 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
1732 // AT: which solution is better?
1733 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
1734  ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
1735 
1736 #define __KMP_GET_REDUCTION_METHOD(gtid) \
1737  ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
1738 
1739 // description of the packed_reduction_method variable: look at the macros in kmp.h
1740 
1741 
1742 // used in a critical section reduce block
1743 static __forceinline void
1744 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
1745 
1746  // this lock was visible to a customer and to the thread profiler as a serial overhead span
1747  // (although it's used for an internal purpose only)
1748  // why was it visible in previous implementation?
1749  // should we keep it visible in new reduce block?
1750  kmp_user_lock_p lck;
1751 
1752  // We know that the fast reduction code is only emitted by Intel compilers
1753  // with 32 byte critical sections. If there isn't enough space, then we
1754  // have to use a pointer.
1755  if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
1756  lck = (kmp_user_lock_p)crit;
1757  }
1758  else {
1759  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1760  }
1761  KMP_DEBUG_ASSERT( lck != NULL );
1762 
1763  if ( __kmp_env_consistency_check )
1764  __kmp_push_sync( global_tid, ct_critical, loc, lck );
1765 
1766  __kmp_acquire_user_lock_with_checks( lck, global_tid );
1767 }
1768 
1769 // used in a critical section reduce block
1770 static __forceinline void
1771 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
1772 
1773  kmp_user_lock_p lck;
1774 
1775  // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
1776  // sections. If there isn't enough space, then we have to use a pointer.
1777  if ( __kmp_base_user_lock_size > 32 ) {
1778  lck = *( (kmp_user_lock_p *) crit );
1779  KMP_ASSERT( lck != NULL );
1780  } else {
1781  lck = (kmp_user_lock_p) crit;
1782  }
1783 
1784  if ( __kmp_env_consistency_check )
1785  __kmp_pop_sync( global_tid, ct_critical, loc );
1786 
1787  __kmp_release_user_lock_with_checks( lck, global_tid );
1788 
1789 } // __kmp_end_critical_section_reduce_block
1790 
1791 
1792 /* 2.a.i. Reduce Block without a terminating barrier */
1806 kmp_int32
1808  ident_t *loc, kmp_int32 global_tid,
1809  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
1810  kmp_critical_name *lck ) {
1811 
1812  KMP_COUNT_BLOCK(REDUCE_nowait);
1813  int retval;
1814  PACKED_REDUCTION_METHOD_T packed_reduction_method;
1815 #if OMP_40_ENABLED
1816  kmp_team_t *team;
1817  kmp_info_t *th;
1818  int teams_swapped = 0, task_state;
1819 #endif
1820  KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
1821 
1822  // why do we need this initialization here at all?
1823  // Reduction clause can not be used as a stand-alone directive.
1824 
1825  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
1826  // possible detection of false-positive race by the threadchecker ???
1827  if( ! TCR_4( __kmp_init_parallel ) )
1828  __kmp_parallel_initialize();
1829 
1830  // check correctness of reduce block nesting
1831  if ( __kmp_env_consistency_check )
1832  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
1833 
1834 #if OMP_40_ENABLED
1835  th = __kmp_thread_from_gtid(global_tid);
1836  if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct?
1837  team = th->th.th_team;
1838  if( team->t.t_level == th->th.th_teams_level ) {
1839  // this is reduction at teams construct
1840  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
1841  // Let's swap teams temporarily for the reduction barrier
1842  teams_swapped = 1;
1843  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
1844  th->th.th_team = team->t.t_parent;
1845  th->th.th_task_team = th->th.th_team->t.t_task_team;
1846  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
1847  task_state = th->th.th_task_state;
1848  if( th->th.th_task_team )
1849  th->th.th_task_state = th->th.th_task_team->tt.tt_state;
1850  }
1851  }
1852 #endif // OMP_40_ENABLED
1853 
1854  // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
1855  // the variable should be either a construct-specific or thread-specific property, not a team specific property
1856  // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
1857  // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
1858  // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
1859  // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
1860  // a thread-specific "th_local.reduction_method" variable is used currently
1861  // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
1862 
1863  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
1864  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
1865 
1866  if( packed_reduction_method == critical_reduce_block ) {
1867 
1868  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
1869  retval = 1;
1870 
1871  } else if( packed_reduction_method == empty_reduce_block ) {
1872 
1873  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
1874  retval = 1;
1875 
1876  } else if( packed_reduction_method == atomic_reduce_block ) {
1877 
1878  retval = 2;
1879 
1880  // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
1881  // (it's not quite good, because the checking block has been closed by this 'pop',
1882  // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
1883  if ( __kmp_env_consistency_check )
1884  __kmp_pop_sync( global_tid, ct_reduce, loc );
1885 
1886  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
1887 
1888  //AT: performance issue: a real barrier here
1889  //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them)
1890  //AT: (it's not what a customer might expect specifying NOWAIT clause)
1891  //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
1892  //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
1893  // and be more in line with sense of NOWAIT
1894  //AT: TO DO: do epcc test and compare times
1895 
1896  // this barrier should be invisible to a customer and to the thread profiler
1897  // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
1898 #if USE_ITT_NOTIFY
1899  __kmp_threads[global_tid]->th.th_ident = loc;
1900 #endif
1901  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
1902  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
1903 
1904  // all other workers except master should do this pop here
1905  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
1906  if ( __kmp_env_consistency_check ) {
1907  if( retval == 0 ) {
1908  __kmp_pop_sync( global_tid, ct_reduce, loc );
1909  }
1910  }
1911 
1912  } else {
1913 
1914  // should never reach this block
1915  KMP_ASSERT( 0 ); // "unexpected method"
1916 
1917  }
1918 #if OMP_40_ENABLED
1919  if( teams_swapped ) {
1920  // Restore thread structure
1921  th->th.th_info.ds.ds_tid = 0;
1922  th->th.th_team = team;
1923  th->th.th_task_team = team->t.t_task_team;
1924  th->th.th_team_nproc = team->t.t_nproc;
1925  th->th.th_task_state = task_state;
1926  }
1927 #endif
1928  KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
1929 
1930  return retval;
1931 }
1932 
1941 void
1942 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
1943 
1944  PACKED_REDUCTION_METHOD_T packed_reduction_method;
1945 
1946  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
1947 
1948  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
1949 
1950  if( packed_reduction_method == critical_reduce_block ) {
1951 
1952  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
1953 
1954  } else if( packed_reduction_method == empty_reduce_block ) {
1955 
1956  // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
1957 
1958  } else if( packed_reduction_method == atomic_reduce_block ) {
1959 
1960  // neither master nor other workers should get here
1961  // (code gen does not generate this call in case 2: atomic reduce block)
1962  // actually it's better to remove this elseif at all;
1963  // after removal this value will checked by the 'else' and will assert
1964 
1965  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
1966 
1967  // only master gets here
1968 
1969  } else {
1970 
1971  // should never reach this block
1972  KMP_ASSERT( 0 ); // "unexpected method"
1973 
1974  }
1975 
1976  if ( __kmp_env_consistency_check )
1977  __kmp_pop_sync( global_tid, ct_reduce, loc );
1978 
1979  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
1980 
1981  return;
1982 }
1983 
1984 /* 2.a.ii. Reduce Block with a terminating barrier */
1985 
1999 kmp_int32
2001  ident_t *loc, kmp_int32 global_tid,
2002  kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2003  void (*reduce_func)(void *lhs_data, void *rhs_data),
2004  kmp_critical_name *lck )
2005 {
2006  KMP_COUNT_BLOCK(REDUCE_wait);
2007  int retval;
2008  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2009 
2010  KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2011 
2012  // why do we need this initialization here at all?
2013  // Reduction clause can not be a stand-alone directive.
2014 
2015  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2016  // possible detection of false-positive race by the threadchecker ???
2017  if( ! TCR_4( __kmp_init_parallel ) )
2018  __kmp_parallel_initialize();
2019 
2020  // check correctness of reduce block nesting
2021  if ( __kmp_env_consistency_check )
2022  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2023 
2024  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2025  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2026 
2027  if( packed_reduction_method == critical_reduce_block ) {
2028 
2029  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2030  retval = 1;
2031 
2032  } else if( packed_reduction_method == empty_reduce_block ) {
2033 
2034  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2035  retval = 1;
2036 
2037  } else if( packed_reduction_method == atomic_reduce_block ) {
2038 
2039  retval = 2;
2040 
2041  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2042 
2043  //case tree_reduce_block:
2044  // this barrier should be visible to a customer and to the thread profiler
2045  // (it's a terminating barrier on constructs if NOWAIT not specified)
2046 #if USE_ITT_NOTIFY
2047  __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
2048 #endif
2049  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2050  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2051 
2052  // all other workers except master should do this pop here
2053  // ( none of other workers except master will enter __kmpc_end_reduce() )
2054  if ( __kmp_env_consistency_check ) {
2055  if( retval == 0 ) { // 0: all other workers; 1: master
2056  __kmp_pop_sync( global_tid, ct_reduce, loc );
2057  }
2058  }
2059 
2060  } else {
2061 
2062  // should never reach this block
2063  KMP_ASSERT( 0 ); // "unexpected method"
2064 
2065  }
2066 
2067  KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2068 
2069  return retval;
2070 }
2071 
2081 void
2082 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2083 
2084  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2085 
2086  KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2087 
2088  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2089 
2090  // this barrier should be visible to a customer and to the thread profiler
2091  // (it's a terminating barrier on constructs if NOWAIT not specified)
2092 
2093  if( packed_reduction_method == critical_reduce_block ) {
2094 
2095  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2096 
2097  // TODO: implicit barrier: should be exposed
2098 #if USE_ITT_NOTIFY
2099  __kmp_threads[global_tid]->th.th_ident = loc;
2100 #endif
2101  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2102 
2103  } else if( packed_reduction_method == empty_reduce_block ) {
2104 
2105  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2106 
2107  // TODO: implicit barrier: should be exposed
2108 #if USE_ITT_NOTIFY
2109  __kmp_threads[global_tid]->th.th_ident = loc;
2110 #endif
2111  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2112 
2113  } else if( packed_reduction_method == atomic_reduce_block ) {
2114 
2115  // TODO: implicit barrier: should be exposed
2116 #if USE_ITT_NOTIFY
2117  __kmp_threads[global_tid]->th.th_ident = loc;
2118 #endif
2119  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2120 
2121  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2122 
2123  // only master executes here (master releases all other workers)
2124  __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2125 
2126  } else {
2127 
2128  // should never reach this block
2129  KMP_ASSERT( 0 ); // "unexpected method"
2130 
2131  }
2132 
2133  if ( __kmp_env_consistency_check )
2134  __kmp_pop_sync( global_tid, ct_reduce, loc );
2135 
2136  KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2137 
2138  return;
2139 }
2140 
2141 #undef __KMP_GET_REDUCTION_METHOD
2142 #undef __KMP_SET_REDUCTION_METHOD
2143 
2144 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
2145 
2146 kmp_uint64
2147 __kmpc_get_taskid() {
2148 
2149  kmp_int32 gtid;
2150  kmp_info_t * thread;
2151 
2152  gtid = __kmp_get_gtid();
2153  if ( gtid < 0 ) {
2154  return 0;
2155  }; // if
2156  thread = __kmp_thread_from_gtid( gtid );
2157  return thread->th.th_current_task->td_task_id;
2158 
2159 } // __kmpc_get_taskid
2160 
2161 
2162 kmp_uint64
2163 __kmpc_get_parent_taskid() {
2164 
2165  kmp_int32 gtid;
2166  kmp_info_t * thread;
2167  kmp_taskdata_t * parent_task;
2168 
2169  gtid = __kmp_get_gtid();
2170  if ( gtid < 0 ) {
2171  return 0;
2172  }; // if
2173  thread = __kmp_thread_from_gtid( gtid );
2174  parent_task = thread->th.th_current_task->td_parent;
2175  return ( parent_task == NULL ? 0 : parent_task->td_task_id );
2176 
2177 } // __kmpc_get_parent_taskid
2178 
2179 void __kmpc_place_threads(int nC, int nT, int nO)
2180 {
2181 #if KMP_MIC
2182  if ( ! __kmp_init_serial ) {
2183  __kmp_serial_initialize();
2184  }
2185  __kmp_place_num_cores = nC;
2186  __kmp_place_num_threads_per_core = nT;
2187  __kmp_place_core_offset = nO;
2188 #endif
2189 }
2190 
2191 // end of file //
2192 
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:682
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:957
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1293
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
Definition: kmp_csupport.c:114
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end(ident_t *loc)
Definition: kmp_csupport.c:80
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:773
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:439
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:201
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
Definition: kmp_csupport.c:62
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
Definition: kmp_csupport.c:151
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:737
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:654
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp_csupport.c:855
Definition: kmp.h:218
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:987
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:714
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
Definition: kmp_csupport.c:256
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:361
kmp_int32 __kmpc_in_parallel(ident_t *loc)
Definition: kmp_csupport.c:241
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
Definition: kmp_csupport.c:177
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
Definition: kmp_csupport.c:137
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
Definition: kmp_csupport.c:163
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:645
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp_csupport.c:913
void __kmpc_flush(ident_t *loc,...)
Definition: kmp_csupport.c:579
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
Definition: kmp_csupport.c:343
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:424
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:297
char const * psource
Definition: kmp.h:227
kmp_int32 flags
Definition: kmp.h:220