Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_taskq.c
1 /*
2  * kmp_taskq.c -- TASKQ support for OpenMP.
3  * $Revision: 42582 $
4  * $Date: 2013-08-09 06:30:22 -0500 (Fri, 09 Aug 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_i18n.h"
39 #include "kmp_io.h"
40 #include "kmp_error.h"
41 
42 #define MAX_MESSAGE 512
43 
44 /* ------------------------------------------------------------------------ */
45 /* ------------------------------------------------------------------------ */
46 
47 /*
48  * Taskq routines and global variables
49  */
50 
51 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
52 
53 #define THREAD_ALLOC_FOR_TASKQ
54 
55 static void
56 __kmp_static_delay( int arg )
57 {
58 /* Work around weird code-gen bug that causes assert to trip */
59 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
60  KMP_ASSERT( arg != 0 );
61 #else
62  KMP_ASSERT( arg >= 0 );
63 #endif
64 }
65 
66 static void
67 __kmp_static_yield( int arg )
68 {
69  __kmp_yield( arg );
70 }
71 
72 static int
73 in_parallel_context( kmp_team_t *team )
74 {
75  return ! team -> t.t_serialized;
76 }
77 
78 static void
79 __kmp_taskq_eo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
80 {
81  int gtid = *gtid_ref;
82  int tid = __kmp_tid_from_gtid( gtid );
83  kmp_uint32 spins;
84  kmp_uint32 my_token;
85  kmpc_task_queue_t *taskq;
86  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
87 
88  if ( __kmp_env_consistency_check )
89  __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL );
90 
91  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
92  KMP_MB(); /* Flush all pending memory write invalidates. */
93 
94  /* GEH - need check here under stats to make sure */
95  /* inside task (curr_thunk[*tid_ref] != NULL) */
96 
97  my_token =tq->tq_curr_thunk[ tid ]-> th_tasknum;
98 
99  taskq = tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue;
100 
101  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
102  KMP_MB();
103  }
104 }
105 
106 static void
107 __kmp_taskq_xo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
108 {
109  int gtid = *gtid_ref;
110  int tid = __kmp_tid_from_gtid( gtid );
111  kmp_uint32 my_token;
112  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
113 
114  if ( __kmp_env_consistency_check )
115  __kmp_pop_sync( gtid, ct_ordered_in_taskq, loc_ref );
116 
117  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
118  KMP_MB(); /* Flush all pending memory write invalidates. */
119 
120  /* GEH - need check here under stats to make sure */
121  /* inside task (curr_thunk[tid] != NULL) */
122 
123  my_token = tq->tq_curr_thunk[ tid ]->th_tasknum;
124 
125  KMP_MB(); /* Flush all pending memory write invalidates. */
126 
127  tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue -> tq_tasknum_serving = my_token + 1;
128 
129  KMP_MB(); /* Flush all pending memory write invalidates. */
130  }
131 }
132 
133 static void
134 __kmp_taskq_check_ordered( kmp_int32 gtid, kmpc_thunk_t *thunk )
135 {
136  kmp_uint32 spins;
137  kmp_uint32 my_token;
138  kmpc_task_queue_t *taskq;
139 
140  /* assume we are always called from an active parallel context */
141 
142  KMP_MB(); /* Flush all pending memory write invalidates. */
143 
144  my_token = thunk -> th_tasknum;
145 
146  taskq = thunk -> th.th_shareds -> sv_queue;
147 
148  if(taskq->tq_tasknum_serving <= my_token) {
149  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
150  KMP_MB();
151  taskq->tq_tasknum_serving = my_token +1;
152  KMP_MB();
153  }
154 }
155 
156 static void
157 __kmp_dump_TQF(kmp_int32 flags)
158 {
159  if (flags & TQF_IS_ORDERED)
160  __kmp_printf("ORDERED ");
161  if (flags & TQF_IS_LASTPRIVATE)
162  __kmp_printf("LAST_PRIV ");
163  if (flags & TQF_IS_NOWAIT)
164  __kmp_printf("NOWAIT ");
165  if (flags & TQF_HEURISTICS)
166  __kmp_printf("HEURIST ");
167  if (flags & TQF_INTERFACE_RESERVED1)
168  __kmp_printf("RESERV1 ");
169  if (flags & TQF_INTERFACE_RESERVED2)
170  __kmp_printf("RESERV2 ");
171  if (flags & TQF_INTERFACE_RESERVED3)
172  __kmp_printf("RESERV3 ");
173  if (flags & TQF_INTERFACE_RESERVED4)
174  __kmp_printf("RESERV4 ");
175  if (flags & TQF_IS_LAST_TASK)
176  __kmp_printf("LAST_TASK ");
177  if (flags & TQF_TASKQ_TASK)
178  __kmp_printf("TASKQ_TASK ");
179  if (flags & TQF_RELEASE_WORKERS)
180  __kmp_printf("RELEASE ");
181  if (flags & TQF_ALL_TASKS_QUEUED)
182  __kmp_printf("ALL_QUEUED ");
183  if (flags & TQF_PARALLEL_CONTEXT)
184  __kmp_printf("PARALLEL ");
185  if (flags & TQF_DEALLOCATED)
186  __kmp_printf("DEALLOC ");
187  if (!(flags & (TQF_INTERNAL_FLAGS|TQF_INTERFACE_FLAGS)))
188  __kmp_printf("(NONE)");
189 }
190 
191 static void
192 __kmp_dump_thunk( kmp_taskq_t *tq, kmpc_thunk_t *thunk, kmp_int32 global_tid )
193 {
194  int i;
195  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
196 
197  __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
198 
199  if (thunk != NULL) {
200  for (i = 0; i < nproc; i++) {
201  if( tq->tq_curr_thunk[i] == thunk ) {
202  __kmp_printf("[%i] ", i);
203  }
204  }
205  __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
206  __kmp_printf("th_task=%p, ", thunk->th_task);
207  __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
208  __kmp_printf("th_status=%d, ", thunk->th_status);
209  __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
210  __kmp_printf("th_flags="); __kmp_dump_TQF(thunk->th_flags);
211  }
212 
213  __kmp_printf("\n");
214 }
215 
216 static void
217 __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num)
218 {
219  kmpc_thunk_t *th;
220 
221  __kmp_printf(" Thunk stack for T#%d: ", thread_num);
222 
223  for (th = thunk; th != NULL; th = th->th_encl_thunk )
224  __kmp_printf("%p ", th);
225 
226  __kmp_printf("\n");
227 }
228 
229 static void
230 __kmp_dump_task_queue( kmp_taskq_t *tq, kmpc_task_queue_t *queue, kmp_int32 global_tid )
231 {
232  int qs, count, i;
233  kmpc_thunk_t *thunk;
234  kmpc_task_queue_t *taskq;
235 
236  __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
237 
238  if (queue != NULL) {
239  int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
240 
241  if ( __kmp_env_consistency_check ) {
242  __kmp_printf(" tq_loc : ");
243  }
244  if (in_parallel) {
245 
246  //if (queue->tq.tq_parent != 0)
247  //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
248 
249  //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
250 
251  KMP_MB(); /* make sure data structures are in consistent state before querying them */
252  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
253 
254  __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
255  __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
256  __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
257  __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
258  __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
259 
260  //__kmp_release_lock(& queue->tq_link_lck, global_tid);
261 
262  //if (queue->tq.tq_parent != 0)
263  //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
264 
265  //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
266  //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
267 
268  KMP_MB(); /* make sure data structures are in consistent state before querying them */
269  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
270  }
271 
272  __kmp_printf(" tq_shareds : ");
273  for (i=0; i<((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
274  __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
275  __kmp_printf("\n");
276 
277  if (in_parallel) {
278  __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
279  __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
280  }
281 
282  __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
283  __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
284  __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
285 
286  __kmp_printf(" tq_free_thunks : ");
287  for (thunk = queue->tq_free_thunks; thunk != NULL; thunk = thunk->th.th_next_free )
288  __kmp_printf("%p ", thunk);
289  __kmp_printf("\n");
290 
291  __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
292  __kmp_printf(" tq_head : %d\n", queue->tq_head);
293  __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
294  __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
295  __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
296  __kmp_printf(" tq_flags : "); __kmp_dump_TQF(queue->tq_flags);
297  __kmp_printf("\n");
298 
299  if (in_parallel) {
300  __kmp_printf(" tq_th_thunks : ");
301  for (i = 0; i < queue->tq_nproc; i++) {
302  __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
303  }
304  __kmp_printf("\n");
305  }
306 
307  __kmp_printf("\n");
308  __kmp_printf(" Queue slots:\n");
309 
310 
311  qs = queue->tq_tail;
312  for ( count = 0; count < queue->tq_nfull; ++count ) {
313  __kmp_printf("(%d)", qs);
314  __kmp_dump_thunk( tq, queue->tq_queue[qs].qs_thunk, global_tid );
315  qs = (qs+1) % queue->tq_nslots;
316  }
317 
318  __kmp_printf("\n");
319 
320  if (in_parallel) {
321  if (queue->tq_taskq_slot != NULL) {
322  __kmp_printf(" TaskQ slot:\n");
323  __kmp_dump_thunk( tq, (kmpc_thunk_t *) queue->tq_taskq_slot, global_tid );
324  __kmp_printf("\n");
325  }
326  //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
327  //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
328  }
329  }
330 
331  __kmp_printf(" Taskq freelist: ");
332 
333  //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
334 
335  KMP_MB(); /* make sure data structures are in consistent state before querying them */
336  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
337 
338  for( taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free )
339  __kmp_printf("%p ", taskq);
340 
341  //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
342 
343  __kmp_printf("\n\n");
344 }
345 
346 static void
347 __kmp_aux_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *curr_queue, kmp_int32 level, kmp_int32 global_tid )
348 {
349  int i, count, qs;
350  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
351  kmpc_task_queue_t *queue = curr_queue;
352 
353  if (curr_queue == NULL)
354  return;
355 
356  __kmp_printf(" ");
357 
358  for (i=0; i<level; i++)
359  __kmp_printf(" ");
360 
361  __kmp_printf("%p", curr_queue);
362 
363  for (i = 0; i < nproc; i++) {
364  if( tq->tq_curr_thunk[i] && tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue ) {
365  __kmp_printf(" [%i]", i);
366  }
367  }
368 
369  __kmp_printf(":");
370 
371  //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
372 
373  KMP_MB(); /* make sure data structures are in consistent state before querying them */
374  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
375 
376  qs = curr_queue->tq_tail;
377 
378  for ( count = 0; count < curr_queue->tq_nfull; ++count ) {
379  __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
380  qs = (qs+1) % curr_queue->tq_nslots;
381  }
382 
383  //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
384 
385  __kmp_printf("\n");
386 
387  if (curr_queue->tq_first_child) {
388  //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
389 
390  KMP_MB(); /* make sure data structures are in consistent state before querying them */
391  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
392 
393  if (curr_queue->tq_first_child) {
394  for(queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
395  queue != NULL;
396  queue = queue->tq_next_child) {
397  __kmp_aux_dump_task_queue_tree( tq, queue, level+1, global_tid );
398  }
399  }
400 
401  //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
402  }
403 }
404 
405 static void
406 __kmp_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *tqroot, kmp_int32 global_tid)
407 {
408  __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
409 
410  __kmp_aux_dump_task_queue_tree( tq, tqroot, 0, global_tid );
411 
412  __kmp_printf("\n");
413 }
414 
415 /* --------------------------------------------------------------------------- */
416 
417 /*
418  New taskq storage routines that try to minimize overhead of mallocs but
419  still provide cache line alignment.
420 */
421 
422 
423 static void *
424 __kmp_taskq_allocate(size_t size, kmp_int32 global_tid)
425 {
426  void *addr, *orig_addr;
427  size_t bytes;
428 
429  KB_TRACE( 5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int) size, global_tid ) );
430 
431  bytes = sizeof(void *) + CACHE_LINE + size;
432 
433 #ifdef THREAD_ALLOC_FOR_TASKQ
434  orig_addr = (void *) __kmp_thread_malloc( __kmp_thread_from_gtid(global_tid), bytes );
435 #else
436  KE_TRACE( 10, ("%%%%%% MALLOC( %d )\n", bytes ) );
437  orig_addr = (void *) KMP_INTERNAL_MALLOC( bytes );
438 #endif /* THREAD_ALLOC_FOR_TASKQ */
439 
440  if (orig_addr == 0)
441  KMP_FATAL( OutOfHeapMemory );
442 
443  addr = orig_addr;
444 
445  if (((kmp_uintptr_t) addr & ( CACHE_LINE - 1 )) != 0) {
446  KB_TRACE( 50, ("__kmp_taskq_allocate: adjust for cache alignment\n" ) );
447  addr = (void *) (((kmp_uintptr_t) addr + CACHE_LINE) & ~( CACHE_LINE - 1 ));
448  }
449 
450  (* (void **) addr) = orig_addr;
451 
452  KB_TRACE( 10, ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, gtid: %d\n",
453  orig_addr, ((void **) addr) + 1, ((char *)(((void **) addr) + 1)) + size-1,
454  (int) size, global_tid ));
455 
456  return ( ((void **) addr) + 1 );
457 }
458 
459 static void
460 __kmpc_taskq_free(void *p, kmp_int32 global_tid)
461 {
462  KB_TRACE( 5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid ) );
463 
464  KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n", (*( ((void **) p)-1)), global_tid ));
465 
466 #ifdef THREAD_ALLOC_FOR_TASKQ
467  __kmp_thread_free( __kmp_thread_from_gtid(global_tid), *( ((void **) p)-1) );
468 #else
469  KMP_INTERNAL_FREE( *( ((void **) p)-1) );
470 #endif /* THREAD_ALLOC_FOR_TASKQ */
471 }
472 
473 /* --------------------------------------------------------------------------- */
474 
475 /*
476  * Keep freed kmpc_task_queue_t on an internal freelist and recycle since
477  * they're of constant size.
478  */
479 
480 static kmpc_task_queue_t *
481 __kmp_alloc_taskq ( kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots, kmp_int32 nthunks,
482  kmp_int32 nshareds, kmp_int32 nproc, size_t sizeof_thunk,
483  size_t sizeof_shareds, kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid )
484 {
485  kmp_int32 i;
486  size_t bytes;
487  kmpc_task_queue_t *new_queue;
488  kmpc_aligned_shared_vars_t *shared_var_array;
489  char *shared_var_storage;
490  char *pt; /* for doing byte-adjusted address computations */
491 
492  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
493 
494  KMP_MB(); /* make sure data structures are in consistent state before querying them */
495  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
496 
497  if( tq->tq_freelist ) {
498  new_queue = tq -> tq_freelist;
499  tq -> tq_freelist = tq -> tq_freelist -> tq.tq_next_free;
500 
501  KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
502 
503  new_queue->tq_flags = 0;
504 
505  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
506  }
507  else {
508  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
509 
510  new_queue = (kmpc_task_queue_t *) __kmp_taskq_allocate (sizeof (kmpc_task_queue_t), global_tid);
511  new_queue->tq_flags = 0;
512  }
513 
514  /* space in the task queue for queue slots (allocate as one big chunk */
515  /* of storage including new_taskq_task space) */
516 
517  sizeof_thunk += (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
518  pt = (char *) __kmp_taskq_allocate (nthunks * sizeof_thunk, global_tid);
519  new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
520  *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
521 
522  /* chain the allocated thunks into a freelist for this queue */
523 
524  new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
525 
526  for (i = 0; i < (nthunks - 2); i++) {
527  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th.th_next_free = (kmpc_thunk_t *)(pt + (i+1)*sizeof_thunk);
528 #ifdef KMP_DEBUG
529  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
530 #endif
531  }
532 
533  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th.th_next_free = NULL;
534 #ifdef KMP_DEBUG
535  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
536 #endif
537 
538  /* initialize the locks */
539 
540  if (in_parallel) {
541  __kmp_init_lock( & new_queue->tq_link_lck );
542  __kmp_init_lock( & new_queue->tq_free_thunks_lck );
543  __kmp_init_lock( & new_queue->tq_queue_lck );
544  }
545 
546  /* now allocate the slots */
547 
548  bytes = nslots * sizeof (kmpc_aligned_queue_slot_t);
549  new_queue->tq_queue = (kmpc_aligned_queue_slot_t *) __kmp_taskq_allocate( bytes, global_tid );
550 
551  /* space for array of pointers to shared variable structures */
552  sizeof_shareds += sizeof(kmpc_task_queue_t *);
553  sizeof_shareds += (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
554 
555  bytes = nshareds * sizeof (kmpc_aligned_shared_vars_t);
556  shared_var_array = (kmpc_aligned_shared_vars_t *) __kmp_taskq_allocate ( bytes, global_tid);
557 
558  bytes = nshareds * sizeof_shareds;
559  shared_var_storage = (char *) __kmp_taskq_allocate ( bytes, global_tid);
560 
561  for (i=0; i<nshareds; i++) {
562  shared_var_array[i].ai_data = (kmpc_shared_vars_t *) (shared_var_storage + i*sizeof_shareds);
563  shared_var_array[i].ai_data->sv_queue = new_queue;
564  }
565  new_queue->tq_shareds = shared_var_array;
566 
567 
568  /* array for number of outstanding thunks per thread */
569 
570  if (in_parallel) {
571  bytes = nproc * sizeof(kmpc_aligned_int32_t);
572  new_queue->tq_th_thunks = (kmpc_aligned_int32_t *) __kmp_taskq_allocate ( bytes, global_tid);
573  new_queue->tq_nproc = nproc;
574 
575  for (i=0; i<nproc; i++)
576  new_queue->tq_th_thunks[i].ai_data = 0;
577  }
578 
579  return new_queue;
580 }
581 
582 static void
583 __kmp_free_taskq (kmp_taskq_t *tq, kmpc_task_queue_t *p, int in_parallel, kmp_int32 global_tid)
584 {
585  __kmpc_taskq_free(p->tq_thunk_space, global_tid);
586  __kmpc_taskq_free(p->tq_queue, global_tid);
587 
588  /* free shared var structure storage */
589  __kmpc_taskq_free((void *) p->tq_shareds[0].ai_data, global_tid);
590 
591  /* free array of pointers to shared vars storage */
592  __kmpc_taskq_free(p->tq_shareds, global_tid);
593 
594 #ifdef KMP_DEBUG
595  p->tq_first_child = NULL;
596  p->tq_next_child = NULL;
597  p->tq_prev_child = NULL;
598  p->tq_ref_count = -10;
599  p->tq_shareds = NULL;
600  p->tq_tasknum_queuing = 0;
601  p->tq_tasknum_serving = 0;
602  p->tq_queue = NULL;
603  p->tq_thunk_space = NULL;
604  p->tq_taskq_slot = NULL;
605  p->tq_free_thunks = NULL;
606  p->tq_nslots = 0;
607  p->tq_head = 0;
608  p->tq_tail = 0;
609  p->tq_nfull = 0;
610  p->tq_hiwat = 0;
611 
612  if (in_parallel) {
613  int i;
614 
615  for (i=0; i<p->tq_nproc; i++)
616  p->tq_th_thunks[i].ai_data = 0;
617  }
618  if ( __kmp_env_consistency_check )
619  p->tq_loc = NULL;
620  KMP_DEBUG_ASSERT( p->tq_flags & TQF_DEALLOCATED );
621  p->tq_flags = TQF_DEALLOCATED;
622 #endif /* KMP_DEBUG */
623 
624  if (in_parallel) {
625  __kmpc_taskq_free(p->tq_th_thunks, global_tid);
626  __kmp_destroy_lock(& p->tq_link_lck);
627  __kmp_destroy_lock(& p->tq_queue_lck);
628  __kmp_destroy_lock(& p->tq_free_thunks_lck);
629  }
630 #ifdef KMP_DEBUG
631  p->tq_th_thunks = NULL;
632 #endif /* KMP_DEBUG */
633 
634  KMP_MB(); /* make sure data structures are in consistent state before querying them */
635  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
636 
637  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
638  p->tq.tq_next_free = tq->tq_freelist;
639 
640  tq->tq_freelist = p;
641  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
642 }
643 
644 /*
645  * Once a group of thunks has been allocated for use in a particular queue,
646  * these are managed via a per-queue freelist.
647  * We force a check that there's always a thunk free if we need one.
648  */
649 
650 static kmpc_thunk_t *
651 __kmp_alloc_thunk (kmpc_task_queue_t *queue, int in_parallel, kmp_int32 global_tid)
652 {
653  kmpc_thunk_t *fl;
654 
655  if (in_parallel) {
656  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
657 
658  KMP_MB(); /* make sure data structures are in consistent state before querying them */
659  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
660  }
661 
662  fl = queue->tq_free_thunks;
663 
664  KMP_DEBUG_ASSERT (fl != NULL);
665 
666  queue->tq_free_thunks = fl->th.th_next_free;
667  fl->th_flags = 0;
668 
669  if (in_parallel)
670  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
671 
672  return fl;
673 }
674 
675 static void
676 __kmp_free_thunk (kmpc_task_queue_t *queue, kmpc_thunk_t *p, int in_parallel, kmp_int32 global_tid)
677 {
678 #ifdef KMP_DEBUG
679  p->th_task = 0;
680  p->th_encl_thunk = 0;
681  p->th_status = 0;
682  p->th_tasknum = 0;
683  /* Also could zero pointers to private vars */
684 #endif
685 
686  if (in_parallel) {
687  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
688 
689  KMP_MB(); /* make sure data structures are in consistent state before querying them */
690  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
691  }
692 
693  p->th.th_next_free = queue->tq_free_thunks;
694  queue->tq_free_thunks = p;
695 
696 #ifdef KMP_DEBUG
697  p->th_flags = TQF_DEALLOCATED;
698 #endif
699 
700  if (in_parallel)
701  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
702 }
703 
704 /* --------------------------------------------------------------------------- */
705 
706 /* returns nonzero if the queue just became full after the enqueue */
707 
708 static kmp_int32
709 __kmp_enqueue_task ( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, kmpc_thunk_t *thunk, int in_parallel )
710 {
711  kmp_int32 ret;
712 
713  /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the master is executing then) */
714  if (in_parallel) {
715  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
716 
717  KMP_MB(); /* make sure data structures are in consistent state before querying them */
718  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
719  }
720 
721  KMP_DEBUG_ASSERT (queue->tq_nfull < queue->tq_nslots); /* check queue not full */
722 
723  queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
724 
725  if (queue->tq_head >= queue->tq_nslots)
726  queue->tq_head = 0;
727 
728  (queue->tq_nfull)++;
729 
730  KMP_MB(); /* to assure that nfull is seen to increase before TQF_ALL_TASKS_QUEUED is set */
731 
732  ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
733 
734  if (in_parallel) {
735  /* don't need to wait until workers are released before unlocking */
736  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
737 
738  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
739  /* If just creating the root queue, the worker threads are waiting at */
740  /* a join barrier until now, when there's something in the queue for */
741  /* them to do; release them now to do work. */
742  /* This should only be done when this is the first task enqueued, */
743  /* so reset the flag here also. */
744 
745  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers are still in spin mode */
746 
747  KMP_MB(); /* avoid releasing barrier twice if taskq_task switches threads */
748 
749  __kmpc_end_barrier_master( NULL, global_tid);
750  }
751  }
752 
753  return ret;
754 }
755 
756 static kmpc_thunk_t *
757 __kmp_dequeue_task (kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel)
758 {
759  kmpc_thunk_t *pt;
760  int tid = __kmp_tid_from_gtid( global_tid );
761 
762  KMP_DEBUG_ASSERT (queue->tq_nfull > 0); /* check queue not empty */
763 
764  if (queue->tq.tq_parent != NULL && in_parallel) {
765  int ct;
766  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
767  ct = ++(queue->tq_ref_count);
768  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
769  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
770  __LINE__, global_tid, queue, ct));
771  }
772 
773  pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
774 
775  if (queue->tq_tail >= queue->tq_nslots)
776  queue->tq_tail = 0;
777 
778  if (in_parallel) {
779  queue->tq_th_thunks[tid].ai_data++;
780 
781  KMP_MB(); /* necessary so ai_data increment is propagated to other threads immediately (digital) */
782 
783  KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding thunks from queue %p\n",
784  global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
785  }
786 
787  (queue->tq_nfull)--;
788 
789 #ifdef KMP_DEBUG
790  KMP_MB();
791 
792  /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is decremented */
793 
794  KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
795 
796  if (in_parallel) {
797  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <= __KMP_TASKQ_THUNKS_PER_TH);
798  }
799 #endif
800 
801  return pt;
802 }
803 
804 /*
805  * Find the next (non-null) task to dequeue and return it.
806  * This is never called unless in_parallel=TRUE
807  *
808  * Here are the rules for deciding which queue to take the task from:
809  * 1. Walk up the task queue tree from the current queue's parent and look
810  * on the way up (for loop, below).
811  * 2. Do a depth-first search back down the tree from the root and
812  * look (find_task_in_descandent_queue()).
813  *
814  * Here are the rules for deciding which task to take from a queue
815  * (__kmp_find_task_in_queue ()):
816  * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
817  * must be staged to make sure we execute the last one with
818  * TQF_IS_LAST_TASK at the end of task queue execution.
819  * 2. If the queue length is below some high water mark and the taskq task
820  * is enqueued, prefer running the taskq task.
821  * 3. Otherwise, take a (normal) task from the queue.
822  *
823  * If we do all this and return pt == NULL at the bottom of this routine,
824  * this means there are no more tasks to execute (except possibly for
825  * TQF_IS_LASTPRIVATE).
826  */
827 
828 static kmpc_thunk_t *
829 __kmp_find_task_in_queue (kmp_int32 global_tid, kmpc_task_queue_t *queue)
830 {
831  kmpc_thunk_t *pt = NULL;
832  int tid = __kmp_tid_from_gtid( global_tid );
833 
834  /* To prevent deadlock from tq_queue_lck if queue already deallocated */
835  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
836 
837  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
838 
839  /* Check again to avoid race in __kmpc_end_taskq() */
840  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
841 
842  KMP_MB(); /* make sure data structures are in consistent state before querying them */
843  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
844 
845  if ((queue->tq_taskq_slot != NULL) && (queue->tq_nfull <= queue->tq_hiwat)) {
846  /* if there's enough room in the queue and the dispatcher */
847  /* (taskq task) is available, schedule more tasks */
848  pt = (kmpc_thunk_t *) queue->tq_taskq_slot;
849  queue->tq_taskq_slot = NULL;
850  }
851  else if (queue->tq_nfull == 0 ||
852  queue->tq_th_thunks[tid].ai_data >= __KMP_TASKQ_THUNKS_PER_TH) {
853  /* do nothing if no thunks available or this thread can't */
854  /* run any because it already is executing too many */
855 
856  pt = NULL;
857  }
858  else if (queue->tq_nfull > 1) {
859  /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
860 
861  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
862  }
863  else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
864  /* one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE */
865 
866  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
867  }
868  else if (queue->tq_flags & TQF_IS_LAST_TASK) {
869  /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
870  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
871  /* instrumentation does copy-out. */
872 
873  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
874  pt->th_flags |= TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
875  }
876  }
877 
878  /* GEH - What happens here if is lastprivate, but not last task? */
879  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
880  }
881 
882  return pt;
883 }
884 
885 /*
886  * Walk a tree of queues starting at queue's first child
887  * and return a non-NULL thunk if one can be scheduled.
888  * Must only be called when in_parallel=TRUE
889  */
890 
891 static kmpc_thunk_t *
892 __kmp_find_task_in_descendant_queue (kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
893 {
894  kmpc_thunk_t *pt = NULL;
895  kmpc_task_queue_t *queue = curr_queue;
896 
897  if (curr_queue->tq_first_child != NULL) {
898  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
899 
900  KMP_MB(); /* make sure data structures are in consistent state before querying them */
901  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
902 
903  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
904  if (queue == NULL) {
905  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
906  return NULL;
907  }
908 
909  while (queue != NULL) {
910  int ct;
911  kmpc_task_queue_t *next;
912 
913  ct= ++(queue->tq_ref_count);
914  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
915  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
916  __LINE__, global_tid, queue, ct));
917 
918  pt = __kmp_find_task_in_queue (global_tid, queue);
919 
920  if (pt != NULL) {
921  int ct;
922 
923  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
924 
925  KMP_MB(); /* make sure data structures are in consistent state before querying them */
926  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
927 
928  ct = --(queue->tq_ref_count);
929  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
930  __LINE__, global_tid, queue, ct));
931  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
932 
933  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
934 
935  return pt;
936  }
937 
938  /* although reference count stays active during descendant walk, shouldn't matter */
939  /* since if children still exist, reference counts aren't being monitored anyway */
940 
941  pt = __kmp_find_task_in_descendant_queue (global_tid, queue);
942 
943  if (pt != NULL) {
944  int ct;
945 
946  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
947 
948  KMP_MB(); /* make sure data structures are in consistent state before querying them */
949  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
950 
951  ct = --(queue->tq_ref_count);
952  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
953  __LINE__, global_tid, queue, ct));
954  KMP_DEBUG_ASSERT( ct >= 0 );
955 
956  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
957 
958  return pt;
959  }
960 
961  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
962 
963  KMP_MB(); /* make sure data structures are in consistent state before querying them */
964  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
965 
966  next = queue->tq_next_child;
967 
968  ct = --(queue->tq_ref_count);
969  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
970  __LINE__, global_tid, queue, ct));
971  KMP_DEBUG_ASSERT( ct >= 0 );
972 
973  queue = next;
974  }
975 
976  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
977  }
978 
979  return pt;
980 }
981 
982 /*
983  * Walk up the taskq tree looking for a task to execute.
984  * If we get to the root, search the tree for a descendent queue task.
985  * Must only be called when in_parallel=TRUE
986  */
987 
988 static kmpc_thunk_t *
989 __kmp_find_task_in_ancestor_queue (kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
990 {
991  kmpc_task_queue_t *queue;
992  kmpc_thunk_t *pt;
993 
994  pt = NULL;
995 
996  if (curr_queue->tq.tq_parent != NULL) {
997  queue = curr_queue->tq.tq_parent;
998 
999  while (queue != NULL) {
1000  if (queue->tq.tq_parent != NULL) {
1001  int ct;
1002  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1003 
1004  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1005  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1006 
1007  ct = ++(queue->tq_ref_count);
1008  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1009  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
1010  __LINE__, global_tid, queue, ct));
1011  }
1012 
1013  pt = __kmp_find_task_in_queue (global_tid, queue);
1014  if (pt != NULL) {
1015  if (queue->tq.tq_parent != NULL) {
1016  int ct;
1017  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1018 
1019  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1020  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
1021 
1022  ct = --(queue->tq_ref_count);
1023  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1024  __LINE__, global_tid, queue, ct));
1025  KMP_DEBUG_ASSERT( ct >= 0 );
1026 
1027  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1028  }
1029 
1030  return pt;
1031  }
1032 
1033  if (queue->tq.tq_parent != NULL) {
1034  int ct;
1035  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1036 
1037  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1038  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1039 
1040  ct = --(queue->tq_ref_count);
1041  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1042  __LINE__, global_tid, queue, ct));
1043  KMP_DEBUG_ASSERT( ct >= 0 );
1044  }
1045  queue = queue->tq.tq_parent;
1046 
1047  if (queue != NULL)
1048  __kmp_release_lock(& queue->tq_link_lck, global_tid);
1049  }
1050 
1051  }
1052 
1053  pt = __kmp_find_task_in_descendant_queue( global_tid, tq->tq_root );
1054 
1055  return pt;
1056 }
1057 
1058 static int
1059 __kmp_taskq_tasks_finished (kmpc_task_queue_t *queue)
1060 {
1061  int i;
1062 
1063  /* KMP_MB(); *//* is this really necessary? */
1064 
1065  for (i=0; i<queue->tq_nproc; i++) {
1066  if (queue->tq_th_thunks[i].ai_data != 0)
1067  return FALSE;
1068  }
1069 
1070  return TRUE;
1071 }
1072 
1073 static int
1074 __kmp_taskq_has_any_children (kmpc_task_queue_t *queue)
1075 {
1076  return (queue->tq_first_child != NULL);
1077 }
1078 
1079 static void
1080 __kmp_remove_queue_from_tree( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel )
1081 {
1082 #ifdef KMP_DEBUG
1083  kmp_int32 i;
1084  kmpc_thunk_t *thunk;
1085 #endif
1086 
1087  KF_TRACE(50, ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1088  KF_DUMP(50, __kmp_dump_task_queue( tq, queue, global_tid ));
1089 
1090  /* sub-queue in a recursion, not the root task queue */
1091  KMP_DEBUG_ASSERT (queue->tq.tq_parent != NULL);
1092 
1093  if (in_parallel) {
1094  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1095 
1096  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1097  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1098  }
1099 
1100  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1101 
1102  /* unlink queue from its siblings if any at this level */
1103  if (queue->tq_prev_child != NULL)
1104  queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1105  if (queue->tq_next_child != NULL)
1106  queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1107  if (queue->tq.tq_parent->tq_first_child == queue)
1108  queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1109 
1110  queue->tq_prev_child = NULL;
1111  queue->tq_next_child = NULL;
1112 
1113  if (in_parallel) {
1114  kmp_uint32 spins;
1115 
1116  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1117  __LINE__, global_tid, queue, queue->tq_ref_count));
1118 
1119  /* wait until all other threads have stopped accessing this queue */
1120  while (queue->tq_ref_count > 1) {
1121  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1122 
1123  KMP_WAIT_YIELD((volatile kmp_uint32*)&queue->tq_ref_count, 1, KMP_LE, NULL);
1124 
1125  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1126 
1127  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1128  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1129  }
1130 
1131  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1132  }
1133 
1134  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p freeing queue\n",
1135  __LINE__, global_tid, queue));
1136 
1137 #ifdef KMP_DEBUG
1138  KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1139  KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1140 
1141  for (i=0; i<queue->tq_nproc; i++) {
1142  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1143  }
1144 
1145  i = 0;
1146  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1147  ++i;
1148 
1149  KMP_ASSERT (i == queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1150 #endif
1151 
1152  /* release storage for queue entry */
1153  __kmp_free_taskq ( tq, queue, TRUE, global_tid );
1154 
1155  KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1156  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1157 }
1158 
1159 /*
1160  * Starting from indicated queue, proceed downward through tree and
1161  * remove all taskqs which are finished, but only go down to taskqs
1162  * which have the "nowait" clause present. Assume this is only called
1163  * when in_parallel=TRUE.
1164  */
1165 
1166 static void
1167 __kmp_find_and_remove_finished_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue )
1168 {
1169  kmpc_task_queue_t *queue = curr_queue;
1170 
1171  if (curr_queue->tq_first_child != NULL) {
1172  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1173 
1174  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1175  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1176 
1177  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
1178  if (queue != NULL) {
1179  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1180  return;
1181  }
1182 
1183  while (queue != NULL) {
1184  kmpc_task_queue_t *next;
1185  int ct = ++(queue->tq_ref_count);
1186  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
1187  __LINE__, global_tid, queue, ct));
1188 
1189 
1190  /* although reference count stays active during descendant walk, */
1191  /* shouldn't matter since if children still exist, reference */
1192  /* counts aren't being monitored anyway */
1193 
1194  if (queue->tq_flags & TQF_IS_NOWAIT) {
1195  __kmp_find_and_remove_finished_child_taskq ( tq, global_tid, queue );
1196 
1197  if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) && (queue->tq_nfull == 0) &&
1198  __kmp_taskq_tasks_finished(queue) && ! __kmp_taskq_has_any_children(queue)) {
1199 
1200  /*
1201  Only remove this if we have not already marked it for deallocation.
1202  This should prevent multiple threads from trying to free this.
1203  */
1204 
1205  if ( __kmp_test_lock(& queue->tq_queue_lck, global_tid) ) {
1206  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1207  queue->tq_flags |= TQF_DEALLOCATED;
1208  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1209 
1210  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1211 
1212  /* Can't do any more here since can't be sure where sibling queue is so just exit this level */
1213  return;
1214  }
1215  else {
1216  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1217  }
1218  }
1219  /* otherwise, just fall through and decrement reference count */
1220  }
1221  }
1222 
1223  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1224 
1225  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1226  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1227 
1228  next = queue->tq_next_child;
1229 
1230  ct = --(queue->tq_ref_count);
1231  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1232  __LINE__, global_tid, queue, ct));
1233  KMP_DEBUG_ASSERT( ct >= 0 );
1234 
1235  queue = next;
1236  }
1237 
1238  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1239  }
1240 }
1241 
1242 /*
1243  * Starting from indicated queue, proceed downward through tree and
1244  * remove all taskq's assuming all are finished and
1245  * assuming NO other threads are executing at this point.
1246  */
1247 
1248 static void
1249 __kmp_remove_all_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue )
1250 {
1251  kmpc_task_queue_t *next_child;
1252 
1253  queue = (kmpc_task_queue_t *) queue->tq_first_child;
1254 
1255  while (queue != NULL) {
1256  __kmp_remove_all_child_taskq ( tq, global_tid, queue );
1257 
1258  next_child = queue->tq_next_child;
1259  queue->tq_flags |= TQF_DEALLOCATED;
1260  __kmp_remove_queue_from_tree ( tq, global_tid, queue, FALSE );
1261  queue = next_child;
1262  }
1263 }
1264 
1265 static void
1266 __kmp_execute_task_from_queue( kmp_taskq_t *tq, ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, int in_parallel )
1267 {
1268  kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1269  kmp_int32 tid = __kmp_tid_from_gtid( global_tid );
1270 
1271  KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1272  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1273  KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1274  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1275 
1276  /*
1277  * For the taskq task, the curr_thunk pushes and pop pairs are set up as follows:
1278  *
1279  * happens exactly once:
1280  * 1) __kmpc_taskq : push (if returning thunk only)
1281  * 4) __kmpc_end_taskq_task : pop
1282  *
1283  * optionally happens *each* time taskq task is dequeued/enqueued:
1284  * 2) __kmpc_taskq_task : pop
1285  * 3) __kmp_execute_task_from_queue : push
1286  *
1287  * execution ordering: 1,(2,3)*,4
1288  */
1289 
1290  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1291  kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1292  thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[index].ai_data;
1293 
1294  if ( __kmp_env_consistency_check ) {
1295  __kmp_push_workshare( global_tid,
1296  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1297  queue->tq_loc );
1298  }
1299  }
1300  else {
1301  if ( __kmp_env_consistency_check )
1302  __kmp_push_workshare( global_tid, ct_taskq, queue->tq_loc );
1303  }
1304 
1305  if (in_parallel) {
1306  thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1307  tq->tq_curr_thunk[tid] = thunk;
1308 
1309  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1310  }
1311 
1312  KF_TRACE( 50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1313  thunk->th_task (global_tid, thunk);
1314  KF_TRACE( 50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1315 
1316  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1317  if ( __kmp_env_consistency_check )
1318  __kmp_pop_workshare( global_tid, (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1319  queue->tq_loc );
1320 
1321  if (in_parallel) {
1322  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1323  thunk->th_encl_thunk = NULL;
1324  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1325  }
1326 
1327  if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1328  __kmp_taskq_check_ordered(global_tid, thunk);
1329  }
1330 
1331  __kmp_free_thunk (queue, thunk, in_parallel, global_tid);
1332 
1333  KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n", global_tid, thunk));
1334  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1335 
1336  if (in_parallel) {
1337  KMP_MB(); /* needed so thunk put on free list before outstanding thunk count is decremented */
1338 
1339  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1340 
1341  KF_TRACE( 200, ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1342  global_tid, queue->tq_th_thunks[tid].ai_data-1, queue));
1343 
1344  queue->tq_th_thunks[tid].ai_data--;
1345 
1346  /* KMP_MB(); */ /* is MB really necessary ? */
1347  }
1348 
1349  if (queue->tq.tq_parent != NULL && in_parallel) {
1350  int ct;
1351  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1352  ct = --(queue->tq_ref_count);
1353  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1354  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1355  __LINE__, global_tid, queue, ct));
1356  KMP_DEBUG_ASSERT( ct >= 0 );
1357  }
1358  }
1359 }
1360 
1361 /* --------------------------------------------------------------------------- */
1362 
1363 /* starts a taskq; creates and returns a thunk for the taskq_task */
1364 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1365 
1366 kmpc_thunk_t *
1367 __kmpc_taskq( ident_t *loc, kmp_int32 global_tid, kmpc_task_t taskq_task,
1368  size_t sizeof_thunk, size_t sizeof_shareds,
1369  kmp_int32 flags, kmpc_shared_vars_t **shareds )
1370 {
1371  int in_parallel;
1372  kmp_int32 nslots, nthunks, nshareds, nproc;
1373  kmpc_task_queue_t *new_queue, *curr_queue;
1374  kmpc_thunk_t *new_taskq_thunk;
1375  kmp_info_t *th;
1376  kmp_team_t *team;
1377  kmp_taskq_t *tq;
1378  kmp_int32 tid;
1379 
1380  KE_TRACE( 10, ("__kmpc_taskq called (%d)\n", global_tid));
1381 
1382  th = __kmp_threads[ global_tid ];
1383  team = th -> th.th_team;
1384  tq = & team -> t.t_taskq;
1385  nproc = team -> t.t_nproc;
1386  tid = __kmp_tid_from_gtid( global_tid );
1387 
1388  /* find out whether this is a parallel taskq or serialized one. */
1389  in_parallel = in_parallel_context( team );
1390 
1391  if( ! tq->tq_root ) {
1392  if (in_parallel) {
1393  /* Vector ORDERED SECTION to taskq version */
1394  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1395 
1396  /* Vector ORDERED SECTION to taskq version */
1397  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1398  }
1399 
1400  if (in_parallel) {
1401  /* This shouldn't be a barrier region boundary, it will confuse the user. */
1402  /* Need the boundary to be at the end taskq instead. */
1403  if ( __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1404  /* Creating the active root queue, and we are not the master thread. */
1405  /* The master thread below created the queue and tasks have been */
1406  /* enqueued, and the master thread released this barrier. This */
1407  /* worker thread can now proceed and execute tasks. See also the */
1408  /* TQF_RELEASE_WORKERS which is used to handle this case. */
1409 
1410  *shareds = (kmpc_shared_vars_t *) tq->tq_root->tq_shareds[tid].ai_data;
1411 
1412  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1413 
1414  return NULL;
1415  }
1416  }
1417 
1418  /* master thread only executes this code */
1419 
1420  if( tq->tq_curr_thunk_capacity < nproc ) {
1421  int i;
1422 
1423  if(tq->tq_curr_thunk)
1424  __kmp_free(tq->tq_curr_thunk);
1425  else {
1426  /* only need to do this once at outer level, i.e. when tq_curr_thunk is still NULL */
1427  __kmp_init_lock( & tq->tq_freelist_lck );
1428  }
1429 
1430  tq->tq_curr_thunk = (kmpc_thunk_t **) __kmp_allocate( nproc * sizeof(kmpc_thunk_t *) );
1431  tq -> tq_curr_thunk_capacity = nproc;
1432  }
1433 
1434  if (in_parallel)
1435  tq->tq_global_flags = TQF_RELEASE_WORKERS;
1436  }
1437 
1438  /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1439  /* on some heuristics (e.g., depth of queue nesting?). */
1440 
1441  nslots = (in_parallel) ? (2 * nproc) : 1;
1442 
1443  /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1444  /* jobs being executed by other threads, and one extra for taskq slot */
1445 
1446  nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1) : nslots + 2;
1447 
1448  /* Only the root taskq gets a per-thread array of shareds. */
1449  /* The rest of the taskq's only get one copy of the shared vars. */
1450 
1451  nshareds = ( !tq->tq_root && in_parallel) ? nproc : 1;
1452 
1453  /* create overall queue data structure and its components that require allocation */
1454 
1455  new_queue = __kmp_alloc_taskq ( tq, in_parallel, nslots, nthunks, nshareds, nproc,
1456  sizeof_thunk, sizeof_shareds, &new_taskq_thunk, global_tid );
1457 
1458  /* rest of new_queue initializations */
1459 
1460  new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1461 
1462  if (in_parallel) {
1463  new_queue->tq_tasknum_queuing = 0;
1464  new_queue->tq_tasknum_serving = 0;
1465  new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1466  }
1467 
1468  new_queue->tq_taskq_slot = NULL;
1469  new_queue->tq_nslots = nslots;
1470  new_queue->tq_hiwat = HIGH_WATER_MARK (nslots);
1471  new_queue->tq_nfull = 0;
1472  new_queue->tq_head = 0;
1473  new_queue->tq_tail = 0;
1474  new_queue->tq_loc = loc;
1475 
1476  if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1477  /* prepare to serve the first-queued task's ORDERED directive */
1478  new_queue->tq_tasknum_serving = 1;
1479 
1480  /* Vector ORDERED SECTION to taskq version */
1481  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1482 
1483  /* Vector ORDERED SECTION to taskq version */
1484  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1485  }
1486 
1487  /* create a new thunk for the taskq_task in the new_queue */
1488  *shareds = (kmpc_shared_vars_t *) new_queue->tq_shareds[0].ai_data;
1489 
1490  new_taskq_thunk->th.th_shareds = *shareds;
1491  new_taskq_thunk->th_task = taskq_task;
1492  new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1493  new_taskq_thunk->th_status = 0;
1494 
1495  KMP_DEBUG_ASSERT (new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1496 
1497  /* KMP_MB(); */ /* make sure these inits complete before threads start using this queue (necessary?) */
1498 
1499  /* insert the new task queue into the tree, but only after all fields initialized */
1500 
1501  if (in_parallel) {
1502  if( ! tq->tq_root ) {
1503  new_queue->tq.tq_parent = NULL;
1504  new_queue->tq_first_child = NULL;
1505  new_queue->tq_next_child = NULL;
1506  new_queue->tq_prev_child = NULL;
1507  new_queue->tq_ref_count = 1;
1508  tq->tq_root = new_queue;
1509  }
1510  else {
1511  curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1512  new_queue->tq.tq_parent = curr_queue;
1513  new_queue->tq_first_child = NULL;
1514  new_queue->tq_prev_child = NULL;
1515  new_queue->tq_ref_count = 1; /* for this the thread that built the queue */
1516 
1517  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n",
1518  __LINE__, global_tid, new_queue, new_queue->tq_ref_count));
1519 
1520  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1521 
1522  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1523  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1524 
1525  new_queue->tq_next_child = (struct kmpc_task_queue_t *) curr_queue->tq_first_child;
1526 
1527  if (curr_queue->tq_first_child != NULL)
1528  curr_queue->tq_first_child->tq_prev_child = new_queue;
1529 
1530  curr_queue->tq_first_child = new_queue;
1531 
1532  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1533  }
1534 
1535  /* set up thunk stack only after code that determines curr_queue above */
1536  new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1537  tq->tq_curr_thunk[tid] = new_taskq_thunk;
1538 
1539  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1540  }
1541  else {
1542  new_taskq_thunk->th_encl_thunk = 0;
1543  new_queue->tq.tq_parent = NULL;
1544  new_queue->tq_first_child = NULL;
1545  new_queue->tq_next_child = NULL;
1546  new_queue->tq_prev_child = NULL;
1547  new_queue->tq_ref_count = 1;
1548  }
1549 
1550 #ifdef KMP_DEBUG
1551  KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1552  KF_DUMP(150, __kmp_dump_thunk( tq, new_taskq_thunk, global_tid ));
1553 
1554  if (in_parallel) {
1555  KF_TRACE(25, ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1556  } else {
1557  KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1558  }
1559 
1560  KF_DUMP(25, __kmp_dump_task_queue( tq, new_queue, global_tid ));
1561 
1562  if (in_parallel) {
1563  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1564  }
1565 #endif /* KMP_DEBUG */
1566 
1567  if ( __kmp_env_consistency_check )
1568  __kmp_push_workshare( global_tid, ct_taskq, new_queue->tq_loc );
1569 
1570  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1571 
1572  return new_taskq_thunk;
1573 }
1574 
1575 
1576 /* ends a taskq; last thread out destroys the queue */
1577 
1578 void
1579 __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk)
1580 {
1581 #ifdef KMP_DEBUG
1582  kmp_int32 i;
1583 #endif
1584  kmp_taskq_t *tq;
1585  int in_parallel;
1586  kmp_info_t *th;
1587  kmp_int32 is_outermost;
1588  kmpc_task_queue_t *queue;
1589  kmpc_thunk_t *thunk;
1590  int nproc;
1591 
1592  KE_TRACE( 10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1593 
1594  tq = & __kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1595  nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
1596 
1597  /* For the outermost taskq only, all but one thread will have taskq_thunk == NULL */
1598  queue = (taskq_thunk == NULL) ? tq->tq_root : taskq_thunk->th.th_shareds->sv_queue;
1599 
1600  KE_TRACE( 50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1601  is_outermost = (queue == tq->tq_root);
1602  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1603 
1604  if (in_parallel) {
1605  kmp_uint32 spins;
1606 
1607  /* this is just a safeguard to release the waiting threads if */
1608  /* the outermost taskq never queues a task */
1609 
1610  if (is_outermost && (KMP_MASTER_GTID( global_tid ))) {
1611  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
1612  /* no lock needed, workers are still in spin mode */
1613  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1614 
1615  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1616  }
1617  }
1618 
1619  /* keep dequeueing work until all tasks are queued and dequeued */
1620 
1621  do {
1622  /* wait until something is available to dequeue */
1623  KMP_INIT_YIELD(spins);
1624 
1625  while ( (queue->tq_nfull == 0)
1626  && (queue->tq_taskq_slot == NULL)
1627  && (! __kmp_taskq_has_any_children(queue) )
1628  && (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED) )
1629  ) {
1630  __kmp_static_delay( 1 );
1631  KMP_YIELD_WHEN( TRUE, spins );
1632  }
1633 
1634  /* check to see if we can execute tasks in the queue */
1635  while ( ( (queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL) )
1636  && (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL
1637  ) {
1638  KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk, queue, global_tid));
1639  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1640  }
1641 
1642  /* see if work found can be found in a descendant queue */
1643  if ( (__kmp_taskq_has_any_children(queue))
1644  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1645  ) {
1646 
1647  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1648  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid ));
1649 
1650  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1651  }
1652 
1653  } while ( (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED))
1654  || (queue->tq_nfull != 0)
1655  );
1656 
1657  KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue, global_tid));
1658 
1659  /* wait while all tasks are not finished and more work found
1660  in descendant queues */
1661 
1662  while ( (!__kmp_taskq_tasks_finished(queue))
1663  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1664  ) {
1665 
1666  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1667  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1668 
1669  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1670  }
1671 
1672  KF_TRACE(50, ("No work found in descendent queues or all work finished in queue: %p (%d)\n", queue, global_tid));
1673 
1674  if (!is_outermost) {
1675  /* need to return if NOWAIT present and not outermost taskq */
1676 
1677  if (queue->tq_flags & TQF_IS_NOWAIT) {
1678  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1679  queue->tq_ref_count--;
1680  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
1681  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1682 
1683  KE_TRACE( 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1684 
1685  return;
1686  }
1687 
1688  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1689 
1690  /* WAIT until all tasks are finished and no child queues exist before proceeding */
1691  KMP_INIT_YIELD(spins);
1692 
1693  while (!__kmp_taskq_tasks_finished(queue) || __kmp_taskq_has_any_children(queue)) {
1694  thunk = __kmp_find_task_in_ancestor_queue( tq, global_tid, queue );
1695 
1696  if (thunk != NULL) {
1697  KF_TRACE(50, ("Stole thunk: %p in ancestor queue: %p while waiting in queue: %p (%d)\n",
1698  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1699  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1700  }
1701 
1702  KMP_YIELD_WHEN( thunk == NULL, spins );
1703 
1704  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1705  }
1706 
1707  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1708  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1709  queue->tq_flags |= TQF_DEALLOCATED;
1710  }
1711  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1712 
1713  /* only the allocating thread can deallocate the queue */
1714  if (taskq_thunk != NULL) {
1715  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1716  }
1717 
1718  KE_TRACE( 10, ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n", global_tid));
1719 
1720  return;
1721  }
1722 
1723  /* Outermost Queue: steal work from descendants until all tasks are finished */
1724 
1725  KMP_INIT_YIELD(spins);
1726 
1727  while (!__kmp_taskq_tasks_finished(queue)) {
1728  thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1729 
1730  if (thunk != NULL) {
1731  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1732  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1733 
1734  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1735  }
1736 
1737  KMP_YIELD_WHEN( thunk == NULL, spins );
1738  }
1739 
1740  /* Need this barrier to prevent destruction of queue before threads have all executed above code */
1741  /* This may need to be done earlier when NOWAIT is implemented for the outermost level */
1742 
1743  if ( !__kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1744  /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1745  /* for right now, everybody waits, and the master thread destroys the */
1746  /* remaining queues. */
1747 
1748  __kmp_remove_all_child_taskq( tq, global_tid, queue );
1749 
1750  /* Now destroy the root queue */
1751  KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n", global_tid, queue ));
1752  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1753 
1754 #ifdef KMP_DEBUG
1755  /* the root queue entry */
1756  KMP_DEBUG_ASSERT ((queue->tq.tq_parent == NULL) && (queue->tq_next_child == NULL));
1757 
1758  /* children must all be gone by now because of barrier above */
1759  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1760 
1761  for (i=0; i<nproc; i++) {
1762  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1763  }
1764 
1765  for (i=0, thunk=queue->tq_free_thunks; thunk != NULL; i++, thunk=thunk->th.th_next_free);
1766 
1767  KMP_DEBUG_ASSERT (i == queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1768 
1769  for (i = 0; i < nproc; i++) {
1770  KMP_DEBUG_ASSERT( ! tq->tq_curr_thunk[i] );
1771  }
1772 #endif
1773  /* unlink the root queue entry */
1774  tq -> tq_root = NULL;
1775 
1776  /* release storage for root queue entry */
1777  KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue, global_tid));
1778 
1779  queue->tq_flags |= TQF_DEALLOCATED;
1780  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1781 
1782  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1783 
1784  /* release the workers now that the data structures are up to date */
1785  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1786  }
1787 
1788  th = __kmp_threads[ global_tid ];
1789 
1790  /* Reset ORDERED SECTION to parallel version */
1791  th->th.th_dispatch->th_deo_fcn = 0;
1792 
1793  /* Reset ORDERED SECTION to parallel version */
1794  th->th.th_dispatch->th_dxo_fcn = 0;
1795  }
1796  else {
1797  /* in serial execution context, dequeue the last task */
1798  /* and execute it, if there were any tasks encountered */
1799 
1800  if (queue->tq_nfull > 0) {
1801  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1802 
1803  thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1804 
1805  if (queue->tq_flags & TQF_IS_LAST_TASK) {
1806  /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1807  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1808  /* instrumentation does copy-out. */
1809 
1810  /* no need for test_then_or call since already locked */
1811  thunk->th_flags |= TQF_IS_LAST_TASK;
1812  }
1813 
1814  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, thunk, queue));
1815 
1816  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1817  }
1818 
1819  /* destroy the unattached serial queue now that there is no more work to do */
1820  KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n", queue, global_tid));
1821  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1822 
1823 #ifdef KMP_DEBUG
1824  i = 0;
1825  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1826  ++i;
1827  KMP_DEBUG_ASSERT (i == queue->tq_nslots + 1);
1828 #endif
1829  /* release storage for unattached serial queue */
1830  KF_TRACE(50, ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1831 
1832  queue->tq_flags |= TQF_DEALLOCATED;
1833  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1834  }
1835 
1836  KE_TRACE( 10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1837 }
1838 
1839 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1840 /* Returns nonzero if just filled up queue */
1841 
1842 kmp_int32
1843 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1844 {
1845  kmp_int32 ret;
1846  kmpc_task_queue_t *queue;
1847  int in_parallel;
1848  kmp_taskq_t *tq;
1849 
1850  KE_TRACE( 10, ("__kmpc_task called (%d)\n", global_tid));
1851 
1852  KMP_DEBUG_ASSERT (!(thunk->th_flags & TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1853 
1854  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1855  queue = thunk->th.th_shareds->sv_queue;
1856  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1857 
1858  if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1859  thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1860 
1861  /* For serial execution dequeue the preceding task and execute it, if one exists */
1862  /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1863 
1864  if (!in_parallel && queue->tq_nfull > 0) {
1865  kmpc_thunk_t *prev_thunk;
1866 
1867  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1868 
1869  prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1870 
1871  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, prev_thunk, queue));
1872 
1873  __kmp_execute_task_from_queue( tq, loc, global_tid, prev_thunk, in_parallel );
1874  }
1875 
1876  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
1877  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
1878  /* task queue is not full and allocates a thunk (which is then passed to */
1879  /* __kmpc_task()). So, the enqueue below should never fail due to a full queue. */
1880 
1881  KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1882  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1883 
1884  ret = __kmp_enqueue_task ( tq, global_tid, queue, thunk, in_parallel );
1885 
1886  KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1887  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1888 
1889  KE_TRACE( 10, ("__kmpc_task return (%d)\n", global_tid));
1890 
1891  return ret;
1892 }
1893 
1894 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1895 /* this should never be called unless in a parallel context */
1896 
1897 void
1898 __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, kmp_int32 status)
1899 {
1900  kmpc_task_queue_t *queue;
1901  kmp_taskq_t *tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1902  int tid = __kmp_tid_from_gtid( global_tid );
1903 
1904  KE_TRACE( 10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1905  KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1906  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1907 
1908  queue = thunk->th.th_shareds->sv_queue;
1909 
1910  if ( __kmp_env_consistency_check )
1911  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1912 
1913  /* thunk->th_task is the taskq_task */
1914  KMP_DEBUG_ASSERT (thunk->th_flags & TQF_TASKQ_TASK);
1915 
1916  /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1917  KMP_DEBUG_ASSERT (queue->tq_taskq_slot == NULL);
1918 
1919  /* dequeue taskq thunk from curr_thunk stack */
1920  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1921  thunk->th_encl_thunk = NULL;
1922 
1923  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1924 
1925  thunk->th_status = status;
1926 
1927  KMP_MB(); /* flush thunk->th_status before taskq_task enqueued to avoid race condition */
1928 
1929  /* enqueue taskq_task in thunk into special slot in queue */
1930  /* GEH - probably don't need to lock taskq slot since only one */
1931  /* thread enqueues & already a lock set at dequeue point */
1932 
1933  queue->tq_taskq_slot = thunk;
1934 
1935  KE_TRACE( 10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1936 }
1937 
1938 /* ends a taskq_task; done generating tasks */
1939 
1940 void
1941 __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1942 {
1943  kmp_taskq_t *tq;
1944  kmpc_task_queue_t *queue;
1945  int in_parallel;
1946  int tid;
1947 
1948  KE_TRACE( 10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1949 
1950  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1951  queue = thunk->th.th_shareds->sv_queue;
1952  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1953  tid = __kmp_tid_from_gtid( global_tid );
1954 
1955  if ( __kmp_env_consistency_check )
1956  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1957 
1958  if (in_parallel) {
1959 #if KMP_ARCH_X86 || \
1960  KMP_ARCH_X86_64
1961 
1962  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_ALL_TASKS_QUEUED );
1963 #else
1964  {
1965  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1966 
1967  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1968  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1969 
1970  queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1971 
1972  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1973  }
1974 #endif
1975  }
1976 
1977  if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1978  /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in the */
1979  /* queue if TQF_IS_LASTPRIVATE so we can positively identify that last task */
1980  /* and run it with its TQF_IS_LAST_TASK bit turned on in th_flags. When */
1981  /* __kmpc_end_taskq_task() is called we are done generating all the tasks, so */
1982  /* we know the last one in the queue is the lastprivate task. Mark the queue */
1983  /* as having gotten to this state via tq_flags & TQF_IS_LAST_TASK; when that */
1984  /* task actually executes mark it via th_flags & TQF_IS_LAST_TASK (this th_flags */
1985  /* bit signals the instrumented code to do copy-outs after execution). */
1986 
1987  if (! in_parallel) {
1988  /* No synchronization needed for serial context */
1989  queue->tq_flags |= TQF_IS_LAST_TASK;
1990  }
1991  else {
1992 #if KMP_ARCH_X86 || \
1993  KMP_ARCH_X86_64
1994 
1995  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_IS_LAST_TASK );
1996 #else
1997  {
1998  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1999 
2000  KMP_MB(); /* make sure data structures are in consistent state before querying them */
2001  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
2002 
2003  queue->tq_flags |= TQF_IS_LAST_TASK;
2004 
2005  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
2006  }
2007 #endif
2008  /* to prevent race condition where last task is dequeued but */
2009  /* flag isn't visible yet (not sure about this) */
2010  KMP_MB();
2011  }
2012  }
2013 
2014  /* dequeue taskq thunk from curr_thunk stack */
2015  if (in_parallel) {
2016  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
2017  thunk->th_encl_thunk = NULL;
2018 
2019  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
2020  }
2021 
2022  KE_TRACE( 10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
2023 }
2024 
2025 /* returns thunk for a regular task based on taskq_thunk */
2026 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
2027 
2028 kmpc_thunk_t *
2029 __kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk, kmpc_task_t task)
2030 {
2031  kmp_taskq_t *tq;
2032  kmpc_task_queue_t *queue;
2033  kmpc_thunk_t *new_thunk;
2034  int in_parallel;
2035 
2036  KE_TRACE( 10, ("__kmpc_task_buffer called (%d)\n", global_tid));
2037 
2038  KMP_DEBUG_ASSERT (taskq_thunk->th_flags & TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
2039 
2040  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
2041  queue = taskq_thunk->th.th_shareds->sv_queue;
2042  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2043 
2044  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
2045  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
2046  /* task queue is not full and allocates a thunk (which is then passed to */
2047  /* __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be */
2048  /* the next to be enqueued in __kmpc_task(). */
2049 
2050  new_thunk = __kmp_alloc_thunk (queue, in_parallel, global_tid);
2051  new_thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[0].ai_data;
2052  new_thunk->th_encl_thunk = NULL;
2053  new_thunk->th_task = task;
2054 
2055  /* GEH - shouldn't need to lock the read of tq_flags here */
2056  new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2057 
2058  new_thunk->th_status = 0;
2059 
2060  KMP_DEBUG_ASSERT (!(new_thunk->th_flags & TQF_TASKQ_TASK));
2061 
2062  KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2063  KF_DUMP(100, __kmp_dump_thunk( tq, new_thunk, global_tid ));
2064 
2065  KE_TRACE( 10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2066 
2067  return new_thunk;
2068 }
2069 
2070 /* --------------------------------------------------------------------------- */