Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_taskq.c
1 /*
2  * kmp_taskq.c -- TASKQ support for OpenMP.
3  * $Revision: 43389 $
4  * $Date: 2014-08-11 10:54:01 -0500 (Mon, 11 Aug 2014) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2014 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_i18n.h"
39 #include "kmp_io.h"
40 #include "kmp_error.h"
41 
42 #define MAX_MESSAGE 512
43 
44 /* ------------------------------------------------------------------------ */
45 /* ------------------------------------------------------------------------ */
46 
47 /*
48  * Taskq routines and global variables
49  */
50 
51 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
52 
53 #define THREAD_ALLOC_FOR_TASKQ
54 
55 static int
56 in_parallel_context( kmp_team_t *team )
57 {
58  return ! team -> t.t_serialized;
59 }
60 
61 static void
62 __kmp_taskq_eo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
63 {
64  int gtid = *gtid_ref;
65  int tid = __kmp_tid_from_gtid( gtid );
66  kmp_uint32 spins;
67  kmp_uint32 my_token;
68  kmpc_task_queue_t *taskq;
69  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
70 
71  if ( __kmp_env_consistency_check )
72  __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL );
73 
74  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
75  KMP_MB(); /* Flush all pending memory write invalidates. */
76 
77  /* GEH - need check here under stats to make sure */
78  /* inside task (curr_thunk[*tid_ref] != NULL) */
79 
80  my_token =tq->tq_curr_thunk[ tid ]-> th_tasknum;
81 
82  taskq = tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue;
83 
84  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
85  KMP_MB();
86  }
87 }
88 
89 static void
90 __kmp_taskq_xo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
91 {
92  int gtid = *gtid_ref;
93  int tid = __kmp_tid_from_gtid( gtid );
94  kmp_uint32 my_token;
95  kmp_taskq_t *tq = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
96 
97  if ( __kmp_env_consistency_check )
98  __kmp_pop_sync( gtid, ct_ordered_in_taskq, loc_ref );
99 
100  if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
101  KMP_MB(); /* Flush all pending memory write invalidates. */
102 
103  /* GEH - need check here under stats to make sure */
104  /* inside task (curr_thunk[tid] != NULL) */
105 
106  my_token = tq->tq_curr_thunk[ tid ]->th_tasknum;
107 
108  KMP_MB(); /* Flush all pending memory write invalidates. */
109 
110  tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue -> tq_tasknum_serving = my_token + 1;
111 
112  KMP_MB(); /* Flush all pending memory write invalidates. */
113  }
114 }
115 
116 static void
117 __kmp_taskq_check_ordered( kmp_int32 gtid, kmpc_thunk_t *thunk )
118 {
119  kmp_uint32 spins;
120  kmp_uint32 my_token;
121  kmpc_task_queue_t *taskq;
122 
123  /* assume we are always called from an active parallel context */
124 
125  KMP_MB(); /* Flush all pending memory write invalidates. */
126 
127  my_token = thunk -> th_tasknum;
128 
129  taskq = thunk -> th.th_shareds -> sv_queue;
130 
131  if(taskq->tq_tasknum_serving <= my_token) {
132  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
133  KMP_MB();
134  taskq->tq_tasknum_serving = my_token +1;
135  KMP_MB();
136  }
137 }
138 
139 static void
140 __kmp_dump_TQF(kmp_int32 flags)
141 {
142  if (flags & TQF_IS_ORDERED)
143  __kmp_printf("ORDERED ");
144  if (flags & TQF_IS_LASTPRIVATE)
145  __kmp_printf("LAST_PRIV ");
146  if (flags & TQF_IS_NOWAIT)
147  __kmp_printf("NOWAIT ");
148  if (flags & TQF_HEURISTICS)
149  __kmp_printf("HEURIST ");
150  if (flags & TQF_INTERFACE_RESERVED1)
151  __kmp_printf("RESERV1 ");
152  if (flags & TQF_INTERFACE_RESERVED2)
153  __kmp_printf("RESERV2 ");
154  if (flags & TQF_INTERFACE_RESERVED3)
155  __kmp_printf("RESERV3 ");
156  if (flags & TQF_INTERFACE_RESERVED4)
157  __kmp_printf("RESERV4 ");
158  if (flags & TQF_IS_LAST_TASK)
159  __kmp_printf("LAST_TASK ");
160  if (flags & TQF_TASKQ_TASK)
161  __kmp_printf("TASKQ_TASK ");
162  if (flags & TQF_RELEASE_WORKERS)
163  __kmp_printf("RELEASE ");
164  if (flags & TQF_ALL_TASKS_QUEUED)
165  __kmp_printf("ALL_QUEUED ");
166  if (flags & TQF_PARALLEL_CONTEXT)
167  __kmp_printf("PARALLEL ");
168  if (flags & TQF_DEALLOCATED)
169  __kmp_printf("DEALLOC ");
170  if (!(flags & (TQF_INTERNAL_FLAGS|TQF_INTERFACE_FLAGS)))
171  __kmp_printf("(NONE)");
172 }
173 
174 static void
175 __kmp_dump_thunk( kmp_taskq_t *tq, kmpc_thunk_t *thunk, kmp_int32 global_tid )
176 {
177  int i;
178  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
179 
180  __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
181 
182  if (thunk != NULL) {
183  for (i = 0; i < nproc; i++) {
184  if( tq->tq_curr_thunk[i] == thunk ) {
185  __kmp_printf("[%i] ", i);
186  }
187  }
188  __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
189  __kmp_printf("th_task=%p, ", thunk->th_task);
190  __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
191  __kmp_printf("th_status=%d, ", thunk->th_status);
192  __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
193  __kmp_printf("th_flags="); __kmp_dump_TQF(thunk->th_flags);
194  }
195 
196  __kmp_printf("\n");
197 }
198 
199 static void
200 __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num)
201 {
202  kmpc_thunk_t *th;
203 
204  __kmp_printf(" Thunk stack for T#%d: ", thread_num);
205 
206  for (th = thunk; th != NULL; th = th->th_encl_thunk )
207  __kmp_printf("%p ", th);
208 
209  __kmp_printf("\n");
210 }
211 
212 static void
213 __kmp_dump_task_queue( kmp_taskq_t *tq, kmpc_task_queue_t *queue, kmp_int32 global_tid )
214 {
215  int qs, count, i;
216  kmpc_thunk_t *thunk;
217  kmpc_task_queue_t *taskq;
218 
219  __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
220 
221  if (queue != NULL) {
222  int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
223 
224  if ( __kmp_env_consistency_check ) {
225  __kmp_printf(" tq_loc : ");
226  }
227  if (in_parallel) {
228 
229  //if (queue->tq.tq_parent != 0)
230  //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
231 
232  //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
233 
234  KMP_MB(); /* make sure data structures are in consistent state before querying them */
235  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
236 
237  __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
238  __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
239  __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
240  __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
241  __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
242 
243  //__kmp_release_lock(& queue->tq_link_lck, global_tid);
244 
245  //if (queue->tq.tq_parent != 0)
246  //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
247 
248  //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
249  //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
250 
251  KMP_MB(); /* make sure data structures are in consistent state before querying them */
252  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
253  }
254 
255  __kmp_printf(" tq_shareds : ");
256  for (i=0; i<((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
257  __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
258  __kmp_printf("\n");
259 
260  if (in_parallel) {
261  __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
262  __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
263  }
264 
265  __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
266  __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
267  __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
268 
269  __kmp_printf(" tq_free_thunks : ");
270  for (thunk = queue->tq_free_thunks; thunk != NULL; thunk = thunk->th.th_next_free )
271  __kmp_printf("%p ", thunk);
272  __kmp_printf("\n");
273 
274  __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
275  __kmp_printf(" tq_head : %d\n", queue->tq_head);
276  __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
277  __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
278  __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
279  __kmp_printf(" tq_flags : "); __kmp_dump_TQF(queue->tq_flags);
280  __kmp_printf("\n");
281 
282  if (in_parallel) {
283  __kmp_printf(" tq_th_thunks : ");
284  for (i = 0; i < queue->tq_nproc; i++) {
285  __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
286  }
287  __kmp_printf("\n");
288  }
289 
290  __kmp_printf("\n");
291  __kmp_printf(" Queue slots:\n");
292 
293 
294  qs = queue->tq_tail;
295  for ( count = 0; count < queue->tq_nfull; ++count ) {
296  __kmp_printf("(%d)", qs);
297  __kmp_dump_thunk( tq, queue->tq_queue[qs].qs_thunk, global_tid );
298  qs = (qs+1) % queue->tq_nslots;
299  }
300 
301  __kmp_printf("\n");
302 
303  if (in_parallel) {
304  if (queue->tq_taskq_slot != NULL) {
305  __kmp_printf(" TaskQ slot:\n");
306  __kmp_dump_thunk( tq, (kmpc_thunk_t *) queue->tq_taskq_slot, global_tid );
307  __kmp_printf("\n");
308  }
309  //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
310  //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
311  }
312  }
313 
314  __kmp_printf(" Taskq freelist: ");
315 
316  //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
317 
318  KMP_MB(); /* make sure data structures are in consistent state before querying them */
319  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
320 
321  for( taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free )
322  __kmp_printf("%p ", taskq);
323 
324  //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
325 
326  __kmp_printf("\n\n");
327 }
328 
329 static void
330 __kmp_aux_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *curr_queue, kmp_int32 level, kmp_int32 global_tid )
331 {
332  int i, count, qs;
333  int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
334  kmpc_task_queue_t *queue = curr_queue;
335 
336  if (curr_queue == NULL)
337  return;
338 
339  __kmp_printf(" ");
340 
341  for (i=0; i<level; i++)
342  __kmp_printf(" ");
343 
344  __kmp_printf("%p", curr_queue);
345 
346  for (i = 0; i < nproc; i++) {
347  if( tq->tq_curr_thunk[i] && tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue ) {
348  __kmp_printf(" [%i]", i);
349  }
350  }
351 
352  __kmp_printf(":");
353 
354  //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
355 
356  KMP_MB(); /* make sure data structures are in consistent state before querying them */
357  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
358 
359  qs = curr_queue->tq_tail;
360 
361  for ( count = 0; count < curr_queue->tq_nfull; ++count ) {
362  __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
363  qs = (qs+1) % curr_queue->tq_nslots;
364  }
365 
366  //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
367 
368  __kmp_printf("\n");
369 
370  if (curr_queue->tq_first_child) {
371  //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
372 
373  KMP_MB(); /* make sure data structures are in consistent state before querying them */
374  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
375 
376  if (curr_queue->tq_first_child) {
377  for(queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
378  queue != NULL;
379  queue = queue->tq_next_child) {
380  __kmp_aux_dump_task_queue_tree( tq, queue, level+1, global_tid );
381  }
382  }
383 
384  //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
385  }
386 }
387 
388 static void
389 __kmp_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *tqroot, kmp_int32 global_tid)
390 {
391  __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
392 
393  __kmp_aux_dump_task_queue_tree( tq, tqroot, 0, global_tid );
394 
395  __kmp_printf("\n");
396 }
397 
398 /* --------------------------------------------------------------------------- */
399 
400 /*
401  New taskq storage routines that try to minimize overhead of mallocs but
402  still provide cache line alignment.
403 */
404 
405 
406 static void *
407 __kmp_taskq_allocate(size_t size, kmp_int32 global_tid)
408 {
409  void *addr, *orig_addr;
410  size_t bytes;
411 
412  KB_TRACE( 5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int) size, global_tid ) );
413 
414  bytes = sizeof(void *) + CACHE_LINE + size;
415 
416 #ifdef THREAD_ALLOC_FOR_TASKQ
417  orig_addr = (void *) __kmp_thread_malloc( __kmp_thread_from_gtid(global_tid), bytes );
418 #else
419  KE_TRACE( 10, ("%%%%%% MALLOC( %d )\n", bytes ) );
420  orig_addr = (void *) KMP_INTERNAL_MALLOC( bytes );
421 #endif /* THREAD_ALLOC_FOR_TASKQ */
422 
423  if (orig_addr == 0)
424  KMP_FATAL( OutOfHeapMemory );
425 
426  addr = orig_addr;
427 
428  if (((kmp_uintptr_t) addr & ( CACHE_LINE - 1 )) != 0) {
429  KB_TRACE( 50, ("__kmp_taskq_allocate: adjust for cache alignment\n" ) );
430  addr = (void *) (((kmp_uintptr_t) addr + CACHE_LINE) & ~( CACHE_LINE - 1 ));
431  }
432 
433  (* (void **) addr) = orig_addr;
434 
435  KB_TRACE( 10, ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, gtid: %d\n",
436  orig_addr, ((void **) addr) + 1, ((char *)(((void **) addr) + 1)) + size-1,
437  (int) size, global_tid ));
438 
439  return ( ((void **) addr) + 1 );
440 }
441 
442 static void
443 __kmpc_taskq_free(void *p, kmp_int32 global_tid)
444 {
445  KB_TRACE( 5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid ) );
446 
447  KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n", (*( ((void **) p)-1)), global_tid ));
448 
449 #ifdef THREAD_ALLOC_FOR_TASKQ
450  __kmp_thread_free( __kmp_thread_from_gtid(global_tid), *( ((void **) p)-1) );
451 #else
452  KMP_INTERNAL_FREE( *( ((void **) p)-1) );
453 #endif /* THREAD_ALLOC_FOR_TASKQ */
454 }
455 
456 /* --------------------------------------------------------------------------- */
457 
458 /*
459  * Keep freed kmpc_task_queue_t on an internal freelist and recycle since
460  * they're of constant size.
461  */
462 
463 static kmpc_task_queue_t *
464 __kmp_alloc_taskq ( kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots, kmp_int32 nthunks,
465  kmp_int32 nshareds, kmp_int32 nproc, size_t sizeof_thunk,
466  size_t sizeof_shareds, kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid )
467 {
468  kmp_int32 i;
469  size_t bytes;
470  kmpc_task_queue_t *new_queue;
471  kmpc_aligned_shared_vars_t *shared_var_array;
472  char *shared_var_storage;
473  char *pt; /* for doing byte-adjusted address computations */
474 
475  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
476 
477  KMP_MB(); /* make sure data structures are in consistent state before querying them */
478  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
479 
480  if( tq->tq_freelist ) {
481  new_queue = tq -> tq_freelist;
482  tq -> tq_freelist = tq -> tq_freelist -> tq.tq_next_free;
483 
484  KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
485 
486  new_queue->tq_flags = 0;
487 
488  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
489  }
490  else {
491  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
492 
493  new_queue = (kmpc_task_queue_t *) __kmp_taskq_allocate (sizeof (kmpc_task_queue_t), global_tid);
494  new_queue->tq_flags = 0;
495  }
496 
497  /* space in the task queue for queue slots (allocate as one big chunk */
498  /* of storage including new_taskq_task space) */
499 
500  sizeof_thunk += (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
501  pt = (char *) __kmp_taskq_allocate (nthunks * sizeof_thunk, global_tid);
502  new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
503  *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
504 
505  /* chain the allocated thunks into a freelist for this queue */
506 
507  new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
508 
509  for (i = 0; i < (nthunks - 2); i++) {
510  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th.th_next_free = (kmpc_thunk_t *)(pt + (i+1)*sizeof_thunk);
511 #ifdef KMP_DEBUG
512  ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
513 #endif
514  }
515 
516  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th.th_next_free = NULL;
517 #ifdef KMP_DEBUG
518  ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
519 #endif
520 
521  /* initialize the locks */
522 
523  if (in_parallel) {
524  __kmp_init_lock( & new_queue->tq_link_lck );
525  __kmp_init_lock( & new_queue->tq_free_thunks_lck );
526  __kmp_init_lock( & new_queue->tq_queue_lck );
527  }
528 
529  /* now allocate the slots */
530 
531  bytes = nslots * sizeof (kmpc_aligned_queue_slot_t);
532  new_queue->tq_queue = (kmpc_aligned_queue_slot_t *) __kmp_taskq_allocate( bytes, global_tid );
533 
534  /* space for array of pointers to shared variable structures */
535  sizeof_shareds += sizeof(kmpc_task_queue_t *);
536  sizeof_shareds += (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
537 
538  bytes = nshareds * sizeof (kmpc_aligned_shared_vars_t);
539  shared_var_array = (kmpc_aligned_shared_vars_t *) __kmp_taskq_allocate ( bytes, global_tid);
540 
541  bytes = nshareds * sizeof_shareds;
542  shared_var_storage = (char *) __kmp_taskq_allocate ( bytes, global_tid);
543 
544  for (i=0; i<nshareds; i++) {
545  shared_var_array[i].ai_data = (kmpc_shared_vars_t *) (shared_var_storage + i*sizeof_shareds);
546  shared_var_array[i].ai_data->sv_queue = new_queue;
547  }
548  new_queue->tq_shareds = shared_var_array;
549 
550 
551  /* array for number of outstanding thunks per thread */
552 
553  if (in_parallel) {
554  bytes = nproc * sizeof(kmpc_aligned_int32_t);
555  new_queue->tq_th_thunks = (kmpc_aligned_int32_t *) __kmp_taskq_allocate ( bytes, global_tid);
556  new_queue->tq_nproc = nproc;
557 
558  for (i=0; i<nproc; i++)
559  new_queue->tq_th_thunks[i].ai_data = 0;
560  }
561 
562  return new_queue;
563 }
564 
565 static void
566 __kmp_free_taskq (kmp_taskq_t *tq, kmpc_task_queue_t *p, int in_parallel, kmp_int32 global_tid)
567 {
568  __kmpc_taskq_free(p->tq_thunk_space, global_tid);
569  __kmpc_taskq_free(p->tq_queue, global_tid);
570 
571  /* free shared var structure storage */
572  __kmpc_taskq_free((void *) p->tq_shareds[0].ai_data, global_tid);
573 
574  /* free array of pointers to shared vars storage */
575  __kmpc_taskq_free(p->tq_shareds, global_tid);
576 
577 #ifdef KMP_DEBUG
578  p->tq_first_child = NULL;
579  p->tq_next_child = NULL;
580  p->tq_prev_child = NULL;
581  p->tq_ref_count = -10;
582  p->tq_shareds = NULL;
583  p->tq_tasknum_queuing = 0;
584  p->tq_tasknum_serving = 0;
585  p->tq_queue = NULL;
586  p->tq_thunk_space = NULL;
587  p->tq_taskq_slot = NULL;
588  p->tq_free_thunks = NULL;
589  p->tq_nslots = 0;
590  p->tq_head = 0;
591  p->tq_tail = 0;
592  p->tq_nfull = 0;
593  p->tq_hiwat = 0;
594 
595  if (in_parallel) {
596  int i;
597 
598  for (i=0; i<p->tq_nproc; i++)
599  p->tq_th_thunks[i].ai_data = 0;
600  }
601  if ( __kmp_env_consistency_check )
602  p->tq_loc = NULL;
603  KMP_DEBUG_ASSERT( p->tq_flags & TQF_DEALLOCATED );
604  p->tq_flags = TQF_DEALLOCATED;
605 #endif /* KMP_DEBUG */
606 
607  if (in_parallel) {
608  __kmpc_taskq_free(p->tq_th_thunks, global_tid);
609  __kmp_destroy_lock(& p->tq_link_lck);
610  __kmp_destroy_lock(& p->tq_queue_lck);
611  __kmp_destroy_lock(& p->tq_free_thunks_lck);
612  }
613 #ifdef KMP_DEBUG
614  p->tq_th_thunks = NULL;
615 #endif /* KMP_DEBUG */
616 
617  KMP_MB(); /* make sure data structures are in consistent state before querying them */
618  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
619 
620  __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
621  p->tq.tq_next_free = tq->tq_freelist;
622 
623  tq->tq_freelist = p;
624  __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
625 }
626 
627 /*
628  * Once a group of thunks has been allocated for use in a particular queue,
629  * these are managed via a per-queue freelist.
630  * We force a check that there's always a thunk free if we need one.
631  */
632 
633 static kmpc_thunk_t *
634 __kmp_alloc_thunk (kmpc_task_queue_t *queue, int in_parallel, kmp_int32 global_tid)
635 {
636  kmpc_thunk_t *fl;
637 
638  if (in_parallel) {
639  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
640 
641  KMP_MB(); /* make sure data structures are in consistent state before querying them */
642  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
643  }
644 
645  fl = queue->tq_free_thunks;
646 
647  KMP_DEBUG_ASSERT (fl != NULL);
648 
649  queue->tq_free_thunks = fl->th.th_next_free;
650  fl->th_flags = 0;
651 
652  if (in_parallel)
653  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
654 
655  return fl;
656 }
657 
658 static void
659 __kmp_free_thunk (kmpc_task_queue_t *queue, kmpc_thunk_t *p, int in_parallel, kmp_int32 global_tid)
660 {
661 #ifdef KMP_DEBUG
662  p->th_task = 0;
663  p->th_encl_thunk = 0;
664  p->th_status = 0;
665  p->th_tasknum = 0;
666  /* Also could zero pointers to private vars */
667 #endif
668 
669  if (in_parallel) {
670  __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
671 
672  KMP_MB(); /* make sure data structures are in consistent state before querying them */
673  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
674  }
675 
676  p->th.th_next_free = queue->tq_free_thunks;
677  queue->tq_free_thunks = p;
678 
679 #ifdef KMP_DEBUG
680  p->th_flags = TQF_DEALLOCATED;
681 #endif
682 
683  if (in_parallel)
684  __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
685 }
686 
687 /* --------------------------------------------------------------------------- */
688 
689 /* returns nonzero if the queue just became full after the enqueue */
690 
691 static kmp_int32
692 __kmp_enqueue_task ( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, kmpc_thunk_t *thunk, int in_parallel )
693 {
694  kmp_int32 ret;
695 
696  /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the master is executing then) */
697  if (in_parallel) {
698  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
699 
700  KMP_MB(); /* make sure data structures are in consistent state before querying them */
701  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
702  }
703 
704  KMP_DEBUG_ASSERT (queue->tq_nfull < queue->tq_nslots); /* check queue not full */
705 
706  queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
707 
708  if (queue->tq_head >= queue->tq_nslots)
709  queue->tq_head = 0;
710 
711  (queue->tq_nfull)++;
712 
713  KMP_MB(); /* to assure that nfull is seen to increase before TQF_ALL_TASKS_QUEUED is set */
714 
715  ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
716 
717  if (in_parallel) {
718  /* don't need to wait until workers are released before unlocking */
719  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
720 
721  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
722  /* If just creating the root queue, the worker threads are waiting at */
723  /* a join barrier until now, when there's something in the queue for */
724  /* them to do; release them now to do work. */
725  /* This should only be done when this is the first task enqueued, */
726  /* so reset the flag here also. */
727 
728  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers are still in spin mode */
729 
730  KMP_MB(); /* avoid releasing barrier twice if taskq_task switches threads */
731 
732  __kmpc_end_barrier_master( NULL, global_tid);
733  }
734  }
735 
736  return ret;
737 }
738 
739 static kmpc_thunk_t *
740 __kmp_dequeue_task (kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel)
741 {
742  kmpc_thunk_t *pt;
743  int tid = __kmp_tid_from_gtid( global_tid );
744 
745  KMP_DEBUG_ASSERT (queue->tq_nfull > 0); /* check queue not empty */
746 
747  if (queue->tq.tq_parent != NULL && in_parallel) {
748  int ct;
749  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
750  ct = ++(queue->tq_ref_count);
751  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
752  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
753  __LINE__, global_tid, queue, ct));
754  }
755 
756  pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
757 
758  if (queue->tq_tail >= queue->tq_nslots)
759  queue->tq_tail = 0;
760 
761  if (in_parallel) {
762  queue->tq_th_thunks[tid].ai_data++;
763 
764  KMP_MB(); /* necessary so ai_data increment is propagated to other threads immediately (digital) */
765 
766  KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding thunks from queue %p\n",
767  global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
768  }
769 
770  (queue->tq_nfull)--;
771 
772 #ifdef KMP_DEBUG
773  KMP_MB();
774 
775  /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is decremented */
776 
777  KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
778 
779  if (in_parallel) {
780  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <= __KMP_TASKQ_THUNKS_PER_TH);
781  }
782 #endif
783 
784  return pt;
785 }
786 
787 /*
788  * Find the next (non-null) task to dequeue and return it.
789  * This is never called unless in_parallel=TRUE
790  *
791  * Here are the rules for deciding which queue to take the task from:
792  * 1. Walk up the task queue tree from the current queue's parent and look
793  * on the way up (for loop, below).
794  * 2. Do a depth-first search back down the tree from the root and
795  * look (find_task_in_descendant_queue()).
796  *
797  * Here are the rules for deciding which task to take from a queue
798  * (__kmp_find_task_in_queue ()):
799  * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
800  * must be staged to make sure we execute the last one with
801  * TQF_IS_LAST_TASK at the end of task queue execution.
802  * 2. If the queue length is below some high water mark and the taskq task
803  * is enqueued, prefer running the taskq task.
804  * 3. Otherwise, take a (normal) task from the queue.
805  *
806  * If we do all this and return pt == NULL at the bottom of this routine,
807  * this means there are no more tasks to execute (except possibly for
808  * TQF_IS_LASTPRIVATE).
809  */
810 
811 static kmpc_thunk_t *
812 __kmp_find_task_in_queue (kmp_int32 global_tid, kmpc_task_queue_t *queue)
813 {
814  kmpc_thunk_t *pt = NULL;
815  int tid = __kmp_tid_from_gtid( global_tid );
816 
817  /* To prevent deadlock from tq_queue_lck if queue already deallocated */
818  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
819 
820  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
821 
822  /* Check again to avoid race in __kmpc_end_taskq() */
823  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
824 
825  KMP_MB(); /* make sure data structures are in consistent state before querying them */
826  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
827 
828  if ((queue->tq_taskq_slot != NULL) && (queue->tq_nfull <= queue->tq_hiwat)) {
829  /* if there's enough room in the queue and the dispatcher */
830  /* (taskq task) is available, schedule more tasks */
831  pt = (kmpc_thunk_t *) queue->tq_taskq_slot;
832  queue->tq_taskq_slot = NULL;
833  }
834  else if (queue->tq_nfull == 0 ||
835  queue->tq_th_thunks[tid].ai_data >= __KMP_TASKQ_THUNKS_PER_TH) {
836  /* do nothing if no thunks available or this thread can't */
837  /* run any because it already is executing too many */
838 
839  pt = NULL;
840  }
841  else if (queue->tq_nfull > 1) {
842  /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
843 
844  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
845  }
846  else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
847  /* one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE */
848 
849  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
850  }
851  else if (queue->tq_flags & TQF_IS_LAST_TASK) {
852  /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
853  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
854  /* instrumentation does copy-out. */
855 
856  pt = __kmp_dequeue_task (global_tid, queue, TRUE);
857  pt->th_flags |= TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
858  }
859  }
860 
861  /* GEH - What happens here if is lastprivate, but not last task? */
862  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
863  }
864 
865  return pt;
866 }
867 
868 /*
869  * Walk a tree of queues starting at queue's first child
870  * and return a non-NULL thunk if one can be scheduled.
871  * Must only be called when in_parallel=TRUE
872  */
873 
874 static kmpc_thunk_t *
875 __kmp_find_task_in_descendant_queue (kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
876 {
877  kmpc_thunk_t *pt = NULL;
878  kmpc_task_queue_t *queue = curr_queue;
879 
880  if (curr_queue->tq_first_child != NULL) {
881  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
882 
883  KMP_MB(); /* make sure data structures are in consistent state before querying them */
884  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
885 
886  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
887  if (queue == NULL) {
888  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
889  return NULL;
890  }
891 
892  while (queue != NULL) {
893  int ct;
894  kmpc_task_queue_t *next;
895 
896  ct= ++(queue->tq_ref_count);
897  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
898  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
899  __LINE__, global_tid, queue, ct));
900 
901  pt = __kmp_find_task_in_queue (global_tid, queue);
902 
903  if (pt != NULL) {
904  int ct;
905 
906  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
907 
908  KMP_MB(); /* make sure data structures are in consistent state before querying them */
909  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
910 
911  ct = --(queue->tq_ref_count);
912  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
913  __LINE__, global_tid, queue, ct));
914  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
915 
916  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
917 
918  return pt;
919  }
920 
921  /* although reference count stays active during descendant walk, shouldn't matter */
922  /* since if children still exist, reference counts aren't being monitored anyway */
923 
924  pt = __kmp_find_task_in_descendant_queue (global_tid, queue);
925 
926  if (pt != NULL) {
927  int ct;
928 
929  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
930 
931  KMP_MB(); /* make sure data structures are in consistent state before querying them */
932  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
933 
934  ct = --(queue->tq_ref_count);
935  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
936  __LINE__, global_tid, queue, ct));
937  KMP_DEBUG_ASSERT( ct >= 0 );
938 
939  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
940 
941  return pt;
942  }
943 
944  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
945 
946  KMP_MB(); /* make sure data structures are in consistent state before querying them */
947  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
948 
949  next = queue->tq_next_child;
950 
951  ct = --(queue->tq_ref_count);
952  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
953  __LINE__, global_tid, queue, ct));
954  KMP_DEBUG_ASSERT( ct >= 0 );
955 
956  queue = next;
957  }
958 
959  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
960  }
961 
962  return pt;
963 }
964 
965 /*
966  * Walk up the taskq tree looking for a task to execute.
967  * If we get to the root, search the tree for a descendent queue task.
968  * Must only be called when in_parallel=TRUE
969  */
970 
971 static kmpc_thunk_t *
972 __kmp_find_task_in_ancestor_queue (kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
973 {
974  kmpc_task_queue_t *queue;
975  kmpc_thunk_t *pt;
976 
977  pt = NULL;
978 
979  if (curr_queue->tq.tq_parent != NULL) {
980  queue = curr_queue->tq.tq_parent;
981 
982  while (queue != NULL) {
983  if (queue->tq.tq_parent != NULL) {
984  int ct;
985  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
986 
987  KMP_MB(); /* make sure data structures are in consistent state before querying them */
988  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
989 
990  ct = ++(queue->tq_ref_count);
991  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
992  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
993  __LINE__, global_tid, queue, ct));
994  }
995 
996  pt = __kmp_find_task_in_queue (global_tid, queue);
997  if (pt != NULL) {
998  if (queue->tq.tq_parent != NULL) {
999  int ct;
1000  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1001 
1002  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1003  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
1004 
1005  ct = --(queue->tq_ref_count);
1006  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1007  __LINE__, global_tid, queue, ct));
1008  KMP_DEBUG_ASSERT( ct >= 0 );
1009 
1010  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1011  }
1012 
1013  return pt;
1014  }
1015 
1016  if (queue->tq.tq_parent != NULL) {
1017  int ct;
1018  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1019 
1020  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1021  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1022 
1023  ct = --(queue->tq_ref_count);
1024  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1025  __LINE__, global_tid, queue, ct));
1026  KMP_DEBUG_ASSERT( ct >= 0 );
1027  }
1028  queue = queue->tq.tq_parent;
1029 
1030  if (queue != NULL)
1031  __kmp_release_lock(& queue->tq_link_lck, global_tid);
1032  }
1033 
1034  }
1035 
1036  pt = __kmp_find_task_in_descendant_queue( global_tid, tq->tq_root );
1037 
1038  return pt;
1039 }
1040 
1041 static int
1042 __kmp_taskq_tasks_finished (kmpc_task_queue_t *queue)
1043 {
1044  int i;
1045 
1046  /* KMP_MB(); *//* is this really necessary? */
1047 
1048  for (i=0; i<queue->tq_nproc; i++) {
1049  if (queue->tq_th_thunks[i].ai_data != 0)
1050  return FALSE;
1051  }
1052 
1053  return TRUE;
1054 }
1055 
1056 static int
1057 __kmp_taskq_has_any_children (kmpc_task_queue_t *queue)
1058 {
1059  return (queue->tq_first_child != NULL);
1060 }
1061 
1062 static void
1063 __kmp_remove_queue_from_tree( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel )
1064 {
1065 #ifdef KMP_DEBUG
1066  kmp_int32 i;
1067  kmpc_thunk_t *thunk;
1068 #endif
1069 
1070  KF_TRACE(50, ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1071  KF_DUMP(50, __kmp_dump_task_queue( tq, queue, global_tid ));
1072 
1073  /* sub-queue in a recursion, not the root task queue */
1074  KMP_DEBUG_ASSERT (queue->tq.tq_parent != NULL);
1075 
1076  if (in_parallel) {
1077  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1078 
1079  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1080  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1081  }
1082 
1083  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1084 
1085  /* unlink queue from its siblings if any at this level */
1086  if (queue->tq_prev_child != NULL)
1087  queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1088  if (queue->tq_next_child != NULL)
1089  queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1090  if (queue->tq.tq_parent->tq_first_child == queue)
1091  queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1092 
1093  queue->tq_prev_child = NULL;
1094  queue->tq_next_child = NULL;
1095 
1096  if (in_parallel) {
1097  kmp_uint32 spins;
1098 
1099  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1100  __LINE__, global_tid, queue, queue->tq_ref_count));
1101 
1102  /* wait until all other threads have stopped accessing this queue */
1103  while (queue->tq_ref_count > 1) {
1104  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1105 
1106  KMP_WAIT_YIELD((volatile kmp_uint32*)&queue->tq_ref_count, 1, KMP_LE, NULL);
1107 
1108  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1109 
1110  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1111  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1112  }
1113 
1114  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1115  }
1116 
1117  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p freeing queue\n",
1118  __LINE__, global_tid, queue));
1119 
1120 #ifdef KMP_DEBUG
1121  KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1122  KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1123 
1124  for (i=0; i<queue->tq_nproc; i++) {
1125  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1126  }
1127 
1128  i = 0;
1129  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1130  ++i;
1131 
1132  KMP_ASSERT (i == queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1133 #endif
1134 
1135  /* release storage for queue entry */
1136  __kmp_free_taskq ( tq, queue, TRUE, global_tid );
1137 
1138  KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1139  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1140 }
1141 
1142 /*
1143  * Starting from indicated queue, proceed downward through tree and
1144  * remove all taskqs which are finished, but only go down to taskqs
1145  * which have the "nowait" clause present. Assume this is only called
1146  * when in_parallel=TRUE.
1147  */
1148 
1149 static void
1150 __kmp_find_and_remove_finished_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue )
1151 {
1152  kmpc_task_queue_t *queue = curr_queue;
1153 
1154  if (curr_queue->tq_first_child != NULL) {
1155  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1156 
1157  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1158  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1159 
1160  queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
1161  if (queue != NULL) {
1162  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1163  return;
1164  }
1165 
1166  while (queue != NULL) {
1167  kmpc_task_queue_t *next;
1168  int ct = ++(queue->tq_ref_count);
1169  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
1170  __LINE__, global_tid, queue, ct));
1171 
1172 
1173  /* although reference count stays active during descendant walk, */
1174  /* shouldn't matter since if children still exist, reference */
1175  /* counts aren't being monitored anyway */
1176 
1177  if (queue->tq_flags & TQF_IS_NOWAIT) {
1178  __kmp_find_and_remove_finished_child_taskq ( tq, global_tid, queue );
1179 
1180  if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) && (queue->tq_nfull == 0) &&
1181  __kmp_taskq_tasks_finished(queue) && ! __kmp_taskq_has_any_children(queue)) {
1182 
1183  /*
1184  Only remove this if we have not already marked it for deallocation.
1185  This should prevent multiple threads from trying to free this.
1186  */
1187 
1188  if ( __kmp_test_lock(& queue->tq_queue_lck, global_tid) ) {
1189  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1190  queue->tq_flags |= TQF_DEALLOCATED;
1191  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1192 
1193  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1194 
1195  /* Can't do any more here since can't be sure where sibling queue is so just exit this level */
1196  return;
1197  }
1198  else {
1199  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1200  }
1201  }
1202  /* otherwise, just fall through and decrement reference count */
1203  }
1204  }
1205 
1206  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1207 
1208  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1209  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1210 
1211  next = queue->tq_next_child;
1212 
1213  ct = --(queue->tq_ref_count);
1214  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1215  __LINE__, global_tid, queue, ct));
1216  KMP_DEBUG_ASSERT( ct >= 0 );
1217 
1218  queue = next;
1219  }
1220 
1221  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1222  }
1223 }
1224 
1225 /*
1226  * Starting from indicated queue, proceed downward through tree and
1227  * remove all taskq's assuming all are finished and
1228  * assuming NO other threads are executing at this point.
1229  */
1230 
1231 static void
1232 __kmp_remove_all_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue )
1233 {
1234  kmpc_task_queue_t *next_child;
1235 
1236  queue = (kmpc_task_queue_t *) queue->tq_first_child;
1237 
1238  while (queue != NULL) {
1239  __kmp_remove_all_child_taskq ( tq, global_tid, queue );
1240 
1241  next_child = queue->tq_next_child;
1242  queue->tq_flags |= TQF_DEALLOCATED;
1243  __kmp_remove_queue_from_tree ( tq, global_tid, queue, FALSE );
1244  queue = next_child;
1245  }
1246 }
1247 
1248 static void
1249 __kmp_execute_task_from_queue( kmp_taskq_t *tq, ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, int in_parallel )
1250 {
1251  kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1252  kmp_int32 tid = __kmp_tid_from_gtid( global_tid );
1253 
1254  KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1255  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1256  KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1257  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1258 
1259  /*
1260  * For the taskq task, the curr_thunk pushes and pop pairs are set up as follows:
1261  *
1262  * happens exactly once:
1263  * 1) __kmpc_taskq : push (if returning thunk only)
1264  * 4) __kmpc_end_taskq_task : pop
1265  *
1266  * optionally happens *each* time taskq task is dequeued/enqueued:
1267  * 2) __kmpc_taskq_task : pop
1268  * 3) __kmp_execute_task_from_queue : push
1269  *
1270  * execution ordering: 1,(2,3)*,4
1271  */
1272 
1273  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1274  kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1275  thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[index].ai_data;
1276 
1277  if ( __kmp_env_consistency_check ) {
1278  __kmp_push_workshare( global_tid,
1279  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1280  queue->tq_loc );
1281  }
1282  }
1283  else {
1284  if ( __kmp_env_consistency_check )
1285  __kmp_push_workshare( global_tid, ct_taskq, queue->tq_loc );
1286  }
1287 
1288  if (in_parallel) {
1289  thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1290  tq->tq_curr_thunk[tid] = thunk;
1291 
1292  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1293  }
1294 
1295  KF_TRACE( 50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1296  thunk->th_task (global_tid, thunk);
1297  KF_TRACE( 50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
1298 
1299  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1300  if ( __kmp_env_consistency_check )
1301  __kmp_pop_workshare( global_tid, (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
1302  queue->tq_loc );
1303 
1304  if (in_parallel) {
1305  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1306  thunk->th_encl_thunk = NULL;
1307  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1308  }
1309 
1310  if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1311  __kmp_taskq_check_ordered(global_tid, thunk);
1312  }
1313 
1314  __kmp_free_thunk (queue, thunk, in_parallel, global_tid);
1315 
1316  KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n", global_tid, thunk));
1317  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1318 
1319  if (in_parallel) {
1320  KMP_MB(); /* needed so thunk put on free list before outstanding thunk count is decremented */
1321 
1322  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1323 
1324  KF_TRACE( 200, ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1325  global_tid, queue->tq_th_thunks[tid].ai_data-1, queue));
1326 
1327  queue->tq_th_thunks[tid].ai_data--;
1328 
1329  /* KMP_MB(); */ /* is MB really necessary ? */
1330  }
1331 
1332  if (queue->tq.tq_parent != NULL && in_parallel) {
1333  int ct;
1334  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1335  ct = --(queue->tq_ref_count);
1336  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1337  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
1338  __LINE__, global_tid, queue, ct));
1339  KMP_DEBUG_ASSERT( ct >= 0 );
1340  }
1341  }
1342 }
1343 
1344 /* --------------------------------------------------------------------------- */
1345 
1346 /* starts a taskq; creates and returns a thunk for the taskq_task */
1347 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1348 
1349 kmpc_thunk_t *
1350 __kmpc_taskq( ident_t *loc, kmp_int32 global_tid, kmpc_task_t taskq_task,
1351  size_t sizeof_thunk, size_t sizeof_shareds,
1352  kmp_int32 flags, kmpc_shared_vars_t **shareds )
1353 {
1354  int in_parallel;
1355  kmp_int32 nslots, nthunks, nshareds, nproc;
1356  kmpc_task_queue_t *new_queue, *curr_queue;
1357  kmpc_thunk_t *new_taskq_thunk;
1358  kmp_info_t *th;
1359  kmp_team_t *team;
1360  kmp_taskq_t *tq;
1361  kmp_int32 tid;
1362 
1363  KE_TRACE( 10, ("__kmpc_taskq called (%d)\n", global_tid));
1364 
1365  th = __kmp_threads[ global_tid ];
1366  team = th -> th.th_team;
1367  tq = & team -> t.t_taskq;
1368  nproc = team -> t.t_nproc;
1369  tid = __kmp_tid_from_gtid( global_tid );
1370 
1371  /* find out whether this is a parallel taskq or serialized one. */
1372  in_parallel = in_parallel_context( team );
1373 
1374  if( ! tq->tq_root ) {
1375  if (in_parallel) {
1376  /* Vector ORDERED SECTION to taskq version */
1377  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1378 
1379  /* Vector ORDERED SECTION to taskq version */
1380  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1381  }
1382 
1383  if (in_parallel) {
1384  /* This shouldn't be a barrier region boundary, it will confuse the user. */
1385  /* Need the boundary to be at the end taskq instead. */
1386  if ( __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1387  /* Creating the active root queue, and we are not the master thread. */
1388  /* The master thread below created the queue and tasks have been */
1389  /* enqueued, and the master thread released this barrier. This */
1390  /* worker thread can now proceed and execute tasks. See also the */
1391  /* TQF_RELEASE_WORKERS which is used to handle this case. */
1392 
1393  *shareds = (kmpc_shared_vars_t *) tq->tq_root->tq_shareds[tid].ai_data;
1394 
1395  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1396 
1397  return NULL;
1398  }
1399  }
1400 
1401  /* master thread only executes this code */
1402 
1403  if( tq->tq_curr_thunk_capacity < nproc ) {
1404  int i;
1405 
1406  if(tq->tq_curr_thunk)
1407  __kmp_free(tq->tq_curr_thunk);
1408  else {
1409  /* only need to do this once at outer level, i.e. when tq_curr_thunk is still NULL */
1410  __kmp_init_lock( & tq->tq_freelist_lck );
1411  }
1412 
1413  tq->tq_curr_thunk = (kmpc_thunk_t **) __kmp_allocate( nproc * sizeof(kmpc_thunk_t *) );
1414  tq -> tq_curr_thunk_capacity = nproc;
1415  }
1416 
1417  if (in_parallel)
1418  tq->tq_global_flags = TQF_RELEASE_WORKERS;
1419  }
1420 
1421  /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1422  /* on some heuristics (e.g., depth of queue nesting?). */
1423 
1424  nslots = (in_parallel) ? (2 * nproc) : 1;
1425 
1426  /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1427  /* jobs being executed by other threads, and one extra for taskq slot */
1428 
1429  nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1) : nslots + 2;
1430 
1431  /* Only the root taskq gets a per-thread array of shareds. */
1432  /* The rest of the taskq's only get one copy of the shared vars. */
1433 
1434  nshareds = ( !tq->tq_root && in_parallel) ? nproc : 1;
1435 
1436  /* create overall queue data structure and its components that require allocation */
1437 
1438  new_queue = __kmp_alloc_taskq ( tq, in_parallel, nslots, nthunks, nshareds, nproc,
1439  sizeof_thunk, sizeof_shareds, &new_taskq_thunk, global_tid );
1440 
1441  /* rest of new_queue initializations */
1442 
1443  new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1444 
1445  if (in_parallel) {
1446  new_queue->tq_tasknum_queuing = 0;
1447  new_queue->tq_tasknum_serving = 0;
1448  new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1449  }
1450 
1451  new_queue->tq_taskq_slot = NULL;
1452  new_queue->tq_nslots = nslots;
1453  new_queue->tq_hiwat = HIGH_WATER_MARK (nslots);
1454  new_queue->tq_nfull = 0;
1455  new_queue->tq_head = 0;
1456  new_queue->tq_tail = 0;
1457  new_queue->tq_loc = loc;
1458 
1459  if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1460  /* prepare to serve the first-queued task's ORDERED directive */
1461  new_queue->tq_tasknum_serving = 1;
1462 
1463  /* Vector ORDERED SECTION to taskq version */
1464  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1465 
1466  /* Vector ORDERED SECTION to taskq version */
1467  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1468  }
1469 
1470  /* create a new thunk for the taskq_task in the new_queue */
1471  *shareds = (kmpc_shared_vars_t *) new_queue->tq_shareds[0].ai_data;
1472 
1473  new_taskq_thunk->th.th_shareds = *shareds;
1474  new_taskq_thunk->th_task = taskq_task;
1475  new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1476  new_taskq_thunk->th_status = 0;
1477 
1478  KMP_DEBUG_ASSERT (new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1479 
1480  /* KMP_MB(); */ /* make sure these inits complete before threads start using this queue (necessary?) */
1481 
1482  /* insert the new task queue into the tree, but only after all fields initialized */
1483 
1484  if (in_parallel) {
1485  if( ! tq->tq_root ) {
1486  new_queue->tq.tq_parent = NULL;
1487  new_queue->tq_first_child = NULL;
1488  new_queue->tq_next_child = NULL;
1489  new_queue->tq_prev_child = NULL;
1490  new_queue->tq_ref_count = 1;
1491  tq->tq_root = new_queue;
1492  }
1493  else {
1494  curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1495  new_queue->tq.tq_parent = curr_queue;
1496  new_queue->tq_first_child = NULL;
1497  new_queue->tq_prev_child = NULL;
1498  new_queue->tq_ref_count = 1; /* for this the thread that built the queue */
1499 
1500  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n",
1501  __LINE__, global_tid, new_queue, new_queue->tq_ref_count));
1502 
1503  __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
1504 
1505  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1506  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1507 
1508  new_queue->tq_next_child = (struct kmpc_task_queue_t *) curr_queue->tq_first_child;
1509 
1510  if (curr_queue->tq_first_child != NULL)
1511  curr_queue->tq_first_child->tq_prev_child = new_queue;
1512 
1513  curr_queue->tq_first_child = new_queue;
1514 
1515  __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
1516  }
1517 
1518  /* set up thunk stack only after code that determines curr_queue above */
1519  new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1520  tq->tq_curr_thunk[tid] = new_taskq_thunk;
1521 
1522  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1523  }
1524  else {
1525  new_taskq_thunk->th_encl_thunk = 0;
1526  new_queue->tq.tq_parent = NULL;
1527  new_queue->tq_first_child = NULL;
1528  new_queue->tq_next_child = NULL;
1529  new_queue->tq_prev_child = NULL;
1530  new_queue->tq_ref_count = 1;
1531  }
1532 
1533 #ifdef KMP_DEBUG
1534  KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1535  KF_DUMP(150, __kmp_dump_thunk( tq, new_taskq_thunk, global_tid ));
1536 
1537  if (in_parallel) {
1538  KF_TRACE(25, ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1539  } else {
1540  KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1541  }
1542 
1543  KF_DUMP(25, __kmp_dump_task_queue( tq, new_queue, global_tid ));
1544 
1545  if (in_parallel) {
1546  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1547  }
1548 #endif /* KMP_DEBUG */
1549 
1550  if ( __kmp_env_consistency_check )
1551  __kmp_push_workshare( global_tid, ct_taskq, new_queue->tq_loc );
1552 
1553  KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
1554 
1555  return new_taskq_thunk;
1556 }
1557 
1558 
1559 /* ends a taskq; last thread out destroys the queue */
1560 
1561 void
1562 __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk)
1563 {
1564 #ifdef KMP_DEBUG
1565  kmp_int32 i;
1566 #endif
1567  kmp_taskq_t *tq;
1568  int in_parallel;
1569  kmp_info_t *th;
1570  kmp_int32 is_outermost;
1571  kmpc_task_queue_t *queue;
1572  kmpc_thunk_t *thunk;
1573  int nproc;
1574 
1575  KE_TRACE( 10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1576 
1577  tq = & __kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1578  nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
1579 
1580  /* For the outermost taskq only, all but one thread will have taskq_thunk == NULL */
1581  queue = (taskq_thunk == NULL) ? tq->tq_root : taskq_thunk->th.th_shareds->sv_queue;
1582 
1583  KE_TRACE( 50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1584  is_outermost = (queue == tq->tq_root);
1585  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1586 
1587  if (in_parallel) {
1588  kmp_uint32 spins;
1589 
1590  /* this is just a safeguard to release the waiting threads if */
1591  /* the outermost taskq never queues a task */
1592 
1593  if (is_outermost && (KMP_MASTER_GTID( global_tid ))) {
1594  if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
1595  /* no lock needed, workers are still in spin mode */
1596  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1597 
1598  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1599  }
1600  }
1601 
1602  /* keep dequeueing work until all tasks are queued and dequeued */
1603 
1604  do {
1605  /* wait until something is available to dequeue */
1606  KMP_INIT_YIELD(spins);
1607 
1608  while ( (queue->tq_nfull == 0)
1609  && (queue->tq_taskq_slot == NULL)
1610  && (! __kmp_taskq_has_any_children(queue) )
1611  && (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED) )
1612  ) {
1613  KMP_YIELD_WHEN( TRUE, spins );
1614  }
1615 
1616  /* check to see if we can execute tasks in the queue */
1617  while ( ( (queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL) )
1618  && (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL
1619  ) {
1620  KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk, queue, global_tid));
1621  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1622  }
1623 
1624  /* see if work found can be found in a descendant queue */
1625  if ( (__kmp_taskq_has_any_children(queue))
1626  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1627  ) {
1628 
1629  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1630  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid ));
1631 
1632  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1633  }
1634 
1635  } while ( (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED))
1636  || (queue->tq_nfull != 0)
1637  );
1638 
1639  KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue, global_tid));
1640 
1641  /* wait while all tasks are not finished and more work found
1642  in descendant queues */
1643 
1644  while ( (!__kmp_taskq_tasks_finished(queue))
1645  && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
1646  ) {
1647 
1648  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1649  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1650 
1651  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1652  }
1653 
1654  KF_TRACE(50, ("No work found in descendent queues or all work finished in queue: %p (%d)\n", queue, global_tid));
1655 
1656  if (!is_outermost) {
1657  /* need to return if NOWAIT present and not outermost taskq */
1658 
1659  if (queue->tq_flags & TQF_IS_NOWAIT) {
1660  __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1661  queue->tq_ref_count--;
1662  KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
1663  __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
1664 
1665  KE_TRACE( 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1666 
1667  return;
1668  }
1669 
1670  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1671 
1672  /* WAIT until all tasks are finished and no child queues exist before proceeding */
1673  KMP_INIT_YIELD(spins);
1674 
1675  while (!__kmp_taskq_tasks_finished(queue) || __kmp_taskq_has_any_children(queue)) {
1676  thunk = __kmp_find_task_in_ancestor_queue( tq, global_tid, queue );
1677 
1678  if (thunk != NULL) {
1679  KF_TRACE(50, ("Stole thunk: %p in ancestor queue: %p while waiting in queue: %p (%d)\n",
1680  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1681  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1682  }
1683 
1684  KMP_YIELD_WHEN( thunk == NULL, spins );
1685 
1686  __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
1687  }
1688 
1689  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1690  if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
1691  queue->tq_flags |= TQF_DEALLOCATED;
1692  }
1693  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1694 
1695  /* only the allocating thread can deallocate the queue */
1696  if (taskq_thunk != NULL) {
1697  __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
1698  }
1699 
1700  KE_TRACE( 10, ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n", global_tid));
1701 
1702  return;
1703  }
1704 
1705  /* Outermost Queue: steal work from descendants until all tasks are finished */
1706 
1707  KMP_INIT_YIELD(spins);
1708 
1709  while (!__kmp_taskq_tasks_finished(queue)) {
1710  thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1711 
1712  if (thunk != NULL) {
1713  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
1714  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1715 
1716  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1717  }
1718 
1719  KMP_YIELD_WHEN( thunk == NULL, spins );
1720  }
1721 
1722  /* Need this barrier to prevent destruction of queue before threads have all executed above code */
1723  /* This may need to be done earlier when NOWAIT is implemented for the outermost level */
1724 
1725  if ( !__kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
1726  /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1727  /* for right now, everybody waits, and the master thread destroys the */
1728  /* remaining queues. */
1729 
1730  __kmp_remove_all_child_taskq( tq, global_tid, queue );
1731 
1732  /* Now destroy the root queue */
1733  KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n", global_tid, queue ));
1734  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1735 
1736 #ifdef KMP_DEBUG
1737  /* the root queue entry */
1738  KMP_DEBUG_ASSERT ((queue->tq.tq_parent == NULL) && (queue->tq_next_child == NULL));
1739 
1740  /* children must all be gone by now because of barrier above */
1741  KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
1742 
1743  for (i=0; i<nproc; i++) {
1744  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1745  }
1746 
1747  for (i=0, thunk=queue->tq_free_thunks; thunk != NULL; i++, thunk=thunk->th.th_next_free);
1748 
1749  KMP_DEBUG_ASSERT (i == queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1750 
1751  for (i = 0; i < nproc; i++) {
1752  KMP_DEBUG_ASSERT( ! tq->tq_curr_thunk[i] );
1753  }
1754 #endif
1755  /* unlink the root queue entry */
1756  tq -> tq_root = NULL;
1757 
1758  /* release storage for root queue entry */
1759  KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue, global_tid));
1760 
1761  queue->tq_flags |= TQF_DEALLOCATED;
1762  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1763 
1764  KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
1765 
1766  /* release the workers now that the data structures are up to date */
1767  __kmp_end_split_barrier( bs_plain_barrier, global_tid );
1768  }
1769 
1770  th = __kmp_threads[ global_tid ];
1771 
1772  /* Reset ORDERED SECTION to parallel version */
1773  th->th.th_dispatch->th_deo_fcn = 0;
1774 
1775  /* Reset ORDERED SECTION to parallel version */
1776  th->th.th_dispatch->th_dxo_fcn = 0;
1777  }
1778  else {
1779  /* in serial execution context, dequeue the last task */
1780  /* and execute it, if there were any tasks encountered */
1781 
1782  if (queue->tq_nfull > 0) {
1783  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1784 
1785  thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1786 
1787  if (queue->tq_flags & TQF_IS_LAST_TASK) {
1788  /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1789  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1790  /* instrumentation does copy-out. */
1791 
1792  /* no need for test_then_or call since already locked */
1793  thunk->th_flags |= TQF_IS_LAST_TASK;
1794  }
1795 
1796  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, thunk, queue));
1797 
1798  __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
1799  }
1800 
1801  /* destroy the unattached serial queue now that there is no more work to do */
1802  KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n", queue, global_tid));
1803  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1804 
1805 #ifdef KMP_DEBUG
1806  i = 0;
1807  for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
1808  ++i;
1809  KMP_DEBUG_ASSERT (i == queue->tq_nslots + 1);
1810 #endif
1811  /* release storage for unattached serial queue */
1812  KF_TRACE(50, ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1813 
1814  queue->tq_flags |= TQF_DEALLOCATED;
1815  __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
1816  }
1817 
1818  KE_TRACE( 10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1819 }
1820 
1821 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1822 /* Returns nonzero if just filled up queue */
1823 
1824 kmp_int32
1825 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1826 {
1827  kmp_int32 ret;
1828  kmpc_task_queue_t *queue;
1829  int in_parallel;
1830  kmp_taskq_t *tq;
1831 
1832  KE_TRACE( 10, ("__kmpc_task called (%d)\n", global_tid));
1833 
1834  KMP_DEBUG_ASSERT (!(thunk->th_flags & TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1835 
1836  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1837  queue = thunk->th.th_shareds->sv_queue;
1838  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1839 
1840  if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1841  thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1842 
1843  /* For serial execution dequeue the preceding task and execute it, if one exists */
1844  /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1845 
1846  if (!in_parallel && queue->tq_nfull > 0) {
1847  kmpc_thunk_t *prev_thunk;
1848 
1849  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1850 
1851  prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1852 
1853  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, prev_thunk, queue));
1854 
1855  __kmp_execute_task_from_queue( tq, loc, global_tid, prev_thunk, in_parallel );
1856  }
1857 
1858  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
1859  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
1860  /* task queue is not full and allocates a thunk (which is then passed to */
1861  /* __kmpc_task()). So, the enqueue below should never fail due to a full queue. */
1862 
1863  KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1864  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1865 
1866  ret = __kmp_enqueue_task ( tq, global_tid, queue, thunk, in_parallel );
1867 
1868  KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1869  KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
1870 
1871  KE_TRACE( 10, ("__kmpc_task return (%d)\n", global_tid));
1872 
1873  return ret;
1874 }
1875 
1876 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1877 /* this should never be called unless in a parallel context */
1878 
1879 void
1880 __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, kmp_int32 status)
1881 {
1882  kmpc_task_queue_t *queue;
1883  kmp_taskq_t *tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1884  int tid = __kmp_tid_from_gtid( global_tid );
1885 
1886  KE_TRACE( 10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1887  KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1888  KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
1889 
1890  queue = thunk->th.th_shareds->sv_queue;
1891 
1892  if ( __kmp_env_consistency_check )
1893  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1894 
1895  /* thunk->th_task is the taskq_task */
1896  KMP_DEBUG_ASSERT (thunk->th_flags & TQF_TASKQ_TASK);
1897 
1898  /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1899  KMP_DEBUG_ASSERT (queue->tq_taskq_slot == NULL);
1900 
1901  /* dequeue taskq thunk from curr_thunk stack */
1902  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1903  thunk->th_encl_thunk = NULL;
1904 
1905  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
1906 
1907  thunk->th_status = status;
1908 
1909  KMP_MB(); /* flush thunk->th_status before taskq_task enqueued to avoid race condition */
1910 
1911  /* enqueue taskq_task in thunk into special slot in queue */
1912  /* GEH - probably don't need to lock taskq slot since only one */
1913  /* thread enqueues & already a lock set at dequeue point */
1914 
1915  queue->tq_taskq_slot = thunk;
1916 
1917  KE_TRACE( 10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1918 }
1919 
1920 /* ends a taskq_task; done generating tasks */
1921 
1922 void
1923 __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
1924 {
1925  kmp_taskq_t *tq;
1926  kmpc_task_queue_t *queue;
1927  int in_parallel;
1928  int tid;
1929 
1930  KE_TRACE( 10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1931 
1932  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
1933  queue = thunk->th.th_shareds->sv_queue;
1934  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1935  tid = __kmp_tid_from_gtid( global_tid );
1936 
1937  if ( __kmp_env_consistency_check )
1938  __kmp_pop_workshare( global_tid, ct_taskq, loc );
1939 
1940  if (in_parallel) {
1941 #if KMP_ARCH_X86 || \
1942  KMP_ARCH_X86_64
1943 
1944  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_ALL_TASKS_QUEUED );
1945 #else
1946  {
1947  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1948 
1949  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1950  /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
1951 
1952  queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1953 
1954  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1955  }
1956 #endif
1957  }
1958 
1959  if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1960  /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in the */
1961  /* queue if TQF_IS_LASTPRIVATE so we can positively identify that last task */
1962  /* and run it with its TQF_IS_LAST_TASK bit turned on in th_flags. When */
1963  /* __kmpc_end_taskq_task() is called we are done generating all the tasks, so */
1964  /* we know the last one in the queue is the lastprivate task. Mark the queue */
1965  /* as having gotten to this state via tq_flags & TQF_IS_LAST_TASK; when that */
1966  /* task actually executes mark it via th_flags & TQF_IS_LAST_TASK (this th_flags */
1967  /* bit signals the instrumented code to do copy-outs after execution). */
1968 
1969  if (! in_parallel) {
1970  /* No synchronization needed for serial context */
1971  queue->tq_flags |= TQF_IS_LAST_TASK;
1972  }
1973  else {
1974 #if KMP_ARCH_X86 || \
1975  KMP_ARCH_X86_64
1976 
1977  KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_IS_LAST_TASK );
1978 #else
1979  {
1980  __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
1981 
1982  KMP_MB(); /* make sure data structures are in consistent state before querying them */
1983  /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
1984 
1985  queue->tq_flags |= TQF_IS_LAST_TASK;
1986 
1987  __kmp_release_lock(& queue->tq_queue_lck, global_tid);
1988  }
1989 #endif
1990  /* to prevent race condition where last task is dequeued but */
1991  /* flag isn't visible yet (not sure about this) */
1992  KMP_MB();
1993  }
1994  }
1995 
1996  /* dequeue taskq thunk from curr_thunk stack */
1997  if (in_parallel) {
1998  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1999  thunk->th_encl_thunk = NULL;
2000 
2001  KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
2002  }
2003 
2004  KE_TRACE( 10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
2005 }
2006 
2007 /* returns thunk for a regular task based on taskq_thunk */
2008 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
2009 
2010 kmpc_thunk_t *
2011 __kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk, kmpc_task_t task)
2012 {
2013  kmp_taskq_t *tq;
2014  kmpc_task_queue_t *queue;
2015  kmpc_thunk_t *new_thunk;
2016  int in_parallel;
2017 
2018  KE_TRACE( 10, ("__kmpc_task_buffer called (%d)\n", global_tid));
2019 
2020  KMP_DEBUG_ASSERT (taskq_thunk->th_flags & TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
2021 
2022  tq = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
2023  queue = taskq_thunk->th.th_shareds->sv_queue;
2024  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2025 
2026  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private */
2027  /* variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the */
2028  /* task queue is not full and allocates a thunk (which is then passed to */
2029  /* __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be */
2030  /* the next to be enqueued in __kmpc_task(). */
2031 
2032  new_thunk = __kmp_alloc_thunk (queue, in_parallel, global_tid);
2033  new_thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[0].ai_data;
2034  new_thunk->th_encl_thunk = NULL;
2035  new_thunk->th_task = task;
2036 
2037  /* GEH - shouldn't need to lock the read of tq_flags here */
2038  new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2039 
2040  new_thunk->th_status = 0;
2041 
2042  KMP_DEBUG_ASSERT (!(new_thunk->th_flags & TQF_TASKQ_TASK));
2043 
2044  KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2045  KF_DUMP(100, __kmp_dump_thunk( tq, new_thunk, global_tid ));
2046 
2047  KE_TRACE( 10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2048 
2049  return new_thunk;
2050 }
2051 
2052 /* --------------------------------------------------------------------------- */
Definition: kmp.h:218
KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:987