Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_taskdeps.cpp
1 /*
2  * kmp_taskdeps.cpp
3  * $Revision: 42539 $
4  * $Date: 2013-07-17 11:20:01 -0500 (Wed, 17 Jul 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 //#define KMP_SUPPORT_GRAPH_OUTPUT 1
38 
39 #include "kmp.h"
40 #include "kmp_io.h"
41 
42 #if OMP_40_ENABLED
43 
44 //TODO: Improve memory allocation? keep a list of pre-allocated structures? allocate in blocks? re-use list finished list entries?
45 //TODO: don't use atomic ref counters for stack-allocated nodes.
46 //TODO: find an alternate to atomic refs for heap-allocated nodes?
47 //TODO: Finish graph output support
48 //TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other runtime locks
49 //TODO: Any ITT support needed?
50 
51 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
52 static kmp_int32 kmp_node_id_seed = 0;
53 #endif
54 
55 static void
56 __kmp_init_node ( kmp_depnode_t *node )
57 {
58  node->dn.task = NULL; // set to null initially, it will point to the right task once dependences have been processed
59  node->dn.successors = NULL;
60  __kmp_init_lock(&node->dn.lock);
61  node->dn.nrefs = 1; // init creates the first reference to the node
62 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
63  node->dn.id = KMP_TEST_THEN_INC32(&kmp_node_id_seed);
64 #endif
65 }
66 
67 static inline kmp_depnode_t *
68 __kmp_node_ref ( kmp_depnode_t *node )
69 {
70  KMP_TEST_THEN_INC32(&node->dn.nrefs);
71  return node;
72 }
73 
74 static inline void
75 __kmp_node_deref ( kmp_info_t *thread, kmp_depnode_t *node )
76 {
77  if (!node) return;
78 
79  kmp_int32 n = KMP_TEST_THEN_DEC32(&node->dn.nrefs) - 1;
80  if ( n == 0 ) {
81  KMP_ASSERT(node->dn.nrefs == 0);
82 #if USE_FAST_MEMORY
83  __kmp_fast_free(thread,node);
84 #else
85  __kmp_thread_free(thread,node);
86 #endif
87  }
88 }
89 
90 #define KMP_ACQUIRE_DEPNODE(gtid,n) __kmp_acquire_lock(&(n)->dn.lock,(gtid))
91 #define KMP_RELEASE_DEPNODE(gtid,n) __kmp_release_lock(&(n)->dn.lock,(gtid))
92 
93 static void
94 __kmp_depnode_list_free ( kmp_info_t *thread, kmp_depnode_list *list );
95 
96 static const kmp_int32 kmp_dephash_log2 = 6;
97 static const kmp_int32 kmp_dephash_size = (1 << kmp_dephash_log2);
98 
99 static inline kmp_int32
100 __kmp_dephash_hash ( kmp_intptr_t addr )
101 {
102  //TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) % m_num_sets );
103  return ((addr >> kmp_dephash_log2) ^ addr) % kmp_dephash_size;
104 }
105 
106 static kmp_dephash_t *
107 __kmp_dephash_create ( kmp_info_t *thread )
108 {
109  kmp_dephash_t *h;
110 
111  kmp_int32 size = kmp_dephash_size * sizeof(kmp_dephash_entry_t) + sizeof(kmp_dephash_t);
112 
113 #if USE_FAST_MEMORY
114  h = (kmp_dephash_t *) __kmp_fast_allocate( thread, size );
115 #else
116  h = (kmp_dephash_t *) __kmp_thread_malloc( thread, size );
117 #endif
118 
119 #ifdef KMP_DEBUG
120  h->nelements = 0;
121 #endif
122  h->buckets = (kmp_dephash_entry **)(h+1);
123 
124  for ( kmp_int32 i = 0; i < kmp_dephash_size; i++ )
125  h->buckets[i] = 0;
126 
127  return h;
128 }
129 
130 static void
131 __kmp_dephash_free ( kmp_info_t *thread, kmp_dephash_t *h )
132 {
133  for ( kmp_int32 i=0; i < kmp_dephash_size; i++ ) {
134  if ( h->buckets[i] ) {
135  kmp_dephash_entry_t *next;
136  for ( kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next ) {
137  next = entry->next_in_bucket;
138  __kmp_depnode_list_free(thread,entry->last_ins);
139  __kmp_node_deref(thread,entry->last_out);
140 #if USE_FAST_MEMORY
141  __kmp_fast_free(thread,entry);
142 #else
143  __kmp_thread_free(thread,entry);
144 #endif
145  }
146  }
147  }
148 #if USE_FAST_MEMORY
149  __kmp_fast_free(thread,h);
150 #else
151  __kmp_thread_free(thread,h);
152 #endif
153 }
154 
155 static kmp_dephash_entry *
156 __kmp_dephash_find ( kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr )
157 {
158  kmp_int32 bucket = __kmp_dephash_hash(addr);
159 
160  kmp_dephash_entry_t *entry;
161  for ( entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket )
162  if ( entry->addr == addr ) break;
163 
164  if ( entry == NULL ) {
165  // create entry. This is only done by one thread so no locking required
166 #if USE_FAST_MEMORY
167  entry = (kmp_dephash_entry_t *) __kmp_fast_allocate( thread, sizeof(kmp_dephash_entry_t) );
168 #else
169  entry = (kmp_dephash_entry_t *) __kmp_thread_malloc( thread, sizeof(kmp_dephash_entry_t) );
170 #endif
171  entry->addr = addr;
172  entry->last_out = NULL;
173  entry->last_ins = NULL;
174  entry->next_in_bucket = h->buckets[bucket];
175  h->buckets[bucket] = entry;
176 #ifdef KMP_DEBUG
177  h->nelements++;
178  if ( entry->next_in_bucket ) h->nconflicts++;
179 #endif
180  }
181  return entry;
182 }
183 
184 static kmp_depnode_list_t *
185 __kmp_add_node ( kmp_info_t *thread, kmp_depnode_list_t *list, kmp_depnode_t *node )
186 {
187  kmp_depnode_list_t *new_head;
188 
189 #if USE_FAST_MEMORY
190  new_head = (kmp_depnode_list_t *) __kmp_fast_allocate(thread,sizeof(kmp_depnode_list_t));
191 #else
192  new_head = (kmp_depnode_list_t *) __kmp_thread_malloc(thread,sizeof(kmp_depnode_list_t));
193 #endif
194 
195  new_head->node = __kmp_node_ref(node);
196  new_head->next = list;
197 
198  return new_head;
199 }
200 
201 static void
202 __kmp_depnode_list_free ( kmp_info_t *thread, kmp_depnode_list *list )
203 {
204  kmp_depnode_list *next;
205 
206  for ( ; list ; list = next ) {
207  next = list->next;
208 
209  __kmp_node_deref(thread,list->node);
210 #if USE_FAST_MEMORY
211  __kmp_fast_free(thread,list);
212 #else
213  __kmp_thread_free(thread,list);
214 #endif
215  }
216 }
217 
218 static inline void
219 __kmp_track_dependence ( kmp_depnode_t *source, kmp_depnode_t *sink )
220 {
221 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
222  kmp_taskdata_t * task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
223  kmp_taskdata_t * task_sink = KMP_TASK_TO_TASKDATA(sink->dn.task); // this can be NULL when if(0) ...
224 
225  __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id, task_source->td_ident->psource, sink->dn.id, task_sink->td_ident->psource);
226 #endif
227 }
228 
229 template< bool filter >
230 static inline kmp_int32
231 __kmp_process_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
232  bool dep_barrier,kmp_int32 ndeps, kmp_depend_info_t *dep_list)
233 {
234  kmp_info_t *thread = __kmp_threads[ gtid ];
235  kmp_int32 npredecessors=0;
236  for ( kmp_int32 i = 0; i < ndeps ; i++ ) {
237  const kmp_depend_info_t * dep = &dep_list[i];
238 
239  KMP_DEBUG_ASSERT(dep->flags.in);
240 
241  if ( filter && dep->base_addr == 0 ) continue; // skip filtered entries
242 
243  kmp_dephash_entry_t *info = __kmp_dephash_find(thread,hash,dep->base_addr);
244  kmp_depnode_t *last_out = info->last_out;
245 
246  if ( dep->flags.out && info->last_ins ) {
247  for ( kmp_depnode_list_t * p = info->last_ins; p; p = p->next ) {
248  kmp_depnode_t * indep = p->node;
249  if ( indep->dn.task ) {
250  KMP_ACQUIRE_DEPNODE(gtid,indep);
251  if ( indep->dn.task ) {
252  __kmp_track_dependence(indep,node);
253  indep->dn.successors = __kmp_add_node(thread, indep->dn.successors, node);
254  npredecessors++;
255  }
256  KMP_RELEASE_DEPNODE(gtid,indep);
257  }
258  }
259 
260  __kmp_depnode_list_free(thread,info->last_ins);
261  info->last_ins = NULL;
262 
263  } else if ( last_out && last_out->dn.task ) {
264  KMP_ACQUIRE_DEPNODE(gtid,last_out);
265  if ( last_out->dn.task ) {
266  __kmp_track_dependence(last_out,node);
267  last_out->dn.successors = __kmp_add_node(thread, last_out->dn.successors, node);
268  npredecessors++;
269  }
270  KMP_RELEASE_DEPNODE(gtid,last_out);
271  }
272 
273  if ( dep_barrier ) {
274  // if this is a sync point in the serial sequence and previous outputs are guaranteed to be completed after
275  // the execution of this task so the previous output nodes can be cleared.
276  __kmp_node_deref(thread,last_out);
277  info->last_out = NULL;
278  } else {
279  if ( dep->flags.out ) {
280  __kmp_node_deref(thread,last_out);
281  info->last_out = __kmp_node_ref(node);
282  } else
283  info->last_ins = __kmp_add_node(thread, info->last_ins, node);
284  }
285 
286  }
287  return npredecessors;
288 }
289 
290 #define NO_DEP_BARRIER (false)
291 #define DEP_BARRIER (true)
292 
293 // returns true if the task has any outstanding dependence
294 static bool
295 __kmp_check_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_task_t *task, kmp_dephash_t *hash, bool dep_barrier,
296  kmp_int32 ndeps, kmp_depend_info_t *dep_list,
297  kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
298 {
299  int i;
300 
301  // Filter deps in dep_list
302  // TODO: Different algorithm for large dep_list ( > 10 ? )
303  for ( i = 0; i < ndeps; i ++ ) {
304  if ( dep_list[i].base_addr != 0 )
305  for ( int j = i+1; j < ndeps; j++ )
306  if ( dep_list[i].base_addr == dep_list[j].base_addr ) {
307  dep_list[i].flags.in |= dep_list[j].flags.in;
308  dep_list[i].flags.out |= dep_list[j].flags.out;
309  dep_list[j].base_addr = 0; // Mark j element as void
310  }
311  }
312 
313  // doesn't need to be atomic as no other thread is going to be accessing this node just yet
314  // npredecessors is set 1 to ensure that none of the releasing tasks queues this task before we have finished processing all the dependencies
315  node->dn.npredecessors = 1;
316 
317  // used to pack all npredecessors additions into a single atomic operation at the end
318  int npredecessors;
319 
320  npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps, dep_list);
321  npredecessors += __kmp_process_deps<false>(gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list);
322 
323  KMP_TEST_THEN_ADD32(&node->dn.npredecessors, npredecessors);
324 
325  // Remove the fake predecessor and find out if there's any outstanding dependence (some tasks may have finished while we processed the dependences)
326  node->dn.task = task;
327  KMP_MB();
328  npredecessors = KMP_TEST_THEN_DEC32(&node->dn.npredecessors) - 1;
329 
330  // beyond this point the task could be queued (and executed) by a releasing task...
331  return npredecessors > 0 ? true : false;
332 }
333 
334 void
335 __kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task )
336 {
337  kmp_info_t *thread = __kmp_threads[ gtid ];
338  kmp_depnode_t *node = task->td_depnode;
339 
340  if ( task->td_dephash )
341  __kmp_dephash_free(thread,task->td_dephash);
342 
343  if ( !node ) return;
344 
345  KMP_ACQUIRE_DEPNODE(gtid,node);
346  node->dn.task = NULL; // mark this task as finished, so no new dependencies are generated
347  KMP_RELEASE_DEPNODE(gtid,node);
348 
349  kmp_depnode_list_t *next;
350  for ( kmp_depnode_list_t *p = node->dn.successors; p; p = next ) {
351  kmp_depnode_t *successor = p->node;
352  kmp_int32 npredecessors = KMP_TEST_THEN_DEC32(&successor->dn.npredecessors) - 1;
353 
354  // successor task can be NULL for wait_depends or because deps are still being processed
355  if ( npredecessors == 0 ) {
356  KMP_MB();
357  if ( successor->dn.task )
358  // loc_ref was already stored in successor's task_data
359  __kmpc_omp_task(NULL,gtid,successor->dn.task);
360  }
361 
362  next = p->next;
363  __kmp_node_deref(thread,p->node);
364 #if USE_FAST_MEMORY
365  __kmp_fast_free(thread,p);
366 #else
367  __kmp_thread_free(thread,p);
368 #endif
369  }
370 
371  __kmp_node_deref(thread,node);
372 }
373 
388 kmp_int32
389 __kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
390  kmp_int32 ndeps, kmp_depend_info_t *dep_list,
391  kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
392 {
393  kmp_info_t *thread = __kmp_threads[ gtid ];
394  kmp_taskdata_t * current_task = thread->th.th_current_task;
395 
396  bool serial = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
397 
398  if ( !serial && ( ndeps > 0 || ndeps_noalias > 0 )) {
399  /* if no dependencies have been tracked yet, create the dependence hash */
400  if ( current_task->td_dephash == NULL )
401  current_task->td_dephash = __kmp_dephash_create(thread);
402 
403 #if USE_FAST_MEMORY
404  kmp_depnode_t *node = (kmp_depnode_t *) __kmp_fast_allocate(thread,sizeof(kmp_depnode_t));
405 #else
406  kmp_depnode_t *node = (kmp_depnode_t *) __kmp_thread_malloc(thread,sizeof(kmp_depnode_t));
407 #endif
408 
409  __kmp_init_node(node);
410  KMP_TASK_TO_TASKDATA(new_task)->td_depnode = node;
411 
412  if ( __kmp_check_deps( gtid, node, new_task, current_task->td_dephash, NO_DEP_BARRIER,
413  ndeps, dep_list, ndeps_noalias,noalias_dep_list ) )
414  return TASK_CURRENT_NOT_QUEUED;
415  }
416 
417  return __kmpc_omp_task(loc_ref,gtid,new_task);
418 }
419 
431 void
432 __kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
433  kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
434 {
435  if ( ndeps == 0 && ndeps_noalias == 0 ) return;
436 
437  kmp_info_t *thread = __kmp_threads[ gtid ];
438  kmp_taskdata_t * current_task = thread->th.th_current_task;
439 
440  // dependences are not computed in serial teams
441  if ( current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final)
442  return;
443 
444  // if the dephash is not yet created it means we have nothing to wait for
445  if ( current_task->td_dephash == NULL ) return;
446 
447  kmp_depnode_t node;
448  __kmp_init_node(&node);
449 
450  if (!__kmp_check_deps( gtid, &node, NULL, current_task->td_dephash, DEP_BARRIER,
451  ndeps, dep_list, ndeps_noalias, noalias_dep_list ))
452  return;
453 
454  int thread_finished = FALSE;
455  while ( node.dn.npredecessors > 0 ) {
456  __kmp_execute_tasks( thread, gtid, (volatile kmp_uint32 *)&(node.dn.npredecessors),
457  0, FALSE, &thread_finished,
458 #if USE_ITT_BUILD
459  NULL,
460 #endif
461  __kmp_task_stealing_constraint );
462  }
463 
464 }
465 
466 #endif /* OMP_40_ENABLED */
467