Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "kmp.h"
36 #include "kmp_i18n.h"
37 #include "kmp_io.h"
38 #include "kmp_str.h"
39 #include "kmp_wrapper_getpid.h"
40 
41 #if KMP_AFFINITY_SUPPORTED
42 
43 //
44 // Print the affinity mask to the character array in a pretty format.
45 //
46 char *
47 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
48 {
49  KMP_ASSERT(buf_len >= 40);
50  char *scan = buf;
51  char *end = buf + buf_len - 1;
52 
53  //
54  // Find first element / check for empty set.
55  //
56  size_t i;
57  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
58  if (KMP_CPU_ISSET(i, mask)) {
59  break;
60  }
61  }
62  if (i == KMP_CPU_SETSIZE) {
63  KMP_SNPRINTF(scan, buf_len, "{<empty>}");
64  while (*scan != '\0') scan++;
65  KMP_ASSERT(scan <= end);
66  return buf;
67  }
68 
69  KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
70  while (*scan != '\0') scan++;
71  i++;
72  for (; i < KMP_CPU_SETSIZE; i++) {
73  if (! KMP_CPU_ISSET(i, mask)) {
74  continue;
75  }
76 
77  //
78  // Check for buffer overflow. A string of the form ",<n>" will have
79  // at most 10 characters, plus we want to leave room to print ",...}"
80  // if the set is too large to print for a total of 15 characters.
81  // We already left room for '\0' in setting end.
82  //
83  if (end - scan < 15) {
84  break;
85  }
86  KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
87  while (*scan != '\0') scan++;
88  }
89  if (i < KMP_CPU_SETSIZE) {
90  KMP_SNPRINTF(scan, buf_len, ",...");
91  while (*scan != '\0') scan++;
92  }
93  KMP_SNPRINTF(scan, buf_len, "}");
94  while (*scan != '\0') scan++;
95  KMP_ASSERT(scan <= end);
96  return buf;
97 }
98 
99 
100 void
101 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
102 {
103  KMP_CPU_ZERO(mask);
104 
105 # if KMP_GROUP_AFFINITY
106 
107  if (__kmp_num_proc_groups > 1) {
108  int group;
109  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
110  for (group = 0; group < __kmp_num_proc_groups; group++) {
111  int i;
112  int num = __kmp_GetActiveProcessorCount(group);
113  for (i = 0; i < num; i++) {
114  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
115  }
116  }
117  }
118  else
119 
120 # endif /* KMP_GROUP_AFFINITY */
121 
122  {
123  int proc;
124  for (proc = 0; proc < __kmp_xproc; proc++) {
125  KMP_CPU_SET(proc, mask);
126  }
127  }
128 }
129 
130 
131 //
132 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
133 // functions.
134 //
135 // The icc codegen emits sections with extremely long names, of the form
136 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
137 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
138 // some sort of memory corruption or table overflow that is triggered by
139 // these long strings. I checked the latest version of the linker -
140 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
141 // fixed.
142 //
143 // Unfortunately, my attempts to reproduce it in a smaller example have
144 // failed - I'm not sure what the prospects are of getting it fixed
145 // properly - but we need a reproducer smaller than all of libiomp.
146 //
147 // Work around the problem by avoiding inline constructors in such builds.
148 // We do this for all platforms, not just Linux* OS - non-inline functions are
149 // more debuggable and provide better coverage into than inline functions.
150 // Use inline functions in shipping libs, for performance.
151 //
152 
153 # if !defined(KMP_DEBUG) && !defined(COVER)
154 
155 class Address {
156 public:
157  static const unsigned maxDepth = 32;
158  unsigned labels[maxDepth];
159  unsigned childNums[maxDepth];
160  unsigned depth;
161  unsigned leader;
162  Address(unsigned _depth)
163  : depth(_depth), leader(FALSE) {
164  }
165  Address &operator=(const Address &b) {
166  depth = b.depth;
167  for (unsigned i = 0; i < depth; i++) {
168  labels[i] = b.labels[i];
169  childNums[i] = b.childNums[i];
170  }
171  leader = FALSE;
172  return *this;
173  }
174  bool operator==(const Address &b) const {
175  if (depth != b.depth)
176  return false;
177  for (unsigned i = 0; i < depth; i++)
178  if(labels[i] != b.labels[i])
179  return false;
180  return true;
181  }
182  bool isClose(const Address &b, int level) const {
183  if (depth != b.depth)
184  return false;
185  if ((unsigned)level >= depth)
186  return true;
187  for (unsigned i = 0; i < (depth - level); i++)
188  if(labels[i] != b.labels[i])
189  return false;
190  return true;
191  }
192  bool operator!=(const Address &b) const {
193  return !operator==(b);
194  }
195 };
196 
197 class AddrUnsPair {
198 public:
199  Address first;
200  unsigned second;
201  AddrUnsPair(Address _first, unsigned _second)
202  : first(_first), second(_second) {
203  }
204  AddrUnsPair &operator=(const AddrUnsPair &b)
205  {
206  first = b.first;
207  second = b.second;
208  return *this;
209  }
210 };
211 
212 # else
213 
214 class Address {
215 public:
216  static const unsigned maxDepth = 32;
217  unsigned labels[maxDepth];
218  unsigned childNums[maxDepth];
219  unsigned depth;
220  unsigned leader;
221  Address(unsigned _depth);
222  Address &operator=(const Address &b);
223  bool operator==(const Address &b) const;
224  bool isClose(const Address &b, int level) const;
225  bool operator!=(const Address &b) const;
226 };
227 
228 Address::Address(unsigned _depth)
229 {
230  depth = _depth;
231  leader = FALSE;
232 }
233 
234 Address &Address::operator=(const Address &b) {
235  depth = b.depth;
236  for (unsigned i = 0; i < depth; i++) {
237  labels[i] = b.labels[i];
238  childNums[i] = b.childNums[i];
239  }
240  leader = FALSE;
241  return *this;
242 }
243 
244 bool Address::operator==(const Address &b) const {
245  if (depth != b.depth)
246  return false;
247  for (unsigned i = 0; i < depth; i++)
248  if(labels[i] != b.labels[i])
249  return false;
250  return true;
251 }
252 
253 bool Address::isClose(const Address &b, int level) const {
254  if (depth != b.depth)
255  return false;
256  if ((unsigned)level >= depth)
257  return true;
258  for (unsigned i = 0; i < (depth - level); i++)
259  if(labels[i] != b.labels[i])
260  return false;
261  return true;
262 }
263 
264 bool Address::operator!=(const Address &b) const {
265  return !operator==(b);
266 }
267 
268 class AddrUnsPair {
269 public:
270  Address first;
271  unsigned second;
272  AddrUnsPair(Address _first, unsigned _second);
273  AddrUnsPair &operator=(const AddrUnsPair &b);
274 };
275 
276 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
277  : first(_first), second(_second)
278 {
279 }
280 
281 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
282 {
283  first = b.first;
284  second = b.second;
285  return *this;
286 }
287 
288 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
289 
290 
291 static int
292 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
293 {
294  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
295  ->first);
296  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
297  ->first);
298  unsigned depth = aa->depth;
299  unsigned i;
300  KMP_DEBUG_ASSERT(depth == bb->depth);
301  for (i = 0; i < depth; i++) {
302  if (aa->labels[i] < bb->labels[i]) return -1;
303  if (aa->labels[i] > bb->labels[i]) return 1;
304  }
305  return 0;
306 }
307 
308 
309 static int
310 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
311 {
312  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
313  ->first);
314  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
315  ->first);
316  unsigned depth = aa->depth;
317  unsigned i;
318  KMP_DEBUG_ASSERT(depth == bb->depth);
319  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
320  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
321  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
322  int j = depth - i - 1;
323  if (aa->childNums[j] < bb->childNums[j]) return -1;
324  if (aa->childNums[j] > bb->childNums[j]) return 1;
325  }
326  for (; i < depth; i++) {
327  int j = i - __kmp_affinity_compact;
328  if (aa->childNums[j] < bb->childNums[j]) return -1;
329  if (aa->childNums[j] > bb->childNums[j]) return 1;
330  }
331  return 0;
332 }
333 
335 class hierarchy_info {
336 public:
339  kmp_uint32 maxLevels;
340 
344  kmp_uint32 depth;
345  kmp_uint32 base_num_threads;
346  volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
347  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
348 
352  kmp_uint32 *numPerLevel;
353  kmp_uint32 *skipPerLevel;
354 
355  void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
356  int hier_depth = adr2os[0].first.depth;
357  int level = 0;
358  for (int i=hier_depth-1; i>=0; --i) {
359  int max = -1;
360  for (int j=0; j<num_addrs; ++j) {
361  int next = adr2os[j].first.childNums[i];
362  if (next > max) max = next;
363  }
364  numPerLevel[level] = max+1;
365  ++level;
366  }
367  }
368 
369  hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
370 
371  // TO FIX: This destructor causes a segfault in the library at shutdown.
372  //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
373 
374  void init(AddrUnsPair *adr2os, int num_addrs)
375  {
376  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
377  if (bool_result == 0) { // Wait for initialization
378  while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
379  return;
380  }
381  KMP_DEBUG_ASSERT(bool_result==1);
382 
383  /* Added explicit initialization of the data fields here to prevent usage of dirty value
384  observed when static library is re-initialized multiple times (e.g. when
385  non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
386  depth = 1;
387  resizing = 0;
388  maxLevels = 7;
389  numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
390  skipPerLevel = &(numPerLevel[maxLevels]);
391  for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
392  numPerLevel[i] = 1;
393  skipPerLevel[i] = 1;
394  }
395 
396  // Sort table by physical ID
397  if (adr2os) {
398  qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
399  deriveLevels(adr2os, num_addrs);
400  }
401  else {
402  numPerLevel[0] = 4;
403  numPerLevel[1] = num_addrs/4;
404  if (num_addrs%4) numPerLevel[1]++;
405  }
406 
407  base_num_threads = num_addrs;
408  for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
409  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
410  depth++;
411 
412  kmp_uint32 branch = 4;
413  if (numPerLevel[0] == 1) branch = num_addrs/4;
414  if (branch<4) branch=4;
415  for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
416  while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
417  if (numPerLevel[d] & 1) numPerLevel[d]++;
418  numPerLevel[d] = numPerLevel[d] >> 1;
419  if (numPerLevel[d+1] == 1) depth++;
420  numPerLevel[d+1] = numPerLevel[d+1] << 1;
421  }
422  if(numPerLevel[0] == 1) {
423  branch = branch >> 1;
424  if (branch<4) branch = 4;
425  }
426  }
427 
428  for (kmp_uint32 i=1; i<depth; ++i)
429  skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
430  // Fill in hierarchy in the case of oversubscription
431  for (kmp_uint32 i=depth; i<maxLevels; ++i)
432  skipPerLevel[i] = 2*skipPerLevel[i-1];
433 
434  uninitialized = 0; // One writer
435 
436  }
437 
438  void resize(kmp_uint32 nproc)
439  {
440  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
441  if (bool_result == 0) { // Someone else is resizing
442  while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
443  return;
444  }
445  KMP_DEBUG_ASSERT(bool_result!=0);
446  KMP_DEBUG_ASSERT(nproc > base_num_threads);
447 
448  // Calculate new max_levels
449  kmp_uint32 old_sz = skipPerLevel[depth-1];
450  kmp_uint32 incs = 0, old_maxLevels= maxLevels;
451  while (nproc > old_sz) {
452  old_sz *=2;
453  incs++;
454  }
455  maxLevels += incs;
456 
457  // Resize arrays
458  kmp_uint32 *old_numPerLevel = numPerLevel;
459  kmp_uint32 *old_skipPerLevel = skipPerLevel;
460  numPerLevel = skipPerLevel = NULL;
461  numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
462  skipPerLevel = &(numPerLevel[maxLevels]);
463 
464  // Copy old elements from old arrays
465  for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
466  numPerLevel[i] = old_numPerLevel[i];
467  skipPerLevel[i] = old_skipPerLevel[i];
468  }
469 
470  // Init new elements in arrays to 1
471  for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
472  numPerLevel[i] = 1;
473  skipPerLevel[i] = 1;
474  }
475 
476  // Free old arrays
477  __kmp_free(old_numPerLevel);
478 
479  // Fill in oversubscription levels of hierarchy
480  for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
481  skipPerLevel[i] = 2*skipPerLevel[i-1];
482 
483  base_num_threads = nproc;
484  resizing = 0; // One writer
485 
486  }
487 };
488 
489 static hierarchy_info machine_hierarchy;
490 
491 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
492  kmp_uint32 depth;
493  // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
494  if (TCR_1(machine_hierarchy.uninitialized))
495  machine_hierarchy.init(NULL, nproc);
496  // Adjust the hierarchy in case num threads exceeds original
497  if (nproc > machine_hierarchy.base_num_threads)
498  machine_hierarchy.resize(nproc);
499 
500  depth = machine_hierarchy.depth;
501  KMP_DEBUG_ASSERT(depth > 0);
502  // The loop below adjusts the depth in the case of a resize
503  while (nproc > machine_hierarchy.skipPerLevel[depth-1])
504  depth++;
505 
506  thr_bar->depth = depth;
507  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
508  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
509 }
510 
511 //
512 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
513 // called to renumber the labels from [0..n] and place them into the child_num
514 // vector of the address object. This is done in case the labels used for
515 // the children at one node of the hierarchy differ from those used for
516 // another node at the same level. Example: suppose the machine has 2 nodes
517 // with 2 packages each. The first node contains packages 601 and 602, and
518 // second node contains packages 603 and 604. If we try to sort the table
519 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
520 // because we are paying attention to the labels themselves, not the ordinal
521 // child numbers. By using the child numbers in the sort, the result is
522 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
523 //
524 static void
525 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
526  int numAddrs)
527 {
528  KMP_DEBUG_ASSERT(numAddrs > 0);
529  int depth = address2os->first.depth;
530  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
531  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
532  * sizeof(unsigned));
533  int labCt;
534  for (labCt = 0; labCt < depth; labCt++) {
535  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
536  lastLabel[labCt] = address2os[0].first.labels[labCt];
537  }
538  int i;
539  for (i = 1; i < numAddrs; i++) {
540  for (labCt = 0; labCt < depth; labCt++) {
541  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
542  int labCt2;
543  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
544  counts[labCt2] = 0;
545  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
546  }
547  counts[labCt]++;
548  lastLabel[labCt] = address2os[i].first.labels[labCt];
549  break;
550  }
551  }
552  for (labCt = 0; labCt < depth; labCt++) {
553  address2os[i].first.childNums[labCt] = counts[labCt];
554  }
555  for (; labCt < (int)Address::maxDepth; labCt++) {
556  address2os[i].first.childNums[labCt] = 0;
557  }
558  }
559 }
560 
561 
562 //
563 // All of the __kmp_affinity_create_*_map() routines should set
564 // __kmp_affinity_masks to a vector of affinity mask objects of length
565 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
566 // return the number of levels in the machine topology tree (zero if
567 // __kmp_affinity_type == affinity_none).
568 //
569 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
570 // to the affinity mask for the initialization thread. They need to save and
571 // restore the mask, and it could be needed later, so saving it is just an
572 // optimization to avoid calling kmp_get_system_affinity() again.
573 //
574 static kmp_affin_mask_t *fullMask = NULL;
575 
576 kmp_affin_mask_t *
577 __kmp_affinity_get_fullMask() { return fullMask; }
578 
579 
580 static int nCoresPerPkg, nPackages;
581 static int __kmp_nThreadsPerCore;
582 #ifndef KMP_DFLT_NTH_CORES
583 static int __kmp_ncores;
584 #endif
585 
586 //
587 // __kmp_affinity_uniform_topology() doesn't work when called from
588 // places which support arbitrarily many levels in the machine topology
589 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
590 // __kmp_affinity_create_x2apicid_map().
591 //
592 inline static bool
593 __kmp_affinity_uniform_topology()
594 {
595  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
596 }
597 
598 
599 //
600 // Print out the detailed machine topology map, i.e. the physical locations
601 // of each OS proc.
602 //
603 static void
604 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
605  int pkgLevel, int coreLevel, int threadLevel)
606 {
607  int proc;
608 
609  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
610  for (proc = 0; proc < len; proc++) {
611  int level;
612  kmp_str_buf_t buf;
613  __kmp_str_buf_init(&buf);
614  for (level = 0; level < depth; level++) {
615  if (level == threadLevel) {
616  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
617  }
618  else if (level == coreLevel) {
619  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
620  }
621  else if (level == pkgLevel) {
622  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
623  }
624  else if (level > pkgLevel) {
625  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
626  level - pkgLevel - 1);
627  }
628  else {
629  __kmp_str_buf_print(&buf, "L%d ", level);
630  }
631  __kmp_str_buf_print(&buf, "%d ",
632  address2os[proc].first.labels[level]);
633  }
634  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
635  buf.str);
636  __kmp_str_buf_free(&buf);
637  }
638 }
639 
640 
641 //
642 // If we don't know how to retrieve the machine's processor topology, or
643 // encounter an error in doing so, this routine is called to form a "flat"
644 // mapping of os thread id's <-> processor id's.
645 //
646 static int
647 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
648  kmp_i18n_id_t *const msg_id)
649 {
650  *address2os = NULL;
651  *msg_id = kmp_i18n_null;
652 
653  //
654  // Even if __kmp_affinity_type == affinity_none, this routine might still
655  // called to set __kmp_ncores, as well as
656  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
657  //
658  if (! KMP_AFFINITY_CAPABLE()) {
659  KMP_ASSERT(__kmp_affinity_type == affinity_none);
660  __kmp_ncores = nPackages = __kmp_xproc;
661  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
662  if (__kmp_affinity_verbose) {
663  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
664  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
665  KMP_INFORM(Uniform, "KMP_AFFINITY");
666  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
667  __kmp_nThreadsPerCore, __kmp_ncores);
668  }
669  return 0;
670  }
671 
672  //
673  // When affinity is off, this routine will still be called to set
674  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
675  // nCoresPerPkg, & nPackages. Make sure all these vars are set
676  // correctly, and return now if affinity is not enabled.
677  //
678  __kmp_ncores = nPackages = __kmp_avail_proc;
679  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
680  if (__kmp_affinity_verbose) {
681  char buf[KMP_AFFIN_MASK_PRINT_LEN];
682  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
683 
684  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
685  if (__kmp_affinity_respect_mask) {
686  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
687  } else {
688  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
689  }
690  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
691  KMP_INFORM(Uniform, "KMP_AFFINITY");
692  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
693  __kmp_nThreadsPerCore, __kmp_ncores);
694  }
695  if (__kmp_affinity_type == affinity_none) {
696  return 0;
697  }
698 
699  //
700  // Contruct the data structure to be returned.
701  //
702  *address2os = (AddrUnsPair*)
703  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
704  int avail_ct = 0;
705  unsigned int i;
706  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
707  //
708  // Skip this proc if it is not included in the machine model.
709  //
710  if (! KMP_CPU_ISSET(i, fullMask)) {
711  continue;
712  }
713 
714  Address addr(1);
715  addr.labels[0] = i;
716  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
717  }
718  if (__kmp_affinity_verbose) {
719  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
720  }
721 
722  if (__kmp_affinity_gran_levels < 0) {
723  //
724  // Only the package level is modeled in the machine topology map,
725  // so the #levels of granularity is either 0 or 1.
726  //
727  if (__kmp_affinity_gran > affinity_gran_package) {
728  __kmp_affinity_gran_levels = 1;
729  }
730  else {
731  __kmp_affinity_gran_levels = 0;
732  }
733  }
734  return 1;
735 }
736 
737 
738 # if KMP_GROUP_AFFINITY
739 
740 //
741 // If multiple Windows* OS processor groups exist, we can create a 2-level
742 // topology map with the groups at level 0 and the individual procs at
743 // level 1.
744 //
745 // This facilitates letting the threads float among all procs in a group,
746 // if granularity=group (the default when there are multiple groups).
747 //
748 static int
749 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
750  kmp_i18n_id_t *const msg_id)
751 {
752  *address2os = NULL;
753  *msg_id = kmp_i18n_null;
754 
755  //
756  // If we don't have multiple processor groups, return now.
757  // The flat mapping will be used.
758  //
759  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
760  // FIXME set *msg_id
761  return -1;
762  }
763 
764  //
765  // Contruct the data structure to be returned.
766  //
767  *address2os = (AddrUnsPair*)
768  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
769  int avail_ct = 0;
770  int i;
771  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
772  //
773  // Skip this proc if it is not included in the machine model.
774  //
775  if (! KMP_CPU_ISSET(i, fullMask)) {
776  continue;
777  }
778 
779  Address addr(2);
780  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
781  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
782  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
783 
784  if (__kmp_affinity_verbose) {
785  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
786  addr.labels[1]);
787  }
788  }
789 
790  if (__kmp_affinity_gran_levels < 0) {
791  if (__kmp_affinity_gran == affinity_gran_group) {
792  __kmp_affinity_gran_levels = 1;
793  }
794  else if ((__kmp_affinity_gran == affinity_gran_fine)
795  || (__kmp_affinity_gran == affinity_gran_thread)) {
796  __kmp_affinity_gran_levels = 0;
797  }
798  else {
799  const char *gran_str = NULL;
800  if (__kmp_affinity_gran == affinity_gran_core) {
801  gran_str = "core";
802  }
803  else if (__kmp_affinity_gran == affinity_gran_package) {
804  gran_str = "package";
805  }
806  else if (__kmp_affinity_gran == affinity_gran_node) {
807  gran_str = "node";
808  }
809  else {
810  KMP_ASSERT(0);
811  }
812 
813  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
814  __kmp_affinity_gran_levels = 0;
815  }
816  }
817  return 2;
818 }
819 
820 # endif /* KMP_GROUP_AFFINITY */
821 
822 
823 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
824 
825 static int
826 __kmp_cpuid_mask_width(int count) {
827  int r = 0;
828 
829  while((1<<r) < count)
830  ++r;
831  return r;
832 }
833 
834 
835 class apicThreadInfo {
836 public:
837  unsigned osId; // param to __kmp_affinity_bind_thread
838  unsigned apicId; // from cpuid after binding
839  unsigned maxCoresPerPkg; // ""
840  unsigned maxThreadsPerPkg; // ""
841  unsigned pkgId; // inferred from above values
842  unsigned coreId; // ""
843  unsigned threadId; // ""
844 };
845 
846 
847 static int
848 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
849 {
850  const apicThreadInfo *aa = (const apicThreadInfo *)a;
851  const apicThreadInfo *bb = (const apicThreadInfo *)b;
852  if (aa->osId < bb->osId) return -1;
853  if (aa->osId > bb->osId) return 1;
854  return 0;
855 }
856 
857 
858 static int
859 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
860 {
861  const apicThreadInfo *aa = (const apicThreadInfo *)a;
862  const apicThreadInfo *bb = (const apicThreadInfo *)b;
863  if (aa->pkgId < bb->pkgId) return -1;
864  if (aa->pkgId > bb->pkgId) return 1;
865  if (aa->coreId < bb->coreId) return -1;
866  if (aa->coreId > bb->coreId) return 1;
867  if (aa->threadId < bb->threadId) return -1;
868  if (aa->threadId > bb->threadId) return 1;
869  return 0;
870 }
871 
872 
873 //
874 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
875 // an algorithm which cycles through the available os threads, setting
876 // the current thread's affinity mask to that thread, and then retrieves
877 // the Apic Id for each thread context using the cpuid instruction.
878 //
879 static int
880 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
881  kmp_i18n_id_t *const msg_id)
882 {
883  kmp_cpuid buf;
884  int rc;
885  *address2os = NULL;
886  *msg_id = kmp_i18n_null;
887 
888  //
889  // Check if cpuid leaf 4 is supported.
890  //
891  __kmp_x86_cpuid(0, 0, &buf);
892  if (buf.eax < 4) {
893  *msg_id = kmp_i18n_str_NoLeaf4Support;
894  return -1;
895  }
896 
897  //
898  // The algorithm used starts by setting the affinity to each available
899  // thread and retrieving info from the cpuid instruction, so if we are
900  // not capable of calling __kmp_get_system_affinity() and
901  // _kmp_get_system_affinity(), then we need to do something else - use
902  // the defaults that we calculated from issuing cpuid without binding
903  // to each proc.
904  //
905  if (! KMP_AFFINITY_CAPABLE()) {
906  //
907  // Hack to try and infer the machine topology using only the data
908  // available from cpuid on the current thread, and __kmp_xproc.
909  //
910  KMP_ASSERT(__kmp_affinity_type == affinity_none);
911 
912  //
913  // Get an upper bound on the number of threads per package using
914  // cpuid(1).
915  //
916  // On some OS/chps combinations where HT is supported by the chip
917  // but is disabled, this value will be 2 on a single core chip.
918  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
919  //
920  __kmp_x86_cpuid(1, 0, &buf);
921  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
922  if (maxThreadsPerPkg == 0) {
923  maxThreadsPerPkg = 1;
924  }
925 
926  //
927  // The num cores per pkg comes from cpuid(4).
928  // 1 must be added to the encoded value.
929  //
930  // The author of cpu_count.cpp treated this only an upper bound
931  // on the number of cores, but I haven't seen any cases where it
932  // was greater than the actual number of cores, so we will treat
933  // it as exact in this block of code.
934  //
935  // First, we need to check if cpuid(4) is supported on this chip.
936  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
937  // has the value n or greater.
938  //
939  __kmp_x86_cpuid(0, 0, &buf);
940  if (buf.eax >= 4) {
941  __kmp_x86_cpuid(4, 0, &buf);
942  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
943  }
944  else {
945  nCoresPerPkg = 1;
946  }
947 
948  //
949  // There is no way to reliably tell if HT is enabled without issuing
950  // the cpuid instruction from every thread, can correlating the cpuid
951  // info, so if the machine is not affinity capable, we assume that HT
952  // is off. We have seen quite a few machines where maxThreadsPerPkg
953  // is 2, yet the machine does not support HT.
954  //
955  // - Older OSes are usually found on machines with older chips, which
956  // do not support HT.
957  //
958  // - The performance penalty for mistakenly identifying a machine as
959  // HT when it isn't (which results in blocktime being incorrecly set
960  // to 0) is greater than the penalty when for mistakenly identifying
961  // a machine as being 1 thread/core when it is really HT enabled
962  // (which results in blocktime being incorrectly set to a positive
963  // value).
964  //
965  __kmp_ncores = __kmp_xproc;
966  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
967  __kmp_nThreadsPerCore = 1;
968  if (__kmp_affinity_verbose) {
969  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
970  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
971  if (__kmp_affinity_uniform_topology()) {
972  KMP_INFORM(Uniform, "KMP_AFFINITY");
973  } else {
974  KMP_INFORM(NonUniform, "KMP_AFFINITY");
975  }
976  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
977  __kmp_nThreadsPerCore, __kmp_ncores);
978  }
979  return 0;
980  }
981 
982  //
983  //
984  // From here on, we can assume that it is safe to call
985  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
986  // even if __kmp_affinity_type = affinity_none.
987  //
988 
989  //
990  // Save the affinity mask for the current thread.
991  //
992  kmp_affin_mask_t *oldMask;
993  KMP_CPU_ALLOC(oldMask);
994  KMP_ASSERT(oldMask != NULL);
995  __kmp_get_system_affinity(oldMask, TRUE);
996 
997  //
998  // Run through each of the available contexts, binding the current thread
999  // to it, and obtaining the pertinent information using the cpuid instr.
1000  //
1001  // The relevant information is:
1002  //
1003  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1004  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1005  //
1006  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
1007  // value of this field determines the width of the core# + thread#
1008  // fields in the Apic Id. It is also an upper bound on the number
1009  // of threads per package, but it has been verified that situations
1010  // happen were it is not exact. In particular, on certain OS/chip
1011  // combinations where Intel(R) Hyper-Threading Technology is supported
1012  // by the chip but has
1013  // been disabled, the value of this field will be 2 (for a single core
1014  // chip). On other OS/chip combinations supporting
1015  // Intel(R) Hyper-Threading Technology, the value of
1016  // this field will be 1 when Intel(R) Hyper-Threading Technology is
1017  // disabled and 2 when it is enabled.
1018  //
1019  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
1020  // value of this field (+1) determines the width of the core# field in
1021  // the Apic Id. The comments in "cpucount.cpp" say that this value is
1022  // an upper bound, but the IA-32 architecture manual says that it is
1023  // exactly the number of cores per package, and I haven't seen any
1024  // case where it wasn't.
1025  //
1026  // From this information, deduce the package Id, core Id, and thread Id,
1027  // and set the corresponding fields in the apicThreadInfo struct.
1028  //
1029  unsigned i;
1030  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1031  __kmp_avail_proc * sizeof(apicThreadInfo));
1032  unsigned nApics = 0;
1033  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
1034  //
1035  // Skip this proc if it is not included in the machine model.
1036  //
1037  if (! KMP_CPU_ISSET(i, fullMask)) {
1038  continue;
1039  }
1040  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1041 
1042  __kmp_affinity_bind_thread(i);
1043  threadInfo[nApics].osId = i;
1044 
1045  //
1046  // The apic id and max threads per pkg come from cpuid(1).
1047  //
1048  __kmp_x86_cpuid(1, 0, &buf);
1049  if (! (buf.edx >> 9) & 1) {
1050  __kmp_set_system_affinity(oldMask, TRUE);
1051  __kmp_free(threadInfo);
1052  KMP_CPU_FREE(oldMask);
1053  *msg_id = kmp_i18n_str_ApicNotPresent;
1054  return -1;
1055  }
1056  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1057  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1058  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1059  threadInfo[nApics].maxThreadsPerPkg = 1;
1060  }
1061 
1062  //
1063  // Max cores per pkg comes from cpuid(4).
1064  // 1 must be added to the encoded value.
1065  //
1066  // First, we need to check if cpuid(4) is supported on this chip.
1067  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1068  // has the value n or greater.
1069  //
1070  __kmp_x86_cpuid(0, 0, &buf);
1071  if (buf.eax >= 4) {
1072  __kmp_x86_cpuid(4, 0, &buf);
1073  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1074  }
1075  else {
1076  threadInfo[nApics].maxCoresPerPkg = 1;
1077  }
1078 
1079  //
1080  // Infer the pkgId / coreId / threadId using only the info
1081  // obtained locally.
1082  //
1083  int widthCT = __kmp_cpuid_mask_width(
1084  threadInfo[nApics].maxThreadsPerPkg);
1085  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1086 
1087  int widthC = __kmp_cpuid_mask_width(
1088  threadInfo[nApics].maxCoresPerPkg);
1089  int widthT = widthCT - widthC;
1090  if (widthT < 0) {
1091  //
1092  // I've never seen this one happen, but I suppose it could, if
1093  // the cpuid instruction on a chip was really screwed up.
1094  // Make sure to restore the affinity mask before the tail call.
1095  //
1096  __kmp_set_system_affinity(oldMask, TRUE);
1097  __kmp_free(threadInfo);
1098  KMP_CPU_FREE(oldMask);
1099  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1100  return -1;
1101  }
1102 
1103  int maskC = (1 << widthC) - 1;
1104  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1105  &maskC;
1106 
1107  int maskT = (1 << widthT) - 1;
1108  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1109 
1110  nApics++;
1111  }
1112 
1113  //
1114  // We've collected all the info we need.
1115  // Restore the old affinity mask for this thread.
1116  //
1117  __kmp_set_system_affinity(oldMask, TRUE);
1118 
1119  //
1120  // If there's only one thread context to bind to, form an Address object
1121  // with depth 1 and return immediately (or, if affinity is off, set
1122  // address2os to NULL and return).
1123  //
1124  // If it is configured to omit the package level when there is only a
1125  // single package, the logic at the end of this routine won't work if
1126  // there is only a single thread - it would try to form an Address
1127  // object with depth 0.
1128  //
1129  KMP_ASSERT(nApics > 0);
1130  if (nApics == 1) {
1131  __kmp_ncores = nPackages = 1;
1132  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1133  if (__kmp_affinity_verbose) {
1134  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1135  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1136 
1137  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1138  if (__kmp_affinity_respect_mask) {
1139  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1140  } else {
1141  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1142  }
1143  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1144  KMP_INFORM(Uniform, "KMP_AFFINITY");
1145  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1146  __kmp_nThreadsPerCore, __kmp_ncores);
1147  }
1148 
1149  if (__kmp_affinity_type == affinity_none) {
1150  __kmp_free(threadInfo);
1151  KMP_CPU_FREE(oldMask);
1152  return 0;
1153  }
1154 
1155  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1156  Address addr(1);
1157  addr.labels[0] = threadInfo[0].pkgId;
1158  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1159 
1160  if (__kmp_affinity_gran_levels < 0) {
1161  __kmp_affinity_gran_levels = 0;
1162  }
1163 
1164  if (__kmp_affinity_verbose) {
1165  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1166  }
1167 
1168  __kmp_free(threadInfo);
1169  KMP_CPU_FREE(oldMask);
1170  return 1;
1171  }
1172 
1173  //
1174  // Sort the threadInfo table by physical Id.
1175  //
1176  qsort(threadInfo, nApics, sizeof(*threadInfo),
1177  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1178 
1179  //
1180  // The table is now sorted by pkgId / coreId / threadId, but we really
1181  // don't know the radix of any of the fields. pkgId's may be sparsely
1182  // assigned among the chips on a system. Although coreId's are usually
1183  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1184  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1185  //
1186  // For that matter, we don't know what coresPerPkg and threadsPerCore
1187  // (or the total # packages) are at this point - we want to determine
1188  // that now. We only have an upper bound on the first two figures.
1189  //
1190  // We also perform a consistency check at this point: the values returned
1191  // by the cpuid instruction for any thread bound to a given package had
1192  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1193  //
1194  nPackages = 1;
1195  nCoresPerPkg = 1;
1196  __kmp_nThreadsPerCore = 1;
1197  unsigned nCores = 1;
1198 
1199  unsigned pkgCt = 1; // to determine radii
1200  unsigned lastPkgId = threadInfo[0].pkgId;
1201  unsigned coreCt = 1;
1202  unsigned lastCoreId = threadInfo[0].coreId;
1203  unsigned threadCt = 1;
1204  unsigned lastThreadId = threadInfo[0].threadId;
1205 
1206  // intra-pkg consist checks
1207  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1208  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1209 
1210  for (i = 1; i < nApics; i++) {
1211  if (threadInfo[i].pkgId != lastPkgId) {
1212  nCores++;
1213  pkgCt++;
1214  lastPkgId = threadInfo[i].pkgId;
1215  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1216  coreCt = 1;
1217  lastCoreId = threadInfo[i].coreId;
1218  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1219  threadCt = 1;
1220  lastThreadId = threadInfo[i].threadId;
1221 
1222  //
1223  // This is a different package, so go on to the next iteration
1224  // without doing any consistency checks. Reset the consistency
1225  // check vars, though.
1226  //
1227  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1228  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1229  continue;
1230  }
1231 
1232  if (threadInfo[i].coreId != lastCoreId) {
1233  nCores++;
1234  coreCt++;
1235  lastCoreId = threadInfo[i].coreId;
1236  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1237  threadCt = 1;
1238  lastThreadId = threadInfo[i].threadId;
1239  }
1240  else if (threadInfo[i].threadId != lastThreadId) {
1241  threadCt++;
1242  lastThreadId = threadInfo[i].threadId;
1243  }
1244  else {
1245  __kmp_free(threadInfo);
1246  KMP_CPU_FREE(oldMask);
1247  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1248  return -1;
1249  }
1250 
1251  //
1252  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1253  // fields agree between all the threads bounds to a given package.
1254  //
1255  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1256  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1257  __kmp_free(threadInfo);
1258  KMP_CPU_FREE(oldMask);
1259  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1260  return -1;
1261  }
1262  }
1263  nPackages = pkgCt;
1264  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1265  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1266 
1267  //
1268  // When affinity is off, this routine will still be called to set
1269  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1270  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1271  // correctly, and return now if affinity is not enabled.
1272  //
1273  __kmp_ncores = nCores;
1274  if (__kmp_affinity_verbose) {
1275  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1276  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1277 
1278  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1279  if (__kmp_affinity_respect_mask) {
1280  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1281  } else {
1282  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1283  }
1284  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1285  if (__kmp_affinity_uniform_topology()) {
1286  KMP_INFORM(Uniform, "KMP_AFFINITY");
1287  } else {
1288  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1289  }
1290  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1291  __kmp_nThreadsPerCore, __kmp_ncores);
1292 
1293  }
1294 
1295  if (__kmp_affinity_type == affinity_none) {
1296  __kmp_free(threadInfo);
1297  KMP_CPU_FREE(oldMask);
1298  return 0;
1299  }
1300 
1301  //
1302  // Now that we've determined the number of packages, the number of cores
1303  // per package, and the number of threads per core, we can construct the
1304  // data structure that is to be returned.
1305  //
1306  int pkgLevel = 0;
1307  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1308  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1309  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1310 
1311  KMP_ASSERT(depth > 0);
1312  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1313 
1314  for (i = 0; i < nApics; ++i) {
1315  Address addr(depth);
1316  unsigned os = threadInfo[i].osId;
1317  int d = 0;
1318 
1319  if (pkgLevel >= 0) {
1320  addr.labels[d++] = threadInfo[i].pkgId;
1321  }
1322  if (coreLevel >= 0) {
1323  addr.labels[d++] = threadInfo[i].coreId;
1324  }
1325  if (threadLevel >= 0) {
1326  addr.labels[d++] = threadInfo[i].threadId;
1327  }
1328  (*address2os)[i] = AddrUnsPair(addr, os);
1329  }
1330 
1331  if (__kmp_affinity_gran_levels < 0) {
1332  //
1333  // Set the granularity level based on what levels are modeled
1334  // in the machine topology map.
1335  //
1336  __kmp_affinity_gran_levels = 0;
1337  if ((threadLevel >= 0)
1338  && (__kmp_affinity_gran > affinity_gran_thread)) {
1339  __kmp_affinity_gran_levels++;
1340  }
1341  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1342  __kmp_affinity_gran_levels++;
1343  }
1344  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1345  __kmp_affinity_gran_levels++;
1346  }
1347  }
1348 
1349  if (__kmp_affinity_verbose) {
1350  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1351  coreLevel, threadLevel);
1352  }
1353 
1354  __kmp_free(threadInfo);
1355  KMP_CPU_FREE(oldMask);
1356  return depth;
1357 }
1358 
1359 
1360 //
1361 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1362 // architectures support a newer interface for specifying the x2APIC Ids,
1363 // based on cpuid leaf 11.
1364 //
1365 static int
1366 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1367  kmp_i18n_id_t *const msg_id)
1368 {
1369  kmp_cpuid buf;
1370  *address2os = NULL;
1371  *msg_id = kmp_i18n_null;
1372 
1373  //
1374  // Check to see if cpuid leaf 11 is supported.
1375  //
1376  __kmp_x86_cpuid(0, 0, &buf);
1377  if (buf.eax < 11) {
1378  *msg_id = kmp_i18n_str_NoLeaf11Support;
1379  return -1;
1380  }
1381  __kmp_x86_cpuid(11, 0, &buf);
1382  if (buf.ebx == 0) {
1383  *msg_id = kmp_i18n_str_NoLeaf11Support;
1384  return -1;
1385  }
1386 
1387  //
1388  // Find the number of levels in the machine topology. While we're at it,
1389  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1390  // try to get more accurate values later by explicitly counting them,
1391  // but get reasonable defaults now, in case we return early.
1392  //
1393  int level;
1394  int threadLevel = -1;
1395  int coreLevel = -1;
1396  int pkgLevel = -1;
1397  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1398 
1399  for (level = 0;; level++) {
1400  if (level > 31) {
1401  //
1402  // FIXME: Hack for DPD200163180
1403  //
1404  // If level is big then something went wrong -> exiting
1405  //
1406  // There could actually be 32 valid levels in the machine topology,
1407  // but so far, the only machine we have seen which does not exit
1408  // this loop before iteration 32 has fubar x2APIC settings.
1409  //
1410  // For now, just reject this case based upon loop trip count.
1411  //
1412  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1413  return -1;
1414  }
1415  __kmp_x86_cpuid(11, level, &buf);
1416  if (buf.ebx == 0) {
1417  if (pkgLevel < 0) {
1418  //
1419  // Will infer nPackages from __kmp_xproc
1420  //
1421  pkgLevel = level;
1422  level++;
1423  }
1424  break;
1425  }
1426  int kind = (buf.ecx >> 8) & 0xff;
1427  if (kind == 1) {
1428  //
1429  // SMT level
1430  //
1431  threadLevel = level;
1432  coreLevel = -1;
1433  pkgLevel = -1;
1434  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1435  if (__kmp_nThreadsPerCore == 0) {
1436  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1437  return -1;
1438  }
1439  }
1440  else if (kind == 2) {
1441  //
1442  // core level
1443  //
1444  coreLevel = level;
1445  pkgLevel = -1;
1446  nCoresPerPkg = buf.ebx & 0xff;
1447  if (nCoresPerPkg == 0) {
1448  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1449  return -1;
1450  }
1451  }
1452  else {
1453  if (level <= 0) {
1454  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1455  return -1;
1456  }
1457  if (pkgLevel >= 0) {
1458  continue;
1459  }
1460  pkgLevel = level;
1461  nPackages = buf.ebx & 0xff;
1462  if (nPackages == 0) {
1463  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1464  return -1;
1465  }
1466  }
1467  }
1468  int depth = level;
1469 
1470  //
1471  // In the above loop, "level" was counted from the finest level (usually
1472  // thread) to the coarsest. The caller expects that we will place the
1473  // labels in (*address2os)[].first.labels[] in the inverse order, so
1474  // we need to invert the vars saying which level means what.
1475  //
1476  if (threadLevel >= 0) {
1477  threadLevel = depth - threadLevel - 1;
1478  }
1479  if (coreLevel >= 0) {
1480  coreLevel = depth - coreLevel - 1;
1481  }
1482  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1483  pkgLevel = depth - pkgLevel - 1;
1484 
1485  //
1486  // The algorithm used starts by setting the affinity to each available
1487  // thread and retrieving info from the cpuid instruction, so if we are
1488  // not capable of calling __kmp_get_system_affinity() and
1489  // _kmp_get_system_affinity(), then we need to do something else - use
1490  // the defaults that we calculated from issuing cpuid without binding
1491  // to each proc.
1492  //
1493  if (! KMP_AFFINITY_CAPABLE())
1494  {
1495  //
1496  // Hack to try and infer the machine topology using only the data
1497  // available from cpuid on the current thread, and __kmp_xproc.
1498  //
1499  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1500 
1501  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1502  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1503  if (__kmp_affinity_verbose) {
1504  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1505  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1506  if (__kmp_affinity_uniform_topology()) {
1507  KMP_INFORM(Uniform, "KMP_AFFINITY");
1508  } else {
1509  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1510  }
1511  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1512  __kmp_nThreadsPerCore, __kmp_ncores);
1513  }
1514  return 0;
1515  }
1516 
1517  //
1518  //
1519  // From here on, we can assume that it is safe to call
1520  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1521  // even if __kmp_affinity_type = affinity_none.
1522  //
1523 
1524  //
1525  // Save the affinity mask for the current thread.
1526  //
1527  kmp_affin_mask_t *oldMask;
1528  KMP_CPU_ALLOC(oldMask);
1529  __kmp_get_system_affinity(oldMask, TRUE);
1530 
1531  //
1532  // Allocate the data structure to be returned.
1533  //
1534  AddrUnsPair *retval = (AddrUnsPair *)
1535  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1536 
1537  //
1538  // Run through each of the available contexts, binding the current thread
1539  // to it, and obtaining the pertinent information using the cpuid instr.
1540  //
1541  unsigned int proc;
1542  int nApics = 0;
1543  for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1544  //
1545  // Skip this proc if it is not included in the machine model.
1546  //
1547  if (! KMP_CPU_ISSET(proc, fullMask)) {
1548  continue;
1549  }
1550  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1551 
1552  __kmp_affinity_bind_thread(proc);
1553 
1554  //
1555  // Extrach the labels for each level in the machine topology map
1556  // from the Apic ID.
1557  //
1558  Address addr(depth);
1559  int prev_shift = 0;
1560 
1561  for (level = 0; level < depth; level++) {
1562  __kmp_x86_cpuid(11, level, &buf);
1563  unsigned apicId = buf.edx;
1564  if (buf.ebx == 0) {
1565  if (level != depth - 1) {
1566  KMP_CPU_FREE(oldMask);
1567  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1568  return -1;
1569  }
1570  addr.labels[depth - level - 1] = apicId >> prev_shift;
1571  level++;
1572  break;
1573  }
1574  int shift = buf.eax & 0x1f;
1575  int mask = (1 << shift) - 1;
1576  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1577  prev_shift = shift;
1578  }
1579  if (level != depth) {
1580  KMP_CPU_FREE(oldMask);
1581  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1582  return -1;
1583  }
1584 
1585  retval[nApics] = AddrUnsPair(addr, proc);
1586  nApics++;
1587  }
1588 
1589  //
1590  // We've collected all the info we need.
1591  // Restore the old affinity mask for this thread.
1592  //
1593  __kmp_set_system_affinity(oldMask, TRUE);
1594 
1595  //
1596  // If there's only one thread context to bind to, return now.
1597  //
1598  KMP_ASSERT(nApics > 0);
1599  if (nApics == 1) {
1600  __kmp_ncores = nPackages = 1;
1601  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1602  if (__kmp_affinity_verbose) {
1603  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1604  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1605 
1606  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1607  if (__kmp_affinity_respect_mask) {
1608  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1609  } else {
1610  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1611  }
1612  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1613  KMP_INFORM(Uniform, "KMP_AFFINITY");
1614  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1615  __kmp_nThreadsPerCore, __kmp_ncores);
1616  }
1617 
1618  if (__kmp_affinity_type == affinity_none) {
1619  __kmp_free(retval);
1620  KMP_CPU_FREE(oldMask);
1621  return 0;
1622  }
1623 
1624  //
1625  // Form an Address object which only includes the package level.
1626  //
1627  Address addr(1);
1628  addr.labels[0] = retval[0].first.labels[pkgLevel];
1629  retval[0].first = addr;
1630 
1631  if (__kmp_affinity_gran_levels < 0) {
1632  __kmp_affinity_gran_levels = 0;
1633  }
1634 
1635  if (__kmp_affinity_verbose) {
1636  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1637  }
1638 
1639  *address2os = retval;
1640  KMP_CPU_FREE(oldMask);
1641  return 1;
1642  }
1643 
1644  //
1645  // Sort the table by physical Id.
1646  //
1647  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1648 
1649  //
1650  // Find the radix at each of the levels.
1651  //
1652  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1653  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1654  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1655  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1656  for (level = 0; level < depth; level++) {
1657  totals[level] = 1;
1658  maxCt[level] = 1;
1659  counts[level] = 1;
1660  last[level] = retval[0].first.labels[level];
1661  }
1662 
1663  //
1664  // From here on, the iteration variable "level" runs from the finest
1665  // level to the coarsest, i.e. we iterate forward through
1666  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1667  // backwards.
1668  //
1669  for (proc = 1; (int)proc < nApics; proc++) {
1670  int level;
1671  for (level = 0; level < depth; level++) {
1672  if (retval[proc].first.labels[level] != last[level]) {
1673  int j;
1674  for (j = level + 1; j < depth; j++) {
1675  totals[j]++;
1676  counts[j] = 1;
1677  // The line below causes printing incorrect topology information
1678  // in case the max value for some level (maxCt[level]) is encountered earlier than
1679  // some less value while going through the array.
1680  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1681  // whereas it must be 4.
1682  // TODO!!! Check if it can be commented safely
1683  //maxCt[j] = 1;
1684  last[j] = retval[proc].first.labels[j];
1685  }
1686  totals[level]++;
1687  counts[level]++;
1688  if (counts[level] > maxCt[level]) {
1689  maxCt[level] = counts[level];
1690  }
1691  last[level] = retval[proc].first.labels[level];
1692  break;
1693  }
1694  else if (level == depth - 1) {
1695  __kmp_free(last);
1696  __kmp_free(maxCt);
1697  __kmp_free(counts);
1698  __kmp_free(totals);
1699  __kmp_free(retval);
1700  KMP_CPU_FREE(oldMask);
1701  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1702  return -1;
1703  }
1704  }
1705  }
1706 
1707  //
1708  // When affinity is off, this routine will still be called to set
1709  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1710  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1711  // correctly, and return if affinity is not enabled.
1712  //
1713  if (threadLevel >= 0) {
1714  __kmp_nThreadsPerCore = maxCt[threadLevel];
1715  }
1716  else {
1717  __kmp_nThreadsPerCore = 1;
1718  }
1719  nPackages = totals[pkgLevel];
1720 
1721  if (coreLevel >= 0) {
1722  __kmp_ncores = totals[coreLevel];
1723  nCoresPerPkg = maxCt[coreLevel];
1724  }
1725  else {
1726  __kmp_ncores = nPackages;
1727  nCoresPerPkg = 1;
1728  }
1729 
1730  //
1731  // Check to see if the machine topology is uniform
1732  //
1733  unsigned prod = maxCt[0];
1734  for (level = 1; level < depth; level++) {
1735  prod *= maxCt[level];
1736  }
1737  bool uniform = (prod == totals[level - 1]);
1738 
1739  //
1740  // Print the machine topology summary.
1741  //
1742  if (__kmp_affinity_verbose) {
1743  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1744  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1745 
1746  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1747  if (__kmp_affinity_respect_mask) {
1748  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1749  } else {
1750  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1751  }
1752  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1753  if (uniform) {
1754  KMP_INFORM(Uniform, "KMP_AFFINITY");
1755  } else {
1756  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1757  }
1758 
1759  kmp_str_buf_t buf;
1760  __kmp_str_buf_init(&buf);
1761 
1762  __kmp_str_buf_print(&buf, "%d", totals[0]);
1763  for (level = 1; level <= pkgLevel; level++) {
1764  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1765  }
1766  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1767  __kmp_nThreadsPerCore, __kmp_ncores);
1768 
1769  __kmp_str_buf_free(&buf);
1770  }
1771 
1772  if (__kmp_affinity_type == affinity_none) {
1773  __kmp_free(last);
1774  __kmp_free(maxCt);
1775  __kmp_free(counts);
1776  __kmp_free(totals);
1777  __kmp_free(retval);
1778  KMP_CPU_FREE(oldMask);
1779  return 0;
1780  }
1781 
1782  //
1783  // Find any levels with radiix 1, and remove them from the map
1784  // (except for the package level).
1785  //
1786  int new_depth = 0;
1787  for (level = 0; level < depth; level++) {
1788  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1789  continue;
1790  }
1791  new_depth++;
1792  }
1793 
1794  //
1795  // If we are removing any levels, allocate a new vector to return,
1796  // and copy the relevant information to it.
1797  //
1798  if (new_depth != depth) {
1799  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1800  sizeof(AddrUnsPair) * nApics);
1801  for (proc = 0; (int)proc < nApics; proc++) {
1802  Address addr(new_depth);
1803  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1804  }
1805  int new_level = 0;
1806  for (level = 0; level < depth; level++) {
1807  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1808  if (level == threadLevel) {
1809  threadLevel = -1;
1810  }
1811  else if ((threadLevel >= 0) && (level < threadLevel)) {
1812  threadLevel--;
1813  }
1814  if (level == coreLevel) {
1815  coreLevel = -1;
1816  }
1817  else if ((coreLevel >= 0) && (level < coreLevel)) {
1818  coreLevel--;
1819  }
1820  if (level < pkgLevel) {
1821  pkgLevel--;
1822  }
1823  continue;
1824  }
1825  for (proc = 0; (int)proc < nApics; proc++) {
1826  new_retval[proc].first.labels[new_level]
1827  = retval[proc].first.labels[level];
1828  }
1829  new_level++;
1830  }
1831 
1832  __kmp_free(retval);
1833  retval = new_retval;
1834  depth = new_depth;
1835  }
1836 
1837  if (__kmp_affinity_gran_levels < 0) {
1838  //
1839  // Set the granularity level based on what levels are modeled
1840  // in the machine topology map.
1841  //
1842  __kmp_affinity_gran_levels = 0;
1843  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1844  __kmp_affinity_gran_levels++;
1845  }
1846  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1847  __kmp_affinity_gran_levels++;
1848  }
1849  if (__kmp_affinity_gran > affinity_gran_package) {
1850  __kmp_affinity_gran_levels++;
1851  }
1852  }
1853 
1854  if (__kmp_affinity_verbose) {
1855  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1856  coreLevel, threadLevel);
1857  }
1858 
1859  __kmp_free(last);
1860  __kmp_free(maxCt);
1861  __kmp_free(counts);
1862  __kmp_free(totals);
1863  KMP_CPU_FREE(oldMask);
1864  *address2os = retval;
1865  return depth;
1866 }
1867 
1868 
1869 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1870 
1871 
1872 #define osIdIndex 0
1873 #define threadIdIndex 1
1874 #define coreIdIndex 2
1875 #define pkgIdIndex 3
1876 #define nodeIdIndex 4
1877 
1878 typedef unsigned *ProcCpuInfo;
1879 static unsigned maxIndex = pkgIdIndex;
1880 
1881 
1882 static int
1883 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1884 {
1885  const unsigned *aa = (const unsigned *)a;
1886  const unsigned *bb = (const unsigned *)b;
1887  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1888  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1889  return 0;
1890 };
1891 
1892 
1893 static int
1894 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1895 {
1896  unsigned i;
1897  const unsigned *aa = *((const unsigned **)a);
1898  const unsigned *bb = *((const unsigned **)b);
1899  for (i = maxIndex; ; i--) {
1900  if (aa[i] < bb[i]) return -1;
1901  if (aa[i] > bb[i]) return 1;
1902  if (i == osIdIndex) break;
1903  }
1904  return 0;
1905 }
1906 
1907 
1908 //
1909 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1910 // affinity map.
1911 //
1912 static int
1913 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1914  kmp_i18n_id_t *const msg_id, FILE *f)
1915 {
1916  *address2os = NULL;
1917  *msg_id = kmp_i18n_null;
1918 
1919  //
1920  // Scan of the file, and count the number of "processor" (osId) fields,
1921  // and find the highest value of <n> for a node_<n> field.
1922  //
1923  char buf[256];
1924  unsigned num_records = 0;
1925  while (! feof(f)) {
1926  buf[sizeof(buf) - 1] = 1;
1927  if (! fgets(buf, sizeof(buf), f)) {
1928  //
1929  // Read errors presumably because of EOF
1930  //
1931  break;
1932  }
1933 
1934  char s1[] = "processor";
1935  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1936  num_records++;
1937  continue;
1938  }
1939 
1940  //
1941  // FIXME - this will match "node_<n> <garbage>"
1942  //
1943  unsigned level;
1944  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1945  if (nodeIdIndex + level >= maxIndex) {
1946  maxIndex = nodeIdIndex + level;
1947  }
1948  continue;
1949  }
1950  }
1951 
1952  //
1953  // Check for empty file / no valid processor records, or too many.
1954  // The number of records can't exceed the number of valid bits in the
1955  // affinity mask.
1956  //
1957  if (num_records == 0) {
1958  *line = 0;
1959  *msg_id = kmp_i18n_str_NoProcRecords;
1960  return -1;
1961  }
1962  if (num_records > (unsigned)__kmp_xproc) {
1963  *line = 0;
1964  *msg_id = kmp_i18n_str_TooManyProcRecords;
1965  return -1;
1966  }
1967 
1968  //
1969  // Set the file pointer back to the begginning, so that we can scan the
1970  // file again, this time performing a full parse of the data.
1971  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1972  // Adding an extra element at the end allows us to remove a lot of extra
1973  // checks for termination conditions.
1974  //
1975  if (fseek(f, 0, SEEK_SET) != 0) {
1976  *line = 0;
1977  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1978  return -1;
1979  }
1980 
1981  //
1982  // Allocate the array of records to store the proc info in. The dummy
1983  // element at the end makes the logic in filling them out easier to code.
1984  //
1985  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1986  * sizeof(unsigned *));
1987  unsigned i;
1988  for (i = 0; i <= num_records; i++) {
1989  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1990  * sizeof(unsigned));
1991  }
1992 
1993 #define CLEANUP_THREAD_INFO \
1994  for (i = 0; i <= num_records; i++) { \
1995  __kmp_free(threadInfo[i]); \
1996  } \
1997  __kmp_free(threadInfo);
1998 
1999  //
2000  // A value of UINT_MAX means that we didn't find the field
2001  //
2002  unsigned __index;
2003 
2004 #define INIT_PROC_INFO(p) \
2005  for (__index = 0; __index <= maxIndex; __index++) { \
2006  (p)[__index] = UINT_MAX; \
2007  }
2008 
2009  for (i = 0; i <= num_records; i++) {
2010  INIT_PROC_INFO(threadInfo[i]);
2011  }
2012 
2013  unsigned num_avail = 0;
2014  *line = 0;
2015  while (! feof(f)) {
2016  //
2017  // Create an inner scoping level, so that all the goto targets at the
2018  // end of the loop appear in an outer scoping level. This avoids
2019  // warnings about jumping past an initialization to a target in the
2020  // same block.
2021  //
2022  {
2023  buf[sizeof(buf) - 1] = 1;
2024  bool long_line = false;
2025  if (! fgets(buf, sizeof(buf), f)) {
2026  //
2027  // Read errors presumably because of EOF
2028  //
2029  // If there is valid data in threadInfo[num_avail], then fake
2030  // a blank line in ensure that the last address gets parsed.
2031  //
2032  bool valid = false;
2033  for (i = 0; i <= maxIndex; i++) {
2034  if (threadInfo[num_avail][i] != UINT_MAX) {
2035  valid = true;
2036  }
2037  }
2038  if (! valid) {
2039  break;
2040  }
2041  buf[0] = 0;
2042  } else if (!buf[sizeof(buf) - 1]) {
2043  //
2044  // The line is longer than the buffer. Set a flag and don't
2045  // emit an error if we were going to ignore the line, anyway.
2046  //
2047  long_line = true;
2048 
2049 #define CHECK_LINE \
2050  if (long_line) { \
2051  CLEANUP_THREAD_INFO; \
2052  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2053  return -1; \
2054  }
2055  }
2056  (*line)++;
2057 
2058  char s1[] = "processor";
2059  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2060  CHECK_LINE;
2061  char *p = strchr(buf + sizeof(s1) - 1, ':');
2062  unsigned val;
2063  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2064  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2065  threadInfo[num_avail][osIdIndex] = val;
2066 #if KMP_OS_LINUX && USE_SYSFS_INFO
2067  char path[256];
2068  KMP_SNPRINTF(path, sizeof(path),
2069  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2070  threadInfo[num_avail][osIdIndex]);
2071  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2072 
2073  KMP_SNPRINTF(path, sizeof(path),
2074  "/sys/devices/system/cpu/cpu%u/topology/core_id",
2075  threadInfo[num_avail][osIdIndex]);
2076  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2077  continue;
2078 #else
2079  }
2080  char s2[] = "physical id";
2081  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2082  CHECK_LINE;
2083  char *p = strchr(buf + sizeof(s2) - 1, ':');
2084  unsigned val;
2085  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2086  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2087  threadInfo[num_avail][pkgIdIndex] = val;
2088  continue;
2089  }
2090  char s3[] = "core id";
2091  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2092  CHECK_LINE;
2093  char *p = strchr(buf + sizeof(s3) - 1, ':');
2094  unsigned val;
2095  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2096  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2097  threadInfo[num_avail][coreIdIndex] = val;
2098  continue;
2099 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2100  }
2101  char s4[] = "thread id";
2102  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2103  CHECK_LINE;
2104  char *p = strchr(buf + sizeof(s4) - 1, ':');
2105  unsigned val;
2106  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2107  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2108  threadInfo[num_avail][threadIdIndex] = val;
2109  continue;
2110  }
2111  unsigned level;
2112  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2113  CHECK_LINE;
2114  char *p = strchr(buf + sizeof(s4) - 1, ':');
2115  unsigned val;
2116  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2117  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2118  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2119  threadInfo[num_avail][nodeIdIndex + level] = val;
2120  continue;
2121  }
2122 
2123  //
2124  // We didn't recognize the leading token on the line.
2125  // There are lots of leading tokens that we don't recognize -
2126  // if the line isn't empty, go on to the next line.
2127  //
2128  if ((*buf != 0) && (*buf != '\n')) {
2129  //
2130  // If the line is longer than the buffer, read characters
2131  // until we find a newline.
2132  //
2133  if (long_line) {
2134  int ch;
2135  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2136  }
2137  continue;
2138  }
2139 
2140  //
2141  // A newline has signalled the end of the processor record.
2142  // Check that there aren't too many procs specified.
2143  //
2144  if ((int)num_avail == __kmp_xproc) {
2145  CLEANUP_THREAD_INFO;
2146  *msg_id = kmp_i18n_str_TooManyEntries;
2147  return -1;
2148  }
2149 
2150  //
2151  // Check for missing fields. The osId field must be there, and we
2152  // currently require that the physical id field is specified, also.
2153  //
2154  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2155  CLEANUP_THREAD_INFO;
2156  *msg_id = kmp_i18n_str_MissingProcField;
2157  return -1;
2158  }
2159  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2160  CLEANUP_THREAD_INFO;
2161  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2162  return -1;
2163  }
2164 
2165  //
2166  // Skip this proc if it is not included in the machine model.
2167  //
2168  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2169  INIT_PROC_INFO(threadInfo[num_avail]);
2170  continue;
2171  }
2172 
2173  //
2174  // We have a successful parse of this proc's info.
2175  // Increment the counter, and prepare for the next proc.
2176  //
2177  num_avail++;
2178  KMP_ASSERT(num_avail <= num_records);
2179  INIT_PROC_INFO(threadInfo[num_avail]);
2180  }
2181  continue;
2182 
2183  no_val:
2184  CLEANUP_THREAD_INFO;
2185  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2186  return -1;
2187 
2188  dup_field:
2189  CLEANUP_THREAD_INFO;
2190  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2191  return -1;
2192  }
2193  *line = 0;
2194 
2195 # if KMP_MIC && REDUCE_TEAM_SIZE
2196  unsigned teamSize = 0;
2197 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2198 
2199  // check for num_records == __kmp_xproc ???
2200 
2201  //
2202  // If there's only one thread context to bind to, form an Address object
2203  // with depth 1 and return immediately (or, if affinity is off, set
2204  // address2os to NULL and return).
2205  //
2206  // If it is configured to omit the package level when there is only a
2207  // single package, the logic at the end of this routine won't work if
2208  // there is only a single thread - it would try to form an Address
2209  // object with depth 0.
2210  //
2211  KMP_ASSERT(num_avail > 0);
2212  KMP_ASSERT(num_avail <= num_records);
2213  if (num_avail == 1) {
2214  __kmp_ncores = 1;
2215  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2216  if (__kmp_affinity_verbose) {
2217  if (! KMP_AFFINITY_CAPABLE()) {
2218  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2219  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2220  KMP_INFORM(Uniform, "KMP_AFFINITY");
2221  }
2222  else {
2223  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2224  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2225  fullMask);
2226  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2227  if (__kmp_affinity_respect_mask) {
2228  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2229  } else {
2230  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2231  }
2232  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2233  KMP_INFORM(Uniform, "KMP_AFFINITY");
2234  }
2235  int index;
2236  kmp_str_buf_t buf;
2237  __kmp_str_buf_init(&buf);
2238  __kmp_str_buf_print(&buf, "1");
2239  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2240  __kmp_str_buf_print(&buf, " x 1");
2241  }
2242  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2243  __kmp_str_buf_free(&buf);
2244  }
2245 
2246  if (__kmp_affinity_type == affinity_none) {
2247  CLEANUP_THREAD_INFO;
2248  return 0;
2249  }
2250 
2251  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2252  Address addr(1);
2253  addr.labels[0] = threadInfo[0][pkgIdIndex];
2254  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2255 
2256  if (__kmp_affinity_gran_levels < 0) {
2257  __kmp_affinity_gran_levels = 0;
2258  }
2259 
2260  if (__kmp_affinity_verbose) {
2261  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2262  }
2263 
2264  CLEANUP_THREAD_INFO;
2265  return 1;
2266  }
2267 
2268  //
2269  // Sort the threadInfo table by physical Id.
2270  //
2271  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2272  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2273 
2274  //
2275  // The table is now sorted by pkgId / coreId / threadId, but we really
2276  // don't know the radix of any of the fields. pkgId's may be sparsely
2277  // assigned among the chips on a system. Although coreId's are usually
2278  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2279  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2280  //
2281  // For that matter, we don't know what coresPerPkg and threadsPerCore
2282  // (or the total # packages) are at this point - we want to determine
2283  // that now. We only have an upper bound on the first two figures.
2284  //
2285  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2286  * sizeof(unsigned));
2287  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2288  * sizeof(unsigned));
2289  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2290  * sizeof(unsigned));
2291  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2292  * sizeof(unsigned));
2293 
2294  bool assign_thread_ids = false;
2295  unsigned threadIdCt;
2296  unsigned index;
2297 
2298  restart_radix_check:
2299  threadIdCt = 0;
2300 
2301  //
2302  // Initialize the counter arrays with data from threadInfo[0].
2303  //
2304  if (assign_thread_ids) {
2305  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2306  threadInfo[0][threadIdIndex] = threadIdCt++;
2307  }
2308  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2309  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2310  }
2311  }
2312  for (index = 0; index <= maxIndex; index++) {
2313  counts[index] = 1;
2314  maxCt[index] = 1;
2315  totals[index] = 1;
2316  lastId[index] = threadInfo[0][index];;
2317  }
2318 
2319  //
2320  // Run through the rest of the OS procs.
2321  //
2322  for (i = 1; i < num_avail; i++) {
2323  //
2324  // Find the most significant index whose id differs
2325  // from the id for the previous OS proc.
2326  //
2327  for (index = maxIndex; index >= threadIdIndex; index--) {
2328  if (assign_thread_ids && (index == threadIdIndex)) {
2329  //
2330  // Auto-assign the thread id field if it wasn't specified.
2331  //
2332  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2333  threadInfo[i][threadIdIndex] = threadIdCt++;
2334  }
2335 
2336  //
2337  // Aparrently the thread id field was specified for some
2338  // entries and not others. Start the thread id counter
2339  // off at the next higher thread id.
2340  //
2341  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2342  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2343  }
2344  }
2345  if (threadInfo[i][index] != lastId[index]) {
2346  //
2347  // Run through all indices which are less significant,
2348  // and reset the counts to 1.
2349  //
2350  // At all levels up to and including index, we need to
2351  // increment the totals and record the last id.
2352  //
2353  unsigned index2;
2354  for (index2 = threadIdIndex; index2 < index; index2++) {
2355  totals[index2]++;
2356  if (counts[index2] > maxCt[index2]) {
2357  maxCt[index2] = counts[index2];
2358  }
2359  counts[index2] = 1;
2360  lastId[index2] = threadInfo[i][index2];
2361  }
2362  counts[index]++;
2363  totals[index]++;
2364  lastId[index] = threadInfo[i][index];
2365 
2366  if (assign_thread_ids && (index > threadIdIndex)) {
2367 
2368 # if KMP_MIC && REDUCE_TEAM_SIZE
2369  //
2370  // The default team size is the total #threads in the machine
2371  // minus 1 thread for every core that has 3 or more threads.
2372  //
2373  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2374 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2375 
2376  //
2377  // Restart the thread counter, as we are on a new core.
2378  //
2379  threadIdCt = 0;
2380 
2381  //
2382  // Auto-assign the thread id field if it wasn't specified.
2383  //
2384  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2385  threadInfo[i][threadIdIndex] = threadIdCt++;
2386  }
2387 
2388  //
2389  // Aparrently the thread id field was specified for some
2390  // entries and not others. Start the thread id counter
2391  // off at the next higher thread id.
2392  //
2393  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2394  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2395  }
2396  }
2397  break;
2398  }
2399  }
2400  if (index < threadIdIndex) {
2401  //
2402  // If thread ids were specified, it is an error if they are not
2403  // unique. Also, check that we waven't already restarted the
2404  // loop (to be safe - shouldn't need to).
2405  //
2406  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2407  || assign_thread_ids) {
2408  __kmp_free(lastId);
2409  __kmp_free(totals);
2410  __kmp_free(maxCt);
2411  __kmp_free(counts);
2412  CLEANUP_THREAD_INFO;
2413  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2414  return -1;
2415  }
2416 
2417  //
2418  // If the thread ids were not specified and we see entries
2419  // entries that are duplicates, start the loop over and
2420  // assign the thread ids manually.
2421  //
2422  assign_thread_ids = true;
2423  goto restart_radix_check;
2424  }
2425  }
2426 
2427 # if KMP_MIC && REDUCE_TEAM_SIZE
2428  //
2429  // The default team size is the total #threads in the machine
2430  // minus 1 thread for every core that has 3 or more threads.
2431  //
2432  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2433 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2434 
2435  for (index = threadIdIndex; index <= maxIndex; index++) {
2436  if (counts[index] > maxCt[index]) {
2437  maxCt[index] = counts[index];
2438  }
2439  }
2440 
2441  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2442  nCoresPerPkg = maxCt[coreIdIndex];
2443  nPackages = totals[pkgIdIndex];
2444 
2445  //
2446  // Check to see if the machine topology is uniform
2447  //
2448  unsigned prod = totals[maxIndex];
2449  for (index = threadIdIndex; index < maxIndex; index++) {
2450  prod *= maxCt[index];
2451  }
2452  bool uniform = (prod == totals[threadIdIndex]);
2453 
2454  //
2455  // When affinity is off, this routine will still be called to set
2456  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2457  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2458  // correctly, and return now if affinity is not enabled.
2459  //
2460  __kmp_ncores = totals[coreIdIndex];
2461 
2462  if (__kmp_affinity_verbose) {
2463  if (! KMP_AFFINITY_CAPABLE()) {
2464  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2465  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2466  if (uniform) {
2467  KMP_INFORM(Uniform, "KMP_AFFINITY");
2468  } else {
2469  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2470  }
2471  }
2472  else {
2473  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2474  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2475  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2476  if (__kmp_affinity_respect_mask) {
2477  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2478  } else {
2479  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2480  }
2481  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2482  if (uniform) {
2483  KMP_INFORM(Uniform, "KMP_AFFINITY");
2484  } else {
2485  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2486  }
2487  }
2488  kmp_str_buf_t buf;
2489  __kmp_str_buf_init(&buf);
2490 
2491  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2492  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2493  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2494  }
2495  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2496  maxCt[threadIdIndex], __kmp_ncores);
2497 
2498  __kmp_str_buf_free(&buf);
2499  }
2500 
2501 # if KMP_MIC && REDUCE_TEAM_SIZE
2502  //
2503  // Set the default team size.
2504  //
2505  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2506  __kmp_dflt_team_nth = teamSize;
2507  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2508  __kmp_dflt_team_nth));
2509  }
2510 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2511 
2512  if (__kmp_affinity_type == affinity_none) {
2513  __kmp_free(lastId);
2514  __kmp_free(totals);
2515  __kmp_free(maxCt);
2516  __kmp_free(counts);
2517  CLEANUP_THREAD_INFO;
2518  return 0;
2519  }
2520 
2521  //
2522  // Count the number of levels which have more nodes at that level than
2523  // at the parent's level (with there being an implicit root node of
2524  // the top level). This is equivalent to saying that there is at least
2525  // one node at this level which has a sibling. These levels are in the
2526  // map, and the package level is always in the map.
2527  //
2528  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2529  int level = 0;
2530  for (index = threadIdIndex; index < maxIndex; index++) {
2531  KMP_ASSERT(totals[index] >= totals[index + 1]);
2532  inMap[index] = (totals[index] > totals[index + 1]);
2533  }
2534  inMap[maxIndex] = (totals[maxIndex] > 1);
2535  inMap[pkgIdIndex] = true;
2536 
2537  int depth = 0;
2538  for (index = threadIdIndex; index <= maxIndex; index++) {
2539  if (inMap[index]) {
2540  depth++;
2541  }
2542  }
2543  KMP_ASSERT(depth > 0);
2544 
2545  //
2546  // Construct the data structure that is to be returned.
2547  //
2548  *address2os = (AddrUnsPair*)
2549  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2550  int pkgLevel = -1;
2551  int coreLevel = -1;
2552  int threadLevel = -1;
2553 
2554  for (i = 0; i < num_avail; ++i) {
2555  Address addr(depth);
2556  unsigned os = threadInfo[i][osIdIndex];
2557  int src_index;
2558  int dst_index = 0;
2559 
2560  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2561  if (! inMap[src_index]) {
2562  continue;
2563  }
2564  addr.labels[dst_index] = threadInfo[i][src_index];
2565  if (src_index == pkgIdIndex) {
2566  pkgLevel = dst_index;
2567  }
2568  else if (src_index == coreIdIndex) {
2569  coreLevel = dst_index;
2570  }
2571  else if (src_index == threadIdIndex) {
2572  threadLevel = dst_index;
2573  }
2574  dst_index++;
2575  }
2576  (*address2os)[i] = AddrUnsPair(addr, os);
2577  }
2578 
2579  if (__kmp_affinity_gran_levels < 0) {
2580  //
2581  // Set the granularity level based on what levels are modeled
2582  // in the machine topology map.
2583  //
2584  unsigned src_index;
2585  __kmp_affinity_gran_levels = 0;
2586  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2587  if (! inMap[src_index]) {
2588  continue;
2589  }
2590  switch (src_index) {
2591  case threadIdIndex:
2592  if (__kmp_affinity_gran > affinity_gran_thread) {
2593  __kmp_affinity_gran_levels++;
2594  }
2595 
2596  break;
2597  case coreIdIndex:
2598  if (__kmp_affinity_gran > affinity_gran_core) {
2599  __kmp_affinity_gran_levels++;
2600  }
2601  break;
2602 
2603  case pkgIdIndex:
2604  if (__kmp_affinity_gran > affinity_gran_package) {
2605  __kmp_affinity_gran_levels++;
2606  }
2607  break;
2608  }
2609  }
2610  }
2611 
2612  if (__kmp_affinity_verbose) {
2613  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2614  coreLevel, threadLevel);
2615  }
2616 
2617  __kmp_free(inMap);
2618  __kmp_free(lastId);
2619  __kmp_free(totals);
2620  __kmp_free(maxCt);
2621  __kmp_free(counts);
2622  CLEANUP_THREAD_INFO;
2623  return depth;
2624 }
2625 
2626 
2627 //
2628 // Create and return a table of affinity masks, indexed by OS thread ID.
2629 // This routine handles OR'ing together all the affinity masks of threads
2630 // that are sufficiently close, if granularity > fine.
2631 //
2632 static kmp_affin_mask_t *
2633 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2634  AddrUnsPair *address2os, unsigned numAddrs)
2635 {
2636  //
2637  // First form a table of affinity masks in order of OS thread id.
2638  //
2639  unsigned depth;
2640  unsigned maxOsId;
2641  unsigned i;
2642 
2643  KMP_ASSERT(numAddrs > 0);
2644  depth = address2os[0].first.depth;
2645 
2646  maxOsId = 0;
2647  for (i = 0; i < numAddrs; i++) {
2648  unsigned osId = address2os[i].second;
2649  if (osId > maxOsId) {
2650  maxOsId = osId;
2651  }
2652  }
2653  kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2654  (maxOsId + 1) * __kmp_affin_mask_size);
2655 
2656  //
2657  // Sort the address2os table according to physical order. Doing so
2658  // will put all threads on the same core/package/node in consecutive
2659  // locations.
2660  //
2661  qsort(address2os, numAddrs, sizeof(*address2os),
2662  __kmp_affinity_cmp_Address_labels);
2663 
2664  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2665  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2666  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2667  }
2668  if (__kmp_affinity_gran_levels >= (int)depth) {
2669  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2670  && (__kmp_affinity_type != affinity_none))) {
2671  KMP_WARNING(AffThreadsMayMigrate);
2672  }
2673  }
2674 
2675  //
2676  // Run through the table, forming the masks for all threads on each
2677  // core. Threads on the same core will have identical "Address"
2678  // objects, not considering the last level, which must be the thread
2679  // id. All threads on a core will appear consecutively.
2680  //
2681  unsigned unique = 0;
2682  unsigned j = 0; // index of 1st thread on core
2683  unsigned leader = 0;
2684  Address *leaderAddr = &(address2os[0].first);
2685  kmp_affin_mask_t *sum
2686  = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2687  KMP_CPU_ZERO(sum);
2688  KMP_CPU_SET(address2os[0].second, sum);
2689  for (i = 1; i < numAddrs; i++) {
2690  //
2691  // If this thread is sufficiently close to the leader (within the
2692  // granularity setting), then set the bit for this os thread in the
2693  // affinity mask for this group, and go on to the next thread.
2694  //
2695  if (leaderAddr->isClose(address2os[i].first,
2696  __kmp_affinity_gran_levels)) {
2697  KMP_CPU_SET(address2os[i].second, sum);
2698  continue;
2699  }
2700 
2701  //
2702  // For every thread in this group, copy the mask to the thread's
2703  // entry in the osId2Mask table. Mark the first address as a
2704  // leader.
2705  //
2706  for (; j < i; j++) {
2707  unsigned osId = address2os[j].second;
2708  KMP_DEBUG_ASSERT(osId <= maxOsId);
2709  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2710  KMP_CPU_COPY(mask, sum);
2711  address2os[j].first.leader = (j == leader);
2712  }
2713  unique++;
2714 
2715  //
2716  // Start a new mask.
2717  //
2718  leader = i;
2719  leaderAddr = &(address2os[i].first);
2720  KMP_CPU_ZERO(sum);
2721  KMP_CPU_SET(address2os[i].second, sum);
2722  }
2723 
2724  //
2725  // For every thread in last group, copy the mask to the thread's
2726  // entry in the osId2Mask table.
2727  //
2728  for (; j < i; j++) {
2729  unsigned osId = address2os[j].second;
2730  KMP_DEBUG_ASSERT(osId <= maxOsId);
2731  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2732  KMP_CPU_COPY(mask, sum);
2733  address2os[j].first.leader = (j == leader);
2734  }
2735  unique++;
2736 
2737  *maxIndex = maxOsId;
2738  *numUnique = unique;
2739  return osId2Mask;
2740 }
2741 
2742 
2743 //
2744 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2745 // as file-static than to try and pass them through the calling sequence of
2746 // the recursive-descent OMP_PLACES parser.
2747 //
2748 static kmp_affin_mask_t *newMasks;
2749 static int numNewMasks;
2750 static int nextNewMask;
2751 
2752 #define ADD_MASK(_mask) \
2753  { \
2754  if (nextNewMask >= numNewMasks) { \
2755  numNewMasks *= 2; \
2756  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2757  numNewMasks * __kmp_affin_mask_size); \
2758  } \
2759  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2760  nextNewMask++; \
2761  }
2762 
2763 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2764  { \
2765  if (((_osId) > _maxOsId) || \
2766  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2767  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2768  && (__kmp_affinity_type != affinity_none))) { \
2769  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2770  } \
2771  } \
2772  else { \
2773  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2774  } \
2775  }
2776 
2777 
2778 //
2779 // Re-parse the proclist (for the explicit affinity type), and form the list
2780 // of affinity newMasks indexed by gtid.
2781 //
2782 static void
2783 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2784  unsigned int *out_numMasks, const char *proclist,
2785  kmp_affin_mask_t *osId2Mask, int maxOsId)
2786 {
2787  const char *scan = proclist;
2788  const char *next = proclist;
2789 
2790  //
2791  // We use malloc() for the temporary mask vector,
2792  // so that we can use realloc() to extend it.
2793  //
2794  numNewMasks = 2;
2795  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2796  * __kmp_affin_mask_size);
2797  nextNewMask = 0;
2798  kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2799  __kmp_affin_mask_size);
2800  int setSize = 0;
2801 
2802  for (;;) {
2803  int start, end, stride;
2804 
2805  SKIP_WS(scan);
2806  next = scan;
2807  if (*next == '\0') {
2808  break;
2809  }
2810 
2811  if (*next == '{') {
2812  int num;
2813  setSize = 0;
2814  next++; // skip '{'
2815  SKIP_WS(next);
2816  scan = next;
2817 
2818  //
2819  // Read the first integer in the set.
2820  //
2821  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2822  "bad proclist");
2823  SKIP_DIGITS(next);
2824  num = __kmp_str_to_int(scan, *next);
2825  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2826 
2827  //
2828  // Copy the mask for that osId to the sum (union) mask.
2829  //
2830  if ((num > maxOsId) ||
2831  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2832  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2833  && (__kmp_affinity_type != affinity_none))) {
2834  KMP_WARNING(AffIgnoreInvalidProcID, num);
2835  }
2836  KMP_CPU_ZERO(sumMask);
2837  }
2838  else {
2839  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2840  setSize = 1;
2841  }
2842 
2843  for (;;) {
2844  //
2845  // Check for end of set.
2846  //
2847  SKIP_WS(next);
2848  if (*next == '}') {
2849  next++; // skip '}'
2850  break;
2851  }
2852 
2853  //
2854  // Skip optional comma.
2855  //
2856  if (*next == ',') {
2857  next++;
2858  }
2859  SKIP_WS(next);
2860 
2861  //
2862  // Read the next integer in the set.
2863  //
2864  scan = next;
2865  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2866  "bad explicit proc list");
2867 
2868  SKIP_DIGITS(next);
2869  num = __kmp_str_to_int(scan, *next);
2870  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2871 
2872  //
2873  // Add the mask for that osId to the sum mask.
2874  //
2875  if ((num > maxOsId) ||
2876  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2877  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2878  && (__kmp_affinity_type != affinity_none))) {
2879  KMP_WARNING(AffIgnoreInvalidProcID, num);
2880  }
2881  }
2882  else {
2883  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2884  setSize++;
2885  }
2886  }
2887  if (setSize > 0) {
2888  ADD_MASK(sumMask);
2889  }
2890 
2891  SKIP_WS(next);
2892  if (*next == ',') {
2893  next++;
2894  }
2895  scan = next;
2896  continue;
2897  }
2898 
2899  //
2900  // Read the first integer.
2901  //
2902  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2903  SKIP_DIGITS(next);
2904  start = __kmp_str_to_int(scan, *next);
2905  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2906  SKIP_WS(next);
2907 
2908  //
2909  // If this isn't a range, then add a mask to the list and go on.
2910  //
2911  if (*next != '-') {
2912  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2913 
2914  //
2915  // Skip optional comma.
2916  //
2917  if (*next == ',') {
2918  next++;
2919  }
2920  scan = next;
2921  continue;
2922  }
2923 
2924  //
2925  // This is a range. Skip over the '-' and read in the 2nd int.
2926  //
2927  next++; // skip '-'
2928  SKIP_WS(next);
2929  scan = next;
2930  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2931  SKIP_DIGITS(next);
2932  end = __kmp_str_to_int(scan, *next);
2933  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2934 
2935  //
2936  // Check for a stride parameter
2937  //
2938  stride = 1;
2939  SKIP_WS(next);
2940  if (*next == ':') {
2941  //
2942  // A stride is specified. Skip over the ':" and read the 3rd int.
2943  //
2944  int sign = +1;
2945  next++; // skip ':'
2946  SKIP_WS(next);
2947  scan = next;
2948  if (*next == '-') {
2949  sign = -1;
2950  next++;
2951  SKIP_WS(next);
2952  scan = next;
2953  }
2954  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2955  "bad explicit proc list");
2956  SKIP_DIGITS(next);
2957  stride = __kmp_str_to_int(scan, *next);
2958  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2959  stride *= sign;
2960  }
2961 
2962  //
2963  // Do some range checks.
2964  //
2965  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2966  if (stride > 0) {
2967  KMP_ASSERT2(start <= end, "bad explicit proc list");
2968  }
2969  else {
2970  KMP_ASSERT2(start >= end, "bad explicit proc list");
2971  }
2972  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2973 
2974  //
2975  // Add the mask for each OS proc # to the list.
2976  //
2977  if (stride > 0) {
2978  do {
2979  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2980  start += stride;
2981  } while (start <= end);
2982  }
2983  else {
2984  do {
2985  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2986  start += stride;
2987  } while (start >= end);
2988  }
2989 
2990  //
2991  // Skip optional comma.
2992  //
2993  SKIP_WS(next);
2994  if (*next == ',') {
2995  next++;
2996  }
2997  scan = next;
2998  }
2999 
3000  *out_numMasks = nextNewMask;
3001  if (nextNewMask == 0) {
3002  *out_masks = NULL;
3003  KMP_INTERNAL_FREE(newMasks);
3004  return;
3005  }
3006  *out_masks
3007  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3008  KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3009  __kmp_free(sumMask);
3010  KMP_INTERNAL_FREE(newMasks);
3011 }
3012 
3013 
3014 # if OMP_40_ENABLED
3015 
3016 /*-----------------------------------------------------------------------------
3017 
3018 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3019 places. Again, Here is the grammar:
3020 
3021 place_list := place
3022 place_list := place , place_list
3023 place := num
3024 place := place : num
3025 place := place : num : signed
3026 place := { subplacelist }
3027 place := ! place // (lowest priority)
3028 subplace_list := subplace
3029 subplace_list := subplace , subplace_list
3030 subplace := num
3031 subplace := num : num
3032 subplace := num : num : signed
3033 signed := num
3034 signed := + signed
3035 signed := - signed
3036 
3037 -----------------------------------------------------------------------------*/
3038 
3039 static void
3040 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3041  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3042 {
3043  const char *next;
3044 
3045  for (;;) {
3046  int start, count, stride, i;
3047 
3048  //
3049  // Read in the starting proc id
3050  //
3051  SKIP_WS(*scan);
3052  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3053  "bad explicit places list");
3054  next = *scan;
3055  SKIP_DIGITS(next);
3056  start = __kmp_str_to_int(*scan, *next);
3057  KMP_ASSERT(start >= 0);
3058  *scan = next;
3059 
3060  //
3061  // valid follow sets are ',' ':' and '}'
3062  //
3063  SKIP_WS(*scan);
3064  if (**scan == '}' || **scan == ',') {
3065  if ((start > maxOsId) ||
3066  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3067  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3068  && (__kmp_affinity_type != affinity_none))) {
3069  KMP_WARNING(AffIgnoreInvalidProcID, start);
3070  }
3071  }
3072  else {
3073  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3074  (*setSize)++;
3075  }
3076  if (**scan == '}') {
3077  break;
3078  }
3079  (*scan)++; // skip ','
3080  continue;
3081  }
3082  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3083  (*scan)++; // skip ':'
3084 
3085  //
3086  // Read count parameter
3087  //
3088  SKIP_WS(*scan);
3089  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3090  "bad explicit places list");
3091  next = *scan;
3092  SKIP_DIGITS(next);
3093  count = __kmp_str_to_int(*scan, *next);
3094  KMP_ASSERT(count >= 0);
3095  *scan = next;
3096 
3097  //
3098  // valid follow sets are ',' ':' and '}'
3099  //
3100  SKIP_WS(*scan);
3101  if (**scan == '}' || **scan == ',') {
3102  for (i = 0; i < count; i++) {
3103  if ((start > maxOsId) ||
3104  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3105  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3106  && (__kmp_affinity_type != affinity_none))) {
3107  KMP_WARNING(AffIgnoreInvalidProcID, start);
3108  }
3109  break; // don't proliferate warnings for large count
3110  }
3111  else {
3112  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3113  start++;
3114  (*setSize)++;
3115  }
3116  }
3117  if (**scan == '}') {
3118  break;
3119  }
3120  (*scan)++; // skip ','
3121  continue;
3122  }
3123  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3124  (*scan)++; // skip ':'
3125 
3126  //
3127  // Read stride parameter
3128  //
3129  int sign = +1;
3130  for (;;) {
3131  SKIP_WS(*scan);
3132  if (**scan == '+') {
3133  (*scan)++; // skip '+'
3134  continue;
3135  }
3136  if (**scan == '-') {
3137  sign *= -1;
3138  (*scan)++; // skip '-'
3139  continue;
3140  }
3141  break;
3142  }
3143  SKIP_WS(*scan);
3144  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3145  "bad explicit places list");
3146  next = *scan;
3147  SKIP_DIGITS(next);
3148  stride = __kmp_str_to_int(*scan, *next);
3149  KMP_ASSERT(stride >= 0);
3150  *scan = next;
3151  stride *= sign;
3152 
3153  //
3154  // valid follow sets are ',' and '}'
3155  //
3156  SKIP_WS(*scan);
3157  if (**scan == '}' || **scan == ',') {
3158  for (i = 0; i < count; i++) {
3159  if ((start > maxOsId) ||
3160  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3161  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3162  && (__kmp_affinity_type != affinity_none))) {
3163  KMP_WARNING(AffIgnoreInvalidProcID, start);
3164  }
3165  break; // don't proliferate warnings for large count
3166  }
3167  else {
3168  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3169  start += stride;
3170  (*setSize)++;
3171  }
3172  }
3173  if (**scan == '}') {
3174  break;
3175  }
3176  (*scan)++; // skip ','
3177  continue;
3178  }
3179 
3180  KMP_ASSERT2(0, "bad explicit places list");
3181  }
3182 }
3183 
3184 
3185 static void
3186 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3187  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3188 {
3189  const char *next;
3190 
3191  //
3192  // valid follow sets are '{' '!' and num
3193  //
3194  SKIP_WS(*scan);
3195  if (**scan == '{') {
3196  (*scan)++; // skip '{'
3197  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3198  setSize);
3199  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3200  (*scan)++; // skip '}'
3201  }
3202  else if (**scan == '!') {
3203  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3204  KMP_CPU_COMPLEMENT(tempMask);
3205  (*scan)++; // skip '!'
3206  }
3207  else if ((**scan >= '0') && (**scan <= '9')) {
3208  next = *scan;
3209  SKIP_DIGITS(next);
3210  int num = __kmp_str_to_int(*scan, *next);
3211  KMP_ASSERT(num >= 0);
3212  if ((num > maxOsId) ||
3213  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3214  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3215  && (__kmp_affinity_type != affinity_none))) {
3216  KMP_WARNING(AffIgnoreInvalidProcID, num);
3217  }
3218  }
3219  else {
3220  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3221  (*setSize)++;
3222  }
3223  *scan = next; // skip num
3224  }
3225  else {
3226  KMP_ASSERT2(0, "bad explicit places list");
3227  }
3228 }
3229 
3230 
3231 //static void
3232 void
3233 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3234  unsigned int *out_numMasks, const char *placelist,
3235  kmp_affin_mask_t *osId2Mask, int maxOsId)
3236 {
3237  const char *scan = placelist;
3238  const char *next = placelist;
3239 
3240  numNewMasks = 2;
3241  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3242  * __kmp_affin_mask_size);
3243  nextNewMask = 0;
3244 
3245  kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3246  __kmp_affin_mask_size);
3247  KMP_CPU_ZERO(tempMask);
3248  int setSize = 0;
3249 
3250  for (;;) {
3251  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3252 
3253  //
3254  // valid follow sets are ',' ':' and EOL
3255  //
3256  SKIP_WS(scan);
3257  if (*scan == '\0' || *scan == ',') {
3258  if (setSize > 0) {
3259  ADD_MASK(tempMask);
3260  }
3261  KMP_CPU_ZERO(tempMask);
3262  setSize = 0;
3263  if (*scan == '\0') {
3264  break;
3265  }
3266  scan++; // skip ','
3267  continue;
3268  }
3269 
3270  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3271  scan++; // skip ':'
3272 
3273  //
3274  // Read count parameter
3275  //
3276  SKIP_WS(scan);
3277  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3278  "bad explicit places list");
3279  next = scan;
3280  SKIP_DIGITS(next);
3281  int count = __kmp_str_to_int(scan, *next);
3282  KMP_ASSERT(count >= 0);
3283  scan = next;
3284 
3285  //
3286  // valid follow sets are ',' ':' and EOL
3287  //
3288  SKIP_WS(scan);
3289  int stride;
3290  if (*scan == '\0' || *scan == ',') {
3291  stride = +1;
3292  }
3293  else {
3294  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3295  scan++; // skip ':'
3296 
3297  //
3298  // Read stride parameter
3299  //
3300  int sign = +1;
3301  for (;;) {
3302  SKIP_WS(scan);
3303  if (*scan == '+') {
3304  scan++; // skip '+'
3305  continue;
3306  }
3307  if (*scan == '-') {
3308  sign *= -1;
3309  scan++; // skip '-'
3310  continue;
3311  }
3312  break;
3313  }
3314  SKIP_WS(scan);
3315  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3316  "bad explicit places list");
3317  next = scan;
3318  SKIP_DIGITS(next);
3319  stride = __kmp_str_to_int(scan, *next);
3320  KMP_DEBUG_ASSERT(stride >= 0);
3321  scan = next;
3322  stride *= sign;
3323  }
3324 
3325  if (stride > 0) {
3326  int i;
3327  for (i = 0; i < count; i++) {
3328  int j;
3329  if (setSize == 0) {
3330  break;
3331  }
3332  ADD_MASK(tempMask);
3333  setSize = 0;
3334  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3335  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3336  KMP_CPU_CLR(j, tempMask);
3337  }
3338  else if ((j > maxOsId) ||
3339  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3340  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3341  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3342  KMP_WARNING(AffIgnoreInvalidProcID, j);
3343  }
3344  KMP_CPU_CLR(j, tempMask);
3345  }
3346  else {
3347  KMP_CPU_SET(j, tempMask);
3348  setSize++;
3349  }
3350  }
3351  for (; j >= 0; j--) {
3352  KMP_CPU_CLR(j, tempMask);
3353  }
3354  }
3355  }
3356  else {
3357  int i;
3358  for (i = 0; i < count; i++) {
3359  int j;
3360  if (setSize == 0) {
3361  break;
3362  }
3363  ADD_MASK(tempMask);
3364  setSize = 0;
3365  for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3366  j++) {
3367  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3368  KMP_CPU_CLR(j, tempMask);
3369  }
3370  else if ((j > maxOsId) ||
3371  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3372  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3373  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3374  KMP_WARNING(AffIgnoreInvalidProcID, j);
3375  }
3376  KMP_CPU_CLR(j, tempMask);
3377  }
3378  else {
3379  KMP_CPU_SET(j, tempMask);
3380  setSize++;
3381  }
3382  }
3383  for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3384  KMP_CPU_CLR(j, tempMask);
3385  }
3386  }
3387  }
3388  KMP_CPU_ZERO(tempMask);
3389  setSize = 0;
3390 
3391  //
3392  // valid follow sets are ',' and EOL
3393  //
3394  SKIP_WS(scan);
3395  if (*scan == '\0') {
3396  break;
3397  }
3398  if (*scan == ',') {
3399  scan++; // skip ','
3400  continue;
3401  }
3402 
3403  KMP_ASSERT2(0, "bad explicit places list");
3404  }
3405 
3406  *out_numMasks = nextNewMask;
3407  if (nextNewMask == 0) {
3408  *out_masks = NULL;
3409  KMP_INTERNAL_FREE(newMasks);
3410  return;
3411  }
3412  *out_masks
3413  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3414  KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3415  __kmp_free(tempMask);
3416  KMP_INTERNAL_FREE(newMasks);
3417 }
3418 
3419 # endif /* OMP_40_ENABLED */
3420 
3421 #undef ADD_MASK
3422 #undef ADD_MASK_OSID
3423 
3424 static void
3425 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3426 {
3427  if ( __kmp_place_num_cores == 0 ) {
3428  if ( __kmp_place_num_threads_per_core == 0 ) {
3429  return; // no cores limiting actions requested, exit
3430  }
3431  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3432  }
3433  if ( !__kmp_affinity_uniform_topology() ) {
3434  KMP_WARNING( AffThrPlaceNonUniform );
3435  return; // don't support non-uniform topology
3436  }
3437  if ( depth != 3 ) {
3438  KMP_WARNING( AffThrPlaceNonThreeLevel );
3439  return; // don't support not-3-level topology
3440  }
3441  if ( __kmp_place_num_threads_per_core == 0 ) {
3442  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3443  }
3444  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3445  KMP_WARNING( AffThrPlaceManyCores );
3446  return;
3447  }
3448 
3449  AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3450  nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3451  int i, j, k, n_old = 0, n_new = 0;
3452  for ( i = 0; i < nPackages; ++i ) {
3453  for ( j = 0; j < nCoresPerPkg; ++j ) {
3454  if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3455  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3456  } else {
3457  for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3458  if ( k < __kmp_place_num_threads_per_core ) {
3459  newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3460  n_new++;
3461  }
3462  n_old++;
3463  }
3464  }
3465  }
3466  }
3467  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3468  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3469  __kmp_avail_proc = n_new; // correct avail_proc
3470  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3471 
3472  __kmp_free( *pAddr );
3473  *pAddr = newAddr; // replace old topology with new one
3474 }
3475 
3476 
3477 static AddrUnsPair *address2os = NULL;
3478 static int * procarr = NULL;
3479 static int __kmp_aff_depth = 0;
3480 
3481 static void
3482 __kmp_aux_affinity_initialize(void)
3483 {
3484  if (__kmp_affinity_masks != NULL) {
3485  KMP_ASSERT(fullMask != NULL);
3486  return;
3487  }
3488 
3489  //
3490  // Create the "full" mask - this defines all of the processors that we
3491  // consider to be in the machine model. If respect is set, then it is
3492  // the initialization thread's affinity mask. Otherwise, it is all
3493  // processors that we know about on the machine.
3494  //
3495  if (fullMask == NULL) {
3496  fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3497  }
3498  if (KMP_AFFINITY_CAPABLE()) {
3499  if (__kmp_affinity_respect_mask) {
3500  __kmp_get_system_affinity(fullMask, TRUE);
3501 
3502  //
3503  // Count the number of available processors.
3504  //
3505  unsigned i;
3506  __kmp_avail_proc = 0;
3507  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3508  if (! KMP_CPU_ISSET(i, fullMask)) {
3509  continue;
3510  }
3511  __kmp_avail_proc++;
3512  }
3513  if (__kmp_avail_proc > __kmp_xproc) {
3514  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3515  && (__kmp_affinity_type != affinity_none))) {
3516  KMP_WARNING(ErrorInitializeAffinity);
3517  }
3518  __kmp_affinity_type = affinity_none;
3519  KMP_AFFINITY_DISABLE();
3520  return;
3521  }
3522  }
3523  else {
3524  __kmp_affinity_entire_machine_mask(fullMask);
3525  __kmp_avail_proc = __kmp_xproc;
3526  }
3527  }
3528 
3529  int depth = -1;
3530  kmp_i18n_id_t msg_id = kmp_i18n_null;
3531 
3532  //
3533  // For backward compatibility, setting KMP_CPUINFO_FILE =>
3534  // KMP_TOPOLOGY_METHOD=cpuinfo
3535  //
3536  if ((__kmp_cpuinfo_file != NULL) &&
3537  (__kmp_affinity_top_method == affinity_top_method_all)) {
3538  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3539  }
3540 
3541  if (__kmp_affinity_top_method == affinity_top_method_all) {
3542  //
3543  // In the default code path, errors are not fatal - we just try using
3544  // another method. We only emit a warning message if affinity is on,
3545  // or the verbose flag is set, an the nowarnings flag was not set.
3546  //
3547  const char *file_name = NULL;
3548  int line = 0;
3549 
3550 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3551 
3552  if (__kmp_affinity_verbose) {
3553  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3554  }
3555 
3556  file_name = NULL;
3557  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3558  if (depth == 0) {
3559  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3560  KMP_ASSERT(address2os == NULL);
3561  return;
3562  }
3563 
3564  if (depth < 0) {
3565  if (__kmp_affinity_verbose) {
3566  if (msg_id != kmp_i18n_null) {
3567  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3568  KMP_I18N_STR(DecodingLegacyAPIC));
3569  }
3570  else {
3571  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3572  }
3573  }
3574 
3575  file_name = NULL;
3576  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3577  if (depth == 0) {
3578  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3579  KMP_ASSERT(address2os == NULL);
3580  return;
3581  }
3582  }
3583 
3584 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3585 
3586 # if KMP_OS_LINUX
3587 
3588  if (depth < 0) {
3589  if (__kmp_affinity_verbose) {
3590  if (msg_id != kmp_i18n_null) {
3591  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3592  }
3593  else {
3594  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3595  }
3596  }
3597 
3598  FILE *f = fopen("/proc/cpuinfo", "r");
3599  if (f == NULL) {
3600  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3601  }
3602  else {
3603  file_name = "/proc/cpuinfo";
3604  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3605  fclose(f);
3606  if (depth == 0) {
3607  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3608  KMP_ASSERT(address2os == NULL);
3609  return;
3610  }
3611  }
3612  }
3613 
3614 # endif /* KMP_OS_LINUX */
3615 
3616 # if KMP_GROUP_AFFINITY
3617 
3618  if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3619  if (__kmp_affinity_verbose) {
3620  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3621  }
3622 
3623  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3624  KMP_ASSERT(depth != 0);
3625  }
3626 
3627 # endif /* KMP_GROUP_AFFINITY */
3628 
3629  if (depth < 0) {
3630  if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3631  if (file_name == NULL) {
3632  KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3633  }
3634  else if (line == 0) {
3635  KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3636  }
3637  else {
3638  KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3639  }
3640  }
3641  // FIXME - print msg if msg_id = kmp_i18n_null ???
3642 
3643  file_name = "";
3644  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3645  if (depth == 0) {
3646  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3647  KMP_ASSERT(address2os == NULL);
3648  return;
3649  }
3650  KMP_ASSERT(depth > 0);
3651  KMP_ASSERT(address2os != NULL);
3652  }
3653  }
3654 
3655  //
3656  // If the user has specified that a paricular topology discovery method
3657  // is to be used, then we abort if that method fails. The exception is
3658  // group affinity, which might have been implicitly set.
3659  //
3660 
3661 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3662 
3663  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3664  if (__kmp_affinity_verbose) {
3665  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3666  KMP_I18N_STR(Decodingx2APIC));
3667  }
3668 
3669  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3670  if (depth == 0) {
3671  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3672  KMP_ASSERT(address2os == NULL);
3673  return;
3674  }
3675  if (depth < 0) {
3676  KMP_ASSERT(msg_id != kmp_i18n_null);
3677  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3678  }
3679  }
3680  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3681  if (__kmp_affinity_verbose) {
3682  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3683  KMP_I18N_STR(DecodingLegacyAPIC));
3684  }
3685 
3686  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3687  if (depth == 0) {
3688  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3689  KMP_ASSERT(address2os == NULL);
3690  return;
3691  }
3692  if (depth < 0) {
3693  KMP_ASSERT(msg_id != kmp_i18n_null);
3694  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3695  }
3696  }
3697 
3698 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3699 
3700  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3701  const char *filename;
3702  if (__kmp_cpuinfo_file != NULL) {
3703  filename = __kmp_cpuinfo_file;
3704  }
3705  else {
3706  filename = "/proc/cpuinfo";
3707  }
3708 
3709  if (__kmp_affinity_verbose) {
3710  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3711  }
3712 
3713  FILE *f = fopen(filename, "r");
3714  if (f == NULL) {
3715  int code = errno;
3716  if (__kmp_cpuinfo_file != NULL) {
3717  __kmp_msg(
3718  kmp_ms_fatal,
3719  KMP_MSG(CantOpenFileForReading, filename),
3720  KMP_ERR(code),
3721  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3722  __kmp_msg_null
3723  );
3724  }
3725  else {
3726  __kmp_msg(
3727  kmp_ms_fatal,
3728  KMP_MSG(CantOpenFileForReading, filename),
3729  KMP_ERR(code),
3730  __kmp_msg_null
3731  );
3732  }
3733  }
3734  int line = 0;
3735  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3736  fclose(f);
3737  if (depth < 0) {
3738  KMP_ASSERT(msg_id != kmp_i18n_null);
3739  if (line > 0) {
3740  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3741  }
3742  else {
3743  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3744  }
3745  }
3746  if (__kmp_affinity_type == affinity_none) {
3747  KMP_ASSERT(depth == 0);
3748  KMP_ASSERT(address2os == NULL);
3749  return;
3750  }
3751  }
3752 
3753 # if KMP_GROUP_AFFINITY
3754 
3755  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3756  if (__kmp_affinity_verbose) {
3757  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3758  }
3759 
3760  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3761  KMP_ASSERT(depth != 0);
3762  if (depth < 0) {
3763  KMP_ASSERT(msg_id != kmp_i18n_null);
3764  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3765  }
3766  }
3767 
3768 # endif /* KMP_GROUP_AFFINITY */
3769 
3770  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3771  if (__kmp_affinity_verbose) {
3772  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3773  }
3774 
3775  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3776  if (depth == 0) {
3777  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3778  KMP_ASSERT(address2os == NULL);
3779  return;
3780  }
3781  // should not fail
3782  KMP_ASSERT(depth > 0);
3783  KMP_ASSERT(address2os != NULL);
3784  }
3785 
3786  if (address2os == NULL) {
3787  if (KMP_AFFINITY_CAPABLE()
3788  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3789  && (__kmp_affinity_type != affinity_none)))) {
3790  KMP_WARNING(ErrorInitializeAffinity);
3791  }
3792  __kmp_affinity_type = affinity_none;
3793  KMP_AFFINITY_DISABLE();
3794  return;
3795  }
3796 
3797  __kmp_apply_thread_places(&address2os, depth);
3798 
3799  //
3800  // Create the table of masks, indexed by thread Id.
3801  //
3802  unsigned maxIndex;
3803  unsigned numUnique;
3804  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3805  address2os, __kmp_avail_proc);
3806  if (__kmp_affinity_gran_levels == 0) {
3807  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3808  }
3809 
3810  //
3811  // Set the childNums vector in all Address objects. This must be done
3812  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3813  // which takes into account the setting of __kmp_affinity_compact.
3814  //
3815  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3816 
3817  switch (__kmp_affinity_type) {
3818 
3819  case affinity_explicit:
3820  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3821 # if OMP_40_ENABLED
3822  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3823 # endif
3824  {
3825  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3826  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3827  maxIndex);
3828  }
3829 # if OMP_40_ENABLED
3830  else {
3831  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3832  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3833  maxIndex);
3834  }
3835 # endif
3836  if (__kmp_affinity_num_masks == 0) {
3837  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3838  && (__kmp_affinity_type != affinity_none))) {
3839  KMP_WARNING(AffNoValidProcID);
3840  }
3841  __kmp_affinity_type = affinity_none;
3842  return;
3843  }
3844  break;
3845 
3846  //
3847  // The other affinity types rely on sorting the Addresses according
3848  // to some permutation of the machine topology tree. Set
3849  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3850  // then jump to a common code fragment to do the sort and create
3851  // the array of affinity masks.
3852  //
3853 
3854  case affinity_logical:
3855  __kmp_affinity_compact = 0;
3856  if (__kmp_affinity_offset) {
3857  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3858  % __kmp_avail_proc;
3859  }
3860  goto sortAddresses;
3861 
3862  case affinity_physical:
3863  if (__kmp_nThreadsPerCore > 1) {
3864  __kmp_affinity_compact = 1;
3865  if (__kmp_affinity_compact >= depth) {
3866  __kmp_affinity_compact = 0;
3867  }
3868  } else {
3869  __kmp_affinity_compact = 0;
3870  }
3871  if (__kmp_affinity_offset) {
3872  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3873  % __kmp_avail_proc;
3874  }
3875  goto sortAddresses;
3876 
3877  case affinity_scatter:
3878  if (__kmp_affinity_compact >= depth) {
3879  __kmp_affinity_compact = 0;
3880  }
3881  else {
3882  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3883  }
3884  goto sortAddresses;
3885 
3886  case affinity_compact:
3887  if (__kmp_affinity_compact >= depth) {
3888  __kmp_affinity_compact = depth - 1;
3889  }
3890  goto sortAddresses;
3891 
3892  case affinity_balanced:
3893  // Balanced works only for the case of a single package
3894  if( nPackages > 1 ) {
3895  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3896  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3897  }
3898  __kmp_affinity_type = affinity_none;
3899  return;
3900  } else if( __kmp_affinity_uniform_topology() ) {
3901  break;
3902  } else { // Non-uniform topology
3903 
3904  // Save the depth for further usage
3905  __kmp_aff_depth = depth;
3906 
3907  // Number of hyper threads per core in HT machine
3908  int nth_per_core = __kmp_nThreadsPerCore;
3909 
3910  int core_level;
3911  if( nth_per_core > 1 ) {
3912  core_level = depth - 2;
3913  } else {
3914  core_level = depth - 1;
3915  }
3916  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3917  int nproc = nth_per_core * ncores;
3918 
3919  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3920  for( int i = 0; i < nproc; i++ ) {
3921  procarr[ i ] = -1;
3922  }
3923 
3924  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3925  int proc = address2os[ i ].second;
3926  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3927  // If there is only one thread per core then depth == 2: level 0 - package,
3928  // level 1 - core.
3929  int level = depth - 1;
3930 
3931  // __kmp_nth_per_core == 1
3932  int thread = 0;
3933  int core = address2os[ i ].first.labels[ level ];
3934  // If the thread level exists, that is we have more than one thread context per core
3935  if( nth_per_core > 1 ) {
3936  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3937  core = address2os[ i ].first.labels[ level - 1 ];
3938  }
3939  procarr[ core * nth_per_core + thread ] = proc;
3940  }
3941 
3942  break;
3943  }
3944 
3945  sortAddresses:
3946  //
3947  // Allocate the gtid->affinity mask table.
3948  //
3949  if (__kmp_affinity_dups) {
3950  __kmp_affinity_num_masks = __kmp_avail_proc;
3951  }
3952  else {
3953  __kmp_affinity_num_masks = numUnique;
3954  }
3955 
3956 # if OMP_40_ENABLED
3957  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3958  && ( __kmp_affinity_num_places > 0 )
3959  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3960  __kmp_affinity_num_masks = __kmp_affinity_num_places;
3961  }
3962 # endif
3963 
3964  __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3965  __kmp_affinity_num_masks * __kmp_affin_mask_size);
3966 
3967  //
3968  // Sort the address2os table according to the current setting of
3969  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3970  //
3971  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3972  __kmp_affinity_cmp_Address_child_num);
3973  {
3974  int i;
3975  unsigned j;
3976  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3977  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3978  continue;
3979  }
3980  unsigned osId = address2os[i].second;
3981  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3982  kmp_affin_mask_t *dest
3983  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3984  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3985  KMP_CPU_COPY(dest, src);
3986  if (++j >= __kmp_affinity_num_masks) {
3987  break;
3988  }
3989  }
3990  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3991  }
3992  break;
3993 
3994  default:
3995  KMP_ASSERT2(0, "Unexpected affinity setting");
3996  }
3997 
3998  __kmp_free(osId2Mask);
3999  machine_hierarchy.init(address2os, __kmp_avail_proc);
4000 }
4001 
4002 
4003 void
4004 __kmp_affinity_initialize(void)
4005 {
4006  //
4007  // Much of the code above was written assumming that if a machine was not
4008  // affinity capable, then __kmp_affinity_type == affinity_none. We now
4009  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4010  //
4011  // There are too many checks for __kmp_affinity_type == affinity_none
4012  // in this code. Instead of trying to change them all, check if
4013  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4014  // affinity_none, call the real initialization routine, then restore
4015  // __kmp_affinity_type to affinity_disabled.
4016  //
4017  int disabled = (__kmp_affinity_type == affinity_disabled);
4018  if (! KMP_AFFINITY_CAPABLE()) {
4019  KMP_ASSERT(disabled);
4020  }
4021  if (disabled) {
4022  __kmp_affinity_type = affinity_none;
4023  }
4024  __kmp_aux_affinity_initialize();
4025  if (disabled) {
4026  __kmp_affinity_type = affinity_disabled;
4027  }
4028 }
4029 
4030 
4031 void
4032 __kmp_affinity_uninitialize(void)
4033 {
4034  if (__kmp_affinity_masks != NULL) {
4035  __kmp_free(__kmp_affinity_masks);
4036  __kmp_affinity_masks = NULL;
4037  }
4038  if (fullMask != NULL) {
4039  KMP_CPU_FREE(fullMask);
4040  fullMask = NULL;
4041  }
4042  __kmp_affinity_num_masks = 0;
4043 # if OMP_40_ENABLED
4044  __kmp_affinity_num_places = 0;
4045 # endif
4046  if (__kmp_affinity_proclist != NULL) {
4047  __kmp_free(__kmp_affinity_proclist);
4048  __kmp_affinity_proclist = NULL;
4049  }
4050  if( address2os != NULL ) {
4051  __kmp_free( address2os );
4052  address2os = NULL;
4053  }
4054  if( procarr != NULL ) {
4055  __kmp_free( procarr );
4056  procarr = NULL;
4057  }
4058 }
4059 
4060 
4061 void
4062 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4063 {
4064  if (! KMP_AFFINITY_CAPABLE()) {
4065  return;
4066  }
4067 
4068  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4069  if (th->th.th_affin_mask == NULL) {
4070  KMP_CPU_ALLOC(th->th.th_affin_mask);
4071  }
4072  else {
4073  KMP_CPU_ZERO(th->th.th_affin_mask);
4074  }
4075 
4076  //
4077  // Copy the thread mask to the kmp_info_t strucuture.
4078  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4079  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4080  // is set, then the full mask is the same as the mask of the initialization
4081  // thread.
4082  //
4083  kmp_affin_mask_t *mask;
4084  int i;
4085 
4086 # if OMP_40_ENABLED
4087  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4088 # endif
4089  {
4090  if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4091  ) {
4092 # if KMP_GROUP_AFFINITY
4093  if (__kmp_num_proc_groups > 1) {
4094  return;
4095  }
4096 # endif
4097  KMP_ASSERT(fullMask != NULL);
4098  i = KMP_PLACE_ALL;
4099  mask = fullMask;
4100  }
4101  else {
4102  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4103  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4104  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4105  }
4106  }
4107 # if OMP_40_ENABLED
4108  else {
4109  if ((! isa_root)
4110  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4111 # if KMP_GROUP_AFFINITY
4112  if (__kmp_num_proc_groups > 1) {
4113  return;
4114  }
4115 # endif
4116  KMP_ASSERT(fullMask != NULL);
4117  i = KMP_PLACE_ALL;
4118  mask = fullMask;
4119  }
4120  else {
4121  //
4122  // int i = some hash function or just a counter that doesn't
4123  // always start at 0. Use gtid for now.
4124  //
4125  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4126  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4127  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4128  }
4129  }
4130 # endif
4131 
4132 # if OMP_40_ENABLED
4133  th->th.th_current_place = i;
4134  if (isa_root) {
4135  th->th.th_new_place = i;
4136  th->th.th_first_place = 0;
4137  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4138  }
4139 
4140  if (i == KMP_PLACE_ALL) {
4141  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4142  gtid));
4143  }
4144  else {
4145  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4146  gtid, i));
4147  }
4148 # else
4149  if (i == -1) {
4150  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4151  gtid));
4152  }
4153  else {
4154  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4155  gtid, i));
4156  }
4157 # endif /* OMP_40_ENABLED */
4158 
4159  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4160 
4161  if (__kmp_affinity_verbose) {
4162  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4163  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4164  th->th.th_affin_mask);
4165  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4166  buf);
4167  }
4168 
4169 # if KMP_OS_WINDOWS
4170  //
4171  // On Windows* OS, the process affinity mask might have changed.
4172  // If the user didn't request affinity and this call fails,
4173  // just continue silently. See CQ171393.
4174  //
4175  if ( __kmp_affinity_type == affinity_none ) {
4176  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4177  }
4178  else
4179 # endif
4180  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4181 }
4182 
4183 
4184 # if OMP_40_ENABLED
4185 
4186 void
4187 __kmp_affinity_set_place(int gtid)
4188 {
4189  int retval;
4190 
4191  if (! KMP_AFFINITY_CAPABLE()) {
4192  return;
4193  }
4194 
4195  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4196 
4197  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4198  gtid, th->th.th_new_place, th->th.th_current_place));
4199 
4200  //
4201  // Check that the new place is within this thread's partition.
4202  //
4203  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4204  KMP_ASSERT(th->th.th_new_place >= 0);
4205  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4206  if (th->th.th_first_place <= th->th.th_last_place) {
4207  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4208  && (th->th.th_new_place <= th->th.th_last_place));
4209  }
4210  else {
4211  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4212  || (th->th.th_new_place >= th->th.th_last_place));
4213  }
4214 
4215  //
4216  // Copy the thread mask to the kmp_info_t strucuture,
4217  // and set this thread's affinity.
4218  //
4219  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4220  th->th.th_new_place);
4221  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4222  th->th.th_current_place = th->th.th_new_place;
4223 
4224  if (__kmp_affinity_verbose) {
4225  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4226  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4227  th->th.th_affin_mask);
4228  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4229  gtid, buf);
4230  }
4231  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4232 }
4233 
4234 # endif /* OMP_40_ENABLED */
4235 
4236 
4237 int
4238 __kmp_aux_set_affinity(void **mask)
4239 {
4240  int gtid;
4241  kmp_info_t *th;
4242  int retval;
4243 
4244  if (! KMP_AFFINITY_CAPABLE()) {
4245  return -1;
4246  }
4247 
4248  gtid = __kmp_entry_gtid();
4249  KA_TRACE(1000, ;{
4250  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4251  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4252  (kmp_affin_mask_t *)(*mask));
4253  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4254  gtid, buf);
4255  });
4256 
4257  if (__kmp_env_consistency_check) {
4258  if ((mask == NULL) || (*mask == NULL)) {
4259  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4260  }
4261  else {
4262  unsigned proc;
4263  int num_procs = 0;
4264 
4265  for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4266  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4267  continue;
4268  }
4269  num_procs++;
4270  if (! KMP_CPU_ISSET(proc, fullMask)) {
4271  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4272  break;
4273  }
4274  }
4275  if (num_procs == 0) {
4276  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4277  }
4278 
4279 # if KMP_GROUP_AFFINITY
4280  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4281  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4282  }
4283 # endif /* KMP_GROUP_AFFINITY */
4284 
4285  }
4286  }
4287 
4288  th = __kmp_threads[gtid];
4289  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4290  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4291  if (retval == 0) {
4292  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4293  }
4294 
4295 # if OMP_40_ENABLED
4296  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4297  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4298  th->th.th_first_place = 0;
4299  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4300 
4301  //
4302  // Turn off 4.0 affinity for the current tread at this parallel level.
4303  //
4304  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4305 # endif
4306 
4307  return retval;
4308 }
4309 
4310 
4311 int
4312 __kmp_aux_get_affinity(void **mask)
4313 {
4314  int gtid;
4315  int retval;
4316  kmp_info_t *th;
4317 
4318  if (! KMP_AFFINITY_CAPABLE()) {
4319  return -1;
4320  }
4321 
4322  gtid = __kmp_entry_gtid();
4323  th = __kmp_threads[gtid];
4324  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4325 
4326  KA_TRACE(1000, ;{
4327  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4328  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4329  th->th.th_affin_mask);
4330  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4331  });
4332 
4333  if (__kmp_env_consistency_check) {
4334  if ((mask == NULL) || (*mask == NULL)) {
4335  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4336  }
4337  }
4338 
4339 # if !KMP_OS_WINDOWS
4340 
4341  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4342  KA_TRACE(1000, ;{
4343  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4344  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4345  (kmp_affin_mask_t *)(*mask));
4346  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4347  });
4348  return retval;
4349 
4350 # else
4351 
4352  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4353  return 0;
4354 
4355 # endif /* KMP_OS_WINDOWS */
4356 
4357 }
4358 
4359 int
4360 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4361 {
4362  int retval;
4363 
4364  if (! KMP_AFFINITY_CAPABLE()) {
4365  return -1;
4366  }
4367 
4368  KA_TRACE(1000, ;{
4369  int gtid = __kmp_entry_gtid();
4370  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4371  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4372  (kmp_affin_mask_t *)(*mask));
4373  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4374  proc, gtid, buf);
4375  });
4376 
4377  if (__kmp_env_consistency_check) {
4378  if ((mask == NULL) || (*mask == NULL)) {
4379  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4380  }
4381  }
4382 
4383  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4384  return -1;
4385  }
4386  if (! KMP_CPU_ISSET(proc, fullMask)) {
4387  return -2;
4388  }
4389 
4390  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4391  return 0;
4392 }
4393 
4394 
4395 int
4396 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4397 {
4398  int retval;
4399 
4400  if (! KMP_AFFINITY_CAPABLE()) {
4401  return -1;
4402  }
4403 
4404  KA_TRACE(1000, ;{
4405  int gtid = __kmp_entry_gtid();
4406  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4407  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4408  (kmp_affin_mask_t *)(*mask));
4409  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4410  proc, gtid, buf);
4411  });
4412 
4413  if (__kmp_env_consistency_check) {
4414  if ((mask == NULL) || (*mask == NULL)) {
4415  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4416  }
4417  }
4418 
4419  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4420  return -1;
4421  }
4422  if (! KMP_CPU_ISSET(proc, fullMask)) {
4423  return -2;
4424  }
4425 
4426  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4427  return 0;
4428 }
4429 
4430 
4431 int
4432 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4433 {
4434  int retval;
4435 
4436  if (! KMP_AFFINITY_CAPABLE()) {
4437  return -1;
4438  }
4439 
4440  KA_TRACE(1000, ;{
4441  int gtid = __kmp_entry_gtid();
4442  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4443  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4444  (kmp_affin_mask_t *)(*mask));
4445  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4446  proc, gtid, buf);
4447  });
4448 
4449  if (__kmp_env_consistency_check) {
4450  if ((mask == NULL) || (*mask == NULL)) {
4451  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4452  }
4453  }
4454 
4455  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4456  return 0;
4457  }
4458  if (! KMP_CPU_ISSET(proc, fullMask)) {
4459  return 0;
4460  }
4461 
4462  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4463 }
4464 
4465 
4466 // Dynamic affinity settings - Affinity balanced
4467 void __kmp_balanced_affinity( int tid, int nthreads )
4468 {
4469  if( __kmp_affinity_uniform_topology() ) {
4470  int coreID;
4471  int threadID;
4472  // Number of hyper threads per core in HT machine
4473  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4474  // Number of cores
4475  int ncores = __kmp_ncores;
4476  // How many threads will be bound to each core
4477  int chunk = nthreads / ncores;
4478  // How many cores will have an additional thread bound to it - "big cores"
4479  int big_cores = nthreads % ncores;
4480  // Number of threads on the big cores
4481  int big_nth = ( chunk + 1 ) * big_cores;
4482  if( tid < big_nth ) {
4483  coreID = tid / (chunk + 1 );
4484  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4485  } else { //tid >= big_nth
4486  coreID = ( tid - big_cores ) / chunk;
4487  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4488  }
4489 
4490  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4491  "Illegal set affinity operation when not capable");
4492 
4493  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4494  KMP_CPU_ZERO(mask);
4495 
4496  // Granularity == thread
4497  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4498  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4499  KMP_CPU_SET( osID, mask);
4500  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4501  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4502  int osID;
4503  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4504  KMP_CPU_SET( osID, mask);
4505  }
4506  }
4507  if (__kmp_affinity_verbose) {
4508  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4509  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4510  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4511  tid, buf);
4512  }
4513  __kmp_set_system_affinity( mask, TRUE );
4514  } else { // Non-uniform topology
4515 
4516  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4517  KMP_CPU_ZERO(mask);
4518 
4519  // Number of hyper threads per core in HT machine
4520  int nth_per_core = __kmp_nThreadsPerCore;
4521  int core_level;
4522  if( nth_per_core > 1 ) {
4523  core_level = __kmp_aff_depth - 2;
4524  } else {
4525  core_level = __kmp_aff_depth - 1;
4526  }
4527 
4528  // Number of cores - maximum value; it does not count trail cores with 0 processors
4529  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4530 
4531  // For performance gain consider the special case nthreads == __kmp_avail_proc
4532  if( nthreads == __kmp_avail_proc ) {
4533  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4534  int osID = address2os[ tid ].second;
4535  KMP_CPU_SET( osID, mask);
4536  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4537  int coreID = address2os[ tid ].first.labels[ core_level ];
4538  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4539  // since the address2os is sortied we can break when cnt==nth_per_core
4540  int cnt = 0;
4541  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4542  int osID = address2os[ i ].second;
4543  int core = address2os[ i ].first.labels[ core_level ];
4544  if( core == coreID ) {
4545  KMP_CPU_SET( osID, mask);
4546  cnt++;
4547  if( cnt == nth_per_core ) {
4548  break;
4549  }
4550  }
4551  }
4552  }
4553  } else if( nthreads <= __kmp_ncores ) {
4554 
4555  int core = 0;
4556  for( int i = 0; i < ncores; i++ ) {
4557  // Check if this core from procarr[] is in the mask
4558  int in_mask = 0;
4559  for( int j = 0; j < nth_per_core; j++ ) {
4560  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4561  in_mask = 1;
4562  break;
4563  }
4564  }
4565  if( in_mask ) {
4566  if( tid == core ) {
4567  for( int j = 0; j < nth_per_core; j++ ) {
4568  int osID = procarr[ i * nth_per_core + j ];
4569  if( osID != -1 ) {
4570  KMP_CPU_SET( osID, mask );
4571  // For granularity=thread it is enough to set the first available osID for this core
4572  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4573  break;
4574  }
4575  }
4576  }
4577  break;
4578  } else {
4579  core++;
4580  }
4581  }
4582  }
4583 
4584  } else { // nthreads > __kmp_ncores
4585 
4586  // Array to save the number of processors at each core
4587  int nproc_at_core[ ncores ];
4588  // Array to save the number of cores with "x" available processors;
4589  int ncores_with_x_procs[ nth_per_core + 1 ];
4590  // Array to save the number of cores with # procs from x to nth_per_core
4591  int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4592 
4593  for( int i = 0; i <= nth_per_core; i++ ) {
4594  ncores_with_x_procs[ i ] = 0;
4595  ncores_with_x_to_max_procs[ i ] = 0;
4596  }
4597 
4598  for( int i = 0; i < ncores; i++ ) {
4599  int cnt = 0;
4600  for( int j = 0; j < nth_per_core; j++ ) {
4601  if( procarr[ i * nth_per_core + j ] != -1 ) {
4602  cnt++;
4603  }
4604  }
4605  nproc_at_core[ i ] = cnt;
4606  ncores_with_x_procs[ cnt ]++;
4607  }
4608 
4609  for( int i = 0; i <= nth_per_core; i++ ) {
4610  for( int j = i; j <= nth_per_core; j++ ) {
4611  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4612  }
4613  }
4614 
4615  // Max number of processors
4616  int nproc = nth_per_core * ncores;
4617  // An array to keep number of threads per each context
4618  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4619  for( int i = 0; i < nproc; i++ ) {
4620  newarr[ i ] = 0;
4621  }
4622 
4623  int nth = nthreads;
4624  int flag = 0;
4625  while( nth > 0 ) {
4626  for( int j = 1; j <= nth_per_core; j++ ) {
4627  int cnt = ncores_with_x_to_max_procs[ j ];
4628  for( int i = 0; i < ncores; i++ ) {
4629  // Skip the core with 0 processors
4630  if( nproc_at_core[ i ] == 0 ) {
4631  continue;
4632  }
4633  for( int k = 0; k < nth_per_core; k++ ) {
4634  if( procarr[ i * nth_per_core + k ] != -1 ) {
4635  if( newarr[ i * nth_per_core + k ] == 0 ) {
4636  newarr[ i * nth_per_core + k ] = 1;
4637  cnt--;
4638  nth--;
4639  break;
4640  } else {
4641  if( flag != 0 ) {
4642  newarr[ i * nth_per_core + k ] ++;
4643  cnt--;
4644  nth--;
4645  break;
4646  }
4647  }
4648  }
4649  }
4650  if( cnt == 0 || nth == 0 ) {
4651  break;
4652  }
4653  }
4654  if( nth == 0 ) {
4655  break;
4656  }
4657  }
4658  flag = 1;
4659  }
4660  int sum = 0;
4661  for( int i = 0; i < nproc; i++ ) {
4662  sum += newarr[ i ];
4663  if( sum > tid ) {
4664  // Granularity == thread
4665  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4666  int osID = procarr[ i ];
4667  KMP_CPU_SET( osID, mask);
4668  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4669  int coreID = i / nth_per_core;
4670  for( int ii = 0; ii < nth_per_core; ii++ ) {
4671  int osID = procarr[ coreID * nth_per_core + ii ];
4672  if( osID != -1 ) {
4673  KMP_CPU_SET( osID, mask);
4674  }
4675  }
4676  }
4677  break;
4678  }
4679  }
4680  __kmp_free( newarr );
4681  }
4682 
4683  if (__kmp_affinity_verbose) {
4684  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4685  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4686  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4687  tid, buf);
4688  }
4689  __kmp_set_system_affinity( mask, TRUE );
4690  }
4691 }
4692 
4693 #else
4694  // affinity not supported
4695 
4696 static const kmp_uint32 noaff_maxLevels=7;
4697 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
4698 kmp_uint32 noaff_depth;
4699 kmp_uint8 noaff_leaf_kids;
4700 kmp_int8 noaff_uninitialized=1;
4701 
4702 void noaff_init(int nprocs)
4703 {
4704  kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
4705  if (result == 0) return; // Already initialized
4706  else if (result == 2) { // Someone else is initializing
4707  while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
4708  return;
4709  }
4710  KMP_DEBUG_ASSERT(result==1);
4711 
4712  kmp_uint32 numPerLevel[noaff_maxLevels];
4713  noaff_depth = 1;
4714  for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4715  numPerLevel[i] = 1;
4716  noaff_skipPerLevel[i] = 1;
4717  }
4718 
4719  numPerLevel[0] = 4;
4720  numPerLevel[1] = nprocs/4;
4721  if (nprocs%4) numPerLevel[1]++;
4722 
4723  for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
4724  if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
4725  noaff_depth++;
4726 
4727  kmp_uint32 branch = 4;
4728  if (numPerLevel[0] == 1) branch = nprocs/4;
4729  if (branch<4) branch=4;
4730  for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
4731  while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4732  if (numPerLevel[d] & 1) numPerLevel[d]++;
4733  numPerLevel[d] = numPerLevel[d] >> 1;
4734  if (numPerLevel[d+1] == 1) noaff_depth++;
4735  numPerLevel[d+1] = numPerLevel[d+1] << 1;
4736  }
4737  if(numPerLevel[0] == 1) {
4738  branch = branch >> 1;
4739  if (branch<4) branch = 4;
4740  }
4741  }
4742 
4743  for (kmp_uint32 i=1; i<noaff_depth; ++i)
4744  noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
4745  // Fill in hierarchy in the case of oversubscription
4746  for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
4747  noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
4748  noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4749  noaff_uninitialized = 0; // One writer
4750 
4751 }
4752 
4753 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4754  if (noaff_uninitialized)
4755  noaff_init(nproc);
4756 
4757  thr_bar->depth = noaff_depth;
4758  thr_bar->base_leaf_kids = noaff_leaf_kids;
4759  thr_bar->skip_per_level = noaff_skipPerLevel;
4760 }
4761 
4762 #endif // KMP_AFFINITY_SUPPORTED