Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  * $Revision: 43473 $
4  * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2014 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_i18n.h"
39 #include "kmp_io.h"
40 #include "kmp_str.h"
41 #include "kmp_wrapper_getpid.h"
42 
43 #if KMP_AFFINITY_SUPPORTED
44 
45 //
46 // Print the affinity mask to the character array in a pretty format.
47 //
48 char *
49 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
50 {
51  KMP_ASSERT(buf_len >= 40);
52  char *scan = buf;
53  char *end = buf + buf_len - 1;
54 
55  //
56  // Find first element / check for empty set.
57  //
58  size_t i;
59  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
60  if (KMP_CPU_ISSET(i, mask)) {
61  break;
62  }
63  }
64  if (i == KMP_CPU_SETSIZE) {
65  sprintf(scan, "{<empty>}");
66  while (*scan != '\0') scan++;
67  KMP_ASSERT(scan <= end);
68  return buf;
69  }
70 
71  sprintf(scan, "{%ld", (long)i);
72  while (*scan != '\0') scan++;
73  i++;
74  for (; i < KMP_CPU_SETSIZE; i++) {
75  if (! KMP_CPU_ISSET(i, mask)) {
76  continue;
77  }
78 
79  //
80  // Check for buffer overflow. A string of the form ",<n>" will have
81  // at most 10 characters, plus we want to leave room to print ",...}"
82  // if the set is too large to print for a total of 15 characters.
83  // We already left room for '\0' in setting end.
84  //
85  if (end - scan < 15) {
86  break;
87  }
88  sprintf(scan, ",%-ld", (long)i);
89  while (*scan != '\0') scan++;
90  }
91  if (i < KMP_CPU_SETSIZE) {
92  sprintf(scan, ",...");
93  while (*scan != '\0') scan++;
94  }
95  sprintf(scan, "}");
96  while (*scan != '\0') scan++;
97  KMP_ASSERT(scan <= end);
98  return buf;
99 }
100 
101 
102 void
103 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
104 {
105  KMP_CPU_ZERO(mask);
106 
107 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
108 
109  if (__kmp_num_proc_groups > 1) {
110  int group;
111  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
112  for (group = 0; group < __kmp_num_proc_groups; group++) {
113  int i;
114  int num = __kmp_GetActiveProcessorCount(group);
115  for (i = 0; i < num; i++) {
116  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
117  }
118  }
119  }
120  else
121 
122 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
123 
124  {
125  int proc;
126  for (proc = 0; proc < __kmp_xproc; proc++) {
127  KMP_CPU_SET(proc, mask);
128  }
129  }
130 }
131 
132 
133 //
134 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
135 // functions.
136 //
137 // The icc codegen emits sections with extremely long names, of the form
138 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
139 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
140 // some sort of memory corruption or table overflow that is triggered by
141 // these long strings. I checked the latest version of the linker -
142 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
143 // fixed.
144 //
145 // Unfortunately, my attempts to reproduce it in a smaller example have
146 // failed - I'm not sure what the prospects are of getting it fixed
147 // properly - but we need a reproducer smaller than all of libiomp.
148 //
149 // Work around the problem by avoiding inline constructors in such builds.
150 // We do this for all platforms, not just Linux* OS - non-inline functions are
151 // more debuggable and provide better coverage into than inline functions.
152 // Use inline functions in shipping libs, for performance.
153 //
154 
155 # if !defined(KMP_DEBUG) && !defined(COVER)
156 
157 class Address {
158 public:
159  static const unsigned maxDepth = 32;
160  unsigned labels[maxDepth];
161  unsigned childNums[maxDepth];
162  unsigned depth;
163  unsigned leader;
164  Address(unsigned _depth)
165  : depth(_depth), leader(FALSE) {
166  }
167  Address &operator=(const Address &b) {
168  depth = b.depth;
169  for (unsigned i = 0; i < depth; i++) {
170  labels[i] = b.labels[i];
171  childNums[i] = b.childNums[i];
172  }
173  leader = FALSE;
174  return *this;
175  }
176  bool operator==(const Address &b) const {
177  if (depth != b.depth)
178  return false;
179  for (unsigned i = 0; i < depth; i++)
180  if(labels[i] != b.labels[i])
181  return false;
182  return true;
183  }
184  bool isClose(const Address &b, int level) const {
185  if (depth != b.depth)
186  return false;
187  if ((unsigned)level >= depth)
188  return true;
189  for (unsigned i = 0; i < (depth - level); i++)
190  if(labels[i] != b.labels[i])
191  return false;
192  return true;
193  }
194  bool operator!=(const Address &b) const {
195  return !operator==(b);
196  }
197 };
198 
199 class AddrUnsPair {
200 public:
201  Address first;
202  unsigned second;
203  AddrUnsPair(Address _first, unsigned _second)
204  : first(_first), second(_second) {
205  }
206  AddrUnsPair &operator=(const AddrUnsPair &b)
207  {
208  first = b.first;
209  second = b.second;
210  return *this;
211  }
212 };
213 
214 # else
215 
216 class Address {
217 public:
218  static const unsigned maxDepth = 32;
219  unsigned labels[maxDepth];
220  unsigned childNums[maxDepth];
221  unsigned depth;
222  unsigned leader;
223  Address(unsigned _depth);
224  Address &operator=(const Address &b);
225  bool operator==(const Address &b) const;
226  bool isClose(const Address &b, int level) const;
227  bool operator!=(const Address &b) const;
228 };
229 
230 Address::Address(unsigned _depth)
231 {
232  depth = _depth;
233  leader = FALSE;
234 }
235 
236 Address &Address::operator=(const Address &b) {
237  depth = b.depth;
238  for (unsigned i = 0; i < depth; i++) {
239  labels[i] = b.labels[i];
240  childNums[i] = b.childNums[i];
241  }
242  leader = FALSE;
243  return *this;
244 }
245 
246 bool Address::operator==(const Address &b) const {
247  if (depth != b.depth)
248  return false;
249  for (unsigned i = 0; i < depth; i++)
250  if(labels[i] != b.labels[i])
251  return false;
252  return true;
253 }
254 
255 bool Address::isClose(const Address &b, int level) const {
256  if (depth != b.depth)
257  return false;
258  if ((unsigned)level >= depth)
259  return true;
260  for (unsigned i = 0; i < (depth - level); i++)
261  if(labels[i] != b.labels[i])
262  return false;
263  return true;
264 }
265 
266 bool Address::operator!=(const Address &b) const {
267  return !operator==(b);
268 }
269 
270 class AddrUnsPair {
271 public:
272  Address first;
273  unsigned second;
274  AddrUnsPair(Address _first, unsigned _second);
275  AddrUnsPair &operator=(const AddrUnsPair &b);
276 };
277 
278 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
279  : first(_first), second(_second)
280 {
281 }
282 
283 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
284 {
285  first = b.first;
286  second = b.second;
287  return *this;
288 }
289 
290 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
291 
292 
293 static int
294 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
295 {
296  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
297  ->first);
298  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
299  ->first);
300  unsigned depth = aa->depth;
301  unsigned i;
302  KMP_DEBUG_ASSERT(depth == bb->depth);
303  for (i = 0; i < depth; i++) {
304  if (aa->labels[i] < bb->labels[i]) return -1;
305  if (aa->labels[i] > bb->labels[i]) return 1;
306  }
307  return 0;
308 }
309 
310 
311 static int
312 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
313 {
314  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
315  ->first);
316  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
317  ->first);
318  unsigned depth = aa->depth;
319  unsigned i;
320  KMP_DEBUG_ASSERT(depth == bb->depth);
321  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
322  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
323  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
324  int j = depth - i - 1;
325  if (aa->childNums[j] < bb->childNums[j]) return -1;
326  if (aa->childNums[j] > bb->childNums[j]) return 1;
327  }
328  for (; i < depth; i++) {
329  int j = i - __kmp_affinity_compact;
330  if (aa->childNums[j] < bb->childNums[j]) return -1;
331  if (aa->childNums[j] > bb->childNums[j]) return 1;
332  }
333  return 0;
334 }
335 
337 class hierarchy_info {
338 public:
341  static const kmp_uint32 maxLevels=7;
342 
346  kmp_uint32 depth;
347  kmp_uint32 base_depth;
348  kmp_uint32 base_num_threads;
349  bool uninitialized;
350 
354  kmp_uint32 numPerLevel[maxLevels];
355  kmp_uint32 skipPerLevel[maxLevels];
356 
357  void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
358  int hier_depth = adr2os[0].first.depth;
359  int level = 0;
360  for (int i=hier_depth-1; i>=0; --i) {
361  int max = -1;
362  for (int j=0; j<num_addrs; ++j) {
363  int next = adr2os[j].first.childNums[i];
364  if (next > max) max = next;
365  }
366  numPerLevel[level] = max+1;
367  ++level;
368  }
369  }
370 
371  hierarchy_info() : depth(1), uninitialized(true) {}
372  void init(AddrUnsPair *adr2os, int num_addrs)
373  {
374  uninitialized = false;
375  for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
376  numPerLevel[i] = 1;
377  skipPerLevel[i] = 1;
378  }
379 
380  // Sort table by physical ID
381  if (adr2os) {
382  qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
383  deriveLevels(adr2os, num_addrs);
384  }
385  else {
386  numPerLevel[0] = 4;
387  numPerLevel[1] = num_addrs/4;
388  if (num_addrs%4) numPerLevel[1]++;
389  }
390 
391  base_num_threads = num_addrs;
392  for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
393  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
394  depth++;
395 
396  kmp_uint32 branch = 4;
397  if (numPerLevel[0] == 1) branch = num_addrs/4;
398  if (branch<4) branch=4;
399  for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
400  while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
401  if (numPerLevel[d] & 1) numPerLevel[d]++;
402  numPerLevel[d] = numPerLevel[d] >> 1;
403  if (numPerLevel[d+1] == 1) depth++;
404  numPerLevel[d+1] = numPerLevel[d+1] << 1;
405  }
406  if(numPerLevel[0] == 1) {
407  branch = branch >> 1;
408  if (branch<4) branch = 4;
409  }
410  }
411 
412  for (kmp_uint32 i=1; i<depth; ++i)
413  skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
414 
415  base_depth = depth;
416  }
417 };
418 
419 static hierarchy_info machine_hierarchy;
420 
421 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
422  if (machine_hierarchy.uninitialized)
423  machine_hierarchy.init(NULL, nproc);
424 
425  if (nproc <= machine_hierarchy.base_num_threads)
426  machine_hierarchy.depth = machine_hierarchy.base_depth;
427  KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
428  while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
429  machine_hierarchy.depth++;
430  machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
431  }
432  thr_bar->depth = machine_hierarchy.depth;
433  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
434  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
435 }
436 
437 //
438 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
439 // called to renumber the labels from [0..n] and place them into the child_num
440 // vector of the address object. This is done in case the labels used for
441 // the children at one node of the hierarchy differ from those used for
442 // another node at the same level. Example: suppose the machine has 2 nodes
443 // with 2 packages each. The first node contains packages 601 and 602, and
444 // second node contains packages 603 and 604. If we try to sort the table
445 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
446 // because we are paying attention to the labels themselves, not the ordinal
447 // child numbers. By using the child numbers in the sort, the result is
448 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
449 //
450 static void
451 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
452  int numAddrs)
453 {
454  KMP_DEBUG_ASSERT(numAddrs > 0);
455  int depth = address2os->first.depth;
456  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
457  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
458  * sizeof(unsigned));
459  int labCt;
460  for (labCt = 0; labCt < depth; labCt++) {
461  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
462  lastLabel[labCt] = address2os[0].first.labels[labCt];
463  }
464  int i;
465  for (i = 1; i < numAddrs; i++) {
466  for (labCt = 0; labCt < depth; labCt++) {
467  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
468  int labCt2;
469  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
470  counts[labCt2] = 0;
471  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
472  }
473  counts[labCt]++;
474  lastLabel[labCt] = address2os[i].first.labels[labCt];
475  break;
476  }
477  }
478  for (labCt = 0; labCt < depth; labCt++) {
479  address2os[i].first.childNums[labCt] = counts[labCt];
480  }
481  for (; labCt < (int)Address::maxDepth; labCt++) {
482  address2os[i].first.childNums[labCt] = 0;
483  }
484  }
485 }
486 
487 
488 //
489 // All of the __kmp_affinity_create_*_map() routines should set
490 // __kmp_affinity_masks to a vector of affinity mask objects of length
491 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
492 // return the number of levels in the machine topology tree (zero if
493 // __kmp_affinity_type == affinity_none).
494 //
495 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
496 // to the affinity mask for the initialization thread. They need to save and
497 // restore the mask, and it could be needed later, so saving it is just an
498 // optimization to avoid calling kmp_get_system_affinity() again.
499 //
500 static kmp_affin_mask_t *fullMask = NULL;
501 
502 kmp_affin_mask_t *
503 __kmp_affinity_get_fullMask() { return fullMask; }
504 
505 
506 static int nCoresPerPkg, nPackages;
507 int __kmp_nThreadsPerCore;
508 
509 //
510 // __kmp_affinity_uniform_topology() doesn't work when called from
511 // places which support arbitrarily many levels in the machine topology
512 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
513 // __kmp_affinity_create_x2apicid_map().
514 //
515 inline static bool
516 __kmp_affinity_uniform_topology()
517 {
518  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
519 }
520 
521 
522 //
523 // Print out the detailed machine topology map, i.e. the physical locations
524 // of each OS proc.
525 //
526 static void
527 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
528  int pkgLevel, int coreLevel, int threadLevel)
529 {
530  int proc;
531 
532  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
533  for (proc = 0; proc < len; proc++) {
534  int level;
535  kmp_str_buf_t buf;
536  __kmp_str_buf_init(&buf);
537  for (level = 0; level < depth; level++) {
538  if (level == threadLevel) {
539  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
540  }
541  else if (level == coreLevel) {
542  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
543  }
544  else if (level == pkgLevel) {
545  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
546  }
547  else if (level > pkgLevel) {
548  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
549  level - pkgLevel - 1);
550  }
551  else {
552  __kmp_str_buf_print(&buf, "L%d ", level);
553  }
554  __kmp_str_buf_print(&buf, "%d ",
555  address2os[proc].first.labels[level]);
556  }
557  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
558  buf.str);
559  __kmp_str_buf_free(&buf);
560  }
561 }
562 
563 
564 //
565 // If we don't know how to retrieve the machine's processor topology, or
566 // encounter an error in doing so, this routine is called to form a "flat"
567 // mapping of os thread id's <-> processor id's.
568 //
569 static int
570 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
571  kmp_i18n_id_t *const msg_id)
572 {
573  *address2os = NULL;
574  *msg_id = kmp_i18n_null;
575 
576  //
577  // Even if __kmp_affinity_type == affinity_none, this routine might still
578  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
579  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
580  //
581  if (! KMP_AFFINITY_CAPABLE()) {
582  KMP_ASSERT(__kmp_affinity_type == affinity_none);
583  __kmp_ncores = nPackages = __kmp_xproc;
584  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
585  __kmp_ht_enabled = FALSE;
586  if (__kmp_affinity_verbose) {
587  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
588  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
589  KMP_INFORM(Uniform, "KMP_AFFINITY");
590  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
591  __kmp_nThreadsPerCore, __kmp_ncores);
592  }
593  return 0;
594  }
595 
596  //
597  // When affinity is off, this routine will still be called to set
598  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
599  // nCoresPerPkg, & nPackages. Make sure all these vars are set
600  // correctly, and return now if affinity is not enabled.
601  //
602  __kmp_ncores = nPackages = __kmp_avail_proc;
603  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
604  __kmp_ht_enabled = FALSE;
605  if (__kmp_affinity_verbose) {
606  char buf[KMP_AFFIN_MASK_PRINT_LEN];
607  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
608 
609  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
610  if (__kmp_affinity_respect_mask) {
611  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
612  } else {
613  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
614  }
615  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
616  KMP_INFORM(Uniform, "KMP_AFFINITY");
617  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
618  __kmp_nThreadsPerCore, __kmp_ncores);
619  }
620  if (__kmp_affinity_type == affinity_none) {
621  return 0;
622  }
623 
624  //
625  // Contruct the data structure to be returned.
626  //
627  *address2os = (AddrUnsPair*)
628  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
629  int avail_ct = 0;
630  unsigned int i;
631  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
632  //
633  // Skip this proc if it is not included in the machine model.
634  //
635  if (! KMP_CPU_ISSET(i, fullMask)) {
636  continue;
637  }
638 
639  Address addr(1);
640  addr.labels[0] = i;
641  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
642  }
643  if (__kmp_affinity_verbose) {
644  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
645  }
646 
647  if (__kmp_affinity_gran_levels < 0) {
648  //
649  // Only the package level is modeled in the machine topology map,
650  // so the #levels of granularity is either 0 or 1.
651  //
652  if (__kmp_affinity_gran > affinity_gran_package) {
653  __kmp_affinity_gran_levels = 1;
654  }
655  else {
656  __kmp_affinity_gran_levels = 0;
657  }
658  }
659  return 1;
660 }
661 
662 
663 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
664 
665 //
666 // If multiple Windows* OS processor groups exist, we can create a 2-level
667 // topology map with the groups at level 0 and the individual procs at
668 // level 1.
669 //
670 // This facilitates letting the threads float among all procs in a group,
671 // if granularity=group (the default when there are multiple groups).
672 //
673 static int
674 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
675  kmp_i18n_id_t *const msg_id)
676 {
677  *address2os = NULL;
678  *msg_id = kmp_i18n_null;
679 
680  //
681  // If we don't have multiple processor groups, return now.
682  // The flat mapping will be used.
683  //
684  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
685  // FIXME set *msg_id
686  return -1;
687  }
688 
689  //
690  // Contruct the data structure to be returned.
691  //
692  *address2os = (AddrUnsPair*)
693  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
694  int avail_ct = 0;
695  int i;
696  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
697  //
698  // Skip this proc if it is not included in the machine model.
699  //
700  if (! KMP_CPU_ISSET(i, fullMask)) {
701  continue;
702  }
703 
704  Address addr(2);
705  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
706  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
707  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
708 
709  if (__kmp_affinity_verbose) {
710  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
711  addr.labels[1]);
712  }
713  }
714 
715  if (__kmp_affinity_gran_levels < 0) {
716  if (__kmp_affinity_gran == affinity_gran_group) {
717  __kmp_affinity_gran_levels = 1;
718  }
719  else if ((__kmp_affinity_gran == affinity_gran_fine)
720  || (__kmp_affinity_gran == affinity_gran_thread)) {
721  __kmp_affinity_gran_levels = 0;
722  }
723  else {
724  const char *gran_str = NULL;
725  if (__kmp_affinity_gran == affinity_gran_core) {
726  gran_str = "core";
727  }
728  else if (__kmp_affinity_gran == affinity_gran_package) {
729  gran_str = "package";
730  }
731  else if (__kmp_affinity_gran == affinity_gran_node) {
732  gran_str = "node";
733  }
734  else {
735  KMP_ASSERT(0);
736  }
737 
738  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
739  __kmp_affinity_gran_levels = 0;
740  }
741  }
742  return 2;
743 }
744 
745 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
746 
747 
748 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
749 
750 static int
751 __kmp_cpuid_mask_width(int count) {
752  int r = 0;
753 
754  while((1<<r) < count)
755  ++r;
756  return r;
757 }
758 
759 
760 class apicThreadInfo {
761 public:
762  unsigned osId; // param to __kmp_affinity_bind_thread
763  unsigned apicId; // from cpuid after binding
764  unsigned maxCoresPerPkg; // ""
765  unsigned maxThreadsPerPkg; // ""
766  unsigned pkgId; // inferred from above values
767  unsigned coreId; // ""
768  unsigned threadId; // ""
769 };
770 
771 
772 static int
773 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
774 {
775  const apicThreadInfo *aa = (const apicThreadInfo *)a;
776  const apicThreadInfo *bb = (const apicThreadInfo *)b;
777  if (aa->osId < bb->osId) return -1;
778  if (aa->osId > bb->osId) return 1;
779  return 0;
780 }
781 
782 
783 static int
784 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
785 {
786  const apicThreadInfo *aa = (const apicThreadInfo *)a;
787  const apicThreadInfo *bb = (const apicThreadInfo *)b;
788  if (aa->pkgId < bb->pkgId) return -1;
789  if (aa->pkgId > bb->pkgId) return 1;
790  if (aa->coreId < bb->coreId) return -1;
791  if (aa->coreId > bb->coreId) return 1;
792  if (aa->threadId < bb->threadId) return -1;
793  if (aa->threadId > bb->threadId) return 1;
794  return 0;
795 }
796 
797 
798 //
799 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
800 // an algorithm which cycles through the available os threads, setting
801 // the current thread's affinity mask to that thread, and then retrieves
802 // the Apic Id for each thread context using the cpuid instruction.
803 //
804 static int
805 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
806  kmp_i18n_id_t *const msg_id)
807 {
808  int rc;
809  *address2os = NULL;
810  *msg_id = kmp_i18n_null;
811 
812 # if KMP_MIC
813  {
814  // The code below will use cpuid(4).
815  // Check if cpuid(4) is supported.
816  // FIXME? - this really doesn't need to be specific to MIC.
817  kmp_cpuid buf;
818  __kmp_x86_cpuid(0, 0, &buf);
819  if (buf.eax < 4) {
820  *msg_id = kmp_i18n_str_NoLeaf4Support;
821  return -1;
822  }
823  }
824 # endif // KMP_MIC
825 
826  //
827  // Even if __kmp_affinity_type == affinity_none, this routine is still
828  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
829  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
830  //
831  // The algorithm used starts by setting the affinity to each available
832  // thread and retreiving info from the cpuid instruction, so if we are not
833  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
834  // then we need to do something else.
835  //
836  if (! KMP_AFFINITY_CAPABLE()) {
837  //
838  // Hack to try and infer the machine topology using only the data
839  // available from cpuid on the current thread, and __kmp_xproc.
840  //
841  KMP_ASSERT(__kmp_affinity_type == affinity_none);
842 
843  //
844  // Get an upper bound on the number of threads per package using
845  // cpuid(1).
846  //
847  // On some OS/chps combinations where HT is supported by the chip
848  // but is disabled, this value will be 2 on a single core chip.
849  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
850  //
851  kmp_cpuid buf;
852  __kmp_x86_cpuid(1, 0, &buf);
853  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
854  if (maxThreadsPerPkg == 0) {
855  maxThreadsPerPkg = 1;
856  }
857 
858  //
859  // The num cores per pkg comes from cpuid(4).
860  // 1 must be added to the encoded value.
861  //
862  // The author of cpu_count.cpp treated this only an upper bound
863  // on the number of cores, but I haven't seen any cases where it
864  // was greater than the actual number of cores, so we will treat
865  // it as exact in this block of code.
866  //
867  // First, we need to check if cpuid(4) is supported on this chip.
868  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
869  // has the value n or greater.
870  //
871  __kmp_x86_cpuid(0, 0, &buf);
872  if (buf.eax >= 4) {
873  __kmp_x86_cpuid(4, 0, &buf);
874  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
875  }
876  else {
877  nCoresPerPkg = 1;
878  }
879 
880  //
881  // There is no way to reliably tell if HT is enabled without issuing
882  // the cpuid instruction from every thread, can correlating the cpuid
883  // info, so if the machine is not affinity capable, we assume that HT
884  // is off. We have seen quite a few machines where maxThreadsPerPkg
885  // is 2, yet the machine does not support HT.
886  //
887  // - Older OSes are usually found on machines with older chips, which
888  // do not support HT.
889  //
890  // - The performance penalty for mistakenly identifying a machine as
891  // HT when it isn't (which results in blocktime being incorrecly set
892  // to 0) is greater than the penalty when for mistakenly identifying
893  // a machine as being 1 thread/core when it is really HT enabled
894  // (which results in blocktime being incorrectly set to a positive
895  // value).
896  //
897  __kmp_ncores = __kmp_xproc;
898  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
899  __kmp_nThreadsPerCore = 1;
900  __kmp_ht_enabled = FALSE;
901  if (__kmp_affinity_verbose) {
902  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
903  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
904  if (__kmp_affinity_uniform_topology()) {
905  KMP_INFORM(Uniform, "KMP_AFFINITY");
906  } else {
907  KMP_INFORM(NonUniform, "KMP_AFFINITY");
908  }
909  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
910  __kmp_nThreadsPerCore, __kmp_ncores);
911  }
912  return 0;
913  }
914 
915  //
916  //
917  // From here on, we can assume that it is safe to call
918  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
919  // even if __kmp_affinity_type = affinity_none.
920  //
921 
922  //
923  // Save the affinity mask for the current thread.
924  //
925  kmp_affin_mask_t *oldMask;
926  KMP_CPU_ALLOC(oldMask);
927  KMP_ASSERT(oldMask != NULL);
928  __kmp_get_system_affinity(oldMask, TRUE);
929 
930  //
931  // Run through each of the available contexts, binding the current thread
932  // to it, and obtaining the pertinent information using the cpuid instr.
933  //
934  // The relevant information is:
935  //
936  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
937  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
938  //
939  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
940  // value of this field determines the width of the core# + thread#
941  // fields in the Apic Id. It is also an upper bound on the number
942  // of threads per package, but it has been verified that situations
943  // happen were it is not exact. In particular, on certain OS/chip
944  // combinations where Intel(R) Hyper-Threading Technology is supported
945  // by the chip but has
946  // been disabled, the value of this field will be 2 (for a single core
947  // chip). On other OS/chip combinations supporting
948  // Intel(R) Hyper-Threading Technology, the value of
949  // this field will be 1 when Intel(R) Hyper-Threading Technology is
950  // disabled and 2 when it is enabled.
951  //
952  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
953  // value of this field (+1) determines the width of the core# field in
954  // the Apic Id. The comments in "cpucount.cpp" say that this value is
955  // an upper bound, but the IA-32 architecture manual says that it is
956  // exactly the number of cores per package, and I haven't seen any
957  // case where it wasn't.
958  //
959  // From this information, deduce the package Id, core Id, and thread Id,
960  // and set the corresponding fields in the apicThreadInfo struct.
961  //
962  unsigned i;
963  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
964  __kmp_avail_proc * sizeof(apicThreadInfo));
965  unsigned nApics = 0;
966  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
967  //
968  // Skip this proc if it is not included in the machine model.
969  //
970  if (! KMP_CPU_ISSET(i, fullMask)) {
971  continue;
972  }
973  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
974 
975  __kmp_affinity_bind_thread(i);
976  threadInfo[nApics].osId = i;
977 
978  //
979  // The apic id and max threads per pkg come from cpuid(1).
980  //
981  kmp_cpuid buf;
982  __kmp_x86_cpuid(1, 0, &buf);
983  if (! (buf.edx >> 9) & 1) {
984  __kmp_set_system_affinity(oldMask, TRUE);
985  __kmp_free(threadInfo);
986  KMP_CPU_FREE(oldMask);
987  *msg_id = kmp_i18n_str_ApicNotPresent;
988  return -1;
989  }
990  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
991  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
992  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
993  threadInfo[nApics].maxThreadsPerPkg = 1;
994  }
995 
996  //
997  // Max cores per pkg comes from cpuid(4).
998  // 1 must be added to the encoded value.
999  //
1000  // First, we need to check if cpuid(4) is supported on this chip.
1001  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1002  // has the value n or greater.
1003  //
1004  __kmp_x86_cpuid(0, 0, &buf);
1005  if (buf.eax >= 4) {
1006  __kmp_x86_cpuid(4, 0, &buf);
1007  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1008  }
1009  else {
1010  threadInfo[nApics].maxCoresPerPkg = 1;
1011  }
1012 
1013  //
1014  // Infer the pkgId / coreId / threadId using only the info
1015  // obtained locally.
1016  //
1017  int widthCT = __kmp_cpuid_mask_width(
1018  threadInfo[nApics].maxThreadsPerPkg);
1019  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1020 
1021  int widthC = __kmp_cpuid_mask_width(
1022  threadInfo[nApics].maxCoresPerPkg);
1023  int widthT = widthCT - widthC;
1024  if (widthT < 0) {
1025  //
1026  // I've never seen this one happen, but I suppose it could, if
1027  // the cpuid instruction on a chip was really screwed up.
1028  // Make sure to restore the affinity mask before the tail call.
1029  //
1030  __kmp_set_system_affinity(oldMask, TRUE);
1031  __kmp_free(threadInfo);
1032  KMP_CPU_FREE(oldMask);
1033  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1034  return -1;
1035  }
1036 
1037  int maskC = (1 << widthC) - 1;
1038  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1039  &maskC;
1040 
1041  int maskT = (1 << widthT) - 1;
1042  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1043 
1044  nApics++;
1045  }
1046 
1047  //
1048  // We've collected all the info we need.
1049  // Restore the old affinity mask for this thread.
1050  //
1051  __kmp_set_system_affinity(oldMask, TRUE);
1052 
1053  //
1054  // If there's only one thread context to bind to, form an Address object
1055  // with depth 1 and return immediately (or, if affinity is off, set
1056  // address2os to NULL and return).
1057  //
1058  // If it is configured to omit the package level when there is only a
1059  // single package, the logic at the end of this routine won't work if
1060  // there is only a single thread - it would try to form an Address
1061  // object with depth 0.
1062  //
1063  KMP_ASSERT(nApics > 0);
1064  if (nApics == 1) {
1065  __kmp_ncores = nPackages = 1;
1066  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1067  __kmp_ht_enabled = FALSE;
1068  if (__kmp_affinity_verbose) {
1069  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1070  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1071 
1072  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1073  if (__kmp_affinity_respect_mask) {
1074  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1075  } else {
1076  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1077  }
1078  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1079  KMP_INFORM(Uniform, "KMP_AFFINITY");
1080  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1081  __kmp_nThreadsPerCore, __kmp_ncores);
1082  }
1083 
1084  if (__kmp_affinity_type == affinity_none) {
1085  __kmp_free(threadInfo);
1086  KMP_CPU_FREE(oldMask);
1087  return 0;
1088  }
1089 
1090  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1091  Address addr(1);
1092  addr.labels[0] = threadInfo[0].pkgId;
1093  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1094 
1095  if (__kmp_affinity_gran_levels < 0) {
1096  __kmp_affinity_gran_levels = 0;
1097  }
1098 
1099  if (__kmp_affinity_verbose) {
1100  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1101  }
1102 
1103  __kmp_free(threadInfo);
1104  KMP_CPU_FREE(oldMask);
1105  return 1;
1106  }
1107 
1108  //
1109  // Sort the threadInfo table by physical Id.
1110  //
1111  qsort(threadInfo, nApics, sizeof(*threadInfo),
1112  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1113 
1114  //
1115  // The table is now sorted by pkgId / coreId / threadId, but we really
1116  // don't know the radix of any of the fields. pkgId's may be sparsely
1117  // assigned among the chips on a system. Although coreId's are usually
1118  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1119  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1120  //
1121  // For that matter, we don't know what coresPerPkg and threadsPerCore
1122  // (or the total # packages) are at this point - we want to determine
1123  // that now. We only have an upper bound on the first two figures.
1124  //
1125  // We also perform a consistency check at this point: the values returned
1126  // by the cpuid instruction for any thread bound to a given package had
1127  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1128  //
1129  nPackages = 1;
1130  nCoresPerPkg = 1;
1131  __kmp_nThreadsPerCore = 1;
1132  unsigned nCores = 1;
1133 
1134  unsigned pkgCt = 1; // to determine radii
1135  unsigned lastPkgId = threadInfo[0].pkgId;
1136  unsigned coreCt = 1;
1137  unsigned lastCoreId = threadInfo[0].coreId;
1138  unsigned threadCt = 1;
1139  unsigned lastThreadId = threadInfo[0].threadId;
1140 
1141  // intra-pkg consist checks
1142  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1143  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1144 
1145  for (i = 1; i < nApics; i++) {
1146  if (threadInfo[i].pkgId != lastPkgId) {
1147  nCores++;
1148  pkgCt++;
1149  lastPkgId = threadInfo[i].pkgId;
1150  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1151  coreCt = 1;
1152  lastCoreId = threadInfo[i].coreId;
1153  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1154  threadCt = 1;
1155  lastThreadId = threadInfo[i].threadId;
1156 
1157  //
1158  // This is a different package, so go on to the next iteration
1159  // without doing any consistency checks. Reset the consistency
1160  // check vars, though.
1161  //
1162  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1163  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1164  continue;
1165  }
1166 
1167  if (threadInfo[i].coreId != lastCoreId) {
1168  nCores++;
1169  coreCt++;
1170  lastCoreId = threadInfo[i].coreId;
1171  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1172  threadCt = 1;
1173  lastThreadId = threadInfo[i].threadId;
1174  }
1175  else if (threadInfo[i].threadId != lastThreadId) {
1176  threadCt++;
1177  lastThreadId = threadInfo[i].threadId;
1178  }
1179  else {
1180  __kmp_free(threadInfo);
1181  KMP_CPU_FREE(oldMask);
1182  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1183  return -1;
1184  }
1185 
1186  //
1187  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1188  // fields agree between all the threads bounds to a given package.
1189  //
1190  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1191  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1192  __kmp_free(threadInfo);
1193  KMP_CPU_FREE(oldMask);
1194  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1195  return -1;
1196  }
1197  }
1198  nPackages = pkgCt;
1199  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1200  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1201 
1202  //
1203  // When affinity is off, this routine will still be called to set
1204  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1205  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1206  // correctly, and return now if affinity is not enabled.
1207  //
1208  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1209  __kmp_ncores = nCores;
1210  if (__kmp_affinity_verbose) {
1211  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1212  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1213 
1214  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1215  if (__kmp_affinity_respect_mask) {
1216  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1217  } else {
1218  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1219  }
1220  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1221  if (__kmp_affinity_uniform_topology()) {
1222  KMP_INFORM(Uniform, "KMP_AFFINITY");
1223  } else {
1224  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1225  }
1226  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1227  __kmp_nThreadsPerCore, __kmp_ncores);
1228 
1229  }
1230 
1231  if (__kmp_affinity_type == affinity_none) {
1232  __kmp_free(threadInfo);
1233  KMP_CPU_FREE(oldMask);
1234  return 0;
1235  }
1236 
1237  //
1238  // Now that we've determined the number of packages, the number of cores
1239  // per package, and the number of threads per core, we can construct the
1240  // data structure that is to be returned.
1241  //
1242  int pkgLevel = 0;
1243  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1244  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1245  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1246 
1247  KMP_ASSERT(depth > 0);
1248  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1249 
1250  for (i = 0; i < nApics; ++i) {
1251  Address addr(depth);
1252  unsigned os = threadInfo[i].osId;
1253  int d = 0;
1254 
1255  if (pkgLevel >= 0) {
1256  addr.labels[d++] = threadInfo[i].pkgId;
1257  }
1258  if (coreLevel >= 0) {
1259  addr.labels[d++] = threadInfo[i].coreId;
1260  }
1261  if (threadLevel >= 0) {
1262  addr.labels[d++] = threadInfo[i].threadId;
1263  }
1264  (*address2os)[i] = AddrUnsPair(addr, os);
1265  }
1266 
1267  if (__kmp_affinity_gran_levels < 0) {
1268  //
1269  // Set the granularity level based on what levels are modeled
1270  // in the machine topology map.
1271  //
1272  __kmp_affinity_gran_levels = 0;
1273  if ((threadLevel >= 0)
1274  && (__kmp_affinity_gran > affinity_gran_thread)) {
1275  __kmp_affinity_gran_levels++;
1276  }
1277  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1278  __kmp_affinity_gran_levels++;
1279  }
1280  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1281  __kmp_affinity_gran_levels++;
1282  }
1283  }
1284 
1285  if (__kmp_affinity_verbose) {
1286  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1287  coreLevel, threadLevel);
1288  }
1289 
1290  __kmp_free(threadInfo);
1291  KMP_CPU_FREE(oldMask);
1292  return depth;
1293 }
1294 
1295 
1296 //
1297 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1298 // architectures support a newer interface for specifying the x2APIC Ids,
1299 // based on cpuid leaf 11.
1300 //
1301 static int
1302 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1303  kmp_i18n_id_t *const msg_id)
1304 {
1305  kmp_cpuid buf;
1306 
1307  *address2os = NULL;
1308  *msg_id = kmp_i18n_null;
1309 
1310  //
1311  // Check to see if cpuid leaf 11 is supported.
1312  //
1313  __kmp_x86_cpuid(0, 0, &buf);
1314  if (buf.eax < 11) {
1315  *msg_id = kmp_i18n_str_NoLeaf11Support;
1316  return -1;
1317  }
1318  __kmp_x86_cpuid(11, 0, &buf);
1319  if (buf.ebx == 0) {
1320  *msg_id = kmp_i18n_str_NoLeaf11Support;
1321  return -1;
1322  }
1323 
1324  //
1325  // Find the number of levels in the machine topology. While we're at it,
1326  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1327  // try to get more accurate values later by explicitly counting them,
1328  // but get reasonable defaults now, in case we return early.
1329  //
1330  int level;
1331  int threadLevel = -1;
1332  int coreLevel = -1;
1333  int pkgLevel = -1;
1334  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1335 
1336  for (level = 0;; level++) {
1337  if (level > 31) {
1338  //
1339  // FIXME: Hack for DPD200163180
1340  //
1341  // If level is big then something went wrong -> exiting
1342  //
1343  // There could actually be 32 valid levels in the machine topology,
1344  // but so far, the only machine we have seen which does not exit
1345  // this loop before iteration 32 has fubar x2APIC settings.
1346  //
1347  // For now, just reject this case based upon loop trip count.
1348  //
1349  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1350  return -1;
1351  }
1352  __kmp_x86_cpuid(11, level, &buf);
1353  if (buf.ebx == 0) {
1354  if (pkgLevel < 0) {
1355  //
1356  // Will infer nPackages from __kmp_xproc
1357  //
1358  pkgLevel = level;
1359  level++;
1360  }
1361  break;
1362  }
1363  int kind = (buf.ecx >> 8) & 0xff;
1364  if (kind == 1) {
1365  //
1366  // SMT level
1367  //
1368  threadLevel = level;
1369  coreLevel = -1;
1370  pkgLevel = -1;
1371  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1372  if (__kmp_nThreadsPerCore == 0) {
1373  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1374  return -1;
1375  }
1376  }
1377  else if (kind == 2) {
1378  //
1379  // core level
1380  //
1381  coreLevel = level;
1382  pkgLevel = -1;
1383  nCoresPerPkg = buf.ebx & 0xff;
1384  if (nCoresPerPkg == 0) {
1385  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1386  return -1;
1387  }
1388  }
1389  else {
1390  if (level <= 0) {
1391  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1392  return -1;
1393  }
1394  if (pkgLevel >= 0) {
1395  continue;
1396  }
1397  pkgLevel = level;
1398  nPackages = buf.ebx & 0xff;
1399  if (nPackages == 0) {
1400  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1401  return -1;
1402  }
1403  }
1404  }
1405  int depth = level;
1406 
1407  //
1408  // In the above loop, "level" was counted from the finest level (usually
1409  // thread) to the coarsest. The caller expects that we will place the
1410  // labels in (*address2os)[].first.labels[] in the inverse order, so
1411  // we need to invert the vars saying which level means what.
1412  //
1413  if (threadLevel >= 0) {
1414  threadLevel = depth - threadLevel - 1;
1415  }
1416  if (coreLevel >= 0) {
1417  coreLevel = depth - coreLevel - 1;
1418  }
1419  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1420  pkgLevel = depth - pkgLevel - 1;
1421 
1422  //
1423  // The algorithm used starts by setting the affinity to each available
1424  // thread and retrieving info from the cpuid instruction, so if we are not
1425  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
1426  // then we need to do something else - use the defaults that we calculated
1427  // from issuing cpuid without binding to each proc.
1428  //
1429  if (! KMP_AFFINITY_CAPABLE())
1430  {
1431  //
1432  // Hack to try and infer the machine topology using only the data
1433  // available from cpuid on the current thread, and __kmp_xproc.
1434  //
1435  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1436 
1437  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1438  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1439  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1440  if (__kmp_affinity_verbose) {
1441  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1442  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1443  if (__kmp_affinity_uniform_topology()) {
1444  KMP_INFORM(Uniform, "KMP_AFFINITY");
1445  } else {
1446  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1447  }
1448  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1449  __kmp_nThreadsPerCore, __kmp_ncores);
1450  }
1451  return 0;
1452  }
1453 
1454  //
1455  //
1456  // From here on, we can assume that it is safe to call
1457  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1458  // even if __kmp_affinity_type = affinity_none.
1459  //
1460 
1461  //
1462  // Save the affinity mask for the current thread.
1463  //
1464  kmp_affin_mask_t *oldMask;
1465  KMP_CPU_ALLOC(oldMask);
1466  __kmp_get_system_affinity(oldMask, TRUE);
1467 
1468  //
1469  // Allocate the data structure to be returned.
1470  //
1471  AddrUnsPair *retval = (AddrUnsPair *)
1472  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1473 
1474  //
1475  // Run through each of the available contexts, binding the current thread
1476  // to it, and obtaining the pertinent information using the cpuid instr.
1477  //
1478  unsigned int proc;
1479  int nApics = 0;
1480  for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1481  //
1482  // Skip this proc if it is not included in the machine model.
1483  //
1484  if (! KMP_CPU_ISSET(proc, fullMask)) {
1485  continue;
1486  }
1487  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1488 
1489  __kmp_affinity_bind_thread(proc);
1490 
1491  //
1492  // Extrach the labels for each level in the machine topology map
1493  // from the Apic ID.
1494  //
1495  Address addr(depth);
1496  int prev_shift = 0;
1497 
1498  for (level = 0; level < depth; level++) {
1499  __kmp_x86_cpuid(11, level, &buf);
1500  unsigned apicId = buf.edx;
1501  if (buf.ebx == 0) {
1502  if (level != depth - 1) {
1503  KMP_CPU_FREE(oldMask);
1504  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1505  return -1;
1506  }
1507  addr.labels[depth - level - 1] = apicId >> prev_shift;
1508  level++;
1509  break;
1510  }
1511  int shift = buf.eax & 0x1f;
1512  int mask = (1 << shift) - 1;
1513  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1514  prev_shift = shift;
1515  }
1516  if (level != depth) {
1517  KMP_CPU_FREE(oldMask);
1518  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1519  return -1;
1520  }
1521 
1522  retval[nApics] = AddrUnsPair(addr, proc);
1523  nApics++;
1524  }
1525 
1526  //
1527  // We've collected all the info we need.
1528  // Restore the old affinity mask for this thread.
1529  //
1530  __kmp_set_system_affinity(oldMask, TRUE);
1531 
1532  //
1533  // If there's only one thread context to bind to, return now.
1534  //
1535  KMP_ASSERT(nApics > 0);
1536  if (nApics == 1) {
1537  __kmp_ncores = nPackages = 1;
1538  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1539  __kmp_ht_enabled = FALSE;
1540  if (__kmp_affinity_verbose) {
1541  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1542  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1543 
1544  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1545  if (__kmp_affinity_respect_mask) {
1546  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1547  } else {
1548  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1549  }
1550  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1551  KMP_INFORM(Uniform, "KMP_AFFINITY");
1552  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1553  __kmp_nThreadsPerCore, __kmp_ncores);
1554  }
1555 
1556  if (__kmp_affinity_type == affinity_none) {
1557  __kmp_free(retval);
1558  KMP_CPU_FREE(oldMask);
1559  return 0;
1560  }
1561 
1562  //
1563  // Form an Address object which only includes the package level.
1564  //
1565  Address addr(1);
1566  addr.labels[0] = retval[0].first.labels[pkgLevel];
1567  retval[0].first = addr;
1568 
1569  if (__kmp_affinity_gran_levels < 0) {
1570  __kmp_affinity_gran_levels = 0;
1571  }
1572 
1573  if (__kmp_affinity_verbose) {
1574  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1575  }
1576 
1577  *address2os = retval;
1578  KMP_CPU_FREE(oldMask);
1579  return 1;
1580  }
1581 
1582  //
1583  // Sort the table by physical Id.
1584  //
1585  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1586 
1587  //
1588  // Find the radix at each of the levels.
1589  //
1590  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1591  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1592  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1593  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1594  for (level = 0; level < depth; level++) {
1595  totals[level] = 1;
1596  maxCt[level] = 1;
1597  counts[level] = 1;
1598  last[level] = retval[0].first.labels[level];
1599  }
1600 
1601  //
1602  // From here on, the iteration variable "level" runs from the finest
1603  // level to the coarsest, i.e. we iterate forward through
1604  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1605  // backwards.
1606  //
1607  for (proc = 1; (int)proc < nApics; proc++) {
1608  int level;
1609  for (level = 0; level < depth; level++) {
1610  if (retval[proc].first.labels[level] != last[level]) {
1611  int j;
1612  for (j = level + 1; j < depth; j++) {
1613  totals[j]++;
1614  counts[j] = 1;
1615  // The line below causes printing incorrect topology information
1616  // in case the max value for some level (maxCt[level]) is encountered earlier than
1617  // some less value while going through the array.
1618  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1619  // whereas it must be 4.
1620  // TODO!!! Check if it can be commented safely
1621  //maxCt[j] = 1;
1622  last[j] = retval[proc].first.labels[j];
1623  }
1624  totals[level]++;
1625  counts[level]++;
1626  if (counts[level] > maxCt[level]) {
1627  maxCt[level] = counts[level];
1628  }
1629  last[level] = retval[proc].first.labels[level];
1630  break;
1631  }
1632  else if (level == depth - 1) {
1633  __kmp_free(last);
1634  __kmp_free(maxCt);
1635  __kmp_free(counts);
1636  __kmp_free(totals);
1637  __kmp_free(retval);
1638  KMP_CPU_FREE(oldMask);
1639  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1640  return -1;
1641  }
1642  }
1643  }
1644 
1645  //
1646  // When affinity is off, this routine will still be called to set
1647  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1648  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1649  // correctly, and return if affinity is not enabled.
1650  //
1651  if (threadLevel >= 0) {
1652  __kmp_nThreadsPerCore = maxCt[threadLevel];
1653  }
1654  else {
1655  __kmp_nThreadsPerCore = 1;
1656  }
1657  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1658 
1659  nPackages = totals[pkgLevel];
1660 
1661  if (coreLevel >= 0) {
1662  __kmp_ncores = totals[coreLevel];
1663  nCoresPerPkg = maxCt[coreLevel];
1664  }
1665  else {
1666  __kmp_ncores = nPackages;
1667  nCoresPerPkg = 1;
1668  }
1669 
1670  //
1671  // Check to see if the machine topology is uniform
1672  //
1673  unsigned prod = maxCt[0];
1674  for (level = 1; level < depth; level++) {
1675  prod *= maxCt[level];
1676  }
1677  bool uniform = (prod == totals[level - 1]);
1678 
1679  //
1680  // Print the machine topology summary.
1681  //
1682  if (__kmp_affinity_verbose) {
1683  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1684  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1685 
1686  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1687  if (__kmp_affinity_respect_mask) {
1688  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1689  } else {
1690  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1691  }
1692  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1693  if (uniform) {
1694  KMP_INFORM(Uniform, "KMP_AFFINITY");
1695  } else {
1696  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1697  }
1698 
1699  kmp_str_buf_t buf;
1700  __kmp_str_buf_init(&buf);
1701 
1702  __kmp_str_buf_print(&buf, "%d", totals[0]);
1703  for (level = 1; level <= pkgLevel; level++) {
1704  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1705  }
1706  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1707  __kmp_nThreadsPerCore, __kmp_ncores);
1708 
1709  __kmp_str_buf_free(&buf);
1710  }
1711 
1712  if (__kmp_affinity_type == affinity_none) {
1713  __kmp_free(last);
1714  __kmp_free(maxCt);
1715  __kmp_free(counts);
1716  __kmp_free(totals);
1717  __kmp_free(retval);
1718  KMP_CPU_FREE(oldMask);
1719  return 0;
1720  }
1721 
1722  //
1723  // Find any levels with radiix 1, and remove them from the map
1724  // (except for the package level).
1725  //
1726  int new_depth = 0;
1727  for (level = 0; level < depth; level++) {
1728  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1729  continue;
1730  }
1731  new_depth++;
1732  }
1733 
1734  //
1735  // If we are removing any levels, allocate a new vector to return,
1736  // and copy the relevant information to it.
1737  //
1738  if (new_depth != depth) {
1739  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1740  sizeof(AddrUnsPair) * nApics);
1741  for (proc = 0; (int)proc < nApics; proc++) {
1742  Address addr(new_depth);
1743  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1744  }
1745  int new_level = 0;
1746  for (level = 0; level < depth; level++) {
1747  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1748  if (level == threadLevel) {
1749  threadLevel = -1;
1750  }
1751  else if ((threadLevel >= 0) && (level < threadLevel)) {
1752  threadLevel--;
1753  }
1754  if (level == coreLevel) {
1755  coreLevel = -1;
1756  }
1757  else if ((coreLevel >= 0) && (level < coreLevel)) {
1758  coreLevel--;
1759  }
1760  if (level < pkgLevel) {
1761  pkgLevel--;
1762  }
1763  continue;
1764  }
1765  for (proc = 0; (int)proc < nApics; proc++) {
1766  new_retval[proc].first.labels[new_level]
1767  = retval[proc].first.labels[level];
1768  }
1769  new_level++;
1770  }
1771 
1772  __kmp_free(retval);
1773  retval = new_retval;
1774  depth = new_depth;
1775  }
1776 
1777  if (__kmp_affinity_gran_levels < 0) {
1778  //
1779  // Set the granularity level based on what levels are modeled
1780  // in the machine topology map.
1781  //
1782  __kmp_affinity_gran_levels = 0;
1783  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1784  __kmp_affinity_gran_levels++;
1785  }
1786  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1787  __kmp_affinity_gran_levels++;
1788  }
1789  if (__kmp_affinity_gran > affinity_gran_package) {
1790  __kmp_affinity_gran_levels++;
1791  }
1792  }
1793 
1794  if (__kmp_affinity_verbose) {
1795  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1796  coreLevel, threadLevel);
1797  }
1798 
1799  __kmp_free(last);
1800  __kmp_free(maxCt);
1801  __kmp_free(counts);
1802  __kmp_free(totals);
1803  KMP_CPU_FREE(oldMask);
1804  *address2os = retval;
1805  return depth;
1806 }
1807 
1808 
1809 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1810 
1811 
1812 #define osIdIndex 0
1813 #define threadIdIndex 1
1814 #define coreIdIndex 2
1815 #define pkgIdIndex 3
1816 #define nodeIdIndex 4
1817 
1818 typedef unsigned *ProcCpuInfo;
1819 static unsigned maxIndex = pkgIdIndex;
1820 
1821 
1822 static int
1823 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1824 {
1825  const unsigned *aa = (const unsigned *)a;
1826  const unsigned *bb = (const unsigned *)b;
1827  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1828  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1829  return 0;
1830 };
1831 
1832 
1833 static int
1834 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1835 {
1836  unsigned i;
1837  const unsigned *aa = *((const unsigned **)a);
1838  const unsigned *bb = *((const unsigned **)b);
1839  for (i = maxIndex; ; i--) {
1840  if (aa[i] < bb[i]) return -1;
1841  if (aa[i] > bb[i]) return 1;
1842  if (i == osIdIndex) break;
1843  }
1844  return 0;
1845 }
1846 
1847 
1848 //
1849 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1850 // affinity map.
1851 //
1852 static int
1853 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1854  kmp_i18n_id_t *const msg_id, FILE *f)
1855 {
1856  *address2os = NULL;
1857  *msg_id = kmp_i18n_null;
1858 
1859  //
1860  // Scan of the file, and count the number of "processor" (osId) fields,
1861  // and find the highest value of <n> for a node_<n> field.
1862  //
1863  char buf[256];
1864  unsigned num_records = 0;
1865  while (! feof(f)) {
1866  buf[sizeof(buf) - 1] = 1;
1867  if (! fgets(buf, sizeof(buf), f)) {
1868  //
1869  // Read errors presumably because of EOF
1870  //
1871  break;
1872  }
1873 
1874  char s1[] = "processor";
1875  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1876  num_records++;
1877  continue;
1878  }
1879 
1880  //
1881  // FIXME - this will match "node_<n> <garbage>"
1882  //
1883  unsigned level;
1884  if (sscanf(buf, "node_%d id", &level) == 1) {
1885  if (nodeIdIndex + level >= maxIndex) {
1886  maxIndex = nodeIdIndex + level;
1887  }
1888  continue;
1889  }
1890  }
1891 
1892  //
1893  // Check for empty file / no valid processor records, or too many.
1894  // The number of records can't exceed the number of valid bits in the
1895  // affinity mask.
1896  //
1897  if (num_records == 0) {
1898  *line = 0;
1899  *msg_id = kmp_i18n_str_NoProcRecords;
1900  return -1;
1901  }
1902  if (num_records > (unsigned)__kmp_xproc) {
1903  *line = 0;
1904  *msg_id = kmp_i18n_str_TooManyProcRecords;
1905  return -1;
1906  }
1907 
1908  //
1909  // Set the file pointer back to the begginning, so that we can scan the
1910  // file again, this time performing a full parse of the data.
1911  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1912  // Adding an extra element at the end allows us to remove a lot of extra
1913  // checks for termination conditions.
1914  //
1915  if (fseek(f, 0, SEEK_SET) != 0) {
1916  *line = 0;
1917  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1918  return -1;
1919  }
1920 
1921  //
1922  // Allocate the array of records to store the proc info in. The dummy
1923  // element at the end makes the logic in filling them out easier to code.
1924  //
1925  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1926  * sizeof(unsigned *));
1927  unsigned i;
1928  for (i = 0; i <= num_records; i++) {
1929  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1930  * sizeof(unsigned));
1931  }
1932 
1933 #define CLEANUP_THREAD_INFO \
1934  for (i = 0; i <= num_records; i++) { \
1935  __kmp_free(threadInfo[i]); \
1936  } \
1937  __kmp_free(threadInfo);
1938 
1939  //
1940  // A value of UINT_MAX means that we didn't find the field
1941  //
1942  unsigned __index;
1943 
1944 #define INIT_PROC_INFO(p) \
1945  for (__index = 0; __index <= maxIndex; __index++) { \
1946  (p)[__index] = UINT_MAX; \
1947  }
1948 
1949  for (i = 0; i <= num_records; i++) {
1950  INIT_PROC_INFO(threadInfo[i]);
1951  }
1952 
1953  unsigned num_avail = 0;
1954  *line = 0;
1955  while (! feof(f)) {
1956  //
1957  // Create an inner scoping level, so that all the goto targets at the
1958  // end of the loop appear in an outer scoping level. This avoids
1959  // warnings about jumping past an initialization to a target in the
1960  // same block.
1961  //
1962  {
1963  buf[sizeof(buf) - 1] = 1;
1964  bool long_line = false;
1965  if (! fgets(buf, sizeof(buf), f)) {
1966  //
1967  // Read errors presumably because of EOF
1968  //
1969  // If there is valid data in threadInfo[num_avail], then fake
1970  // a blank line in ensure that the last address gets parsed.
1971  //
1972  bool valid = false;
1973  for (i = 0; i <= maxIndex; i++) {
1974  if (threadInfo[num_avail][i] != UINT_MAX) {
1975  valid = true;
1976  }
1977  }
1978  if (! valid) {
1979  break;
1980  }
1981  buf[0] = 0;
1982  } else if (!buf[sizeof(buf) - 1]) {
1983  //
1984  // The line is longer than the buffer. Set a flag and don't
1985  // emit an error if we were going to ignore the line, anyway.
1986  //
1987  long_line = true;
1988 
1989 #define CHECK_LINE \
1990  if (long_line) { \
1991  CLEANUP_THREAD_INFO; \
1992  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1993  return -1; \
1994  }
1995  }
1996  (*line)++;
1997 
1998  char s1[] = "processor";
1999  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2000  CHECK_LINE;
2001  char *p = strchr(buf + sizeof(s1) - 1, ':');
2002  unsigned val;
2003  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2004  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2005  threadInfo[num_avail][osIdIndex] = val;
2006 #if KMP_OS_LINUX && USE_SYSFS_INFO
2007  char path[256];
2008  snprintf(path, sizeof(path),
2009  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2010  threadInfo[num_avail][osIdIndex]);
2011  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2012 
2013  snprintf(path, sizeof(path),
2014  "/sys/devices/system/cpu/cpu%u/topology/core_id",
2015  threadInfo[num_avail][osIdIndex]);
2016  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2017  continue;
2018 #else
2019  }
2020  char s2[] = "physical id";
2021  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2022  CHECK_LINE;
2023  char *p = strchr(buf + sizeof(s2) - 1, ':');
2024  unsigned val;
2025  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2026  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2027  threadInfo[num_avail][pkgIdIndex] = val;
2028  continue;
2029  }
2030  char s3[] = "core id";
2031  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2032  CHECK_LINE;
2033  char *p = strchr(buf + sizeof(s3) - 1, ':');
2034  unsigned val;
2035  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2036  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2037  threadInfo[num_avail][coreIdIndex] = val;
2038  continue;
2039 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2040  }
2041  char s4[] = "thread id";
2042  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2043  CHECK_LINE;
2044  char *p = strchr(buf + sizeof(s4) - 1, ':');
2045  unsigned val;
2046  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2047  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2048  threadInfo[num_avail][threadIdIndex] = val;
2049  continue;
2050  }
2051  unsigned level;
2052  if (sscanf(buf, "node_%d id", &level) == 1) {
2053  CHECK_LINE;
2054  char *p = strchr(buf + sizeof(s4) - 1, ':');
2055  unsigned val;
2056  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
2057  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2058  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2059  threadInfo[num_avail][nodeIdIndex + level] = val;
2060  continue;
2061  }
2062 
2063  //
2064  // We didn't recognize the leading token on the line.
2065  // There are lots of leading tokens that we don't recognize -
2066  // if the line isn't empty, go on to the next line.
2067  //
2068  if ((*buf != 0) && (*buf != '\n')) {
2069  //
2070  // If the line is longer than the buffer, read characters
2071  // until we find a newline.
2072  //
2073  if (long_line) {
2074  int ch;
2075  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2076  }
2077  continue;
2078  }
2079 
2080  //
2081  // A newline has signalled the end of the processor record.
2082  // Check that there aren't too many procs specified.
2083  //
2084  if ((int)num_avail == __kmp_xproc) {
2085  CLEANUP_THREAD_INFO;
2086  *msg_id = kmp_i18n_str_TooManyEntries;
2087  return -1;
2088  }
2089 
2090  //
2091  // Check for missing fields. The osId field must be there, and we
2092  // currently require that the physical id field is specified, also.
2093  //
2094  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2095  CLEANUP_THREAD_INFO;
2096  *msg_id = kmp_i18n_str_MissingProcField;
2097  return -1;
2098  }
2099  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2100  CLEANUP_THREAD_INFO;
2101  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2102  return -1;
2103  }
2104 
2105  //
2106  // Skip this proc if it is not included in the machine model.
2107  //
2108  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2109  INIT_PROC_INFO(threadInfo[num_avail]);
2110  continue;
2111  }
2112 
2113  //
2114  // We have a successful parse of this proc's info.
2115  // Increment the counter, and prepare for the next proc.
2116  //
2117  num_avail++;
2118  KMP_ASSERT(num_avail <= num_records);
2119  INIT_PROC_INFO(threadInfo[num_avail]);
2120  }
2121  continue;
2122 
2123  no_val:
2124  CLEANUP_THREAD_INFO;
2125  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2126  return -1;
2127 
2128  dup_field:
2129  CLEANUP_THREAD_INFO;
2130  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2131  return -1;
2132  }
2133  *line = 0;
2134 
2135 # if KMP_MIC && REDUCE_TEAM_SIZE
2136  unsigned teamSize = 0;
2137 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2138 
2139  // check for num_records == __kmp_xproc ???
2140 
2141  //
2142  // If there's only one thread context to bind to, form an Address object
2143  // with depth 1 and return immediately (or, if affinity is off, set
2144  // address2os to NULL and return).
2145  //
2146  // If it is configured to omit the package level when there is only a
2147  // single package, the logic at the end of this routine won't work if
2148  // there is only a single thread - it would try to form an Address
2149  // object with depth 0.
2150  //
2151  KMP_ASSERT(num_avail > 0);
2152  KMP_ASSERT(num_avail <= num_records);
2153  if (num_avail == 1) {
2154  __kmp_ncores = 1;
2155  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2156  __kmp_ht_enabled = FALSE;
2157  if (__kmp_affinity_verbose) {
2158  if (! KMP_AFFINITY_CAPABLE()) {
2159  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2160  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2161  KMP_INFORM(Uniform, "KMP_AFFINITY");
2162  }
2163  else {
2164  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2165  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2166  fullMask);
2167  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2168  if (__kmp_affinity_respect_mask) {
2169  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2170  } else {
2171  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2172  }
2173  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2174  KMP_INFORM(Uniform, "KMP_AFFINITY");
2175  }
2176  int index;
2177  kmp_str_buf_t buf;
2178  __kmp_str_buf_init(&buf);
2179  __kmp_str_buf_print(&buf, "1");
2180  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2181  __kmp_str_buf_print(&buf, " x 1");
2182  }
2183  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2184  __kmp_str_buf_free(&buf);
2185  }
2186 
2187  if (__kmp_affinity_type == affinity_none) {
2188  CLEANUP_THREAD_INFO;
2189  return 0;
2190  }
2191 
2192  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2193  Address addr(1);
2194  addr.labels[0] = threadInfo[0][pkgIdIndex];
2195  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2196 
2197  if (__kmp_affinity_gran_levels < 0) {
2198  __kmp_affinity_gran_levels = 0;
2199  }
2200 
2201  if (__kmp_affinity_verbose) {
2202  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2203  }
2204 
2205  CLEANUP_THREAD_INFO;
2206  return 1;
2207  }
2208 
2209  //
2210  // Sort the threadInfo table by physical Id.
2211  //
2212  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2213  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2214 
2215  //
2216  // The table is now sorted by pkgId / coreId / threadId, but we really
2217  // don't know the radix of any of the fields. pkgId's may be sparsely
2218  // assigned among the chips on a system. Although coreId's are usually
2219  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2220  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2221  //
2222  // For that matter, we don't know what coresPerPkg and threadsPerCore
2223  // (or the total # packages) are at this point - we want to determine
2224  // that now. We only have an upper bound on the first two figures.
2225  //
2226  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2227  * sizeof(unsigned));
2228  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2229  * sizeof(unsigned));
2230  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2231  * sizeof(unsigned));
2232  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2233  * sizeof(unsigned));
2234 
2235  bool assign_thread_ids = false;
2236  unsigned threadIdCt;
2237  unsigned index;
2238 
2239  restart_radix_check:
2240  threadIdCt = 0;
2241 
2242  //
2243  // Initialize the counter arrays with data from threadInfo[0].
2244  //
2245  if (assign_thread_ids) {
2246  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2247  threadInfo[0][threadIdIndex] = threadIdCt++;
2248  }
2249  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2250  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2251  }
2252  }
2253  for (index = 0; index <= maxIndex; index++) {
2254  counts[index] = 1;
2255  maxCt[index] = 1;
2256  totals[index] = 1;
2257  lastId[index] = threadInfo[0][index];;
2258  }
2259 
2260  //
2261  // Run through the rest of the OS procs.
2262  //
2263  for (i = 1; i < num_avail; i++) {
2264  //
2265  // Find the most significant index whose id differs
2266  // from the id for the previous OS proc.
2267  //
2268  for (index = maxIndex; index >= threadIdIndex; index--) {
2269  if (assign_thread_ids && (index == threadIdIndex)) {
2270  //
2271  // Auto-assign the thread id field if it wasn't specified.
2272  //
2273  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2274  threadInfo[i][threadIdIndex] = threadIdCt++;
2275  }
2276 
2277  //
2278  // Aparrently the thread id field was specified for some
2279  // entries and not others. Start the thread id counter
2280  // off at the next higher thread id.
2281  //
2282  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2283  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2284  }
2285  }
2286  if (threadInfo[i][index] != lastId[index]) {
2287  //
2288  // Run through all indices which are less significant,
2289  // and reset the counts to 1.
2290  //
2291  // At all levels up to and including index, we need to
2292  // increment the totals and record the last id.
2293  //
2294  unsigned index2;
2295  for (index2 = threadIdIndex; index2 < index; index2++) {
2296  totals[index2]++;
2297  if (counts[index2] > maxCt[index2]) {
2298  maxCt[index2] = counts[index2];
2299  }
2300  counts[index2] = 1;
2301  lastId[index2] = threadInfo[i][index2];
2302  }
2303  counts[index]++;
2304  totals[index]++;
2305  lastId[index] = threadInfo[i][index];
2306 
2307  if (assign_thread_ids && (index > threadIdIndex)) {
2308 
2309 # if KMP_MIC && REDUCE_TEAM_SIZE
2310  //
2311  // The default team size is the total #threads in the machine
2312  // minus 1 thread for every core that has 3 or more threads.
2313  //
2314  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2315 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2316 
2317  //
2318  // Restart the thread counter, as we are on a new core.
2319  //
2320  threadIdCt = 0;
2321 
2322  //
2323  // Auto-assign the thread id field if it wasn't specified.
2324  //
2325  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2326  threadInfo[i][threadIdIndex] = threadIdCt++;
2327  }
2328 
2329  //
2330  // Aparrently the thread id field was specified for some
2331  // entries and not others. Start the thread id counter
2332  // off at the next higher thread id.
2333  //
2334  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2335  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2336  }
2337  }
2338  break;
2339  }
2340  }
2341  if (index < threadIdIndex) {
2342  //
2343  // If thread ids were specified, it is an error if they are not
2344  // unique. Also, check that we waven't already restarted the
2345  // loop (to be safe - shouldn't need to).
2346  //
2347  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2348  || assign_thread_ids) {
2349  __kmp_free(lastId);
2350  __kmp_free(totals);
2351  __kmp_free(maxCt);
2352  __kmp_free(counts);
2353  CLEANUP_THREAD_INFO;
2354  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2355  return -1;
2356  }
2357 
2358  //
2359  // If the thread ids were not specified and we see entries
2360  // entries that are duplicates, start the loop over and
2361  // assign the thread ids manually.
2362  //
2363  assign_thread_ids = true;
2364  goto restart_radix_check;
2365  }
2366  }
2367 
2368 # if KMP_MIC && REDUCE_TEAM_SIZE
2369  //
2370  // The default team size is the total #threads in the machine
2371  // minus 1 thread for every core that has 3 or more threads.
2372  //
2373  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2374 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2375 
2376  for (index = threadIdIndex; index <= maxIndex; index++) {
2377  if (counts[index] > maxCt[index]) {
2378  maxCt[index] = counts[index];
2379  }
2380  }
2381 
2382  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2383  nCoresPerPkg = maxCt[coreIdIndex];
2384  nPackages = totals[pkgIdIndex];
2385 
2386  //
2387  // Check to see if the machine topology is uniform
2388  //
2389  unsigned prod = totals[maxIndex];
2390  for (index = threadIdIndex; index < maxIndex; index++) {
2391  prod *= maxCt[index];
2392  }
2393  bool uniform = (prod == totals[threadIdIndex]);
2394 
2395  //
2396  // When affinity is off, this routine will still be called to set
2397  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
2398  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2399  // correctly, and return now if affinity is not enabled.
2400  //
2401  __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1
2402  __kmp_ncores = totals[coreIdIndex];
2403 
2404  if (__kmp_affinity_verbose) {
2405  if (! KMP_AFFINITY_CAPABLE()) {
2406  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2407  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2408  if (uniform) {
2409  KMP_INFORM(Uniform, "KMP_AFFINITY");
2410  } else {
2411  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2412  }
2413  }
2414  else {
2415  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2416  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2417  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2418  if (__kmp_affinity_respect_mask) {
2419  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2420  } else {
2421  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2422  }
2423  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2424  if (uniform) {
2425  KMP_INFORM(Uniform, "KMP_AFFINITY");
2426  } else {
2427  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2428  }
2429  }
2430  kmp_str_buf_t buf;
2431  __kmp_str_buf_init(&buf);
2432 
2433  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2434  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2435  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2436  }
2437  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2438  maxCt[threadIdIndex], __kmp_ncores);
2439 
2440  __kmp_str_buf_free(&buf);
2441  }
2442 
2443 # if KMP_MIC && REDUCE_TEAM_SIZE
2444  //
2445  // Set the default team size.
2446  //
2447  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2448  __kmp_dflt_team_nth = teamSize;
2449  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2450  __kmp_dflt_team_nth));
2451  }
2452 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2453 
2454  if (__kmp_affinity_type == affinity_none) {
2455  __kmp_free(lastId);
2456  __kmp_free(totals);
2457  __kmp_free(maxCt);
2458  __kmp_free(counts);
2459  CLEANUP_THREAD_INFO;
2460  return 0;
2461  }
2462 
2463  //
2464  // Count the number of levels which have more nodes at that level than
2465  // at the parent's level (with there being an implicit root node of
2466  // the top level). This is equivalent to saying that there is at least
2467  // one node at this level which has a sibling. These levels are in the
2468  // map, and the package level is always in the map.
2469  //
2470  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2471  int level = 0;
2472  for (index = threadIdIndex; index < maxIndex; index++) {
2473  KMP_ASSERT(totals[index] >= totals[index + 1]);
2474  inMap[index] = (totals[index] > totals[index + 1]);
2475  }
2476  inMap[maxIndex] = (totals[maxIndex] > 1);
2477  inMap[pkgIdIndex] = true;
2478 
2479  int depth = 0;
2480  for (index = threadIdIndex; index <= maxIndex; index++) {
2481  if (inMap[index]) {
2482  depth++;
2483  }
2484  }
2485  KMP_ASSERT(depth > 0);
2486 
2487  //
2488  // Construct the data structure that is to be returned.
2489  //
2490  *address2os = (AddrUnsPair*)
2491  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2492  int pkgLevel = -1;
2493  int coreLevel = -1;
2494  int threadLevel = -1;
2495 
2496  for (i = 0; i < num_avail; ++i) {
2497  Address addr(depth);
2498  unsigned os = threadInfo[i][osIdIndex];
2499  int src_index;
2500  int dst_index = 0;
2501 
2502  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2503  if (! inMap[src_index]) {
2504  continue;
2505  }
2506  addr.labels[dst_index] = threadInfo[i][src_index];
2507  if (src_index == pkgIdIndex) {
2508  pkgLevel = dst_index;
2509  }
2510  else if (src_index == coreIdIndex) {
2511  coreLevel = dst_index;
2512  }
2513  else if (src_index == threadIdIndex) {
2514  threadLevel = dst_index;
2515  }
2516  dst_index++;
2517  }
2518  (*address2os)[i] = AddrUnsPair(addr, os);
2519  }
2520 
2521  if (__kmp_affinity_gran_levels < 0) {
2522  //
2523  // Set the granularity level based on what levels are modeled
2524  // in the machine topology map.
2525  //
2526  unsigned src_index;
2527  __kmp_affinity_gran_levels = 0;
2528  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2529  if (! inMap[src_index]) {
2530  continue;
2531  }
2532  switch (src_index) {
2533  case threadIdIndex:
2534  if (__kmp_affinity_gran > affinity_gran_thread) {
2535  __kmp_affinity_gran_levels++;
2536  }
2537 
2538  break;
2539  case coreIdIndex:
2540  if (__kmp_affinity_gran > affinity_gran_core) {
2541  __kmp_affinity_gran_levels++;
2542  }
2543  break;
2544 
2545  case pkgIdIndex:
2546  if (__kmp_affinity_gran > affinity_gran_package) {
2547  __kmp_affinity_gran_levels++;
2548  }
2549  break;
2550  }
2551  }
2552  }
2553 
2554  if (__kmp_affinity_verbose) {
2555  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2556  coreLevel, threadLevel);
2557  }
2558 
2559  __kmp_free(inMap);
2560  __kmp_free(lastId);
2561  __kmp_free(totals);
2562  __kmp_free(maxCt);
2563  __kmp_free(counts);
2564  CLEANUP_THREAD_INFO;
2565  return depth;
2566 }
2567 
2568 
2569 //
2570 // Create and return a table of affinity masks, indexed by OS thread ID.
2571 // This routine handles OR'ing together all the affinity masks of threads
2572 // that are sufficiently close, if granularity > fine.
2573 //
2574 static kmp_affin_mask_t *
2575 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2576  AddrUnsPair *address2os, unsigned numAddrs)
2577 {
2578  //
2579  // First form a table of affinity masks in order of OS thread id.
2580  //
2581  unsigned depth;
2582  unsigned maxOsId;
2583  unsigned i;
2584 
2585  KMP_ASSERT(numAddrs > 0);
2586  depth = address2os[0].first.depth;
2587 
2588  maxOsId = 0;
2589  for (i = 0; i < numAddrs; i++) {
2590  unsigned osId = address2os[i].second;
2591  if (osId > maxOsId) {
2592  maxOsId = osId;
2593  }
2594  }
2595  kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2596  (maxOsId + 1) * __kmp_affin_mask_size);
2597 
2598  //
2599  // Sort the address2os table according to physical order. Doing so
2600  // will put all threads on the same core/package/node in consecutive
2601  // locations.
2602  //
2603  qsort(address2os, numAddrs, sizeof(*address2os),
2604  __kmp_affinity_cmp_Address_labels);
2605 
2606  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2607  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2608  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2609  }
2610  if (__kmp_affinity_gran_levels >= (int)depth) {
2611  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2612  && (__kmp_affinity_type != affinity_none))) {
2613  KMP_WARNING(AffThreadsMayMigrate);
2614  }
2615  }
2616 
2617  //
2618  // Run through the table, forming the masks for all threads on each
2619  // core. Threads on the same core will have identical "Address"
2620  // objects, not considering the last level, which must be the thread
2621  // id. All threads on a core will appear consecutively.
2622  //
2623  unsigned unique = 0;
2624  unsigned j = 0; // index of 1st thread on core
2625  unsigned leader = 0;
2626  Address *leaderAddr = &(address2os[0].first);
2627  kmp_affin_mask_t *sum
2628  = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2629  KMP_CPU_ZERO(sum);
2630  KMP_CPU_SET(address2os[0].second, sum);
2631  for (i = 1; i < numAddrs; i++) {
2632  //
2633  // If this thread is sufficiently close to the leader (within the
2634  // granularity setting), then set the bit for this os thread in the
2635  // affinity mask for this group, and go on to the next thread.
2636  //
2637  if (leaderAddr->isClose(address2os[i].first,
2638  __kmp_affinity_gran_levels)) {
2639  KMP_CPU_SET(address2os[i].second, sum);
2640  continue;
2641  }
2642 
2643  //
2644  // For every thread in this group, copy the mask to the thread's
2645  // entry in the osId2Mask table. Mark the first address as a
2646  // leader.
2647  //
2648  for (; j < i; j++) {
2649  unsigned osId = address2os[j].second;
2650  KMP_DEBUG_ASSERT(osId <= maxOsId);
2651  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2652  KMP_CPU_COPY(mask, sum);
2653  address2os[j].first.leader = (j == leader);
2654  }
2655  unique++;
2656 
2657  //
2658  // Start a new mask.
2659  //
2660  leader = i;
2661  leaderAddr = &(address2os[i].first);
2662  KMP_CPU_ZERO(sum);
2663  KMP_CPU_SET(address2os[i].second, sum);
2664  }
2665 
2666  //
2667  // For every thread in last group, copy the mask to the thread's
2668  // entry in the osId2Mask table.
2669  //
2670  for (; j < i; j++) {
2671  unsigned osId = address2os[j].second;
2672  KMP_DEBUG_ASSERT(osId <= maxOsId);
2673  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2674  KMP_CPU_COPY(mask, sum);
2675  address2os[j].first.leader = (j == leader);
2676  }
2677  unique++;
2678 
2679  *maxIndex = maxOsId;
2680  *numUnique = unique;
2681  return osId2Mask;
2682 }
2683 
2684 
2685 //
2686 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2687 // as file-static than to try and pass them through the calling sequence of
2688 // the recursive-descent OMP_PLACES parser.
2689 //
2690 static kmp_affin_mask_t *newMasks;
2691 static int numNewMasks;
2692 static int nextNewMask;
2693 
2694 #define ADD_MASK(_mask) \
2695  { \
2696  if (nextNewMask >= numNewMasks) { \
2697  numNewMasks *= 2; \
2698  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2699  numNewMasks * __kmp_affin_mask_size); \
2700  } \
2701  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2702  nextNewMask++; \
2703  }
2704 
2705 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2706  { \
2707  if (((_osId) > _maxOsId) || \
2708  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2709  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2710  && (__kmp_affinity_type != affinity_none))) { \
2711  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2712  } \
2713  } \
2714  else { \
2715  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2716  } \
2717  }
2718 
2719 
2720 //
2721 // Re-parse the proclist (for the explicit affinity type), and form the list
2722 // of affinity newMasks indexed by gtid.
2723 //
2724 static void
2725 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2726  unsigned int *out_numMasks, const char *proclist,
2727  kmp_affin_mask_t *osId2Mask, int maxOsId)
2728 {
2729  const char *scan = proclist;
2730  const char *next = proclist;
2731 
2732  //
2733  // We use malloc() for the temporary mask vector,
2734  // so that we can use realloc() to extend it.
2735  //
2736  numNewMasks = 2;
2737  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2738  * __kmp_affin_mask_size);
2739  nextNewMask = 0;
2740  kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2741  __kmp_affin_mask_size);
2742  int setSize = 0;
2743 
2744  for (;;) {
2745  int start, end, stride;
2746 
2747  SKIP_WS(scan);
2748  next = scan;
2749  if (*next == '\0') {
2750  break;
2751  }
2752 
2753  if (*next == '{') {
2754  int num;
2755  setSize = 0;
2756  next++; // skip '{'
2757  SKIP_WS(next);
2758  scan = next;
2759 
2760  //
2761  // Read the first integer in the set.
2762  //
2763  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2764  "bad proclist");
2765  SKIP_DIGITS(next);
2766  num = __kmp_str_to_int(scan, *next);
2767  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2768 
2769  //
2770  // Copy the mask for that osId to the sum (union) mask.
2771  //
2772  if ((num > maxOsId) ||
2773  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2774  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2775  && (__kmp_affinity_type != affinity_none))) {
2776  KMP_WARNING(AffIgnoreInvalidProcID, num);
2777  }
2778  KMP_CPU_ZERO(sumMask);
2779  }
2780  else {
2781  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2782  setSize = 1;
2783  }
2784 
2785  for (;;) {
2786  //
2787  // Check for end of set.
2788  //
2789  SKIP_WS(next);
2790  if (*next == '}') {
2791  next++; // skip '}'
2792  break;
2793  }
2794 
2795  //
2796  // Skip optional comma.
2797  //
2798  if (*next == ',') {
2799  next++;
2800  }
2801  SKIP_WS(next);
2802 
2803  //
2804  // Read the next integer in the set.
2805  //
2806  scan = next;
2807  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2808  "bad explicit proc list");
2809 
2810  SKIP_DIGITS(next);
2811  num = __kmp_str_to_int(scan, *next);
2812  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2813 
2814  //
2815  // Add the mask for that osId to the sum mask.
2816  //
2817  if ((num > maxOsId) ||
2818  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2819  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2820  && (__kmp_affinity_type != affinity_none))) {
2821  KMP_WARNING(AffIgnoreInvalidProcID, num);
2822  }
2823  }
2824  else {
2825  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2826  setSize++;
2827  }
2828  }
2829  if (setSize > 0) {
2830  ADD_MASK(sumMask);
2831  }
2832 
2833  SKIP_WS(next);
2834  if (*next == ',') {
2835  next++;
2836  }
2837  scan = next;
2838  continue;
2839  }
2840 
2841  //
2842  // Read the first integer.
2843  //
2844  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2845  SKIP_DIGITS(next);
2846  start = __kmp_str_to_int(scan, *next);
2847  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2848  SKIP_WS(next);
2849 
2850  //
2851  // If this isn't a range, then add a mask to the list and go on.
2852  //
2853  if (*next != '-') {
2854  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2855 
2856  //
2857  // Skip optional comma.
2858  //
2859  if (*next == ',') {
2860  next++;
2861  }
2862  scan = next;
2863  continue;
2864  }
2865 
2866  //
2867  // This is a range. Skip over the '-' and read in the 2nd int.
2868  //
2869  next++; // skip '-'
2870  SKIP_WS(next);
2871  scan = next;
2872  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2873  SKIP_DIGITS(next);
2874  end = __kmp_str_to_int(scan, *next);
2875  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2876 
2877  //
2878  // Check for a stride parameter
2879  //
2880  stride = 1;
2881  SKIP_WS(next);
2882  if (*next == ':') {
2883  //
2884  // A stride is specified. Skip over the ':" and read the 3rd int.
2885  //
2886  int sign = +1;
2887  next++; // skip ':'
2888  SKIP_WS(next);
2889  scan = next;
2890  if (*next == '-') {
2891  sign = -1;
2892  next++;
2893  SKIP_WS(next);
2894  scan = next;
2895  }
2896  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2897  "bad explicit proc list");
2898  SKIP_DIGITS(next);
2899  stride = __kmp_str_to_int(scan, *next);
2900  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2901  stride *= sign;
2902  }
2903 
2904  //
2905  // Do some range checks.
2906  //
2907  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2908  if (stride > 0) {
2909  KMP_ASSERT2(start <= end, "bad explicit proc list");
2910  }
2911  else {
2912  KMP_ASSERT2(start >= end, "bad explicit proc list");
2913  }
2914  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2915 
2916  //
2917  // Add the mask for each OS proc # to the list.
2918  //
2919  if (stride > 0) {
2920  do {
2921  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2922  start += stride;
2923  } while (start <= end);
2924  }
2925  else {
2926  do {
2927  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2928  start += stride;
2929  } while (start >= end);
2930  }
2931 
2932  //
2933  // Skip optional comma.
2934  //
2935  SKIP_WS(next);
2936  if (*next == ',') {
2937  next++;
2938  }
2939  scan = next;
2940  }
2941 
2942  *out_numMasks = nextNewMask;
2943  if (nextNewMask == 0) {
2944  *out_masks = NULL;
2945  KMP_INTERNAL_FREE(newMasks);
2946  return;
2947  }
2948  *out_masks
2949  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2950  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2951  __kmp_free(sumMask);
2952  KMP_INTERNAL_FREE(newMasks);
2953 }
2954 
2955 
2956 # if OMP_40_ENABLED
2957 
2958 /*-----------------------------------------------------------------------------
2959 
2960 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2961 places. Again, Here is the grammar:
2962 
2963 place_list := place
2964 place_list := place , place_list
2965 place := num
2966 place := place : num
2967 place := place : num : signed
2968 place := { subplacelist }
2969 place := ! place // (lowest priority)
2970 subplace_list := subplace
2971 subplace_list := subplace , subplace_list
2972 subplace := num
2973 subplace := num : num
2974 subplace := num : num : signed
2975 signed := num
2976 signed := + signed
2977 signed := - signed
2978 
2979 -----------------------------------------------------------------------------*/
2980 
2981 static void
2982 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2983  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2984 {
2985  const char *next;
2986 
2987  for (;;) {
2988  int start, count, stride, i;
2989 
2990  //
2991  // Read in the starting proc id
2992  //
2993  SKIP_WS(*scan);
2994  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2995  "bad explicit places list");
2996  next = *scan;
2997  SKIP_DIGITS(next);
2998  start = __kmp_str_to_int(*scan, *next);
2999  KMP_ASSERT(start >= 0);
3000  *scan = next;
3001 
3002  //
3003  // valid follow sets are ',' ':' and '}'
3004  //
3005  SKIP_WS(*scan);
3006  if (**scan == '}' || **scan == ',') {
3007  if ((start > maxOsId) ||
3008  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3009  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3010  && (__kmp_affinity_type != affinity_none))) {
3011  KMP_WARNING(AffIgnoreInvalidProcID, start);
3012  }
3013  }
3014  else {
3015  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3016  (*setSize)++;
3017  }
3018  if (**scan == '}') {
3019  break;
3020  }
3021  (*scan)++; // skip ','
3022  continue;
3023  }
3024  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3025  (*scan)++; // skip ':'
3026 
3027  //
3028  // Read count parameter
3029  //
3030  SKIP_WS(*scan);
3031  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3032  "bad explicit places list");
3033  next = *scan;
3034  SKIP_DIGITS(next);
3035  count = __kmp_str_to_int(*scan, *next);
3036  KMP_ASSERT(count >= 0);
3037  *scan = next;
3038 
3039  //
3040  // valid follow sets are ',' ':' and '}'
3041  //
3042  SKIP_WS(*scan);
3043  if (**scan == '}' || **scan == ',') {
3044  for (i = 0; i < count; i++) {
3045  if ((start > maxOsId) ||
3046  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3047  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3048  && (__kmp_affinity_type != affinity_none))) {
3049  KMP_WARNING(AffIgnoreInvalidProcID, start);
3050  }
3051  break; // don't proliferate warnings for large count
3052  }
3053  else {
3054  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3055  start++;
3056  (*setSize)++;
3057  }
3058  }
3059  if (**scan == '}') {
3060  break;
3061  }
3062  (*scan)++; // skip ','
3063  continue;
3064  }
3065  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3066  (*scan)++; // skip ':'
3067 
3068  //
3069  // Read stride parameter
3070  //
3071  int sign = +1;
3072  for (;;) {
3073  SKIP_WS(*scan);
3074  if (**scan == '+') {
3075  (*scan)++; // skip '+'
3076  continue;
3077  }
3078  if (**scan == '-') {
3079  sign *= -1;
3080  (*scan)++; // skip '-'
3081  continue;
3082  }
3083  break;
3084  }
3085  SKIP_WS(*scan);
3086  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3087  "bad explicit places list");
3088  next = *scan;
3089  SKIP_DIGITS(next);
3090  stride = __kmp_str_to_int(*scan, *next);
3091  KMP_ASSERT(stride >= 0);
3092  *scan = next;
3093  stride *= sign;
3094 
3095  //
3096  // valid follow sets are ',' and '}'
3097  //
3098  SKIP_WS(*scan);
3099  if (**scan == '}' || **scan == ',') {
3100  for (i = 0; i < count; i++) {
3101  if ((start > maxOsId) ||
3102  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3103  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3104  && (__kmp_affinity_type != affinity_none))) {
3105  KMP_WARNING(AffIgnoreInvalidProcID, start);
3106  }
3107  break; // don't proliferate warnings for large count
3108  }
3109  else {
3110  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3111  start += stride;
3112  (*setSize)++;
3113  }
3114  }
3115  if (**scan == '}') {
3116  break;
3117  }
3118  (*scan)++; // skip ','
3119  continue;
3120  }
3121 
3122  KMP_ASSERT2(0, "bad explicit places list");
3123  }
3124 }
3125 
3126 
3127 static void
3128 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3129  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3130 {
3131  const char *next;
3132 
3133  //
3134  // valid follow sets are '{' '!' and num
3135  //
3136  SKIP_WS(*scan);
3137  if (**scan == '{') {
3138  (*scan)++; // skip '{'
3139  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3140  setSize);
3141  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3142  (*scan)++; // skip '}'
3143  }
3144  else if (**scan == '!') {
3145  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3146  KMP_CPU_COMPLEMENT(tempMask);
3147  (*scan)++; // skip '!'
3148  }
3149  else if ((**scan >= '0') && (**scan <= '9')) {
3150  next = *scan;
3151  SKIP_DIGITS(next);
3152  int num = __kmp_str_to_int(*scan, *next);
3153  KMP_ASSERT(num >= 0);
3154  if ((num > maxOsId) ||
3155  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3156  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3157  && (__kmp_affinity_type != affinity_none))) {
3158  KMP_WARNING(AffIgnoreInvalidProcID, num);
3159  }
3160  }
3161  else {
3162  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3163  (*setSize)++;
3164  }
3165  *scan = next; // skip num
3166  }
3167  else {
3168  KMP_ASSERT2(0, "bad explicit places list");
3169  }
3170 }
3171 
3172 
3173 //static void
3174 void
3175 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3176  unsigned int *out_numMasks, const char *placelist,
3177  kmp_affin_mask_t *osId2Mask, int maxOsId)
3178 {
3179  const char *scan = placelist;
3180  const char *next = placelist;
3181 
3182  numNewMasks = 2;
3183  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3184  * __kmp_affin_mask_size);
3185  nextNewMask = 0;
3186 
3187  kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3188  __kmp_affin_mask_size);
3189  KMP_CPU_ZERO(tempMask);
3190  int setSize = 0;
3191 
3192  for (;;) {
3193  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3194 
3195  //
3196  // valid follow sets are ',' ':' and EOL
3197  //
3198  SKIP_WS(scan);
3199  if (*scan == '\0' || *scan == ',') {
3200  if (setSize > 0) {
3201  ADD_MASK(tempMask);
3202  }
3203  KMP_CPU_ZERO(tempMask);
3204  setSize = 0;
3205  if (*scan == '\0') {
3206  break;
3207  }
3208  scan++; // skip ','
3209  continue;
3210  }
3211 
3212  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3213  scan++; // skip ':'
3214 
3215  //
3216  // Read count parameter
3217  //
3218  SKIP_WS(scan);
3219  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3220  "bad explicit places list");
3221  next = scan;
3222  SKIP_DIGITS(next);
3223  int count = __kmp_str_to_int(scan, *next);
3224  KMP_ASSERT(count >= 0);
3225  scan = next;
3226 
3227  //
3228  // valid follow sets are ',' ':' and EOL
3229  //
3230  SKIP_WS(scan);
3231  int stride;
3232  if (*scan == '\0' || *scan == ',') {
3233  stride = +1;
3234  }
3235  else {
3236  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3237  scan++; // skip ':'
3238 
3239  //
3240  // Read stride parameter
3241  //
3242  int sign = +1;
3243  for (;;) {
3244  SKIP_WS(scan);
3245  if (*scan == '+') {
3246  scan++; // skip '+'
3247  continue;
3248  }
3249  if (*scan == '-') {
3250  sign *= -1;
3251  scan++; // skip '-'
3252  continue;
3253  }
3254  break;
3255  }
3256  SKIP_WS(scan);
3257  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3258  "bad explicit places list");
3259  next = scan;
3260  SKIP_DIGITS(next);
3261  stride = __kmp_str_to_int(scan, *next);
3262  KMP_DEBUG_ASSERT(stride >= 0);
3263  scan = next;
3264  stride *= sign;
3265  }
3266 
3267  if (stride > 0) {
3268  int i;
3269  for (i = 0; i < count; i++) {
3270  int j;
3271  if (setSize == 0) {
3272  break;
3273  }
3274  ADD_MASK(tempMask);
3275  setSize = 0;
3276  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3277  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3278  KMP_CPU_CLR(j, tempMask);
3279  }
3280  else if ((j > maxOsId) ||
3281  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3282  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3283  && (__kmp_affinity_type != affinity_none))) {
3284  KMP_WARNING(AffIgnoreInvalidProcID, j);
3285  }
3286  KMP_CPU_CLR(j, tempMask);
3287  }
3288  else {
3289  KMP_CPU_SET(j, tempMask);
3290  setSize++;
3291  }
3292  }
3293  for (; j >= 0; j--) {
3294  KMP_CPU_CLR(j, tempMask);
3295  }
3296  }
3297  }
3298  else {
3299  int i;
3300  for (i = 0; i < count; i++) {
3301  int j;
3302  if (setSize == 0) {
3303  break;
3304  }
3305  ADD_MASK(tempMask);
3306  setSize = 0;
3307  for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3308  j++) {
3309  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3310  KMP_CPU_CLR(j, tempMask);
3311  }
3312  else if ((j > maxOsId) ||
3313  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3314  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3315  && (__kmp_affinity_type != affinity_none))) {
3316  KMP_WARNING(AffIgnoreInvalidProcID, j);
3317  }
3318  KMP_CPU_CLR(j, tempMask);
3319  }
3320  else {
3321  KMP_CPU_SET(j, tempMask);
3322  setSize++;
3323  }
3324  }
3325  for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3326  KMP_CPU_CLR(j, tempMask);
3327  }
3328  }
3329  }
3330  KMP_CPU_ZERO(tempMask);
3331  setSize = 0;
3332 
3333  //
3334  // valid follow sets are ',' and EOL
3335  //
3336  SKIP_WS(scan);
3337  if (*scan == '\0') {
3338  break;
3339  }
3340  if (*scan == ',') {
3341  scan++; // skip ','
3342  continue;
3343  }
3344 
3345  KMP_ASSERT2(0, "bad explicit places list");
3346  }
3347 
3348  *out_numMasks = nextNewMask;
3349  if (nextNewMask == 0) {
3350  *out_masks = NULL;
3351  KMP_INTERNAL_FREE(newMasks);
3352  return;
3353  }
3354  *out_masks
3355  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3356  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3357  __kmp_free(tempMask);
3358  KMP_INTERNAL_FREE(newMasks);
3359 }
3360 
3361 # endif /* OMP_40_ENABLED */
3362 
3363 #undef ADD_MASK
3364 #undef ADD_MASK_OSID
3365 
3366 
3367 # if KMP_MIC
3368 
3369 static void
3370 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3371 {
3372  if ( __kmp_place_num_cores == 0 ) {
3373  if ( __kmp_place_num_threads_per_core == 0 ) {
3374  return; // no cores limiting actions requested, exit
3375  }
3376  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3377  }
3378  if ( !__kmp_affinity_uniform_topology() ) {
3379  KMP_WARNING( AffThrPlaceNonUniform );
3380  return; // don't support non-uniform topology
3381  }
3382  if ( depth != 3 ) {
3383  KMP_WARNING( AffThrPlaceNonThreeLevel );
3384  return; // don't support not-3-level topology
3385  }
3386  if ( __kmp_place_num_threads_per_core == 0 ) {
3387  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3388  }
3389  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3390  KMP_WARNING( AffThrPlaceManyCores );
3391  return;
3392  }
3393 
3394  AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3395  nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3396  int i, j, k, n_old = 0, n_new = 0;
3397  for ( i = 0; i < nPackages; ++i ) {
3398  for ( j = 0; j < nCoresPerPkg; ++j ) {
3399  if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3400  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3401  } else {
3402  for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3403  if ( k < __kmp_place_num_threads_per_core ) {
3404  newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3405  n_new++;
3406  }
3407  n_old++;
3408  }
3409  }
3410  }
3411  }
3412  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3413  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3414  __kmp_avail_proc = n_new; // correct avail_proc
3415  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3416 
3417  __kmp_free( *pAddr );
3418  *pAddr = newAddr; // replace old topology with new one
3419 }
3420 
3421 # endif /* KMP_MIC */
3422 
3423 
3424 static AddrUnsPair *address2os = NULL;
3425 static int * procarr = NULL;
3426 static int __kmp_aff_depth = 0;
3427 
3428 static void
3429 __kmp_aux_affinity_initialize(void)
3430 {
3431  if (__kmp_affinity_masks != NULL) {
3432  KMP_ASSERT(fullMask != NULL);
3433  return;
3434  }
3435 
3436  //
3437  // Create the "full" mask - this defines all of the processors that we
3438  // consider to be in the machine model. If respect is set, then it is
3439  // the initialization thread's affinity mask. Otherwise, it is all
3440  // processors that we know about on the machine.
3441  //
3442  if (fullMask == NULL) {
3443  fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3444  }
3445  if (KMP_AFFINITY_CAPABLE()) {
3446  if (__kmp_affinity_respect_mask) {
3447  __kmp_get_system_affinity(fullMask, TRUE);
3448 
3449  //
3450  // Count the number of available processors.
3451  //
3452  unsigned i;
3453  __kmp_avail_proc = 0;
3454  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3455  if (! KMP_CPU_ISSET(i, fullMask)) {
3456  continue;
3457  }
3458  __kmp_avail_proc++;
3459  }
3460  if (__kmp_avail_proc > __kmp_xproc) {
3461  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3462  && (__kmp_affinity_type != affinity_none))) {
3463  KMP_WARNING(ErrorInitializeAffinity);
3464  }
3465  __kmp_affinity_type = affinity_none;
3466  __kmp_affin_mask_size = 0;
3467  return;
3468  }
3469  }
3470  else {
3471  __kmp_affinity_entire_machine_mask(fullMask);
3472  __kmp_avail_proc = __kmp_xproc;
3473  }
3474  }
3475 
3476  int depth = -1;
3477  kmp_i18n_id_t msg_id = kmp_i18n_null;
3478 
3479  //
3480  // For backward compatibility, setting KMP_CPUINFO_FILE =>
3481  // KMP_TOPOLOGY_METHOD=cpuinfo
3482  //
3483  if ((__kmp_cpuinfo_file != NULL) &&
3484  (__kmp_affinity_top_method == affinity_top_method_all)) {
3485  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3486  }
3487 
3488  if (__kmp_affinity_top_method == affinity_top_method_all) {
3489  //
3490  // In the default code path, errors are not fatal - we just try using
3491  // another method. We only emit a warning message if affinity is on,
3492  // or the verbose flag is set, an the nowarnings flag was not set.
3493  //
3494  const char *file_name = NULL;
3495  int line = 0;
3496 
3497 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3498 
3499  if (__kmp_affinity_verbose) {
3500  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3501  }
3502 
3503  file_name = NULL;
3504  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3505  if (depth == 0) {
3506  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3507  KMP_ASSERT(address2os == NULL);
3508  return;
3509  }
3510 
3511  if (depth < 0) {
3512  if (__kmp_affinity_verbose) {
3513  if (msg_id != kmp_i18n_null) {
3514  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3515  KMP_I18N_STR(DecodingLegacyAPIC));
3516  }
3517  else {
3518  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3519  }
3520  }
3521 
3522  file_name = NULL;
3523  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3524  if (depth == 0) {
3525  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3526  KMP_ASSERT(address2os == NULL);
3527  return;
3528  }
3529  }
3530 
3531 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3532 
3533 # if KMP_OS_LINUX
3534 
3535  if (depth < 0) {
3536  if (__kmp_affinity_verbose) {
3537  if (msg_id != kmp_i18n_null) {
3538  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3539  }
3540  else {
3541  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3542  }
3543  }
3544 
3545  FILE *f = fopen("/proc/cpuinfo", "r");
3546  if (f == NULL) {
3547  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3548  }
3549  else {
3550  file_name = "/proc/cpuinfo";
3551  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3552  fclose(f);
3553  if (depth == 0) {
3554  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3555  KMP_ASSERT(address2os == NULL);
3556  return;
3557  }
3558  }
3559  }
3560 
3561 # endif /* KMP_OS_LINUX */
3562 
3563 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3564 
3565  if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3566  if (__kmp_affinity_verbose) {
3567  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3568  }
3569 
3570  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3571  KMP_ASSERT(depth != 0);
3572  }
3573 
3574 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3575 
3576  if (depth < 0) {
3577  if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3578  if (file_name == NULL) {
3579  KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3580  }
3581  else if (line == 0) {
3582  KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3583  }
3584  else {
3585  KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3586  }
3587  }
3588  // FIXME - print msg if msg_id = kmp_i18n_null ???
3589 
3590  file_name = "";
3591  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3592  if (depth == 0) {
3593  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3594  KMP_ASSERT(address2os == NULL);
3595  return;
3596  }
3597  KMP_ASSERT(depth > 0);
3598  KMP_ASSERT(address2os != NULL);
3599  }
3600  }
3601 
3602  //
3603  // If the user has specified that a paricular topology discovery method
3604  // is to be used, then we abort if that method fails. The exception is
3605  // group affinity, which might have been implicitly set.
3606  //
3607 
3608 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3609 
3610  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3611  if (__kmp_affinity_verbose) {
3612  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3613  KMP_I18N_STR(Decodingx2APIC));
3614  }
3615 
3616  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3617  if (depth == 0) {
3618  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3619  KMP_ASSERT(address2os == NULL);
3620  return;
3621  }
3622  if (depth < 0) {
3623  KMP_ASSERT(msg_id != kmp_i18n_null);
3624  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3625  }
3626  }
3627  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3628  if (__kmp_affinity_verbose) {
3629  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3630  KMP_I18N_STR(DecodingLegacyAPIC));
3631  }
3632 
3633  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3634  if (depth == 0) {
3635  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3636  KMP_ASSERT(address2os == NULL);
3637  return;
3638  }
3639  if (depth < 0) {
3640  KMP_ASSERT(msg_id != kmp_i18n_null);
3641  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3642  }
3643  }
3644 
3645 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3646 
3647  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3648  const char *filename;
3649  if (__kmp_cpuinfo_file != NULL) {
3650  filename = __kmp_cpuinfo_file;
3651  }
3652  else {
3653  filename = "/proc/cpuinfo";
3654  }
3655 
3656  if (__kmp_affinity_verbose) {
3657  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3658  }
3659 
3660  FILE *f = fopen(filename, "r");
3661  if (f == NULL) {
3662  int code = errno;
3663  if (__kmp_cpuinfo_file != NULL) {
3664  __kmp_msg(
3665  kmp_ms_fatal,
3666  KMP_MSG(CantOpenFileForReading, filename),
3667  KMP_ERR(code),
3668  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3669  __kmp_msg_null
3670  );
3671  }
3672  else {
3673  __kmp_msg(
3674  kmp_ms_fatal,
3675  KMP_MSG(CantOpenFileForReading, filename),
3676  KMP_ERR(code),
3677  __kmp_msg_null
3678  );
3679  }
3680  }
3681  int line = 0;
3682  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3683  fclose(f);
3684  if (depth < 0) {
3685  KMP_ASSERT(msg_id != kmp_i18n_null);
3686  if (line > 0) {
3687  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3688  }
3689  else {
3690  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3691  }
3692  }
3693  if (__kmp_affinity_type == affinity_none) {
3694  KMP_ASSERT(depth == 0);
3695  KMP_ASSERT(address2os == NULL);
3696  return;
3697  }
3698  }
3699 
3700 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3701 
3702  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3703  if (__kmp_affinity_verbose) {
3704  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3705  }
3706 
3707  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3708  KMP_ASSERT(depth != 0);
3709  if (depth < 0) {
3710  KMP_ASSERT(msg_id != kmp_i18n_null);
3711  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3712  }
3713  }
3714 
3715 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3716 
3717  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3718  if (__kmp_affinity_verbose) {
3719  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3720  }
3721 
3722  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3723  if (depth == 0) {
3724  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3725  KMP_ASSERT(address2os == NULL);
3726  return;
3727  }
3728  // should not fail
3729  KMP_ASSERT(depth > 0);
3730  KMP_ASSERT(address2os != NULL);
3731  }
3732 
3733  if (address2os == NULL) {
3734  if (KMP_AFFINITY_CAPABLE()
3735  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3736  && (__kmp_affinity_type != affinity_none)))) {
3737  KMP_WARNING(ErrorInitializeAffinity);
3738  }
3739  __kmp_affinity_type = affinity_none;
3740  __kmp_affin_mask_size = 0;
3741  return;
3742  }
3743 
3744 # if KMP_MIC
3745  __kmp_apply_thread_places(&address2os, depth);
3746 # endif
3747 
3748  //
3749  // Create the table of masks, indexed by thread Id.
3750  //
3751  unsigned maxIndex;
3752  unsigned numUnique;
3753  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3754  address2os, __kmp_avail_proc);
3755  if (__kmp_affinity_gran_levels == 0) {
3756  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3757  }
3758 
3759  //
3760  // Set the childNums vector in all Address objects. This must be done
3761  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3762  // which takes into account the setting of __kmp_affinity_compact.
3763  //
3764  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3765 
3766  switch (__kmp_affinity_type) {
3767 
3768  case affinity_explicit:
3769  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3770 # if OMP_40_ENABLED
3771  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3772 # endif
3773  {
3774  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3775  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3776  maxIndex);
3777  }
3778 # if OMP_40_ENABLED
3779  else {
3780  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3781  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3782  maxIndex);
3783  }
3784 # endif
3785  if (__kmp_affinity_num_masks == 0) {
3786  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3787  && (__kmp_affinity_type != affinity_none))) {
3788  KMP_WARNING(AffNoValidProcID);
3789  }
3790  __kmp_affinity_type = affinity_none;
3791  return;
3792  }
3793  break;
3794 
3795  //
3796  // The other affinity types rely on sorting the Addresses according
3797  // to some permutation of the machine topology tree. Set
3798  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3799  // then jump to a common code fragment to do the sort and create
3800  // the array of affinity masks.
3801  //
3802 
3803  case affinity_logical:
3804  __kmp_affinity_compact = 0;
3805  if (__kmp_affinity_offset) {
3806  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3807  % __kmp_avail_proc;
3808  }
3809  goto sortAddresses;
3810 
3811  case affinity_physical:
3812  if (__kmp_nThreadsPerCore > 1) {
3813  __kmp_affinity_compact = 1;
3814  if (__kmp_affinity_compact >= depth) {
3815  __kmp_affinity_compact = 0;
3816  }
3817  } else {
3818  __kmp_affinity_compact = 0;
3819  }
3820  if (__kmp_affinity_offset) {
3821  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3822  % __kmp_avail_proc;
3823  }
3824  goto sortAddresses;
3825 
3826  case affinity_scatter:
3827  if (__kmp_affinity_compact >= depth) {
3828  __kmp_affinity_compact = 0;
3829  }
3830  else {
3831  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3832  }
3833  goto sortAddresses;
3834 
3835  case affinity_compact:
3836  if (__kmp_affinity_compact >= depth) {
3837  __kmp_affinity_compact = depth - 1;
3838  }
3839  goto sortAddresses;
3840 
3841 # if KMP_MIC
3842  case affinity_balanced:
3843  // Balanced works only for the case of a single package and uniform topology
3844  if( nPackages > 1 ) {
3845  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3846  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3847  }
3848  __kmp_affinity_type = affinity_none;
3849  return;
3850  } else if( __kmp_affinity_uniform_topology() ) {
3851  break;
3852  } else { // Non-uniform topology
3853 
3854  // Save the depth for further usage
3855  __kmp_aff_depth = depth;
3856 
3857  // Number of hyper threads per core in HT machine
3858  int nth_per_core = __kmp_nThreadsPerCore;
3859 
3860  int core_level;
3861  if( nth_per_core > 1 ) {
3862  core_level = depth - 2;
3863  } else {
3864  core_level = depth - 1;
3865  }
3866  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3867  int nproc = nth_per_core * ncores;
3868 
3869  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3870  for( int i = 0; i < nproc; i++ ) {
3871  procarr[ i ] = -1;
3872  }
3873 
3874  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3875  int proc = address2os[ i ].second;
3876  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3877  // If there is only one thread per core then depth == 2: level 0 - package,
3878  // level 1 - core.
3879  int level = depth - 1;
3880 
3881  // __kmp_nth_per_core == 1
3882  int thread = 0;
3883  int core = address2os[ i ].first.labels[ level ];
3884  // If the thread level exists, that is we have more than one thread context per core
3885  if( nth_per_core > 1 ) {
3886  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3887  core = address2os[ i ].first.labels[ level - 1 ];
3888  }
3889  procarr[ core * nth_per_core + thread ] = proc;
3890  }
3891 
3892  break;
3893  }
3894 # endif
3895 
3896  sortAddresses:
3897  //
3898  // Allocate the gtid->affinity mask table.
3899  //
3900  if (__kmp_affinity_dups) {
3901  __kmp_affinity_num_masks = __kmp_avail_proc;
3902  }
3903  else {
3904  __kmp_affinity_num_masks = numUnique;
3905  }
3906 
3907 # if OMP_40_ENABLED
3908  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3909  && ( __kmp_affinity_num_places > 0 )
3910  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3911  __kmp_affinity_num_masks = __kmp_affinity_num_places;
3912  }
3913 # endif
3914 
3915  __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3916  __kmp_affinity_num_masks * __kmp_affin_mask_size);
3917 
3918  //
3919  // Sort the address2os table according to the current setting of
3920  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3921  //
3922  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3923  __kmp_affinity_cmp_Address_child_num);
3924  {
3925  int i;
3926  unsigned j;
3927  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3928  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3929  continue;
3930  }
3931  unsigned osId = address2os[i].second;
3932  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3933  kmp_affin_mask_t *dest
3934  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3935  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3936  KMP_CPU_COPY(dest, src);
3937  if (++j >= __kmp_affinity_num_masks) {
3938  break;
3939  }
3940  }
3941  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3942  }
3943  break;
3944 
3945  default:
3946  KMP_ASSERT2(0, "Unexpected affinity setting");
3947  }
3948 
3949  __kmp_free(osId2Mask);
3950  machine_hierarchy.init(address2os, __kmp_avail_proc);
3951 }
3952 
3953 
3954 void
3955 __kmp_affinity_initialize(void)
3956 {
3957  //
3958  // Much of the code above was written assumming that if a machine was not
3959  // affinity capable, then __kmp_affinity_type == affinity_none. We now
3960  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3961  //
3962  // There are too many checks for __kmp_affinity_type == affinity_none
3963  // in this code. Instead of trying to change them all, check if
3964  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3965  // affinity_none, call the real initialization routine, then restore
3966  // __kmp_affinity_type to affinity_disabled.
3967  //
3968  int disabled = (__kmp_affinity_type == affinity_disabled);
3969  if (! KMP_AFFINITY_CAPABLE()) {
3970  KMP_ASSERT(disabled);
3971  }
3972  if (disabled) {
3973  __kmp_affinity_type = affinity_none;
3974  }
3975  __kmp_aux_affinity_initialize();
3976  if (disabled) {
3977  __kmp_affinity_type = affinity_disabled;
3978  }
3979 }
3980 
3981 
3982 void
3983 __kmp_affinity_uninitialize(void)
3984 {
3985  if (__kmp_affinity_masks != NULL) {
3986  __kmp_free(__kmp_affinity_masks);
3987  __kmp_affinity_masks = NULL;
3988  }
3989  if (fullMask != NULL) {
3990  KMP_CPU_FREE(fullMask);
3991  fullMask = NULL;
3992  }
3993  __kmp_affinity_num_masks = 0;
3994 # if OMP_40_ENABLED
3995  __kmp_affinity_num_places = 0;
3996 # endif
3997  if (__kmp_affinity_proclist != NULL) {
3998  __kmp_free(__kmp_affinity_proclist);
3999  __kmp_affinity_proclist = NULL;
4000  }
4001  if( address2os != NULL ) {
4002  __kmp_free( address2os );
4003  address2os = NULL;
4004  }
4005  if( procarr != NULL ) {
4006  __kmp_free( procarr );
4007  procarr = NULL;
4008  }
4009 }
4010 
4011 
4012 void
4013 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4014 {
4015  if (! KMP_AFFINITY_CAPABLE()) {
4016  return;
4017  }
4018 
4019  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4020  if (th->th.th_affin_mask == NULL) {
4021  KMP_CPU_ALLOC(th->th.th_affin_mask);
4022  }
4023  else {
4024  KMP_CPU_ZERO(th->th.th_affin_mask);
4025  }
4026 
4027  //
4028  // Copy the thread mask to the kmp_info_t strucuture.
4029  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4030  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4031  // is set, then the full mask is the same as the mask of the initialization
4032  // thread.
4033  //
4034  kmp_affin_mask_t *mask;
4035  int i;
4036 
4037 # if OMP_40_ENABLED
4038  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4039 # endif
4040  {
4041  if ((__kmp_affinity_type == affinity_none)
4042 # if KMP_MIC
4043  || (__kmp_affinity_type == affinity_balanced)
4044 # endif
4045  ) {
4046 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4047  if (__kmp_num_proc_groups > 1) {
4048  return;
4049  }
4050 # endif
4051  KMP_ASSERT(fullMask != NULL);
4052  i = KMP_PLACE_ALL;
4053  mask = fullMask;
4054  }
4055  else {
4056  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4057  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4058  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4059  }
4060  }
4061 # if OMP_40_ENABLED
4062  else {
4063  if ((! isa_root)
4064  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4065 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4066  if (__kmp_num_proc_groups > 1) {
4067  return;
4068  }
4069 # endif
4070  KMP_ASSERT(fullMask != NULL);
4071  i = KMP_PLACE_ALL;
4072  mask = fullMask;
4073  }
4074  else {
4075  //
4076  // int i = some hash function or just a counter that doesn't
4077  // always start at 0. Use gtid for now.
4078  //
4079  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4080  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4081  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4082  }
4083  }
4084 # endif
4085 
4086 # if OMP_40_ENABLED
4087  th->th.th_current_place = i;
4088  if (isa_root) {
4089  th->th.th_new_place = i;
4090  th->th.th_first_place = 0;
4091  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4092  }
4093 
4094  if (i == KMP_PLACE_ALL) {
4095  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4096  gtid));
4097  }
4098  else {
4099  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4100  gtid, i));
4101  }
4102 # else
4103  if (i == -1) {
4104  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4105  gtid));
4106  }
4107  else {
4108  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4109  gtid, i));
4110  }
4111 # endif /* OMP_40_ENABLED */
4112 
4113  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4114 
4115  if (__kmp_affinity_verbose) {
4116  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4117  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4118  th->th.th_affin_mask);
4119  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4120  buf);
4121  }
4122 
4123 # if KMP_OS_WINDOWS
4124  //
4125  // On Windows* OS, the process affinity mask might have changed.
4126  // If the user didn't request affinity and this call fails,
4127  // just continue silently. See CQ171393.
4128  //
4129  if ( __kmp_affinity_type == affinity_none ) {
4130  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4131  }
4132  else
4133 # endif
4134  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4135 }
4136 
4137 
4138 # if OMP_40_ENABLED
4139 
4140 void
4141 __kmp_affinity_set_place(int gtid)
4142 {
4143  int retval;
4144 
4145  if (! KMP_AFFINITY_CAPABLE()) {
4146  return;
4147  }
4148 
4149  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4150 
4151  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4152  gtid, th->th.th_new_place, th->th.th_current_place));
4153 
4154  //
4155  // Check that the new place is within this thread's partition.
4156  //
4157  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4158  KMP_ASSERT(th->th.th_new_place >= 0);
4159  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4160  if (th->th.th_first_place <= th->th.th_last_place) {
4161  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4162  && (th->th.th_new_place <= th->th.th_last_place));
4163  }
4164  else {
4165  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4166  || (th->th.th_new_place >= th->th.th_last_place));
4167  }
4168 
4169  //
4170  // Copy the thread mask to the kmp_info_t strucuture,
4171  // and set this thread's affinity.
4172  //
4173  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4174  th->th.th_new_place);
4175  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4176  th->th.th_current_place = th->th.th_new_place;
4177 
4178  if (__kmp_affinity_verbose) {
4179  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4180  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4181  th->th.th_affin_mask);
4182  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4183  gtid, buf);
4184  }
4185  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4186 }
4187 
4188 # endif /* OMP_40_ENABLED */
4189 
4190 
4191 int
4192 __kmp_aux_set_affinity(void **mask)
4193 {
4194  int gtid;
4195  kmp_info_t *th;
4196  int retval;
4197 
4198  if (! KMP_AFFINITY_CAPABLE()) {
4199  return -1;
4200  }
4201 
4202  gtid = __kmp_entry_gtid();
4203  KA_TRACE(1000, ;{
4204  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4205  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4206  (kmp_affin_mask_t *)(*mask));
4207  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4208  gtid, buf);
4209  });
4210 
4211  if (__kmp_env_consistency_check) {
4212  if ((mask == NULL) || (*mask == NULL)) {
4213  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4214  }
4215  else {
4216  unsigned proc;
4217  int num_procs = 0;
4218 
4219  for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4220  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4221  continue;
4222  }
4223  num_procs++;
4224  if (! KMP_CPU_ISSET(proc, fullMask)) {
4225  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4226  break;
4227  }
4228  }
4229  if (num_procs == 0) {
4230  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4231  }
4232 
4233 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4234  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4235  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4236  }
4237 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
4238 
4239  }
4240  }
4241 
4242  th = __kmp_threads[gtid];
4243  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4244  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4245  if (retval == 0) {
4246  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4247  }
4248 
4249 # if OMP_40_ENABLED
4250  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4251  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4252  th->th.th_first_place = 0;
4253  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4254 
4255  //
4256  // Turn off 4.0 affinity for the current tread at this parallel level.
4257  //
4258  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4259 # endif
4260 
4261  return retval;
4262 }
4263 
4264 
4265 int
4266 __kmp_aux_get_affinity(void **mask)
4267 {
4268  int gtid;
4269  int retval;
4270  kmp_info_t *th;
4271 
4272  if (! KMP_AFFINITY_CAPABLE()) {
4273  return -1;
4274  }
4275 
4276  gtid = __kmp_entry_gtid();
4277  th = __kmp_threads[gtid];
4278  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4279 
4280  KA_TRACE(1000, ;{
4281  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4282  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4283  th->th.th_affin_mask);
4284  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4285  });
4286 
4287  if (__kmp_env_consistency_check) {
4288  if ((mask == NULL) || (*mask == NULL)) {
4289  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4290  }
4291  }
4292 
4293 # if !KMP_OS_WINDOWS
4294 
4295  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4296  KA_TRACE(1000, ;{
4297  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4298  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4299  (kmp_affin_mask_t *)(*mask));
4300  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4301  });
4302  return retval;
4303 
4304 # else
4305 
4306  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4307  return 0;
4308 
4309 # endif /* KMP_OS_WINDOWS */
4310 
4311 }
4312 
4313 int
4314 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4315 {
4316  int retval;
4317 
4318  if (! KMP_AFFINITY_CAPABLE()) {
4319  return -1;
4320  }
4321 
4322  KA_TRACE(1000, ;{
4323  int gtid = __kmp_entry_gtid();
4324  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4325  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4326  (kmp_affin_mask_t *)(*mask));
4327  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4328  proc, gtid, buf);
4329  });
4330 
4331  if (__kmp_env_consistency_check) {
4332  if ((mask == NULL) || (*mask == NULL)) {
4333  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4334  }
4335  }
4336 
4337  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4338  return -1;
4339  }
4340  if (! KMP_CPU_ISSET(proc, fullMask)) {
4341  return -2;
4342  }
4343 
4344  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4345  return 0;
4346 }
4347 
4348 
4349 int
4350 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4351 {
4352  int retval;
4353 
4354  if (! KMP_AFFINITY_CAPABLE()) {
4355  return -1;
4356  }
4357 
4358  KA_TRACE(1000, ;{
4359  int gtid = __kmp_entry_gtid();
4360  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4361  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4362  (kmp_affin_mask_t *)(*mask));
4363  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4364  proc, gtid, buf);
4365  });
4366 
4367  if (__kmp_env_consistency_check) {
4368  if ((mask == NULL) || (*mask == NULL)) {
4369  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4370  }
4371  }
4372 
4373  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4374  return -1;
4375  }
4376  if (! KMP_CPU_ISSET(proc, fullMask)) {
4377  return -2;
4378  }
4379 
4380  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4381  return 0;
4382 }
4383 
4384 
4385 int
4386 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4387 {
4388  int retval;
4389 
4390  if (! KMP_AFFINITY_CAPABLE()) {
4391  return -1;
4392  }
4393 
4394  KA_TRACE(1000, ;{
4395  int gtid = __kmp_entry_gtid();
4396  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4397  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4398  (kmp_affin_mask_t *)(*mask));
4399  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4400  proc, gtid, buf);
4401  });
4402 
4403  if (__kmp_env_consistency_check) {
4404  if ((mask == NULL) || (*mask == NULL)) {
4405  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4406  }
4407  }
4408 
4409  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4410  return 0;
4411  }
4412  if (! KMP_CPU_ISSET(proc, fullMask)) {
4413  return 0;
4414  }
4415 
4416  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4417 }
4418 
4419 # if KMP_MIC
4420 
4421 // Dynamic affinity settings - Affinity balanced
4422 void __kmp_balanced_affinity( int tid, int nthreads )
4423 {
4424  if( __kmp_affinity_uniform_topology() ) {
4425  int coreID;
4426  int threadID;
4427  // Number of hyper threads per core in HT machine
4428  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4429  // Number of cores
4430  int ncores = __kmp_ncores;
4431  // How many threads will be bound to each core
4432  int chunk = nthreads / ncores;
4433  // How many cores will have an additional thread bound to it - "big cores"
4434  int big_cores = nthreads % ncores;
4435  // Number of threads on the big cores
4436  int big_nth = ( chunk + 1 ) * big_cores;
4437  if( tid < big_nth ) {
4438  coreID = tid / (chunk + 1 );
4439  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4440  } else { //tid >= big_nth
4441  coreID = ( tid - big_cores ) / chunk;
4442  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4443  }
4444 
4445  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4446  "Illegal set affinity operation when not capable");
4447 
4448  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4449  KMP_CPU_ZERO(mask);
4450 
4451  // Granularity == thread
4452  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4453  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4454  KMP_CPU_SET( osID, mask);
4455  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4456  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4457  int osID;
4458  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4459  KMP_CPU_SET( osID, mask);
4460  }
4461  }
4462  if (__kmp_affinity_verbose) {
4463  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4464  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4465  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4466  tid, buf);
4467  }
4468  __kmp_set_system_affinity( mask, TRUE );
4469  } else { // Non-uniform topology
4470 
4471  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4472  KMP_CPU_ZERO(mask);
4473 
4474  // Number of hyper threads per core in HT machine
4475  int nth_per_core = __kmp_nThreadsPerCore;
4476  int core_level;
4477  if( nth_per_core > 1 ) {
4478  core_level = __kmp_aff_depth - 2;
4479  } else {
4480  core_level = __kmp_aff_depth - 1;
4481  }
4482 
4483  // Number of cores - maximum value; it does not count trail cores with 0 processors
4484  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4485 
4486  // For performance gain consider the special case nthreads == __kmp_avail_proc
4487  if( nthreads == __kmp_avail_proc ) {
4488  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4489  int osID = address2os[ tid ].second;
4490  KMP_CPU_SET( osID, mask);
4491  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4492  int coreID = address2os[ tid ].first.labels[ core_level ];
4493  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4494  // since the address2os is sortied we can break when cnt==nth_per_core
4495  int cnt = 0;
4496  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4497  int osID = address2os[ i ].second;
4498  int core = address2os[ i ].first.labels[ core_level ];
4499  if( core == coreID ) {
4500  KMP_CPU_SET( osID, mask);
4501  cnt++;
4502  if( cnt == nth_per_core ) {
4503  break;
4504  }
4505  }
4506  }
4507  }
4508  } else if( nthreads <= __kmp_ncores ) {
4509 
4510  int core = 0;
4511  for( int i = 0; i < ncores; i++ ) {
4512  // Check if this core from procarr[] is in the mask
4513  int in_mask = 0;
4514  for( int j = 0; j < nth_per_core; j++ ) {
4515  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4516  in_mask = 1;
4517  break;
4518  }
4519  }
4520  if( in_mask ) {
4521  if( tid == core ) {
4522  for( int j = 0; j < nth_per_core; j++ ) {
4523  int osID = procarr[ i * nth_per_core + j ];
4524  if( osID != -1 ) {
4525  KMP_CPU_SET( osID, mask );
4526  // For granularity=thread it is enough to set the first available osID for this core
4527  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4528  break;
4529  }
4530  }
4531  }
4532  break;
4533  } else {
4534  core++;
4535  }
4536  }
4537  }
4538 
4539  } else { // nthreads > __kmp_ncores
4540 
4541  // Array to save the number of processors at each core
4542  int nproc_at_core[ ncores ];
4543  // Array to save the number of cores with "x" available processors;
4544  int ncores_with_x_procs[ nth_per_core + 1 ];
4545  // Array to save the number of cores with # procs from x to nth_per_core
4546  int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4547 
4548  for( int i = 0; i <= nth_per_core; i++ ) {
4549  ncores_with_x_procs[ i ] = 0;
4550  ncores_with_x_to_max_procs[ i ] = 0;
4551  }
4552 
4553  for( int i = 0; i < ncores; i++ ) {
4554  int cnt = 0;
4555  for( int j = 0; j < nth_per_core; j++ ) {
4556  if( procarr[ i * nth_per_core + j ] != -1 ) {
4557  cnt++;
4558  }
4559  }
4560  nproc_at_core[ i ] = cnt;
4561  ncores_with_x_procs[ cnt ]++;
4562  }
4563 
4564  for( int i = 0; i <= nth_per_core; i++ ) {
4565  for( int j = i; j <= nth_per_core; j++ ) {
4566  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4567  }
4568  }
4569 
4570  // Max number of processors
4571  int nproc = nth_per_core * ncores;
4572  // An array to keep number of threads per each context
4573  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4574  for( int i = 0; i < nproc; i++ ) {
4575  newarr[ i ] = 0;
4576  }
4577 
4578  int nth = nthreads;
4579  int flag = 0;
4580  while( nth > 0 ) {
4581  for( int j = 1; j <= nth_per_core; j++ ) {
4582  int cnt = ncores_with_x_to_max_procs[ j ];
4583  for( int i = 0; i < ncores; i++ ) {
4584  // Skip the core with 0 processors
4585  if( nproc_at_core[ i ] == 0 ) {
4586  continue;
4587  }
4588  for( int k = 0; k < nth_per_core; k++ ) {
4589  if( procarr[ i * nth_per_core + k ] != -1 ) {
4590  if( newarr[ i * nth_per_core + k ] == 0 ) {
4591  newarr[ i * nth_per_core + k ] = 1;
4592  cnt--;
4593  nth--;
4594  break;
4595  } else {
4596  if( flag != 0 ) {
4597  newarr[ i * nth_per_core + k ] ++;
4598  cnt--;
4599  nth--;
4600  break;
4601  }
4602  }
4603  }
4604  }
4605  if( cnt == 0 || nth == 0 ) {
4606  break;
4607  }
4608  }
4609  if( nth == 0 ) {
4610  break;
4611  }
4612  }
4613  flag = 1;
4614  }
4615  int sum = 0;
4616  for( int i = 0; i < nproc; i++ ) {
4617  sum += newarr[ i ];
4618  if( sum > tid ) {
4619  // Granularity == thread
4620  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4621  int osID = procarr[ i ];
4622  KMP_CPU_SET( osID, mask);
4623  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4624  int coreID = i / nth_per_core;
4625  for( int ii = 0; ii < nth_per_core; ii++ ) {
4626  int osID = procarr[ coreID * nth_per_core + ii ];
4627  if( osID != -1 ) {
4628  KMP_CPU_SET( osID, mask);
4629  }
4630  }
4631  }
4632  break;
4633  }
4634  }
4635  __kmp_free( newarr );
4636  }
4637 
4638  if (__kmp_affinity_verbose) {
4639  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4640  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4641  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4642  tid, buf);
4643  }
4644  __kmp_set_system_affinity( mask, TRUE );
4645  }
4646 }
4647 
4648 # endif /* KMP_MIC */
4649 
4650 #else
4651  // affinity not supported
4652 
4653 kmp_uint32 mac_skipPerLevel[7];
4654 kmp_uint32 mac_depth;
4655 kmp_uint8 mac_leaf_kids;
4656 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4657  static int first = 1;
4658  if (first) {
4659  const kmp_uint32 maxLevels = 7;
4660  kmp_uint32 numPerLevel[maxLevels];
4661 
4662  for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4663  numPerLevel[i] = 1;
4664  mac_skipPerLevel[i] = 1;
4665  }
4666 
4667  mac_depth = 2;
4668  numPerLevel[0] = nproc;
4669 
4670  kmp_uint32 branch = 4;
4671  if (numPerLevel[0] == 1) branch = nproc/4;
4672  if (branch<4) branch=4;
4673  for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
4674  while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4675  if (numPerLevel[d] & 1) numPerLevel[d]++;
4676  numPerLevel[d] = numPerLevel[d] >> 1;
4677  if (numPerLevel[d+1] == 1) mac_depth++;
4678  numPerLevel[d+1] = numPerLevel[d+1] << 1;
4679  }
4680  if(numPerLevel[0] == 1) {
4681  branch = branch >> 1;
4682  if (branch<4) branch = 4;
4683  }
4684  }
4685 
4686  for (kmp_uint32 i=1; i<mac_depth; ++i)
4687  mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
4688  mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4689  first=0;
4690  }
4691  thr_bar->depth = mac_depth;
4692  thr_bar->base_leaf_kids = mac_leaf_kids;
4693  thr_bar->skip_per_level = mac_skipPerLevel;
4694 }
4695 
4696 #endif // KMP_AFFINITY_SUPPORTED