Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  * $Revision: 42613 $
4  * $Date: 2013-08-23 13:29:50 -0500 (Fri, 23 Aug 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_i18n.h"
39 #include "kmp_io.h"
40 #include "kmp_str.h"
41 
42 
43 #if KMP_OS_WINDOWS || KMP_OS_LINUX
44 
45 //
46 // Print the affinity mask to the character array in a pretty format.
47 //
48 char *
49 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
50 {
51  KMP_ASSERT(buf_len >= 40);
52  char *scan = buf;
53  char *end = buf + buf_len - 1;
54 
55  //
56  // Find first element / check for empty set.
57  //
58  size_t i;
59  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
60  if (KMP_CPU_ISSET(i, mask)) {
61  break;
62  }
63  }
64  if (i == KMP_CPU_SETSIZE) {
65  sprintf(scan, "{<empty>}");
66  while (*scan != '\0') scan++;
67  KMP_ASSERT(scan <= end);
68  return buf;
69  }
70 
71  sprintf(scan, "{%ld", i);
72  while (*scan != '\0') scan++;
73  i++;
74  for (; i < KMP_CPU_SETSIZE; i++) {
75  if (! KMP_CPU_ISSET(i, mask)) {
76  continue;
77  }
78 
79  //
80  // Check for buffer overflow. A string of the form ",<n>" will have
81  // at most 10 characters, plus we want to leave room to print ",...}"
82  // if the set is too large to print for a total of 15 characters.
83  // We already left room for '\0' in setting end.
84  //
85  if (end - scan < 15) {
86  break;
87  }
88  sprintf(scan, ",%-ld", i);
89  while (*scan != '\0') scan++;
90  }
91  if (i < KMP_CPU_SETSIZE) {
92  sprintf(scan, ",...");
93  while (*scan != '\0') scan++;
94  }
95  sprintf(scan, "}");
96  while (*scan != '\0') scan++;
97  KMP_ASSERT(scan <= end);
98  return buf;
99 }
100 
101 
102 void
103 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
104 {
105  KMP_CPU_ZERO(mask);
106 
107 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
108 
109  if (__kmp_num_proc_groups > 1) {
110  int group;
111  struct GROUP_AFFINITY ga;
112  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
113  for (group = 0; group < __kmp_num_proc_groups; group++) {
114  int i;
115  int num = __kmp_GetActiveProcessorCount(group);
116  for (i = 0; i < num; i++) {
117  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
118  }
119  }
120  }
121  else
122 
123 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
124 
125  {
126  int proc;
127  for (proc = 0; proc < __kmp_xproc; proc++) {
128  KMP_CPU_SET(proc, mask);
129  }
130  }
131 }
132 
133 
134 //
135 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
136 // functions.
137 //
138 // The icc codegen emits sections with extremely long names, of the form
139 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
140 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
141 // some sort of memory corruption or table overflow that is triggered by
142 // these long strings. I checked the latest version of the linker -
143 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
144 // fixed.
145 //
146 // Unfortunately, my attempts to reproduce it in a smaller example have
147 // failed - I'm not sure what the prospects are of getting it fixed
148 // properly - but we need a reproducer smaller than all of libiomp.
149 //
150 // Work around the problem by avoiding inline constructors in such builds.
151 // We do this for all platforms, not just Linux* OS - non-inline functions are
152 // more debuggable and provide better coverage into than inline functions.
153 // Use inline functions in shipping libs, for performance.
154 //
155 
156 # if !defined(KMP_DEBUG) && !defined(COVER)
157 
158 class Address {
159 public:
160  static const unsigned maxDepth = 32;
161  unsigned labels[maxDepth];
162  unsigned childNums[maxDepth];
163  unsigned depth;
164  unsigned leader;
165  Address(unsigned _depth)
166  : depth(_depth), leader(FALSE) {
167  }
168  Address &operator=(const Address &b) {
169  depth = b.depth;
170  for (unsigned i = 0; i < depth; i++) {
171  labels[i] = b.labels[i];
172  childNums[i] = b.childNums[i];
173  }
174  leader = FALSE;
175  return *this;
176  }
177  bool operator==(const Address &b) const {
178  if (depth != b.depth)
179  return false;
180  for (unsigned i = 0; i < depth; i++)
181  if(labels[i] != b.labels[i])
182  return false;
183  return true;
184  }
185  bool isClose(const Address &b, int level) const {
186  if (depth != b.depth)
187  return false;
188  if ((unsigned)level >= depth)
189  return true;
190  for (unsigned i = 0; i < (depth - level); i++)
191  if(labels[i] != b.labels[i])
192  return false;
193  return true;
194  }
195  bool operator!=(const Address &b) const {
196  return !operator==(b);
197  }
198 };
199 
200 class AddrUnsPair {
201 public:
202  Address first;
203  unsigned second;
204  AddrUnsPair(Address _first, unsigned _second)
205  : first(_first), second(_second) {
206  }
207  AddrUnsPair &operator=(const AddrUnsPair &b)
208  {
209  first = b.first;
210  second = b.second;
211  return *this;
212  }
213 };
214 
215 # else
216 
217 class Address {
218 public:
219  static const unsigned maxDepth = 32;
220  unsigned labels[maxDepth];
221  unsigned childNums[maxDepth];
222  unsigned depth;
223  unsigned leader;
224  Address(unsigned _depth);
225  Address &operator=(const Address &b);
226  bool operator==(const Address &b) const;
227  bool isClose(const Address &b, int level) const;
228  bool operator!=(const Address &b) const;
229 };
230 
231 Address::Address(unsigned _depth)
232 {
233  depth = _depth;
234  leader = FALSE;
235 }
236 
237 Address &Address::operator=(const Address &b) {
238  depth = b.depth;
239  for (unsigned i = 0; i < depth; i++) {
240  labels[i] = b.labels[i];
241  childNums[i] = b.childNums[i];
242  }
243  leader = FALSE;
244  return *this;
245 }
246 
247 bool Address::operator==(const Address &b) const {
248  if (depth != b.depth)
249  return false;
250  for (unsigned i = 0; i < depth; i++)
251  if(labels[i] != b.labels[i])
252  return false;
253  return true;
254 }
255 
256 bool Address::isClose(const Address &b, int level) const {
257  if (depth != b.depth)
258  return false;
259  if ((unsigned)level >= depth)
260  return true;
261  for (unsigned i = 0; i < (depth - level); i++)
262  if(labels[i] != b.labels[i])
263  return false;
264  return true;
265 }
266 
267 bool Address::operator!=(const Address &b) const {
268  return !operator==(b);
269 }
270 
271 class AddrUnsPair {
272 public:
273  Address first;
274  unsigned second;
275  AddrUnsPair(Address _first, unsigned _second);
276  AddrUnsPair &operator=(const AddrUnsPair &b);
277 };
278 
279 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
280  : first(_first), second(_second)
281 {
282 }
283 
284 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
285 {
286  first = b.first;
287  second = b.second;
288  return *this;
289 }
290 
291 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
292 
293 
294 static int
295 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
296 {
297  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
298  ->first);
299  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
300  ->first);
301  unsigned depth = aa->depth;
302  unsigned i;
303  KMP_DEBUG_ASSERT(depth == bb->depth);
304  for (i = 0; i < depth; i++) {
305  if (aa->labels[i] < bb->labels[i]) return -1;
306  if (aa->labels[i] > bb->labels[i]) return 1;
307  }
308  return 0;
309 }
310 
311 
312 static int
313 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
314 {
315  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
316  ->first);
317  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
318  ->first);
319  unsigned depth = aa->depth;
320  unsigned i;
321  KMP_DEBUG_ASSERT(depth == bb->depth);
322  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
323  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
324  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
325  int j = depth - i - 1;
326  if (aa->childNums[j] < bb->childNums[j]) return -1;
327  if (aa->childNums[j] > bb->childNums[j]) return 1;
328  }
329  for (; i < depth; i++) {
330  int j = i - __kmp_affinity_compact;
331  if (aa->childNums[j] < bb->childNums[j]) return -1;
332  if (aa->childNums[j] > bb->childNums[j]) return 1;
333  }
334  return 0;
335 }
336 
337 
338 //
339 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
340 // called to renumber the labels from [0..n] and place them into the child_num
341 // vector of the address object. This is done in case the labels used for
342 // the children at one node of the heirarchy differ from those used for
343 // another node at the same level. Example: suppose the machine has 2 nodes
344 // with 2 packages each. The first node contains packages 601 and 602, and
345 // second node contains packages 603 and 604. If we try to sort the table
346 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
347 // because we are paying attention to the labels themselves, not the ordinal
348 // child numbers. By using the child numbers in the sort, the result is
349 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
350 //
351 static void
352 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
353  int numAddrs)
354 {
355  KMP_DEBUG_ASSERT(numAddrs > 0);
356  int depth = address2os->first.depth;
357  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
358  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
359  * sizeof(unsigned));
360  int labCt;
361  for (labCt = 0; labCt < depth; labCt++) {
362  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
363  lastLabel[labCt] = address2os[0].first.labels[labCt];
364  }
365  int i;
366  for (i = 1; i < numAddrs; i++) {
367  for (labCt = 0; labCt < depth; labCt++) {
368  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
369  int labCt2;
370  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
371  counts[labCt2] = 0;
372  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
373  }
374  counts[labCt]++;
375  lastLabel[labCt] = address2os[i].first.labels[labCt];
376  break;
377  }
378  }
379  for (labCt = 0; labCt < depth; labCt++) {
380  address2os[i].first.childNums[labCt] = counts[labCt];
381  }
382  for (; labCt < (int)Address::maxDepth; labCt++) {
383  address2os[i].first.childNums[labCt] = 0;
384  }
385  }
386 }
387 
388 
389 //
390 // All of the __kmp_affinity_create_*_map() routines should set
391 // __kmp_affinity_masks to a vector of affinity mask objects of length
392 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
393 // return the number of levels in the machine topology tree (zero if
394 // __kmp_affinity_type == affinity_none).
395 //
396 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
397 // to the affinity mask for the initialization thread. They need to save and
398 // restore the mask, and it could be needed later, so saving it is just an
399 // optimization to avoid calling kmp_get_system_affinity() again.
400 //
401 static kmp_affin_mask_t *fullMask = NULL;
402 
403 kmp_affin_mask_t *
404 __kmp_affinity_get_fullMask() { return fullMask; }
405 
406 
407 static int nCoresPerPkg, nPackages;
408 int __kmp_nThreadsPerCore;
409 
410 //
411 // __kmp_affinity_uniform_topology() doesn't work when called from
412 // places which support arbitrarily many levels in the machine topology
413 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
414 // __kmp_affinity_create_x2apicid_map().
415 //
416 inline static bool
417 __kmp_affinity_uniform_topology()
418 {
419  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
420 }
421 
422 
423 //
424 // Print out the detailed machine topology map, i.e. the physical locations
425 // of each OS proc.
426 //
427 static void
428 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
429  int pkgLevel, int coreLevel, int threadLevel)
430 {
431  int proc;
432 
433  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
434  for (proc = 0; proc < len; proc++) {
435  int level;
436  kmp_str_buf_t buf;
437  __kmp_str_buf_init(&buf);
438  for (level = 0; level < depth; level++) {
439  if (level == threadLevel) {
440  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
441  }
442  else if (level == coreLevel) {
443  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
444  }
445  else if (level == pkgLevel) {
446  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
447  }
448  else if (level > pkgLevel) {
449  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
450  level - pkgLevel - 1);
451  }
452  else {
453  __kmp_str_buf_print(&buf, "L%d ", level);
454  }
455  __kmp_str_buf_print(&buf, "%d ",
456  address2os[proc].first.labels[level]);
457  }
458  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
459  buf.str);
460  __kmp_str_buf_free(&buf);
461  }
462 }
463 
464 
465 //
466 // If we don't know how to retrieve the machine's processor topology, or
467 // encounter an error in doing so, this routine is called to form a "flat"
468 // mapping of os thread id's <-> processor id's.
469 //
470 static int
471 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
472  kmp_i18n_id_t *const msg_id)
473 {
474  *address2os = NULL;
475  *msg_id = kmp_i18n_null;
476 
477  //
478  // Even if __kmp_affinity_type == affinity_none, this routine might still
479  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
480  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
481  //
482  if (! KMP_AFFINITY_CAPABLE()) {
483  KMP_ASSERT(__kmp_affinity_type == affinity_none);
484  __kmp_ncores = nPackages = __kmp_xproc;
485  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
486  __kmp_ht_enabled = FALSE;
487  if (__kmp_affinity_verbose) {
488  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
489  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
490  KMP_INFORM(Uniform, "KMP_AFFINITY");
491  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
492  __kmp_nThreadsPerCore, __kmp_ncores);
493  }
494  return 0;
495  }
496 
497  //
498  // When affinity is off, this routine will still be called to set
499  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
500  // nCoresPerPkg, & nPackages. Make sure all these vars are set
501  // correctly, and return now if affinity is not enabled.
502  //
503  __kmp_ncores = nPackages = __kmp_avail_proc;
504  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
505  __kmp_ht_enabled = FALSE;
506  if (__kmp_affinity_verbose) {
507  char buf[KMP_AFFIN_MASK_PRINT_LEN];
508  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
509 
510  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
511  if (__kmp_affinity_respect_mask) {
512  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
513  } else {
514  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
515  }
516  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
517  KMP_INFORM(Uniform, "KMP_AFFINITY");
518  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
519  __kmp_nThreadsPerCore, __kmp_ncores);
520  }
521  if (__kmp_affinity_type == affinity_none) {
522  return 0;
523  }
524 
525  //
526  // Contruct the data structure to be returned.
527  //
528  *address2os = (AddrUnsPair*)
529  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
530  int avail_ct = 0;
531  unsigned int i;
532  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
533  //
534  // Skip this proc if it is not included in the machine model.
535  //
536  if (! KMP_CPU_ISSET(i, fullMask)) {
537  continue;
538  }
539 
540  Address addr(1);
541  addr.labels[0] = i;
542  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
543  }
544  if (__kmp_affinity_verbose) {
545  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
546  }
547 
548  if (__kmp_affinity_gran_levels < 0) {
549  //
550  // Only the package level is modeled in the machine topology map,
551  // so the #levels of granularity is either 0 or 1.
552  //
553  if (__kmp_affinity_gran > affinity_gran_package) {
554  __kmp_affinity_gran_levels = 1;
555  }
556  else {
557  __kmp_affinity_gran_levels = 0;
558  }
559  }
560  return 1;
561 }
562 
563 
564 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
565 
566 //
567 // If multiple Windows* OS processor groups exist, we can create a 2-level
568 // topology map with the groups at level 0 and the individual procs at
569 // level 1.
570 //
571 // This facilitates letting the threads float among all procs in a group,
572 // if granularity=group (the default when there are multiple groups).
573 //
574 static int
575 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
576  kmp_i18n_id_t *const msg_id)
577 {
578  *address2os = NULL;
579  *msg_id = kmp_i18n_null;
580 
581  //
582  // If we don't have multiple processor groups, return now.
583  // The flat mapping will be used.
584  //
585  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
586  // FIXME set *msg_id
587  return -1;
588  }
589 
590  //
591  // Contruct the data structure to be returned.
592  //
593  *address2os = (AddrUnsPair*)
594  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
595  int avail_ct = 0;
596  int i;
597  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
598  //
599  // Skip this proc if it is not included in the machine model.
600  //
601  if (! KMP_CPU_ISSET(i, fullMask)) {
602  continue;
603  }
604 
605  Address addr(2);
606  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
607  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
608  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
609 
610  if (__kmp_affinity_verbose) {
611  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
612  addr.labels[1]);
613  }
614  }
615 
616  if (__kmp_affinity_gran_levels < 0) {
617  if (__kmp_affinity_gran == affinity_gran_group) {
618  __kmp_affinity_gran_levels = 1;
619  }
620  else if ((__kmp_affinity_gran == affinity_gran_fine)
621  || (__kmp_affinity_gran == affinity_gran_thread)) {
622  __kmp_affinity_gran_levels = 0;
623  }
624  else {
625  const char *gran_str = NULL;
626  if (__kmp_affinity_gran == affinity_gran_core) {
627  gran_str = "core";
628  }
629  else if (__kmp_affinity_gran == affinity_gran_package) {
630  gran_str = "package";
631  }
632  else if (__kmp_affinity_gran == affinity_gran_node) {
633  gran_str = "node";
634  }
635  else {
636  KMP_ASSERT(0);
637  }
638 
639  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
640  __kmp_affinity_gran_levels = 0;
641  }
642  }
643  return 2;
644 }
645 
646 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
647 
648 
649 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
650 
651 static int
652 __kmp_cpuid_mask_width(int count) {
653  int r = 0;
654 
655  while((1<<r) < count)
656  ++r;
657  return r;
658 }
659 
660 
661 class apicThreadInfo {
662 public:
663  unsigned osId; // param to __kmp_affinity_bind_thread
664  unsigned apicId; // from cpuid after binding
665  unsigned maxCoresPerPkg; // ""
666  unsigned maxThreadsPerPkg; // ""
667  unsigned pkgId; // inferred from above values
668  unsigned coreId; // ""
669  unsigned threadId; // ""
670 };
671 
672 
673 static int
674 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
675 {
676  const apicThreadInfo *aa = (const apicThreadInfo *)a;
677  const apicThreadInfo *bb = (const apicThreadInfo *)b;
678  if (aa->osId < bb->osId) return -1;
679  if (aa->osId > bb->osId) return 1;
680  return 0;
681 }
682 
683 
684 static int
685 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
686 {
687  const apicThreadInfo *aa = (const apicThreadInfo *)a;
688  const apicThreadInfo *bb = (const apicThreadInfo *)b;
689  if (aa->pkgId < bb->pkgId) return -1;
690  if (aa->pkgId > bb->pkgId) return 1;
691  if (aa->coreId < bb->coreId) return -1;
692  if (aa->coreId > bb->coreId) return 1;
693  if (aa->threadId < bb->threadId) return -1;
694  if (aa->threadId > bb->threadId) return 1;
695  return 0;
696 }
697 
698 
699 //
700 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
701 // an algorithm which cycles through the available os threads, setting
702 // the current thread's affinity mask to that thread, and then retrieves
703 // the Apic Id for each thread context using the cpuid instruction.
704 //
705 static int
706 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
707  kmp_i18n_id_t *const msg_id)
708 {
709  int rc;
710  *address2os = NULL;
711  *msg_id = kmp_i18n_null;
712 
713 # if KMP_MIC
714  {
715  // The code below will use cpuid(4).
716  // Check if cpuid(4) is supported.
717  // FIXME? - this really doesn't need to be specific to MIC.
718  kmp_cpuid buf;
719  __kmp_x86_cpuid(0, 0, &buf);
720  if (buf.eax < 4) {
721  *msg_id = kmp_i18n_str_NoLeaf4Support;
722  return -1;
723  }
724  }
725 # endif // KMP_MIC
726 
727  //
728  // Even if __kmp_affinity_type == affinity_none, this routine is still
729  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
730  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
731  //
732  // The algorithm used starts by setting the affinity to each available
733  // thread and retreiving info from the cpuid instruction, so if we are not
734  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
735  // then we need to do something else.
736  //
737  if (! KMP_AFFINITY_CAPABLE()) {
738  //
739  // Hack to try and infer the machine topology using only the data
740  // available from cpuid on the current thread, and __kmp_xproc.
741  //
742  KMP_ASSERT(__kmp_affinity_type == affinity_none);
743 
744  //
745  // Get an upper bound on the number of threads per package using
746  // cpuid(1).
747  //
748  // On some OS/chps combinations where HT is supported by the chip
749  // but is disabled, this value will be 2 on a single core chip.
750  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
751  //
752  kmp_cpuid buf;
753  __kmp_x86_cpuid(1, 0, &buf);
754  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
755  if (maxThreadsPerPkg == 0) {
756  maxThreadsPerPkg = 1;
757  }
758 
759  //
760  // The num cores per pkg comes from cpuid(4).
761  // 1 must be added to the encoded value.
762  //
763  // The author of cpu_count.cpp treated this only an upper bound
764  // on the number of cores, but I haven't seen any cases where it
765  // was greater than the actual number of cores, so we will treat
766  // it as exact in this block of code.
767  //
768  // First, we need to check if cpuid(4) is supported on this chip.
769  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
770  // has the value n or greater.
771  //
772  __kmp_x86_cpuid(0, 0, &buf);
773  if (buf.eax >= 4) {
774  __kmp_x86_cpuid(4, 0, &buf);
775  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
776  }
777  else {
778  nCoresPerPkg = 1;
779  }
780 
781  //
782  // There is no way to reliably tell if HT is enabled without issuing
783  // the cpuid instruction from every thread, can correlating the cpuid
784  // info, so if the machine is not affinity capable, we assume that HT
785  // is off. We have seen quite a few machines where maxThreadsPerPkg
786  // is 2, yet the machine does not support HT.
787  //
788  // - Older OSes are usually found on machines with older chips, which
789  // do not support HT.
790  //
791  // - The performance penalty for mistakenly identifying a machine as
792  // HT when it isn't (which results in blocktime being incorrecly set
793  // to 0) is greater than the penalty when for mistakenly identifying
794  // a machine as being 1 thread/core when it is really HT enabled
795  // (which results in blocktime being incorrectly set to a positive
796  // value).
797  //
798  __kmp_ncores = __kmp_xproc;
799  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
800  __kmp_nThreadsPerCore = 1;
801  __kmp_ht_enabled = FALSE;
802  if (__kmp_affinity_verbose) {
803  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
804  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
805  if (__kmp_affinity_uniform_topology()) {
806  KMP_INFORM(Uniform, "KMP_AFFINITY");
807  } else {
808  KMP_INFORM(NonUniform, "KMP_AFFINITY");
809  }
810  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
811  __kmp_nThreadsPerCore, __kmp_ncores);
812  }
813  return 0;
814  }
815 
816  //
817  //
818  // From here on, we can assume that it is safe to call
819  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
820  // even if __kmp_affinity_type = affinity_none.
821  //
822 
823  //
824  // Save the affinity mask for the current thread.
825  //
826  kmp_affin_mask_t *oldMask;
827  KMP_CPU_ALLOC(oldMask);
828  KMP_ASSERT(oldMask != NULL);
829  __kmp_get_system_affinity(oldMask, TRUE);
830 
831  //
832  // Run through each of the available contexts, binding the current thread
833  // to it, and obtaining the pertinent information using the cpuid instr.
834  //
835  // The relevant information is:
836  //
837  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
838  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
839  //
840  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
841  // value of this field determines the width of the core# + thread#
842  // fields in the Apic Id. It is also an upper bound on the number
843  // of threads per package, but it has been verified that situations
844  // happen were it is not exact. In particular, on certain OS/chip
845  // combinations where Intel(R) Hyper-Threading Technology is supported
846  // by the chip but has
847  // been disabled, the value of this field will be 2 (for a single core
848  // chip). On other OS/chip combinations supporting
849  // Intel(R) Hyper-Threading Technology, the value of
850  // this field will be 1 when Intel(R) Hyper-Threading Technology is
851  // disabled and 2 when it is enabled.
852  //
853  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
854  // value of this field (+1) determines the width of the core# field in
855  // the Apic Id. The comments in "cpucount.cpp" say that this value is
856  // an upper bound, but the IA-32 architecture manual says that it is
857  // exactly the number of cores per package, and I haven't seen any
858  // case where it wasn't.
859  //
860  // From this information, deduce the package Id, core Id, and thread Id,
861  // and set the corresponding fields in the apicThreadInfo struct.
862  //
863  unsigned i;
864  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
865  __kmp_avail_proc * sizeof(apicThreadInfo));
866  unsigned nApics = 0;
867  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
868  //
869  // Skip this proc if it is not included in the machine model.
870  //
871  if (! KMP_CPU_ISSET(i, fullMask)) {
872  continue;
873  }
874  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
875 
876  __kmp_affinity_bind_thread(i);
877  threadInfo[nApics].osId = i;
878 
879  //
880  // The apic id and max threads per pkg come from cpuid(1).
881  //
882  kmp_cpuid buf;
883  __kmp_x86_cpuid(1, 0, &buf);
884  if (! (buf.edx >> 9) & 1) {
885  __kmp_set_system_affinity(oldMask, TRUE);
886  __kmp_free(threadInfo);
887  KMP_CPU_FREE(oldMask);
888  *msg_id = kmp_i18n_str_ApicNotPresent;
889  return -1;
890  }
891  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
892  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
893  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
894  threadInfo[nApics].maxThreadsPerPkg = 1;
895  }
896 
897  //
898  // Max cores per pkg comes from cpuid(4).
899  // 1 must be added to the encoded value.
900  //
901  // First, we need to check if cpuid(4) is supported on this chip.
902  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
903  // has the value n or greater.
904  //
905  __kmp_x86_cpuid(0, 0, &buf);
906  if (buf.eax >= 4) {
907  __kmp_x86_cpuid(4, 0, &buf);
908  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
909  }
910  else {
911  threadInfo[nApics].maxCoresPerPkg = 1;
912  }
913 
914  //
915  // Infer the pkgId / coreId / threadId using only the info
916  // obtained locally.
917  //
918  int widthCT = __kmp_cpuid_mask_width(
919  threadInfo[nApics].maxThreadsPerPkg);
920  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
921 
922  int widthC = __kmp_cpuid_mask_width(
923  threadInfo[nApics].maxCoresPerPkg);
924  int widthT = widthCT - widthC;
925  if (widthT < 0) {
926  //
927  // I've never seen this one happen, but I suppose it could, if
928  // the cpuid instruction on a chip was really screwed up.
929  // Make sure to restore the affinity mask before the tail call.
930  //
931  __kmp_set_system_affinity(oldMask, TRUE);
932  __kmp_free(threadInfo);
933  KMP_CPU_FREE(oldMask);
934  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
935  return -1;
936  }
937 
938  int maskC = (1 << widthC) - 1;
939  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
940  &maskC;
941 
942  int maskT = (1 << widthT) - 1;
943  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
944 
945  nApics++;
946  }
947 
948  //
949  // We've collected all the info we need.
950  // Restore the old affinity mask for this thread.
951  //
952  __kmp_set_system_affinity(oldMask, TRUE);
953 
954  //
955  // If there's only one thread context to bind to, form an Address object
956  // with depth 1 and return immediately (or, if affinity is off, set
957  // address2os to NULL and return).
958  //
959  // If it is configured to omit the package level when there is only a
960  // single package, the logic at the end of this routine won't work if
961  // there is only a single thread - it would try to form an Address
962  // object with depth 0.
963  //
964  KMP_ASSERT(nApics > 0);
965  if (nApics == 1) {
966  __kmp_ncores = nPackages = 1;
967  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
968  __kmp_ht_enabled = FALSE;
969  if (__kmp_affinity_verbose) {
970  char buf[KMP_AFFIN_MASK_PRINT_LEN];
971  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
972 
973  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
974  if (__kmp_affinity_respect_mask) {
975  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
976  } else {
977  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
978  }
979  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
980  KMP_INFORM(Uniform, "KMP_AFFINITY");
981  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
982  __kmp_nThreadsPerCore, __kmp_ncores);
983  }
984 
985  if (__kmp_affinity_type == affinity_none) {
986  __kmp_free(threadInfo);
987  KMP_CPU_FREE(oldMask);
988  return 0;
989  }
990 
991  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
992  Address addr(1);
993  addr.labels[0] = threadInfo[0].pkgId;
994  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
995 
996  if (__kmp_affinity_gran_levels < 0) {
997  __kmp_affinity_gran_levels = 0;
998  }
999 
1000  if (__kmp_affinity_verbose) {
1001  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1002  }
1003 
1004  __kmp_free(threadInfo);
1005  KMP_CPU_FREE(oldMask);
1006  return 1;
1007  }
1008 
1009  //
1010  // Sort the threadInfo table by physical Id.
1011  //
1012  qsort(threadInfo, nApics, sizeof(*threadInfo),
1013  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1014 
1015  //
1016  // The table is now sorted by pkgId / coreId / threadId, but we really
1017  // don't know the radix of any of the fields. pkgId's may be sparsely
1018  // assigned among the chips on a system. Although coreId's are usually
1019  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1020  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1021  //
1022  // For that matter, we don't know what coresPerPkg and threadsPerCore
1023  // (or the total # packages) are at this point - we want to determine
1024  // that now. We only have an upper bound on the first two figures.
1025  //
1026  // We also perform a consistency check at this point: the values returned
1027  // by the cpuid instruction for any thread bound to a given package had
1028  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1029  //
1030  nPackages = 1;
1031  nCoresPerPkg = 1;
1032  __kmp_nThreadsPerCore = 1;
1033  unsigned nCores = 1;
1034 
1035  unsigned pkgCt = 1; // to determine radii
1036  unsigned lastPkgId = threadInfo[0].pkgId;
1037  unsigned coreCt = 1;
1038  unsigned lastCoreId = threadInfo[0].coreId;
1039  unsigned threadCt = 1;
1040  unsigned lastThreadId = threadInfo[0].threadId;
1041 
1042  // intra-pkg consist checks
1043  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1044  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1045 
1046  for (i = 1; i < nApics; i++) {
1047  if (threadInfo[i].pkgId != lastPkgId) {
1048  nCores++;
1049  pkgCt++;
1050  lastPkgId = threadInfo[i].pkgId;
1051  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1052  coreCt = 1;
1053  lastCoreId = threadInfo[i].coreId;
1054  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1055  threadCt = 1;
1056  lastThreadId = threadInfo[i].threadId;
1057 
1058  //
1059  // This is a different package, so go on to the next iteration
1060  // without doing any consistency checks. Reset the consistency
1061  // check vars, though.
1062  //
1063  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1064  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1065  continue;
1066  }
1067 
1068  if (threadInfo[i].coreId != lastCoreId) {
1069  nCores++;
1070  coreCt++;
1071  lastCoreId = threadInfo[i].coreId;
1072  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1073  threadCt = 1;
1074  lastThreadId = threadInfo[i].threadId;
1075  }
1076  else if (threadInfo[i].threadId != lastThreadId) {
1077  threadCt++;
1078  lastThreadId = threadInfo[i].threadId;
1079  }
1080  else {
1081  __kmp_free(threadInfo);
1082  KMP_CPU_FREE(oldMask);
1083  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1084  return -1;
1085  }
1086 
1087  //
1088  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1089  // fields agree between all the threads bounds to a given package.
1090  //
1091  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1092  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1093  __kmp_free(threadInfo);
1094  KMP_CPU_FREE(oldMask);
1095  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1096  return -1;
1097  }
1098  }
1099  nPackages = pkgCt;
1100  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1101  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1102 
1103  //
1104  // When affinity is off, this routine will still be called to set
1105  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1106  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1107  // correctly, and return now if affinity is not enabled.
1108  //
1109  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1110  __kmp_ncores = nCores;
1111  if (__kmp_affinity_verbose) {
1112  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1113  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1114 
1115  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1116  if (__kmp_affinity_respect_mask) {
1117  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1118  } else {
1119  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1120  }
1121  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1122  if (__kmp_affinity_uniform_topology()) {
1123  KMP_INFORM(Uniform, "KMP_AFFINITY");
1124  } else {
1125  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1126  }
1127  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1128  __kmp_nThreadsPerCore, __kmp_ncores);
1129 
1130  }
1131 
1132  if (__kmp_affinity_type == affinity_none) {
1133  __kmp_free(threadInfo);
1134  KMP_CPU_FREE(oldMask);
1135  return 0;
1136  }
1137 
1138  //
1139  // Now that we've determined the number of packages, the number of cores
1140  // per package, and the number of threads per core, we can construct the
1141  // data structure that is to be returned.
1142  //
1143  int pkgLevel = 0;
1144  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1145  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1146  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1147 
1148  KMP_ASSERT(depth > 0);
1149  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1150 
1151  for (i = 0; i < nApics; ++i) {
1152  Address addr(depth);
1153  unsigned os = threadInfo[i].osId;
1154  int d = 0;
1155 
1156  if (pkgLevel >= 0) {
1157  addr.labels[d++] = threadInfo[i].pkgId;
1158  }
1159  if (coreLevel >= 0) {
1160  addr.labels[d++] = threadInfo[i].coreId;
1161  }
1162  if (threadLevel >= 0) {
1163  addr.labels[d++] = threadInfo[i].threadId;
1164  }
1165  (*address2os)[i] = AddrUnsPair(addr, os);
1166  }
1167 
1168  if (__kmp_affinity_gran_levels < 0) {
1169  //
1170  // Set the granularity level based on what levels are modeled
1171  // in the machine topology map.
1172  //
1173  __kmp_affinity_gran_levels = 0;
1174  if ((threadLevel >= 0)
1175  && (__kmp_affinity_gran > affinity_gran_thread)) {
1176  __kmp_affinity_gran_levels++;
1177  }
1178  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1179  __kmp_affinity_gran_levels++;
1180  }
1181  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1182  __kmp_affinity_gran_levels++;
1183  }
1184  }
1185 
1186  if (__kmp_affinity_verbose) {
1187  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1188  coreLevel, threadLevel);
1189  }
1190 
1191  __kmp_free(threadInfo);
1192  KMP_CPU_FREE(oldMask);
1193  return depth;
1194 }
1195 
1196 
1197 //
1198 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1199 // architectures support a newer interface for specifying the x2APIC Ids,
1200 // based on cpuid leaf 11.
1201 //
1202 static int
1203 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1204  kmp_i18n_id_t *const msg_id)
1205 {
1206  kmp_cpuid buf;
1207 
1208  *address2os = NULL;
1209  *msg_id = kmp_i18n_null;
1210 
1211  //
1212  // Check to see if cpuid leaf 11 is supported.
1213  //
1214  __kmp_x86_cpuid(0, 0, &buf);
1215  if (buf.eax < 11) {
1216  *msg_id = kmp_i18n_str_NoLeaf11Support;
1217  return -1;
1218  }
1219  __kmp_x86_cpuid(11, 0, &buf);
1220  if (buf.ebx == 0) {
1221  *msg_id = kmp_i18n_str_NoLeaf11Support;
1222  return -1;
1223  }
1224 
1225  //
1226  // Find the number of levels in the machine topology. While we're at it,
1227  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1228  // try to get more accurate values later by explicitly counting them,
1229  // but get reasonable defaults now, in case we return early.
1230  //
1231  int level;
1232  int threadLevel = -1;
1233  int coreLevel = -1;
1234  int pkgLevel = -1;
1235  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1236 
1237  for (level = 0;; level++) {
1238  if (level > 31) {
1239  //
1240  // FIXME: Hack for DPD200163180
1241  //
1242  // If level is big then something went wrong -> exiting
1243  //
1244  // There could actually be 32 valid levels in the machine topology,
1245  // but so far, the only machine we have seen which does not exit
1246  // this loop before iteration 32 has fubar x2APIC settings.
1247  //
1248  // For now, just reject this case based upon loop trip count.
1249  //
1250  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1251  return -1;
1252  }
1253  __kmp_x86_cpuid(11, level, &buf);
1254  if (buf.ebx == 0) {
1255  if (pkgLevel < 0) {
1256  //
1257  // Will infer nPackages from __kmp_xproc
1258  //
1259  pkgLevel = level;
1260  level++;
1261  }
1262  break;
1263  }
1264  int kind = (buf.ecx >> 8) & 0xff;
1265  if (kind == 1) {
1266  //
1267  // SMT level
1268  //
1269  threadLevel = level;
1270  coreLevel = -1;
1271  pkgLevel = -1;
1272  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1273  if (__kmp_nThreadsPerCore == 0) {
1274  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1275  return -1;
1276  }
1277  }
1278  else if (kind == 2) {
1279  //
1280  // core level
1281  //
1282  coreLevel = level;
1283  pkgLevel = -1;
1284  nCoresPerPkg = buf.ebx & 0xff;
1285  if (nCoresPerPkg == 0) {
1286  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1287  return -1;
1288  }
1289  }
1290  else {
1291  if (level <= 0) {
1292  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1293  return -1;
1294  }
1295  if (pkgLevel >= 0) {
1296  continue;
1297  }
1298  pkgLevel = level;
1299  nPackages = buf.ebx & 0xff;
1300  if (nPackages == 0) {
1301  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1302  return -1;
1303  }
1304  }
1305  }
1306  int depth = level;
1307 
1308  //
1309  // In the above loop, "level" was counted from the finest level (usually
1310  // thread) to the coarsest. The caller expects that we will place the
1311  // labels in (*address2os)[].first.labels[] in the inverse order, so
1312  // we need to invert the vars saying which level means what.
1313  //
1314  if (threadLevel >= 0) {
1315  threadLevel = depth - threadLevel - 1;
1316  }
1317  if (coreLevel >= 0) {
1318  coreLevel = depth - coreLevel - 1;
1319  }
1320  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1321  pkgLevel = depth - pkgLevel - 1;
1322 
1323  //
1324  // The algorithm used starts by setting the affinity to each available
1325  // thread and retrieving info from the cpuid instruction, so if we are not
1326  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
1327  // then we need to do something else - use the defaults that we calculated
1328  // from issuing cpuid without binding to each proc.
1329  //
1330  if (! KMP_AFFINITY_CAPABLE())
1331  {
1332  //
1333  // Hack to try and infer the machine topology using only the data
1334  // available from cpuid on the current thread, and __kmp_xproc.
1335  //
1336  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1337 
1338  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1339  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1340  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1341  if (__kmp_affinity_verbose) {
1342  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1343  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1344  if (__kmp_affinity_uniform_topology()) {
1345  KMP_INFORM(Uniform, "KMP_AFFINITY");
1346  } else {
1347  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1348  }
1349  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1350  __kmp_nThreadsPerCore, __kmp_ncores);
1351  }
1352  return 0;
1353  }
1354 
1355  //
1356  //
1357  // From here on, we can assume that it is safe to call
1358  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1359  // even if __kmp_affinity_type = affinity_none.
1360  //
1361 
1362  //
1363  // Save the affinity mask for the current thread.
1364  //
1365  kmp_affin_mask_t *oldMask;
1366  KMP_CPU_ALLOC(oldMask);
1367  __kmp_get_system_affinity(oldMask, TRUE);
1368 
1369  //
1370  // Allocate the data structure to be returned.
1371  //
1372  AddrUnsPair *retval = (AddrUnsPair *)
1373  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1374 
1375  //
1376  // Run through each of the available contexts, binding the current thread
1377  // to it, and obtaining the pertinent information using the cpuid instr.
1378  //
1379  unsigned int proc;
1380  int nApics = 0;
1381  for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1382  //
1383  // Skip this proc if it is not included in the machine model.
1384  //
1385  if (! KMP_CPU_ISSET(proc, fullMask)) {
1386  continue;
1387  }
1388  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1389 
1390  __kmp_affinity_bind_thread(proc);
1391 
1392  //
1393  // Extrach the labels for each level in the machine topology map
1394  // from the Apic ID.
1395  //
1396  Address addr(depth);
1397  int prev_shift = 0;
1398 
1399  for (level = 0; level < depth; level++) {
1400  __kmp_x86_cpuid(11, level, &buf);
1401  unsigned apicId = buf.edx;
1402  if (buf.ebx == 0) {
1403  if (level != depth - 1) {
1404  KMP_CPU_FREE(oldMask);
1405  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1406  return -1;
1407  }
1408  addr.labels[depth - level - 1] = apicId >> prev_shift;
1409  level++;
1410  break;
1411  }
1412  int shift = buf.eax & 0x1f;
1413  int mask = (1 << shift) - 1;
1414  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1415  prev_shift = shift;
1416  }
1417  if (level != depth) {
1418  KMP_CPU_FREE(oldMask);
1419  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1420  return -1;
1421  }
1422 
1423  retval[nApics] = AddrUnsPair(addr, proc);
1424  nApics++;
1425  }
1426 
1427  //
1428  // We've collected all the info we need.
1429  // Restore the old affinity mask for this thread.
1430  //
1431  __kmp_set_system_affinity(oldMask, TRUE);
1432 
1433  //
1434  // If there's only one thread context to bind to, return now.
1435  //
1436  KMP_ASSERT(nApics > 0);
1437  if (nApics == 1) {
1438  __kmp_ncores = nPackages = 1;
1439  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1440  __kmp_ht_enabled = FALSE;
1441  if (__kmp_affinity_verbose) {
1442  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1443  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1444 
1445  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1446  if (__kmp_affinity_respect_mask) {
1447  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1448  } else {
1449  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1450  }
1451  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1452  KMP_INFORM(Uniform, "KMP_AFFINITY");
1453  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1454  __kmp_nThreadsPerCore, __kmp_ncores);
1455  }
1456 
1457  if (__kmp_affinity_type == affinity_none) {
1458  __kmp_free(retval);
1459  KMP_CPU_FREE(oldMask);
1460  return 0;
1461  }
1462 
1463  //
1464  // Form an Address object which only includes the package level.
1465  //
1466  Address addr(1);
1467  addr.labels[0] = retval[0].first.labels[pkgLevel];
1468  retval[0].first = addr;
1469 
1470  if (__kmp_affinity_gran_levels < 0) {
1471  __kmp_affinity_gran_levels = 0;
1472  }
1473 
1474  if (__kmp_affinity_verbose) {
1475  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1476  }
1477 
1478  *address2os = retval;
1479  KMP_CPU_FREE(oldMask);
1480  return 1;
1481  }
1482 
1483  //
1484  // Sort the table by physical Id.
1485  //
1486  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1487 
1488  //
1489  // Find the radix at each of the levels.
1490  //
1491  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1492  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1493  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1494  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1495  for (level = 0; level < depth; level++) {
1496  totals[level] = 1;
1497  maxCt[level] = 1;
1498  counts[level] = 1;
1499  last[level] = retval[0].first.labels[level];
1500  }
1501 
1502  //
1503  // From here on, the iteration variable "level" runs from the finest
1504  // level to the coarsest, i.e. we iterate forward through
1505  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1506  // backwards.
1507  //
1508  for (proc = 1; (int)proc < nApics; proc++) {
1509  int level;
1510  for (level = 0; level < depth; level++) {
1511  if (retval[proc].first.labels[level] != last[level]) {
1512  int j;
1513  for (j = level + 1; j < depth; j++) {
1514  totals[j]++;
1515  counts[j] = 1;
1516  // The line below causes printing incorrect topology information
1517  // in case the max value for some level (maxCt[level]) is encountered earlier than
1518  // some less value while going through the array.
1519  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1520  // whereas it must be 4.
1521  // TODO!!! Check if it can be commented safely
1522  //maxCt[j] = 1;
1523  last[j] = retval[proc].first.labels[j];
1524  }
1525  totals[level]++;
1526  counts[level]++;
1527  if (counts[level] > maxCt[level]) {
1528  maxCt[level] = counts[level];
1529  }
1530  last[level] = retval[proc].first.labels[level];
1531  break;
1532  }
1533  else if (level == depth - 1) {
1534  __kmp_free(last);
1535  __kmp_free(maxCt);
1536  __kmp_free(counts);
1537  __kmp_free(totals);
1538  __kmp_free(retval);
1539  KMP_CPU_FREE(oldMask);
1540  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1541  return -1;
1542  }
1543  }
1544  }
1545 
1546  //
1547  // When affinity is off, this routine will still be called to set
1548  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1549  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1550  // correctly, and return if affinity is not enabled.
1551  //
1552  if (threadLevel >= 0) {
1553  __kmp_nThreadsPerCore = maxCt[threadLevel];
1554  }
1555  else {
1556  __kmp_nThreadsPerCore = 1;
1557  }
1558  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1559 
1560  nPackages = totals[pkgLevel];
1561 
1562  if (coreLevel >= 0) {
1563  __kmp_ncores = totals[coreLevel];
1564  nCoresPerPkg = maxCt[coreLevel];
1565  }
1566  else {
1567  __kmp_ncores = nPackages;
1568  nCoresPerPkg = 1;
1569  }
1570 
1571  //
1572  // Check to see if the machine topology is uniform
1573  //
1574  unsigned prod = maxCt[0];
1575  for (level = 1; level < depth; level++) {
1576  prod *= maxCt[level];
1577  }
1578  bool uniform = (prod == totals[level - 1]);
1579 
1580  //
1581  // Print the machine topology summary.
1582  //
1583  if (__kmp_affinity_verbose) {
1584  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1585  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1586 
1587  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1588  if (__kmp_affinity_respect_mask) {
1589  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1590  } else {
1591  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1592  }
1593  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1594  if (uniform) {
1595  KMP_INFORM(Uniform, "KMP_AFFINITY");
1596  } else {
1597  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1598  }
1599 
1600  kmp_str_buf_t buf;
1601  __kmp_str_buf_init(&buf);
1602 
1603  __kmp_str_buf_print(&buf, "%d", totals[0]);
1604  for (level = 1; level <= pkgLevel; level++) {
1605  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1606  }
1607  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1608  __kmp_nThreadsPerCore, __kmp_ncores);
1609 
1610  __kmp_str_buf_free(&buf);
1611  }
1612 
1613  if (__kmp_affinity_type == affinity_none) {
1614  __kmp_free(last);
1615  __kmp_free(maxCt);
1616  __kmp_free(counts);
1617  __kmp_free(totals);
1618  __kmp_free(retval);
1619  KMP_CPU_FREE(oldMask);
1620  return 0;
1621  }
1622 
1623  //
1624  // Find any levels with radiix 1, and remove them from the map
1625  // (except for the package level).
1626  //
1627  int new_depth = 0;
1628  for (level = 0; level < depth; level++) {
1629  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1630  continue;
1631  }
1632  new_depth++;
1633  }
1634 
1635  //
1636  // If we are removing any levels, allocate a new vector to return,
1637  // and copy the relevant information to it.
1638  //
1639  if (new_depth != depth) {
1640  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1641  sizeof(AddrUnsPair) * nApics);
1642  for (proc = 0; (int)proc < nApics; proc++) {
1643  Address addr(new_depth);
1644  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1645  }
1646  int new_level = 0;
1647  for (level = 0; level < depth; level++) {
1648  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1649  if (level == threadLevel) {
1650  threadLevel = -1;
1651  }
1652  else if ((threadLevel >= 0) && (level < threadLevel)) {
1653  threadLevel--;
1654  }
1655  if (level == coreLevel) {
1656  coreLevel = -1;
1657  }
1658  else if ((coreLevel >= 0) && (level < coreLevel)) {
1659  coreLevel--;
1660  }
1661  if (level < pkgLevel) {
1662  pkgLevel--;
1663  }
1664  continue;
1665  }
1666  for (proc = 0; (int)proc < nApics; proc++) {
1667  new_retval[proc].first.labels[new_level]
1668  = retval[proc].first.labels[level];
1669  }
1670  new_level++;
1671  }
1672 
1673  __kmp_free(retval);
1674  retval = new_retval;
1675  depth = new_depth;
1676  }
1677 
1678  if (__kmp_affinity_gran_levels < 0) {
1679  //
1680  // Set the granularity level based on what levels are modeled
1681  // in the machine topology map.
1682  //
1683  __kmp_affinity_gran_levels = 0;
1684  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1685  __kmp_affinity_gran_levels++;
1686  }
1687  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1688  __kmp_affinity_gran_levels++;
1689  }
1690  if (__kmp_affinity_gran > affinity_gran_package) {
1691  __kmp_affinity_gran_levels++;
1692  }
1693  }
1694 
1695  if (__kmp_affinity_verbose) {
1696  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1697  coreLevel, threadLevel);
1698  }
1699 
1700  __kmp_free(last);
1701  __kmp_free(maxCt);
1702  __kmp_free(counts);
1703  __kmp_free(totals);
1704  KMP_CPU_FREE(oldMask);
1705  *address2os = retval;
1706  return depth;
1707 }
1708 
1709 
1710 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1711 
1712 
1713 #define osIdIndex 0
1714 #define threadIdIndex 1
1715 #define coreIdIndex 2
1716 #define pkgIdIndex 3
1717 #define nodeIdIndex 4
1718 
1719 typedef unsigned *ProcCpuInfo;
1720 static unsigned maxIndex = pkgIdIndex;
1721 
1722 
1723 static int
1724 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1725 {
1726  const unsigned *aa = (const unsigned *)a;
1727  const unsigned *bb = (const unsigned *)b;
1728  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1729  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1730  return 0;
1731 };
1732 
1733 
1734 static int
1735 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1736 {
1737  unsigned i;
1738  const unsigned *aa = *((const unsigned **)a);
1739  const unsigned *bb = *((const unsigned **)b);
1740  for (i = maxIndex; ; i--) {
1741  if (aa[i] < bb[i]) return -1;
1742  if (aa[i] > bb[i]) return 1;
1743  if (i == osIdIndex) break;
1744  }
1745  return 0;
1746 }
1747 
1748 
1749 //
1750 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1751 // affinity map.
1752 //
1753 static int
1754 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1755  kmp_i18n_id_t *const msg_id, FILE *f)
1756 {
1757  *address2os = NULL;
1758  *msg_id = kmp_i18n_null;
1759 
1760  //
1761  // Scan of the file, and count the number of "processor" (osId) fields,
1762  // and find the higest value of <n> for a node_<n> field.
1763  //
1764  char buf[256];
1765  unsigned num_records = 0;
1766  while (! feof(f)) {
1767  buf[sizeof(buf) - 1] = 1;
1768  if (! fgets(buf, sizeof(buf), f)) {
1769  //
1770  // Read errors presumably because of EOF
1771  //
1772  break;
1773  }
1774 
1775  char s1[] = "processor";
1776  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1777  num_records++;
1778  continue;
1779  }
1780 
1781  //
1782  // FIXME - this will match "node_<n> <garbage>"
1783  //
1784  unsigned level;
1785  if (sscanf(buf, "node_%d id", &level) == 1) {
1786  if (nodeIdIndex + level >= maxIndex) {
1787  maxIndex = nodeIdIndex + level;
1788  }
1789  continue;
1790  }
1791  }
1792 
1793  //
1794  // Check for empty file / no valid processor records, or too many.
1795  // The number of records can't exceed the number of valid bits in the
1796  // affinity mask.
1797  //
1798  if (num_records == 0) {
1799  *line = 0;
1800  *msg_id = kmp_i18n_str_NoProcRecords;
1801  return -1;
1802  }
1803  if (num_records > (unsigned)__kmp_xproc) {
1804  *line = 0;
1805  *msg_id = kmp_i18n_str_TooManyProcRecords;
1806  return -1;
1807  }
1808 
1809  //
1810  // Set the file pointer back to the begginning, so that we can scan the
1811  // file again, this time performing a full parse of the data.
1812  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1813  // Adding an extra element at the end allows us to remove a lot of extra
1814  // checks for termination conditions.
1815  //
1816  if (fseek(f, 0, SEEK_SET) != 0) {
1817  *line = 0;
1818  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1819  return -1;
1820  }
1821 
1822  //
1823  // Allocate the array of records to store the proc info in. The dummy
1824  // element at the end makes the logic in filling them out easier to code.
1825  //
1826  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1827  * sizeof(unsigned *));
1828  unsigned i;
1829  for (i = 0; i <= num_records; i++) {
1830  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1831  * sizeof(unsigned));
1832  }
1833 
1834 #define CLEANUP_THREAD_INFO \
1835  for (i = 0; i <= num_records; i++) { \
1836  __kmp_free(threadInfo[i]); \
1837  } \
1838  __kmp_free(threadInfo);
1839 
1840  //
1841  // A value of UINT_MAX means that we didn't find the field
1842  //
1843  unsigned __index;
1844 
1845 #define INIT_PROC_INFO(p) \
1846  for (__index = 0; __index <= maxIndex; __index++) { \
1847  (p)[__index] = UINT_MAX; \
1848  }
1849 
1850  for (i = 0; i <= num_records; i++) {
1851  INIT_PROC_INFO(threadInfo[i]);
1852  }
1853 
1854  unsigned num_avail = 0;
1855  *line = 0;
1856  while (! feof(f)) {
1857  //
1858  // Create an inner scoping level, so that all the goto targets at the
1859  // end of the loop appear in an outer scoping level. This avoids
1860  // warnings about jumping past an initialization to a target in the
1861  // same block.
1862  //
1863  {
1864  buf[sizeof(buf) - 1] = 1;
1865  bool long_line = false;
1866  if (! fgets(buf, sizeof(buf), f)) {
1867  //
1868  // Read errors presumably because of EOF
1869  //
1870  // If there is valid data in threadInfo[num_avail], then fake
1871  // a blank line in ensure that the last address gets parsed.
1872  //
1873  bool valid = false;
1874  for (i = 0; i <= maxIndex; i++) {
1875  if (threadInfo[num_avail][i] != UINT_MAX) {
1876  valid = true;
1877  }
1878  }
1879  if (! valid) {
1880  break;
1881  }
1882  buf[0] = 0;
1883  } else if (!buf[sizeof(buf) - 1]) {
1884  //
1885  // The line is longer than the buffer. Set a flag and don't
1886  // emit an error if we were going to ignore the line, anyway.
1887  //
1888  long_line = true;
1889 
1890 #define CHECK_LINE \
1891  if (long_line) { \
1892  CLEANUP_THREAD_INFO; \
1893  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1894  return -1; \
1895  }
1896  }
1897  (*line)++;
1898 
1899  char s1[] = "processor";
1900  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1901  CHECK_LINE;
1902  char *p = strchr(buf + sizeof(s1) - 1, ':');
1903  unsigned val;
1904  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1905  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1906  threadInfo[num_avail][osIdIndex] = val;
1907  continue;
1908  }
1909  char s2[] = "physical id";
1910  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1911  CHECK_LINE;
1912  char *p = strchr(buf + sizeof(s2) - 1, ':');
1913  unsigned val;
1914  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1915  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1916  threadInfo[num_avail][pkgIdIndex] = val;
1917  continue;
1918  }
1919  char s3[] = "core id";
1920  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1921  CHECK_LINE;
1922  char *p = strchr(buf + sizeof(s3) - 1, ':');
1923  unsigned val;
1924  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1925  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
1926  threadInfo[num_avail][coreIdIndex] = val;
1927  continue;
1928  }
1929  char s4[] = "thread id";
1930  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1931  CHECK_LINE;
1932  char *p = strchr(buf + sizeof(s4) - 1, ':');
1933  unsigned val;
1934  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1935  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
1936  threadInfo[num_avail][threadIdIndex] = val;
1937  continue;
1938  }
1939  unsigned level;
1940  if (sscanf(buf, "node_%d id", &level) == 1) {
1941  CHECK_LINE;
1942  char *p = strchr(buf + sizeof(s4) - 1, ':');
1943  unsigned val;
1944  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1945  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1946  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
1947  threadInfo[num_avail][nodeIdIndex + level] = val;
1948  continue;
1949  }
1950 
1951  //
1952  // We didn't recognize the leading token on the line.
1953  // There are lots of leading tokens that we don't recognize -
1954  // if the line isn't empty, go on to the next line.
1955  //
1956  if ((*buf != 0) && (*buf != '\n')) {
1957  //
1958  // If the line is longer than the buffer, read characters
1959  // until we find a newline.
1960  //
1961  if (long_line) {
1962  int ch;
1963  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
1964  }
1965  continue;
1966  }
1967 
1968  //
1969  // A newline has signalled the end of the processor record.
1970  // Check that there aren't too many procs specified.
1971  //
1972  if (num_avail == __kmp_xproc) {
1973  CLEANUP_THREAD_INFO;
1974  *msg_id = kmp_i18n_str_TooManyEntries;
1975  return -1;
1976  }
1977 
1978  //
1979  // Check for missing fields. The osId field must be there, and we
1980  // currently require that the physical id field is specified, also.
1981  //
1982  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1983  CLEANUP_THREAD_INFO;
1984  *msg_id = kmp_i18n_str_MissingProcField;
1985  return -1;
1986  }
1987  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
1988  CLEANUP_THREAD_INFO;
1989  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
1990  return -1;
1991  }
1992 
1993  //
1994  // Skip this proc if it is not included in the machine model.
1995  //
1996  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
1997  INIT_PROC_INFO(threadInfo[num_avail]);
1998  continue;
1999  }
2000 
2001  //
2002  // We have a successful parse of this proc's info.
2003  // Increment the counter, and prepare for the next proc.
2004  //
2005  num_avail++;
2006  KMP_ASSERT(num_avail <= num_records);
2007  INIT_PROC_INFO(threadInfo[num_avail]);
2008  }
2009  continue;
2010 
2011  no_val:
2012  CLEANUP_THREAD_INFO;
2013  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2014  return -1;
2015 
2016  dup_field:
2017  CLEANUP_THREAD_INFO;
2018  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2019  return -1;
2020  }
2021  *line = 0;
2022 
2023 # if KMP_MIC && REDUCE_TEAM_SIZE
2024  unsigned teamSize = 0;
2025 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2026 
2027  // check for num_records == __kmp_xproc ???
2028 
2029  //
2030  // If there's only one thread context to bind to, form an Address object
2031  // with depth 1 and return immediately (or, if affinity is off, set
2032  // address2os to NULL and return).
2033  //
2034  // If it is configured to omit the package level when there is only a
2035  // single package, the logic at the end of this routine won't work if
2036  // there is only a single thread - it would try to form an Address
2037  // object with depth 0.
2038  //
2039  KMP_ASSERT(num_avail > 0);
2040  KMP_ASSERT(num_avail <= num_records);
2041  if (num_avail == 1) {
2042  __kmp_ncores = 1;
2043  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2044  __kmp_ht_enabled = FALSE;
2045  if (__kmp_affinity_verbose) {
2046  if (! KMP_AFFINITY_CAPABLE()) {
2047  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2048  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2049  KMP_INFORM(Uniform, "KMP_AFFINITY");
2050  }
2051  else {
2052  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2053  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2054  fullMask);
2055  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2056  if (__kmp_affinity_respect_mask) {
2057  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2058  } else {
2059  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2060  }
2061  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2062  KMP_INFORM(Uniform, "KMP_AFFINITY");
2063  }
2064  int index;
2065  kmp_str_buf_t buf;
2066  __kmp_str_buf_init(&buf);
2067  __kmp_str_buf_print(&buf, "1");
2068  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2069  __kmp_str_buf_print(&buf, " x 1");
2070  }
2071  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2072  __kmp_str_buf_free(&buf);
2073  }
2074 
2075  if (__kmp_affinity_type == affinity_none) {
2076  CLEANUP_THREAD_INFO;
2077  return 0;
2078  }
2079 
2080  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2081  Address addr(1);
2082  addr.labels[0] = threadInfo[0][pkgIdIndex];
2083  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2084 
2085  if (__kmp_affinity_gran_levels < 0) {
2086  __kmp_affinity_gran_levels = 0;
2087  }
2088 
2089  if (__kmp_affinity_verbose) {
2090  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2091  }
2092 
2093  CLEANUP_THREAD_INFO;
2094  return 1;
2095  }
2096 
2097  //
2098  // Sort the threadInfo table by physical Id.
2099  //
2100  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2101  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2102 
2103  //
2104  // The table is now sorted by pkgId / coreId / threadId, but we really
2105  // don't know the radix of any of the fields. pkgId's may be sparsely
2106  // assigned among the chips on a system. Although coreId's are usually
2107  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2108  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2109  //
2110  // For that matter, we don't know what coresPerPkg and threadsPerCore
2111  // (or the total # packages) are at this point - we want to determine
2112  // that now. We only have an upper bound on the first two figures.
2113  //
2114  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2115  * sizeof(unsigned));
2116  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2117  * sizeof(unsigned));
2118  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2119  * sizeof(unsigned));
2120  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2121  * sizeof(unsigned));
2122 
2123  bool assign_thread_ids = false;
2124  unsigned threadIdCt;
2125  unsigned index;
2126 
2127  restart_radix_check:
2128  threadIdCt = 0;
2129 
2130  //
2131  // Initialize the counter arrays with data from threadInfo[0].
2132  //
2133  if (assign_thread_ids) {
2134  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2135  threadInfo[0][threadIdIndex] = threadIdCt++;
2136  }
2137  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2138  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2139  }
2140  }
2141  for (index = 0; index <= maxIndex; index++) {
2142  counts[index] = 1;
2143  maxCt[index] = 1;
2144  totals[index] = 1;
2145  lastId[index] = threadInfo[0][index];;
2146  }
2147 
2148  //
2149  // Run through the rest of the OS procs.
2150  //
2151  for (i = 1; i < num_avail; i++) {
2152  //
2153  // Find the most significant index whose id differs
2154  // from the id for the previous OS proc.
2155  //
2156  for (index = maxIndex; index >= threadIdIndex; index--) {
2157  if (assign_thread_ids && (index == threadIdIndex)) {
2158  //
2159  // Auto-assign the thread id field if it wasn't specified.
2160  //
2161  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2162  threadInfo[i][threadIdIndex] = threadIdCt++;
2163  }
2164 
2165  //
2166  // Aparrently the thread id field was specified for some
2167  // entries and not others. Start the thread id counter
2168  // off at the next higher thread id.
2169  //
2170  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2171  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2172  }
2173  }
2174  if (threadInfo[i][index] != lastId[index]) {
2175  //
2176  // Run through all indices which are less significant,
2177  // and reset the counts to 1.
2178  //
2179  // At all levels up to and including index, we need to
2180  // increment the totals and record the last id.
2181  //
2182  unsigned index2;
2183  for (index2 = threadIdIndex; index2 < index; index2++) {
2184  totals[index2]++;
2185  if (counts[index2] > maxCt[index2]) {
2186  maxCt[index2] = counts[index2];
2187  }
2188  counts[index2] = 1;
2189  lastId[index2] = threadInfo[i][index2];
2190  }
2191  counts[index]++;
2192  totals[index]++;
2193  lastId[index] = threadInfo[i][index];
2194 
2195  if (assign_thread_ids && (index > threadIdIndex)) {
2196 
2197 # if KMP_MIC && REDUCE_TEAM_SIZE
2198  //
2199  // The default team size is the total #threads in the machine
2200  // minus 1 thread for every core that has 3 or more threads.
2201  //
2202  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2203 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2204 
2205  //
2206  // Restart the thread counter, as we are on a new core.
2207  //
2208  threadIdCt = 0;
2209 
2210  //
2211  // Auto-assign the thread id field if it wasn't specified.
2212  //
2213  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2214  threadInfo[i][threadIdIndex] = threadIdCt++;
2215  }
2216 
2217  //
2218  // Aparrently the thread id field was specified for some
2219  // entries and not others. Start the thread id counter
2220  // off at the next higher thread id.
2221  //
2222  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2223  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2224  }
2225  }
2226  break;
2227  }
2228  }
2229  if (index < threadIdIndex) {
2230  //
2231  // If thread ids were specified, it is an error if they are not
2232  // unique. Also, check that we waven't already restarted the
2233  // loop (to be safe - shouldn't need to).
2234  //
2235  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2236  || assign_thread_ids) {
2237  __kmp_free(lastId);
2238  __kmp_free(totals);
2239  __kmp_free(maxCt);
2240  __kmp_free(counts);
2241  CLEANUP_THREAD_INFO;
2242  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2243  return -1;
2244  }
2245 
2246  //
2247  // If the thread ids were not specified and we see entries
2248  // entries that are duplicates, start the loop over and
2249  // assign the thread ids manually.
2250  //
2251  assign_thread_ids = true;
2252  goto restart_radix_check;
2253  }
2254  }
2255 
2256 # if KMP_MIC && REDUCE_TEAM_SIZE
2257  //
2258  // The default team size is the total #threads in the machine
2259  // minus 1 thread for every core that has 3 or more threads.
2260  //
2261  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2262 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2263 
2264  for (index = threadIdIndex; index <= maxIndex; index++) {
2265  if (counts[index] > maxCt[index]) {
2266  maxCt[index] = counts[index];
2267  }
2268  }
2269 
2270  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2271  nCoresPerPkg = maxCt[coreIdIndex];
2272  nPackages = totals[pkgIdIndex];
2273 
2274  //
2275  // Check to see if the machine topology is uniform
2276  //
2277  unsigned prod = totals[maxIndex];
2278  for (index = threadIdIndex; index < maxIndex; index++) {
2279  prod *= maxCt[index];
2280  }
2281  bool uniform = (prod == totals[threadIdIndex]);
2282 
2283  //
2284  // When affinity is off, this routine will still be called to set
2285  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
2286  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2287  // correctly, and return now if affinity is not enabled.
2288  //
2289  __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1
2290  __kmp_ncores = totals[coreIdIndex];
2291 
2292  if (__kmp_affinity_verbose) {
2293  if (! KMP_AFFINITY_CAPABLE()) {
2294  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2295  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2296  if (uniform) {
2297  KMP_INFORM(Uniform, "KMP_AFFINITY");
2298  } else {
2299  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2300  }
2301  }
2302  else {
2303  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2304  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2305  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2306  if (__kmp_affinity_respect_mask) {
2307  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2308  } else {
2309  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2310  }
2311  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2312  if (uniform) {
2313  KMP_INFORM(Uniform, "KMP_AFFINITY");
2314  } else {
2315  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2316  }
2317  }
2318  kmp_str_buf_t buf;
2319  __kmp_str_buf_init(&buf);
2320 
2321  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2322  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2323  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2324  }
2325  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2326  maxCt[threadIdIndex], __kmp_ncores);
2327 
2328  __kmp_str_buf_free(&buf);
2329  }
2330 
2331 # if KMP_MIC && REDUCE_TEAM_SIZE
2332  //
2333  // Set the default team size.
2334  //
2335  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2336  __kmp_dflt_team_nth = teamSize;
2337  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2338  __kmp_dflt_team_nth));
2339  }
2340 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2341 
2342  if (__kmp_affinity_type == affinity_none) {
2343  __kmp_free(lastId);
2344  __kmp_free(totals);
2345  __kmp_free(maxCt);
2346  __kmp_free(counts);
2347  CLEANUP_THREAD_INFO;
2348  return 0;
2349  }
2350 
2351  //
2352  // Count the number of levels which have more nodes at that level than
2353  // at the parent's level (with there being an implicit root node of
2354  // the top level). This is equivalent to saying that there is at least
2355  // one node at this level which has a sibling. These levels are in the
2356  // map, and the package level is always in the map.
2357  //
2358  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2359  int level = 0;
2360  for (index = threadIdIndex; index < maxIndex; index++) {
2361  KMP_ASSERT(totals[index] >= totals[index + 1]);
2362  inMap[index] = (totals[index] > totals[index + 1]);
2363  }
2364  inMap[maxIndex] = (totals[maxIndex] > 1);
2365  inMap[pkgIdIndex] = true;
2366 
2367  int depth = 0;
2368  for (index = threadIdIndex; index <= maxIndex; index++) {
2369  if (inMap[index]) {
2370  depth++;
2371  }
2372  }
2373  KMP_ASSERT(depth > 0);
2374 
2375  //
2376  // Construct the data structure that is to be returned.
2377  //
2378  *address2os = (AddrUnsPair*)
2379  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2380  int pkgLevel = -1;
2381  int coreLevel = -1;
2382  int threadLevel = -1;
2383 
2384  for (i = 0; i < num_avail; ++i) {
2385  Address addr(depth);
2386  unsigned os = threadInfo[i][osIdIndex];
2387  int src_index;
2388  int dst_index = 0;
2389 
2390  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2391  if (! inMap[src_index]) {
2392  continue;
2393  }
2394  addr.labels[dst_index] = threadInfo[i][src_index];
2395  if (src_index == pkgIdIndex) {
2396  pkgLevel = dst_index;
2397  }
2398  else if (src_index == coreIdIndex) {
2399  coreLevel = dst_index;
2400  }
2401  else if (src_index == threadIdIndex) {
2402  threadLevel = dst_index;
2403  }
2404  dst_index++;
2405  }
2406  (*address2os)[i] = AddrUnsPair(addr, os);
2407  }
2408 
2409  if (__kmp_affinity_gran_levels < 0) {
2410  //
2411  // Set the granularity level based on what levels are modeled
2412  // in the machine topology map.
2413  //
2414  unsigned src_index;
2415  __kmp_affinity_gran_levels = 0;
2416  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2417  if (! inMap[src_index]) {
2418  continue;
2419  }
2420  switch (src_index) {
2421  case threadIdIndex:
2422  if (__kmp_affinity_gran > affinity_gran_thread) {
2423  __kmp_affinity_gran_levels++;
2424  }
2425 
2426  break;
2427  case coreIdIndex:
2428  if (__kmp_affinity_gran > affinity_gran_core) {
2429  __kmp_affinity_gran_levels++;
2430  }
2431  break;
2432 
2433  case pkgIdIndex:
2434  if (__kmp_affinity_gran > affinity_gran_package) {
2435  __kmp_affinity_gran_levels++;
2436  }
2437  break;
2438  }
2439  }
2440  }
2441 
2442  if (__kmp_affinity_verbose) {
2443  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2444  coreLevel, threadLevel);
2445  }
2446 
2447  __kmp_free(inMap);
2448  __kmp_free(lastId);
2449  __kmp_free(totals);
2450  __kmp_free(maxCt);
2451  __kmp_free(counts);
2452  CLEANUP_THREAD_INFO;
2453  return depth;
2454 }
2455 
2456 
2457 //
2458 // Create and return a table of affinity masks, indexed by OS thread ID.
2459 // This routine handles OR'ing together all the affinity masks of threads
2460 // that are sufficiently close, if granularity > fine.
2461 //
2462 static kmp_affin_mask_t *
2463 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2464  AddrUnsPair *address2os, unsigned numAddrs)
2465 {
2466  //
2467  // First form a table of affinity masks in order of OS thread id.
2468  //
2469  unsigned depth;
2470  unsigned maxOsId;
2471  unsigned i;
2472 
2473  KMP_ASSERT(numAddrs > 0);
2474  depth = address2os[0].first.depth;
2475 
2476  maxOsId = 0;
2477  for (i = 0; i < numAddrs; i++) {
2478  unsigned osId = address2os[i].second;
2479  if (osId > maxOsId) {
2480  maxOsId = osId;
2481  }
2482  }
2483  kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2484  (maxOsId + 1) * __kmp_affin_mask_size);
2485 
2486  //
2487  // Sort the address2os table according to physical order. Doing so
2488  // will put all threads on the same core/package/node in consecutive
2489  // locations.
2490  //
2491  qsort(address2os, numAddrs, sizeof(*address2os),
2492  __kmp_affinity_cmp_Address_labels);
2493 
2494  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2495  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2496  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2497  }
2498  if (__kmp_affinity_gran_levels >= (int)depth) {
2499  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2500  && (__kmp_affinity_type != affinity_none))) {
2501  KMP_WARNING(AffThreadsMayMigrate);
2502  }
2503  }
2504 
2505  //
2506  // Run through the table, forming the masks for all threads on each
2507  // core. Threads on the same core will have identical "Address"
2508  // objects, not considering the last level, which must be the thread
2509  // id. All threads on a core will appear consecutively.
2510  //
2511  unsigned unique = 0;
2512  unsigned j = 0; // index of 1st thread on core
2513  unsigned leader = 0;
2514  Address *leaderAddr = &(address2os[0].first);
2515  kmp_affin_mask_t *sum
2516  = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2517  KMP_CPU_ZERO(sum);
2518  KMP_CPU_SET(address2os[0].second, sum);
2519  for (i = 1; i < numAddrs; i++) {
2520  //
2521  // If this thread is sufficiently close to the leader (withing the
2522  // granularity setting), then set the bit for this os thread in the
2523  // affinity mask for this group, and go on to the next thread.
2524  //
2525  if (leaderAddr->isClose(address2os[i].first,
2526  __kmp_affinity_gran_levels)) {
2527  KMP_CPU_SET(address2os[i].second, sum);
2528  continue;
2529  }
2530 
2531  //
2532  // For every thread in this group, copy the mask to the thread's
2533  // entry in the osId2Mask table. Mark the first address as a
2534  // leader.
2535  //
2536  for (; j < i; j++) {
2537  unsigned osId = address2os[j].second;
2538  KMP_DEBUG_ASSERT(osId <= maxOsId);
2539  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2540  KMP_CPU_COPY(mask, sum);
2541  address2os[j].first.leader = (j == leader);
2542  }
2543  unique++;
2544 
2545  //
2546  // Start a new mask.
2547  //
2548  leader = i;
2549  leaderAddr = &(address2os[i].first);
2550  KMP_CPU_ZERO(sum);
2551  KMP_CPU_SET(address2os[i].second, sum);
2552  }
2553 
2554  //
2555  // For every thread in last group, copy the mask to the thread's
2556  // entry in the osId2Mask table.
2557  //
2558  for (; j < i; j++) {
2559  unsigned osId = address2os[j].second;
2560  KMP_DEBUG_ASSERT(osId <= maxOsId);
2561  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2562  KMP_CPU_COPY(mask, sum);
2563  address2os[j].first.leader = (j == leader);
2564  }
2565  unique++;
2566 
2567  *maxIndex = maxOsId;
2568  *numUnique = unique;
2569  return osId2Mask;
2570 }
2571 
2572 
2573 //
2574 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2575 // as file-static than to try and pass them through the calling sequence of
2576 // the recursive-descent OMP_PLACES parser.
2577 //
2578 static kmp_affin_mask_t *newMasks;
2579 static int numNewMasks;
2580 static int nextNewMask;
2581 
2582 #define ADD_MASK(_mask) \
2583  { \
2584  if (nextNewMask >= numNewMasks) { \
2585  numNewMasks *= 2; \
2586  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2587  numNewMasks * __kmp_affin_mask_size); \
2588  } \
2589  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2590  nextNewMask++; \
2591  }
2592 
2593 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2594  { \
2595  if (((_osId) > _maxOsId) || \
2596  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX(_osId2Mask, (_osId))))) {\
2597  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2598  && (__kmp_affinity_type != affinity_none))) { \
2599  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2600  } \
2601  } \
2602  else { \
2603  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2604  } \
2605  }
2606 
2607 
2608 //
2609 // Re-parse the proclist (for the explicit affinity type), and form the list
2610 // of affinity newMasks indexed by gtid.
2611 //
2612 static void
2613 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2614  unsigned int *out_numMasks, const char *proclist,
2615  kmp_affin_mask_t *osId2Mask, int maxOsId)
2616 {
2617  const char *scan = proclist;
2618  const char *next = proclist;
2619 
2620  //
2621  // We use malloc() for the temporary mask vector,
2622  // so that we can use realloc() to extend it.
2623  //
2624  numNewMasks = 2;
2625  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2626  * __kmp_affin_mask_size);
2627  nextNewMask = 0;
2628  kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2629  __kmp_affin_mask_size);
2630  int setSize = 0;
2631 
2632  for (;;) {
2633  int start, end, stride;
2634 
2635  SKIP_WS(scan);
2636  next = scan;
2637  if (*next == '\0') {
2638  break;
2639  }
2640 
2641  if (*next == '{') {
2642  int num;
2643  setSize = 0;
2644  next++; // skip '{'
2645  SKIP_WS(next);
2646  scan = next;
2647 
2648  //
2649  // Read the first integer in the set.
2650  //
2651  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2652  "bad proclist");
2653  SKIP_DIGITS(next);
2654  num = __kmp_str_to_int(scan, *next);
2655  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2656 
2657  //
2658  // Copy the mask for that osId to the sum (union) mask.
2659  //
2660  if ((num > maxOsId) ||
2661  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2662  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2663  && (__kmp_affinity_type != affinity_none))) {
2664  KMP_WARNING(AffIgnoreInvalidProcID, num);
2665  }
2666  KMP_CPU_ZERO(sumMask);
2667  }
2668  else {
2669  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2670  setSize = 1;
2671  }
2672 
2673  for (;;) {
2674  //
2675  // Check for end of set.
2676  //
2677  SKIP_WS(next);
2678  if (*next == '}') {
2679  next++; // skip '}'
2680  break;
2681  }
2682 
2683  //
2684  // Skip optional comma.
2685  //
2686  if (*next == ',') {
2687  next++;
2688  }
2689  SKIP_WS(next);
2690 
2691  //
2692  // Read the next integer in the set.
2693  //
2694  scan = next;
2695  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2696  "bad explicit proc list");
2697 
2698  SKIP_DIGITS(next);
2699  num = __kmp_str_to_int(scan, *next);
2700  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2701 
2702  //
2703  // Add the mask for that osId to the sum mask.
2704  //
2705  if ((num > maxOsId) ||
2706  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2707  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2708  && (__kmp_affinity_type != affinity_none))) {
2709  KMP_WARNING(AffIgnoreInvalidProcID, num);
2710  }
2711  }
2712  else {
2713  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2714  setSize++;
2715  }
2716  }
2717  if (setSize > 0) {
2718  ADD_MASK(sumMask);
2719  }
2720 
2721  SKIP_WS(next);
2722  if (*next == ',') {
2723  next++;
2724  }
2725  scan = next;
2726  continue;
2727  }
2728 
2729  //
2730  // Read the first integer.
2731  //
2732  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2733  SKIP_DIGITS(next);
2734  start = __kmp_str_to_int(scan, *next);
2735  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2736  SKIP_WS(next);
2737 
2738  //
2739  // If this isn't a range, then add a mask to the list and go on.
2740  //
2741  if (*next != '-') {
2742  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2743 
2744  //
2745  // Skip optional comma.
2746  //
2747  if (*next == ',') {
2748  next++;
2749  }
2750  scan = next;
2751  continue;
2752  }
2753 
2754  //
2755  // This is a range. Skip over the '-' and read in the 2nd int.
2756  //
2757  next++; // skip '-'
2758  SKIP_WS(next);
2759  scan = next;
2760  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2761  SKIP_DIGITS(next);
2762  end = __kmp_str_to_int(scan, *next);
2763  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2764 
2765  //
2766  // Check for a stride parameter
2767  //
2768  stride = 1;
2769  SKIP_WS(next);
2770  if (*next == ':') {
2771  //
2772  // A stride is specified. Skip over the ':" and read the 3rd int.
2773  //
2774  int sign = +1;
2775  next++; // skip ':'
2776  SKIP_WS(next);
2777  scan = next;
2778  if (*next == '-') {
2779  sign = -1;
2780  next++;
2781  SKIP_WS(next);
2782  scan = next;
2783  }
2784  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2785  "bad explicit proc list");
2786  SKIP_DIGITS(next);
2787  stride = __kmp_str_to_int(scan, *next);
2788  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2789  stride *= sign;
2790  }
2791 
2792  //
2793  // Do some range checks.
2794  //
2795  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2796  if (stride > 0) {
2797  KMP_ASSERT2(start <= end, "bad explicit proc list");
2798  }
2799  else {
2800  KMP_ASSERT2(start >= end, "bad explicit proc list");
2801  }
2802  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2803 
2804  //
2805  // Add the mask for each OS proc # to the list.
2806  //
2807  if (stride > 0) {
2808  do {
2809  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2810  start += stride;
2811  } while (start <= end);
2812  }
2813  else {
2814  do {
2815  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2816  start += stride;
2817  } while (start >= end);
2818  }
2819 
2820  //
2821  // Skip optional comma.
2822  //
2823  SKIP_WS(next);
2824  if (*next == ',') {
2825  next++;
2826  }
2827  scan = next;
2828  }
2829 
2830  *out_numMasks = nextNewMask;
2831  if (nextNewMask == 0) {
2832  *out_masks = NULL;
2833  KMP_INTERNAL_FREE(newMasks);
2834  return;
2835  }
2836  *out_masks
2837  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2838  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2839  __kmp_free(sumMask);
2840  KMP_INTERNAL_FREE(newMasks);
2841 }
2842 
2843 
2844 # if OMP_40_ENABLED
2845 
2846 /*-----------------------------------------------------------------------------
2847 
2848 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2849 places. Again, Here is the grammar:
2850 
2851 place_list := place
2852 place_list := place , place_list
2853 place := num
2854 place := place : num
2855 place := place : num : signed
2856 place := { subplacelist }
2857 place := ! place // (lowest priority)
2858 subplace_list := subplace
2859 subplace_list := subplace , subplace_list
2860 subplace := num
2861 subplace := num : num
2862 subplace := num : num : signed
2863 signed := num
2864 signed := + signed
2865 signed := - signed
2866 
2867 -----------------------------------------------------------------------------*/
2868 
2869 static void
2870 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2871  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2872 {
2873  const char *next;
2874 
2875  for (;;) {
2876  int start, count, stride, i;
2877 
2878  //
2879  // Read in the starting proc id
2880  //
2881  SKIP_WS(*scan);
2882  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2883  "bad explicit places list");
2884  next = *scan;
2885  SKIP_DIGITS(next);
2886  start = __kmp_str_to_int(*scan, *next);
2887  KMP_ASSERT(start >= 0);
2888  *scan = next;
2889 
2890  //
2891  // valid follow sets are ',' ':' and '}'
2892  //
2893  SKIP_WS(*scan);
2894  if (**scan == '}' || **scan == ',') {
2895  if ((start > maxOsId) ||
2896  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2897  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2898  && (__kmp_affinity_type != affinity_none))) {
2899  KMP_WARNING(AffIgnoreInvalidProcID, start);
2900  }
2901  }
2902  else {
2903  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2904  (*setSize)++;
2905  }
2906  if (**scan == '}') {
2907  break;
2908  }
2909  (*scan)++; // skip ','
2910  continue;
2911  }
2912  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2913  (*scan)++; // skip ':'
2914 
2915  //
2916  // Read count parameter
2917  //
2918  SKIP_WS(*scan);
2919  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2920  "bad explicit places list");
2921  next = *scan;
2922  SKIP_DIGITS(next);
2923  count = __kmp_str_to_int(*scan, *next);
2924  KMP_ASSERT(count >= 0);
2925  *scan = next;
2926 
2927  //
2928  // valid follow sets are ',' ':' and '}'
2929  //
2930  SKIP_WS(*scan);
2931  if (**scan == '}' || **scan == ',') {
2932  for (i = 0; i < count; i++) {
2933  if ((start > maxOsId) ||
2934  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2935  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2936  && (__kmp_affinity_type != affinity_none))) {
2937  KMP_WARNING(AffIgnoreInvalidProcID, start);
2938  }
2939  break; // don't proliferate warnings for large count
2940  }
2941  else {
2942  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2943  start++;
2944  (*setSize)++;
2945  }
2946  }
2947  if (**scan == '}') {
2948  break;
2949  }
2950  (*scan)++; // skip ','
2951  continue;
2952  }
2953  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2954  (*scan)++; // skip ':'
2955 
2956  //
2957  // Read stride parameter
2958  //
2959  int sign = +1;
2960  for (;;) {
2961  SKIP_WS(*scan);
2962  if (**scan == '+') {
2963  (*scan)++; // skip '+'
2964  continue;
2965  }
2966  if (**scan == '-') {
2967  sign *= -1;
2968  (*scan)++; // skip '-'
2969  continue;
2970  }
2971  break;
2972  }
2973  SKIP_WS(*scan);
2974  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2975  "bad explicit places list");
2976  next = *scan;
2977  SKIP_DIGITS(next);
2978  stride = __kmp_str_to_int(*scan, *next);
2979  KMP_ASSERT(stride >= 0);
2980  *scan = next;
2981  stride *= sign;
2982 
2983  //
2984  // valid follow sets are ',' and '}'
2985  //
2986  SKIP_WS(*scan);
2987  if (**scan == '}' || **scan == ',') {
2988  for (i = 0; i < count; i++) {
2989  if ((start > maxOsId) ||
2990  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2991  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2992  && (__kmp_affinity_type != affinity_none))) {
2993  KMP_WARNING(AffIgnoreInvalidProcID, start);
2994  }
2995  break; // don't proliferate warnings for large count
2996  }
2997  else {
2998  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2999  start += stride;
3000  (*setSize)++;
3001  }
3002  }
3003  if (**scan == '}') {
3004  break;
3005  }
3006  (*scan)++; // skip ','
3007  continue;
3008  }
3009 
3010  KMP_ASSERT2(0, "bad explicit places list");
3011  }
3012 }
3013 
3014 
3015 static void
3016 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3017  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3018 {
3019  const char *next;
3020 
3021  //
3022  // valid follow sets are '{' '!' and num
3023  //
3024  SKIP_WS(*scan);
3025  if (**scan == '{') {
3026  (*scan)++; // skip '{'
3027  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3028  setSize);
3029  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3030  (*scan)++; // skip '}'
3031  }
3032  else if (**scan == '!') {
3033  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3034  KMP_CPU_COMPLEMENT(tempMask);
3035  (*scan)++; // skip '!'
3036  }
3037  else if ((**scan >= '0') && (**scan <= '9')) {
3038  next = *scan;
3039  SKIP_DIGITS(next);
3040  int num = __kmp_str_to_int(*scan, *next);
3041  KMP_ASSERT(num >= 0);
3042  if ((num > maxOsId) ||
3043  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3044  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3045  && (__kmp_affinity_type != affinity_none))) {
3046  KMP_WARNING(AffIgnoreInvalidProcID, num);
3047  }
3048  }
3049  else {
3050  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3051  (*setSize)++;
3052  }
3053  *scan = next; // skip num
3054  }
3055  else {
3056  KMP_ASSERT2(0, "bad explicit places list");
3057  }
3058 }
3059 
3060 
3061 static void
3062 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3063  unsigned int *out_numMasks, const char *placelist,
3064  kmp_affin_mask_t *osId2Mask, int maxOsId)
3065 {
3066  const char *scan = placelist;
3067  const char *next = placelist;
3068 
3069  numNewMasks = 2;
3070  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3071  * __kmp_affin_mask_size);
3072  nextNewMask = 0;
3073 
3074  kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3075  __kmp_affin_mask_size);
3076  KMP_CPU_ZERO(tempMask);
3077  int setSize = 0;
3078 
3079  for (;;) {
3080  int start, count, stride;
3081 
3082  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3083 
3084  //
3085  // valid follow sets are ',' ':' and EOL
3086  //
3087  SKIP_WS(scan);
3088  if (*scan == '\0' || *scan == ',') {
3089  if (setSize > 0) {
3090  ADD_MASK(tempMask);
3091  }
3092  KMP_CPU_ZERO(tempMask);
3093  setSize = 0;
3094  if (*scan == '\0') {
3095  break;
3096  }
3097  scan++; // skip ','
3098  continue;
3099  }
3100 
3101  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3102  scan++; // skip ':'
3103 
3104  //
3105  // Read count parameter
3106  //
3107  SKIP_WS(scan);
3108  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3109  "bad explicit places list");
3110  next = scan;
3111  SKIP_DIGITS(next);
3112  count = __kmp_str_to_int(scan, *next);
3113  KMP_ASSERT(count >= 0);
3114  scan = next;
3115 
3116  //
3117  // valid follow sets are ',' ':' and EOL
3118  //
3119  SKIP_WS(scan);
3120  if (*scan == '\0' || *scan == ',') {
3121  int i;
3122  for (i = 0; i < count; i++) {
3123  int j;
3124  if (setSize == 0) {
3125  break;
3126  }
3127  ADD_MASK(tempMask);
3128  setSize = 0;
3129  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j > 0; j--) {
3130  //
3131  // Use a temp var in case macro is changed to evaluate
3132  // args multiple times.
3133  //
3134  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3135  KMP_CPU_SET(j, tempMask);
3136  setSize++;
3137  }
3138  else {
3139  KMP_CPU_CLR(j, tempMask);
3140  }
3141  }
3142  for (; j >= 0; j--) {
3143  KMP_CPU_CLR(j, tempMask);
3144  }
3145  }
3146  KMP_CPU_ZERO(tempMask);
3147  setSize = 0;
3148 
3149  if (*scan == '\0') {
3150  break;
3151  }
3152  scan++; // skip ','
3153  continue;
3154  }
3155 
3156  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3157  scan++; // skip ':'
3158 
3159  //
3160  // Read stride parameter
3161  //
3162  int sign = +1;
3163  for (;;) {
3164  SKIP_WS(scan);
3165  if (*scan == '+') {
3166  scan++; // skip '+'
3167  continue;
3168  }
3169  if (*scan == '-') {
3170  sign *= -1;
3171  scan++; // skip '-'
3172  continue;
3173  }
3174  break;
3175  }
3176  SKIP_WS(scan);
3177  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3178  "bad explicit places list");
3179  next = scan;
3180  SKIP_DIGITS(next);
3181  stride = __kmp_str_to_int(scan, *next);
3182  KMP_DEBUG_ASSERT(stride >= 0);
3183  scan = next;
3184  stride *= sign;
3185 
3186  if (stride > 0) {
3187  int i;
3188  for (i = 0; i < count; i++) {
3189  int j;
3190  if (setSize == 0) {
3191  break;
3192  }
3193  ADD_MASK(tempMask);
3194  setSize = 0;
3195  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3196  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3197  KMP_CPU_SET(j, tempMask);
3198  setSize++;
3199  }
3200  else {
3201  KMP_CPU_CLR(j, tempMask);
3202  }
3203  }
3204  for (; j >= 0; j--) {
3205  KMP_CPU_CLR(j, tempMask);
3206  }
3207  }
3208  }
3209  else {
3210  int i;
3211  for (i = 0; i < count; i++) {
3212  unsigned j;
3213  if (setSize == 0) {
3214  break;
3215  }
3216  ADD_MASK(tempMask);
3217  setSize = 0;
3218  for (j = 0; j < (__kmp_affin_mask_size * CHAR_BIT) + stride;
3219  j++) {
3220  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3221  KMP_CPU_SET(j, tempMask);
3222  setSize++;
3223  }
3224  else {
3225  KMP_CPU_CLR(j, tempMask);
3226  }
3227  }
3228  for (; j < __kmp_affin_mask_size * CHAR_BIT; j++) {
3229  KMP_CPU_CLR(j, tempMask);
3230  }
3231  }
3232  }
3233  KMP_CPU_ZERO(tempMask);
3234  setSize = 0;
3235 
3236  //
3237  // valid follow sets are ',' and EOL
3238  //
3239  SKIP_WS(scan);
3240  if (*scan == '\0') {
3241  break;
3242  }
3243  if (*scan == ',') {
3244  scan++; // skip ','
3245  continue;
3246  }
3247 
3248  KMP_ASSERT2(0, "bad explicit places list");
3249  }
3250 
3251  *out_numMasks = nextNewMask;
3252  if (nextNewMask == 0) {
3253  *out_masks = NULL;
3254  KMP_INTERNAL_FREE(newMasks);
3255  return;
3256  }
3257  *out_masks
3258  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3259  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3260  __kmp_free(tempMask);
3261  KMP_INTERNAL_FREE(newMasks);
3262 }
3263 
3264 # endif /* OMP_40_ENABLED */
3265 
3266 #undef ADD_MASK
3267 #undef ADD_MASK_OSID
3268 
3269 
3270 # if KMP_MIC
3271 
3272 static void
3273 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3274 {
3275  if ( __kmp_place_num_cores == 0 ) {
3276  if ( __kmp_place_num_threads_per_core == 0 ) {
3277  return; // no cores limiting actions requested, exit
3278  }
3279  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3280  }
3281  if ( !__kmp_affinity_uniform_topology() || depth != 3 ) {
3282  KMP_WARNING( AffThrPlaceUnsupported );
3283  return; // don't support non-uniform topology or not-3-level architecture
3284  }
3285  if ( __kmp_place_num_threads_per_core == 0 ) {
3286  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3287  }
3288  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3289  KMP_WARNING( AffThrPlaceManyCores );
3290  return;
3291  }
3292 
3293  AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3294  nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3295  int i, j, k, n_old = 0, n_new = 0;
3296  for ( i = 0; i < nPackages; ++i ) {
3297  for ( j = 0; j < nCoresPerPkg; ++j ) {
3298  if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3299  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3300  } else {
3301  for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3302  if ( k < __kmp_place_num_threads_per_core ) {
3303  newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3304  n_new++;
3305  }
3306  n_old++;
3307  }
3308  }
3309  }
3310  }
3311  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3312  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3313  __kmp_avail_proc = n_new; // correct avail_proc
3314  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3315 
3316  __kmp_free( *pAddr );
3317  *pAddr = newAddr; // replace old topology with new one
3318 }
3319 
3320 # endif /* KMP_MIC */
3321 
3322 
3323 static AddrUnsPair *address2os = NULL;
3324 static int * procarr = NULL;
3325 static int __kmp_aff_depth = 0;
3326 
3327 static void
3328 __kmp_aux_affinity_initialize(void)
3329 {
3330  if (__kmp_affinity_masks != NULL) {
3331  KMP_ASSERT(fullMask != NULL);
3332  return;
3333  }
3334 
3335  //
3336  // Create the "full" mask - this defines all of the processors that we
3337  // consider to be in the machine model. If respect is set, then it is
3338  // the initialization thread's affinity mask. Otherwise, it is all
3339  // processors that we know about on the machine.
3340  //
3341  if (fullMask == NULL) {
3342  fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3343  }
3344  if (KMP_AFFINITY_CAPABLE()) {
3345  if (__kmp_affinity_respect_mask) {
3346  __kmp_get_system_affinity(fullMask, TRUE);
3347 
3348  //
3349  // Count the number of available processors.
3350  //
3351  unsigned i;
3352  __kmp_avail_proc = 0;
3353  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3354  if (! KMP_CPU_ISSET(i, fullMask)) {
3355  continue;
3356  }
3357  __kmp_avail_proc++;
3358  }
3359  if (__kmp_avail_proc > __kmp_xproc) {
3360  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3361  && (__kmp_affinity_type != affinity_none))) {
3362  KMP_WARNING(ErrorInitializeAffinity);
3363  }
3364  __kmp_affinity_type = affinity_none;
3365  __kmp_affin_mask_size = 0;
3366  return;
3367  }
3368  }
3369  else {
3370  __kmp_affinity_entire_machine_mask(fullMask);
3371  __kmp_avail_proc = __kmp_xproc;
3372  }
3373  }
3374 
3375  int depth = -1;
3376  kmp_i18n_id_t msg_id = kmp_i18n_null;
3377 
3378  //
3379  // For backward compatiblity, setting KMP_CPUINFO_FILE =>
3380  // KMP_TOPOLOGY_METHOD=cpuinfo
3381  //
3382  if ((__kmp_cpuinfo_file != NULL) &&
3383  (__kmp_affinity_top_method == affinity_top_method_all)) {
3384  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3385  }
3386 
3387  if (__kmp_affinity_top_method == affinity_top_method_all) {
3388  //
3389  // In the default code path, errors are not fatal - we just try using
3390  // another method. We only emit a warning message if affinity is on,
3391  // or the verbose flag is set, an the nowarnings flag was not set.
3392  //
3393  const char *file_name = NULL;
3394  int line = 0;
3395 
3396 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3397 
3398  if (__kmp_affinity_verbose) {
3399  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3400  }
3401 
3402  file_name = NULL;
3403  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3404  if (depth == 0) {
3405  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3406  KMP_ASSERT(address2os == NULL);
3407  return;
3408  }
3409 
3410  if (depth < 0) {
3411  if ((msg_id != kmp_i18n_null)
3412  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3413  && (__kmp_affinity_type != affinity_none)))) {
3414 # if KMP_MIC
3415  if (__kmp_affinity_verbose) {
3416  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3417  KMP_I18N_STR(DecodingLegacyAPIC));
3418  }
3419 # else
3420  KMP_WARNING(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3421  KMP_I18N_STR(DecodingLegacyAPIC));
3422 # endif
3423  }
3424 
3425  file_name = NULL;
3426  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3427  if (depth == 0) {
3428  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3429  KMP_ASSERT(address2os == NULL);
3430  return;
3431  }
3432  }
3433 
3434 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3435 
3436 # if KMP_OS_LINUX
3437 
3438  if (depth < 0) {
3439  if ((msg_id != kmp_i18n_null)
3440  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3441  && (__kmp_affinity_type != affinity_none)))) {
3442 # if KMP_MIC
3443  if (__kmp_affinity_verbose) {
3444  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3445  }
3446 # else
3447  KMP_WARNING(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3448 # endif
3449  }
3450  else if (__kmp_affinity_verbose) {
3451  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3452  }
3453 
3454  FILE *f = fopen("/proc/cpuinfo", "r");
3455  if (f == NULL) {
3456  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3457  }
3458  else {
3459  file_name = "/proc/cpuinfo";
3460  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3461  fclose(f);
3462  if (depth == 0) {
3463  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3464  KMP_ASSERT(address2os == NULL);
3465  return;
3466  }
3467  }
3468  }
3469 
3470 # endif /* KMP_OS_LINUX */
3471 
3472  if (depth < 0) {
3473  if (msg_id != kmp_i18n_null
3474  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3475  && (__kmp_affinity_type != affinity_none)))) {
3476  if (file_name == NULL) {
3477  KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3478  }
3479  else if (line == 0) {
3480  KMP_WARNING(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3481  }
3482  else {
3483  KMP_WARNING(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3484  }
3485  }
3486 
3487  file_name = "";
3488  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3489  if (depth == 0) {
3490  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3491  KMP_ASSERT(address2os == NULL);
3492  return;
3493  }
3494  KMP_ASSERT(depth > 0);
3495  KMP_ASSERT(address2os != NULL);
3496  }
3497  }
3498 
3499  //
3500  // If the user has specified that a paricular topology discovery method
3501  // is to be used, then we abort if that method fails. The exception is
3502  // group affinity, which might have been implicitly set.
3503  //
3504 
3505 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3506 
3507  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3508  if (__kmp_affinity_verbose) {
3509  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3510  KMP_I18N_STR(Decodingx2APIC));
3511  }
3512 
3513  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3514  if (depth == 0) {
3515  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3516  KMP_ASSERT(address2os == NULL);
3517  return;
3518  }
3519 
3520  if (depth < 0) {
3521  KMP_ASSERT(msg_id != kmp_i18n_null);
3522  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3523  }
3524  }
3525  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3526  if (__kmp_affinity_verbose) {
3527  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3528  KMP_I18N_STR(DecodingLegacyAPIC));
3529  }
3530 
3531  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3532  if (depth == 0) {
3533  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3534  KMP_ASSERT(address2os == NULL);
3535  return;
3536  }
3537 
3538  if (depth < 0) {
3539  KMP_ASSERT(msg_id != kmp_i18n_null);
3540  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3541  }
3542  }
3543 
3544 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3545 
3546  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3547  const char *filename;
3548  if (__kmp_cpuinfo_file != NULL) {
3549  filename = __kmp_cpuinfo_file;
3550  }
3551  else {
3552  filename = "/proc/cpuinfo";
3553  }
3554 
3555  if (__kmp_affinity_verbose) {
3556  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3557  }
3558 
3559  FILE *f = fopen(filename, "r");
3560  if (f == NULL) {
3561  int code = errno;
3562  if (__kmp_cpuinfo_file != NULL) {
3563  __kmp_msg(
3564  kmp_ms_fatal,
3565  KMP_MSG(CantOpenFileForReading, filename),
3566  KMP_ERR(code),
3567  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3568  __kmp_msg_null
3569  );
3570  }
3571  else {
3572  __kmp_msg(
3573  kmp_ms_fatal,
3574  KMP_MSG(CantOpenFileForReading, filename),
3575  KMP_ERR(code),
3576  __kmp_msg_null
3577  );
3578  }
3579  }
3580  int line = 0;
3581  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3582  fclose(f);
3583  if (depth < 0) {
3584  KMP_ASSERT(msg_id != kmp_i18n_null);
3585  if (line > 0) {
3586  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3587  }
3588  else {
3589  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3590  }
3591  }
3592  if (__kmp_affinity_type == affinity_none) {
3593  KMP_ASSERT(depth == 0);
3594  KMP_ASSERT(address2os == NULL);
3595  return;
3596  }
3597  }
3598 
3599 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3600 
3601  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3602  if (__kmp_affinity_verbose) {
3603  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3604  }
3605 
3606  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3607  KMP_ASSERT(depth != 0);
3608 
3609  if (depth < 0) {
3610  if ((msg_id != kmp_i18n_null)
3611  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3612  && (__kmp_affinity_type != affinity_none)))) {
3613  KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3614  }
3615 
3616  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3617  if (depth == 0) {
3618  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3619  KMP_ASSERT(address2os == NULL);
3620  return;
3621  }
3622  // should not fail
3623  KMP_ASSERT(depth > 0);
3624  KMP_ASSERT(address2os != NULL);
3625  }
3626  }
3627 
3628 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3629 
3630  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3631  if (__kmp_affinity_verbose) {
3632  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3633  }
3634 
3635  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3636  if (depth == 0) {
3637  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3638  KMP_ASSERT(address2os == NULL);
3639  return;
3640  }
3641  // should not fail
3642  KMP_ASSERT(depth > 0);
3643  KMP_ASSERT(address2os != NULL);
3644  }
3645 
3646  if (address2os == NULL) {
3647  if (KMP_AFFINITY_CAPABLE()
3648  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3649  && (__kmp_affinity_type != affinity_none)))) {
3650  KMP_WARNING(ErrorInitializeAffinity);
3651  }
3652  __kmp_affinity_type = affinity_none;
3653  __kmp_affin_mask_size = 0;
3654  return;
3655  }
3656 
3657 # if KMP_MIC
3658  __kmp_apply_thread_places(&address2os, depth);
3659 # endif
3660 
3661  //
3662  // Create the table of masks, indexed by thread Id.
3663  //
3664  unsigned maxIndex;
3665  unsigned numUnique;
3666  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3667  address2os, __kmp_avail_proc);
3668  if (__kmp_affinity_gran_levels == 0) {
3669  KMP_DEBUG_ASSERT(numUnique == __kmp_avail_proc);
3670  }
3671 
3672  //
3673  // Set the childNums vector in all Address objects. This must be done
3674  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3675  // which takes into account the setting of __kmp_affinity_compact.
3676  //
3677  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3678 
3679  switch (__kmp_affinity_type) {
3680 
3681  case affinity_explicit:
3682  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3683 # if OMP_40_ENABLED
3684  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3685 # endif
3686  {
3687  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3688  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3689  maxIndex);
3690  }
3691 # if OMP_40_ENABLED
3692  else {
3693  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3694  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3695  maxIndex);
3696  }
3697 # endif
3698  if (__kmp_affinity_num_masks == 0) {
3699  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3700  && (__kmp_affinity_type != affinity_none))) {
3701  KMP_WARNING(AffNoValidProcID);
3702  }
3703  __kmp_affinity_type = affinity_none;
3704  return;
3705  }
3706  break;
3707 
3708  //
3709  // The other affinity types rely on sorting the Addresses according
3710  // to some permutation of the machine topology tree. Set
3711  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3712  // then jump to a common code fragment to do the sort and create
3713  // the array of affinity masks.
3714  //
3715 
3716  case affinity_logical:
3717  __kmp_affinity_compact = 0;
3718  if (__kmp_affinity_offset) {
3719  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3720  % __kmp_avail_proc;
3721  }
3722  goto sortAddresses;
3723 
3724  case affinity_physical:
3725  if (__kmp_nThreadsPerCore > 1) {
3726  __kmp_affinity_compact = 1;
3727  if (__kmp_affinity_compact >= depth) {
3728  __kmp_affinity_compact = 0;
3729  }
3730  } else {
3731  __kmp_affinity_compact = 0;
3732  }
3733  if (__kmp_affinity_offset) {
3734  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3735  % __kmp_avail_proc;
3736  }
3737  goto sortAddresses;
3738 
3739  case affinity_scatter:
3740  if (__kmp_affinity_compact >= depth) {
3741  __kmp_affinity_compact = 0;
3742  }
3743  else {
3744  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3745  }
3746  goto sortAddresses;
3747 
3748  case affinity_compact:
3749  if (__kmp_affinity_compact >= depth) {
3750  __kmp_affinity_compact = depth - 1;
3751  }
3752  goto sortAddresses;
3753 
3754 # if KMP_MIC
3755  case affinity_balanced:
3756  // Balanced works only for the case of a single package and uniform topology
3757  if( nPackages > 1 ) {
3758  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3759  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3760  }
3761  __kmp_affinity_type = affinity_none;
3762  return;
3763  } else if( __kmp_affinity_uniform_topology() ) {
3764  break;
3765  } else { // Non-uniform topology
3766 
3767  // Save the depth for further usage
3768  __kmp_aff_depth = depth;
3769 
3770  // Number of hyper threads per core in HT machine
3771  int nth_per_core = __kmp_nThreadsPerCore;
3772 
3773  int core_level;
3774  if( nth_per_core > 1 ) {
3775  core_level = depth - 2;
3776  } else {
3777  core_level = depth - 1;
3778  }
3779  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3780  int nproc = nth_per_core * ncores;
3781 
3782  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3783  for( int i = 0; i < nproc; i++ ) {
3784  procarr[ i ] = -1;
3785  }
3786 
3787  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3788  int proc = address2os[ i ].second;
3789  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3790  // If there is only one thread per core then depth == 2: level 0 - package,
3791  // level 1 - core.
3792  int level = depth - 1;
3793 
3794  // __kmp_nth_per_core == 1
3795  int thread = 0;
3796  int core = address2os[ i ].first.labels[ level ];
3797  // If the thread level exists, that is we have more than one thread context per core
3798  if( nth_per_core > 1 ) {
3799  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3800  core = address2os[ i ].first.labels[ level - 1 ];
3801  }
3802  procarr[ core * nth_per_core + thread ] = proc;
3803  }
3804 
3805  break;
3806  }
3807 # endif
3808 
3809  sortAddresses:
3810  //
3811  // Allocate the gtid->affinity mask table.
3812  //
3813  if (__kmp_affinity_dups) {
3814  __kmp_affinity_num_masks = __kmp_avail_proc;
3815  }
3816  else {
3817  __kmp_affinity_num_masks = numUnique;
3818  }
3819 
3820 # if OMP_40_ENABLED
3821  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3822  && ( __kmp_affinity_num_places > 0 )
3823  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3824  __kmp_affinity_num_masks = __kmp_affinity_num_places;
3825  }
3826 # endif
3827 
3828  __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3829  __kmp_affinity_num_masks * __kmp_affin_mask_size);
3830 
3831  //
3832  // Sort the address2os table according to the current setting of
3833  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3834  //
3835  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3836  __kmp_affinity_cmp_Address_child_num);
3837  {
3838  int i;
3839  unsigned j;
3840  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3841  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3842  continue;
3843  }
3844  unsigned osId = address2os[i].second;
3845  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3846  kmp_affin_mask_t *dest
3847  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3848  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3849  KMP_CPU_COPY(dest, src);
3850  if (++j >= __kmp_affinity_num_masks) {
3851  break;
3852  }
3853  }
3854  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3855  }
3856  break;
3857 
3858  default:
3859  KMP_ASSERT2(0, "Unexpected affinity setting");
3860  }
3861 
3862  __kmp_free(osId2Mask);
3863 }
3864 
3865 
3866 void
3867 __kmp_affinity_initialize(void)
3868 {
3869  //
3870  // Much of the code above was written assumming that if a machine was not
3871  // affinity capable, then __kmp_affinity_type == affinity_none. We now
3872  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3873  //
3874  // There are too many checks for __kmp_affinity_type == affinity_none
3875  // in this code. Instead of trying to change them all, check if
3876  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3877  // affinity_none, call the real initialization routine, then restore
3878  // __kmp_affinity_type to affinity_disabled.
3879  //
3880  int disabled = (__kmp_affinity_type == affinity_disabled);
3881  if (! KMP_AFFINITY_CAPABLE()) {
3882  KMP_ASSERT(disabled);
3883  }
3884  if (disabled) {
3885  __kmp_affinity_type = affinity_none;
3886  }
3887  __kmp_aux_affinity_initialize();
3888  if (disabled) {
3889  __kmp_affinity_type = affinity_disabled;
3890  }
3891 }
3892 
3893 
3894 void
3895 __kmp_affinity_uninitialize(void)
3896 {
3897  if (__kmp_affinity_masks != NULL) {
3898  __kmp_free(__kmp_affinity_masks);
3899  __kmp_affinity_masks = NULL;
3900  }
3901  if (fullMask != NULL) {
3902  KMP_CPU_FREE(fullMask);
3903  fullMask = NULL;
3904  }
3905  __kmp_affinity_num_masks = 0;
3906 # if OMP_40_ENABLED
3907  __kmp_affinity_num_places = 0;
3908 # endif
3909  if (__kmp_affinity_proclist != NULL) {
3910  __kmp_free(__kmp_affinity_proclist);
3911  __kmp_affinity_proclist = NULL;
3912  }
3913  if( address2os != NULL ) {
3914  __kmp_free( address2os );
3915  address2os = NULL;
3916  }
3917  if( procarr != NULL ) {
3918  __kmp_free( procarr );
3919  procarr = NULL;
3920  }
3921 }
3922 
3923 
3924 void
3925 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3926 {
3927  if (! KMP_AFFINITY_CAPABLE()) {
3928  return;
3929  }
3930 
3931  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3932  if (th->th.th_affin_mask == NULL) {
3933  KMP_CPU_ALLOC(th->th.th_affin_mask);
3934  }
3935  else {
3936  KMP_CPU_ZERO(th->th.th_affin_mask);
3937  }
3938 
3939  //
3940  // Copy the thread mask to the kmp_info_t strucuture.
3941  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3942  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3943  // is set, then the full mask is the same as the mask of the initialization
3944  // thread.
3945  //
3946  kmp_affin_mask_t *mask;
3947  int i;
3948 
3949 # if OMP_40_ENABLED
3950  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3951 # endif
3952  {
3953  if ((__kmp_affinity_type == affinity_none)
3954 # if KMP_MIC
3955  || (__kmp_affinity_type == affinity_balanced)
3956 # endif
3957  ) {
3958 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3959  if (__kmp_num_proc_groups > 1) {
3960  return;
3961  }
3962 # endif
3963  KMP_ASSERT(fullMask != NULL);
3964  i = -1;
3965  mask = fullMask;
3966  }
3967  else {
3968  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3969  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3970  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3971  }
3972  }
3973 # if OMP_40_ENABLED
3974  else {
3975  if ((! isa_root)
3976  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3977 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3978  if (__kmp_num_proc_groups > 1) {
3979  return;
3980  }
3981 # endif
3982  KMP_ASSERT(fullMask != NULL);
3983  i = KMP_PLACE_ALL;
3984  mask = fullMask;
3985  }
3986  else {
3987  //
3988  // int i = some hash function or just a counter that doesn't
3989  // always start at 0. Use gtid for now.
3990  //
3991  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3992  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3993  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3994  }
3995  }
3996 # endif
3997 
3998 # if OMP_40_ENABLED
3999  th->th.th_current_place = i;
4000  if (isa_root) {
4001  th->th.th_new_place = i;
4002  th->th.th_first_place = 0;
4003  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4004  }
4005 
4006  if (i == KMP_PLACE_ALL) {
4007  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4008  gtid));
4009  }
4010  else {
4011  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4012  gtid, i));
4013  }
4014 # else
4015  if (i == -1) {
4016  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4017  gtid));
4018  }
4019  else {
4020  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4021  gtid, i));
4022  }
4023 # endif /* OMP_40_ENABLED */
4024 
4025  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4026 
4027  if (__kmp_affinity_verbose) {
4028  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4029  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4030  th->th.th_affin_mask);
4031  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", gtid, buf);
4032  }
4033 
4034 # if KMP_OS_WINDOWS
4035  //
4036  // On Windows* OS, the process affinity mask might have changed.
4037  // If the user didn't request affinity and this call fails,
4038  // just continue silently. See CQ171393.
4039  //
4040  if ( __kmp_affinity_type == affinity_none ) {
4041  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4042  }
4043  else
4044 # endif
4045  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4046 }
4047 
4048 
4049 # if OMP_40_ENABLED
4050 
4051 void
4052 __kmp_affinity_set_place(int gtid)
4053 {
4054  int retval;
4055 
4056  if (! KMP_AFFINITY_CAPABLE()) {
4057  return;
4058  }
4059 
4060  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4061 
4062  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4063  gtid, th->th.th_new_place, th->th.th_current_place));
4064 
4065  //
4066  // Check that the new place is withing this thread's partition.
4067  //
4068  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4069  KMP_DEBUG_ASSERT(th->th.th_new_place >= 0);
4070  KMP_DEBUG_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4071  if (th->th.th_first_place <= th->th.th_last_place) {
4072  KMP_DEBUG_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4073  && (th->th.th_new_place <= th->th.th_last_place));
4074  }
4075  else {
4076  KMP_DEBUG_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4077  || (th->th.th_new_place >= th->th.th_last_place));
4078  }
4079 
4080  //
4081  // Copy the thread mask to the kmp_info_t strucuture,
4082  // and set this thread's affinity.
4083  //
4084  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4085  th->th.th_new_place);
4086  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4087  th->th.th_current_place = th->th.th_new_place;
4088 
4089  if (__kmp_affinity_verbose) {
4090  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4091  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4092  th->th.th_affin_mask);
4093  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", gtid, buf);
4094  }
4095  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4096 }
4097 
4098 # endif /* OMP_40_ENABLED */
4099 
4100 
4101 int
4102 __kmp_aux_set_affinity(void **mask)
4103 {
4104  int gtid;
4105  kmp_info_t *th;
4106  int retval;
4107 
4108  if (! KMP_AFFINITY_CAPABLE()) {
4109  return -1;
4110  }
4111 
4112  gtid = __kmp_entry_gtid();
4113  KA_TRACE(1000, ;{
4114  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4115  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4116  (kmp_affin_mask_t *)(*mask));
4117  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4118  gtid, buf);
4119  });
4120 
4121  if (__kmp_env_consistency_check) {
4122  if ((mask == NULL) || (*mask == NULL)) {
4123  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4124  }
4125  else {
4126  unsigned proc;
4127  int num_procs = 0;
4128 
4129  for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4130  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4131  continue;
4132  }
4133  num_procs++;
4134  if (! KMP_CPU_ISSET(proc, fullMask)) {
4135  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4136  break;
4137  }
4138  }
4139  if (num_procs == 0) {
4140  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4141  }
4142 
4143 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4144  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4145  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4146  }
4147 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
4148 
4149  }
4150  }
4151 
4152  th = __kmp_threads[gtid];
4153  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4154  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4155  if (retval == 0) {
4156  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4157  }
4158 
4159 # if OMP_40_ENABLED
4160  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4161  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4162  th->th.th_first_place = 0;
4163  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4164 # endif
4165 
4166  return retval;
4167 }
4168 
4169 
4170 int
4171 __kmp_aux_get_affinity(void **mask)
4172 {
4173  int gtid;
4174  int retval;
4175  kmp_info_t *th;
4176 
4177  if (! KMP_AFFINITY_CAPABLE()) {
4178  return -1;
4179  }
4180 
4181  gtid = __kmp_entry_gtid();
4182  th = __kmp_threads[gtid];
4183  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4184 
4185  KA_TRACE(1000, ;{
4186  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4187  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4188  th->th.th_affin_mask);
4189  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4190  });
4191 
4192  if (__kmp_env_consistency_check) {
4193  if ((mask == NULL) || (*mask == NULL)) {
4194  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4195  }
4196  }
4197 
4198 # if !KMP_OS_WINDOWS
4199 
4200  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4201  KA_TRACE(1000, ;{
4202  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4203  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4204  (kmp_affin_mask_t *)(*mask));
4205  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4206  });
4207  return retval;
4208 
4209 # else
4210 
4211  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4212  return 0;
4213 
4214 # endif /* KMP_OS_WINDOWS */
4215 
4216 }
4217 
4218 
4219 int
4220 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4221 {
4222  int retval;
4223 
4224  if (! KMP_AFFINITY_CAPABLE()) {
4225  return -1;
4226  }
4227 
4228  KA_TRACE(1000, ;{
4229  int gtid = __kmp_entry_gtid();
4230  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4231  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4232  (kmp_affin_mask_t *)(*mask));
4233  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4234  proc, gtid, buf);
4235  });
4236 
4237  if (__kmp_env_consistency_check) {
4238  if ((mask == NULL) || (*mask == NULL)) {
4239  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4240  }
4241  }
4242 
4243  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4244  return -1;
4245  }
4246  if (! KMP_CPU_ISSET(proc, fullMask)) {
4247  return -2;
4248  }
4249 
4250  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4251  return 0;
4252 }
4253 
4254 
4255 int
4256 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4257 {
4258  int retval;
4259 
4260  if (! KMP_AFFINITY_CAPABLE()) {
4261  return -1;
4262  }
4263 
4264  KA_TRACE(1000, ;{
4265  int gtid = __kmp_entry_gtid();
4266  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4267  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4268  (kmp_affin_mask_t *)(*mask));
4269  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4270  proc, gtid, buf);
4271  });
4272 
4273  if (__kmp_env_consistency_check) {
4274  if ((mask == NULL) || (*mask == NULL)) {
4275  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4276  }
4277  }
4278 
4279  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4280  return -1;
4281  }
4282  if (! KMP_CPU_ISSET(proc, fullMask)) {
4283  return -2;
4284  }
4285 
4286  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4287  return 0;
4288 }
4289 
4290 
4291 int
4292 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4293 {
4294  int retval;
4295 
4296  if (! KMP_AFFINITY_CAPABLE()) {
4297  return -1;
4298  }
4299 
4300  KA_TRACE(1000, ;{
4301  int gtid = __kmp_entry_gtid();
4302  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4303  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4304  (kmp_affin_mask_t *)(*mask));
4305  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4306  proc, gtid, buf);
4307  });
4308 
4309  if (__kmp_env_consistency_check) {
4310  if ((mask == NULL) || (*mask == NULL)) {
4311  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4312  }
4313  }
4314 
4315  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4316  return 0;
4317  }
4318  if (! KMP_CPU_ISSET(proc, fullMask)) {
4319  return 0;
4320  }
4321 
4322  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4323 }
4324 
4325 # if KMP_MIC
4326 
4327 // Dynamic affinity settings - Affinity balanced
4328 void __kmp_balanced_affinity( int tid, int nthreads )
4329 {
4330  if( __kmp_affinity_uniform_topology() ) {
4331  int coreID;
4332  int threadID;
4333  // Number of hyper threads per core in HT machine
4334  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4335  // Number of cores
4336  int ncores = __kmp_ncores;
4337  // How many threads will be bound to each core
4338  int chunk = nthreads / ncores;
4339  // How many cores will have an additional thread bound to it - "big cores"
4340  int big_cores = nthreads % ncores;
4341  // Number of threads on the big cores
4342  int big_nth = ( chunk + 1 ) * big_cores;
4343  if( tid < big_nth ) {
4344  coreID = tid / (chunk + 1 );
4345  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4346  } else { //tid >= big_nth
4347  coreID = ( tid - big_cores ) / chunk;
4348  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4349  }
4350 
4351  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4352  "Illegal set affinity operation when not capable");
4353 
4354  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4355  KMP_CPU_ZERO(mask);
4356 
4357  // Granularity == thread
4358  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4359  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4360  KMP_CPU_SET( osID, mask);
4361  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4362  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4363  int osID;
4364  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4365  KMP_CPU_SET( osID, mask);
4366  }
4367  }
4368  if (__kmp_affinity_verbose) {
4369  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4370  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4371  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4372  }
4373  __kmp_set_system_affinity( mask, TRUE );
4374  } else { // Non-uniform topology
4375 
4376  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4377  KMP_CPU_ZERO(mask);
4378 
4379  // Number of hyper threads per core in HT machine
4380  int nth_per_core = __kmp_nThreadsPerCore;
4381  int core_level;
4382  if( nth_per_core > 1 ) {
4383  core_level = __kmp_aff_depth - 2;
4384  } else {
4385  core_level = __kmp_aff_depth - 1;
4386  }
4387 
4388  // Number of cores - maximum value; it does not count trail cores with 0 processors
4389  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4390 
4391  // For performance gain consider the special case nthreads == __kmp_avail_proc
4392  if( nthreads == __kmp_avail_proc ) {
4393  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4394  int osID = address2os[ tid ].second;
4395  KMP_CPU_SET( osID, mask);
4396  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4397  int coreID = address2os[ tid ].first.labels[ core_level ];
4398  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4399  // since the address2os is sortied we can break when cnt==nth_per_core
4400  int cnt = 0;
4401  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4402  int osID = address2os[ i ].second;
4403  int core = address2os[ i ].first.labels[ core_level ];
4404  if( core == coreID ) {
4405  KMP_CPU_SET( osID, mask);
4406  cnt++;
4407  if( cnt == nth_per_core ) {
4408  break;
4409  }
4410  }
4411  }
4412  }
4413  } else if( nthreads <= __kmp_ncores ) {
4414 
4415  int core = 0;
4416  for( int i = 0; i < ncores; i++ ) {
4417  // Check if this core from procarr[] is in the mask
4418  int in_mask = 0;
4419  for( int j = 0; j < nth_per_core; j++ ) {
4420  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4421  in_mask = 1;
4422  break;
4423  }
4424  }
4425  if( in_mask ) {
4426  if( tid == core ) {
4427  for( int j = 0; j < nth_per_core; j++ ) {
4428  int osID = procarr[ i * nth_per_core + j ];
4429  if( osID != -1 ) {
4430  KMP_CPU_SET( osID, mask );
4431  // For granularity=thread it is enough to set the first available osID for this core
4432  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4433  break;
4434  }
4435  }
4436  }
4437  break;
4438  } else {
4439  core++;
4440  }
4441  }
4442  }
4443 
4444  } else { // nthreads > __kmp_ncores
4445 
4446  // Array to save the number of processors at each core
4447  int nproc_at_core[ ncores ];
4448  // Array to save the number of cores with "x" available processors;
4449  int ncores_with_x_procs[ nth_per_core + 1 ];
4450  // Array to save the number of cores with # procs from x to nth_per_core
4451  int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4452 
4453  for( int i = 0; i <= nth_per_core; i++ ) {
4454  ncores_with_x_procs[ i ] = 0;
4455  ncores_with_x_to_max_procs[ i ] = 0;
4456  }
4457 
4458  for( int i = 0; i < ncores; i++ ) {
4459  int cnt = 0;
4460  for( int j = 0; j < nth_per_core; j++ ) {
4461  if( procarr[ i * nth_per_core + j ] != -1 ) {
4462  cnt++;
4463  }
4464  }
4465  nproc_at_core[ i ] = cnt;
4466  ncores_with_x_procs[ cnt ]++;
4467  }
4468 
4469  for( int i = 0; i <= nth_per_core; i++ ) {
4470  for( int j = i; j <= nth_per_core; j++ ) {
4471  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4472  }
4473  }
4474 
4475  // Max number of processors
4476  int nproc = nth_per_core * ncores;
4477  // An array to keep number of threads per each context
4478  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4479  for( int i = 0; i < nproc; i++ ) {
4480  newarr[ i ] = 0;
4481  }
4482 
4483  int nth = nthreads;
4484  int flag = 0;
4485  while( nth > 0 ) {
4486  for( int j = 1; j <= nth_per_core; j++ ) {
4487  int cnt = ncores_with_x_to_max_procs[ j ];
4488  for( int i = 0; i < ncores; i++ ) {
4489  // Skip the core with 0 processors
4490  if( nproc_at_core[ i ] == 0 ) {
4491  continue;
4492  }
4493  for( int k = 0; k < nth_per_core; k++ ) {
4494  if( procarr[ i * nth_per_core + k ] != -1 ) {
4495  if( newarr[ i * nth_per_core + k ] == 0 ) {
4496  newarr[ i * nth_per_core + k ] = 1;
4497  cnt--;
4498  nth--;
4499  break;
4500  } else {
4501  if( flag != 0 ) {
4502  newarr[ i * nth_per_core + k ] ++;
4503  cnt--;
4504  nth--;
4505  break;
4506  }
4507  }
4508  }
4509  }
4510  if( cnt == 0 || nth == 0 ) {
4511  break;
4512  }
4513  }
4514  if( nth == 0 ) {
4515  break;
4516  }
4517  }
4518  flag = 1;
4519  }
4520  int sum = 0;
4521  for( int i = 0; i < nproc; i++ ) {
4522  sum += newarr[ i ];
4523  if( sum > tid ) {
4524  // Granularity == thread
4525  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4526  int osID = procarr[ i ];
4527  KMP_CPU_SET( osID, mask);
4528  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4529  int coreID = i / nth_per_core;
4530  for( int ii = 0; ii < nth_per_core; ii++ ) {
4531  int osID = procarr[ coreID * nth_per_core + ii ];
4532  if( osID != -1 ) {
4533  KMP_CPU_SET( osID, mask);
4534  }
4535  }
4536  }
4537  break;
4538  }
4539  }
4540  __kmp_free( newarr );
4541  }
4542 
4543  if (__kmp_affinity_verbose) {
4544  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4545  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4546  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4547  }
4548  __kmp_set_system_affinity( mask, TRUE );
4549  }
4550 }
4551 
4552 # endif /* KMP_MIC */
4553 
4554 #elif KMP_OS_DARWIN
4555  // affinity not supported
4556 #else
4557  #error "Unknown or unsupported OS"
4558 #endif // KMP_OS_WINDOWS || KMP_OS_LINUX
4559