Author: avg
Date: Mon Apr  4 16:09:29 2016
New Revision: 297558
URL: https://svnweb.freebsd.org/changeset/base/297558

Log:
  new x86 smp topology detection code
  
  Previously, the code determined a topology of processing units
  (hardware threads, cores, packages) and then deduced a cache topology
  using certain assumptions.  The new code builds a topology that
  includes both processing units and caches using the information
  provided by the hardware.
  
  At the moment, the discovered full topology is used only to creeate
  a scheduling topology for SCHED_ULE.
  There is no KPI for other kernel uses.
  
  Summary:
  - based on APIC ID derivation rules for Intel and AMD CPUs
  - can handle non-uniform topologies
  - requires homogeneous APIC ID assignment (same bit widths for ID
    components)
  - topology for dual-node AMD CPUs may not be optimal
  - topology for latest AMD CPU models may not be optimal as the code is
    several years old
  - supports only thread/package/core/cache nodes
  
  Todo:
    - AMD dual-node processors
    - latest AMD processors
    - NUMA nodes
    - checking for homogeneity of the APIC ID assignment across packages
    - more flexible cache placement within topology
    - expose topology to userland, e.g., via sysctl nodes
  
  Long term todo:
    - KPI for CPU sharing and affinity with respect to various resources
      (e.g., two logical processors may share the same FPU, etc)
  
  Reviewed by:  mav
  Tested by:    mav
  MFC after:    1 month
  Differential Revision:        https://reviews.freebsd.org/D2728

Modified:
  head/sys/kern/subr_smp.c
  head/sys/sys/smp.h
  head/sys/x86/x86/mp_x86.c

Modified: head/sys/kern/subr_smp.c
==============================================================================
--- head/sys/kern/subr_smp.c    Mon Apr  4 15:56:14 2016        (r297557)
+++ head/sys/kern/subr_smp.c    Mon Apr  4 16:09:29 2016        (r297558)
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/sched.h>
@@ -51,6 +52,10 @@ __FBSDID("$FreeBSD$");
 #include "opt_sched.h"
 
 #ifdef SMP
+MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
+#endif
+
+#ifdef SMP
 volatile cpuset_t stopped_cpus;
 volatile cpuset_t started_cpus;
 volatile cpuset_t suspended_cpus;
@@ -556,7 +561,7 @@ smp_rendezvous(void (* setup_func)(void 
        smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, 
arg);
 }
 
-static struct cpu_group group[MAXCPU];
+static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 
 struct cpu_group *
 smp_topo(void)
@@ -616,6 +621,17 @@ smp_topo(void)
 }
 
 struct cpu_group *
+smp_topo_alloc(u_int count)
+{
+       static u_int index;
+       u_int curr;
+
+       curr = index;
+       index += count;
+       return (&group[curr]);
+}
+
+struct cpu_group *
 smp_topo_none(void)
 {
        struct cpu_group *top;
@@ -861,3 +877,233 @@ sysctl_kern_smp_active(SYSCTL_HANDLER_AR
        return (error);
 }
 
+
+#ifdef SMP
+void
+topo_init_node(struct topo_node *node)
+{
+
+       bzero(node, sizeof(*node));
+       TAILQ_INIT(&node->children);
+}
+
+void
+topo_init_root(struct topo_node *root)
+{
+
+       topo_init_node(root);
+       root->type = TOPO_TYPE_SYSTEM;
+}
+
+struct topo_node *
+topo_add_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype)
+{
+       struct topo_node *node;
+
+       TAILQ_FOREACH_REVERSE(node, &parent->children,
+           topo_children, siblings) {
+               if (node->hwid == hwid
+                   && node->type == type && node->subtype == subtype) {
+                       return (node);
+               }
+       }
+
+       node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
+       topo_init_node(node);
+       node->parent = parent;
+       node->hwid = hwid;
+       node->type = type;
+       node->subtype = subtype;
+       TAILQ_INSERT_TAIL(&parent->children, node, siblings);
+       parent->nchildren++;
+
+       return (node);
+}
+
+struct topo_node *
+topo_find_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype)
+{
+
+       struct topo_node *node;
+
+       TAILQ_FOREACH(node, &parent->children, siblings) {
+               if (node->hwid == hwid
+                   && node->type == type && node->subtype == subtype) {
+                       return (node);
+               }
+       }
+
+       return (NULL);
+}
+
+void
+topo_promote_child(struct topo_node *child)
+{
+       struct topo_node *next;
+       struct topo_node *node;
+       struct topo_node *parent;
+
+       parent = child->parent;
+       next = TAILQ_NEXT(child, siblings);
+       TAILQ_REMOVE(&parent->children, child, siblings);
+       TAILQ_INSERT_HEAD(&parent->children, child, siblings);
+
+       while (next != NULL) {
+               node = next;
+               next = TAILQ_NEXT(node, siblings);
+               TAILQ_REMOVE(&parent->children, node, siblings);
+               TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
+               child = node;
+       }
+}
+
+struct topo_node *
+topo_next_node(struct topo_node *top, struct topo_node *node)
+{
+       struct topo_node *next;
+
+       if ((next = TAILQ_FIRST(&node->children)) != NULL)
+               return (next);
+
+       if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+               return (next);
+
+       while ((node = node->parent) != top)
+               if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+                       return (next);
+
+       return (NULL);
+}
+
+struct topo_node *
+topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
+{
+       struct topo_node *next;
+
+       if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+               return (next);
+
+       while ((node = node->parent) != top)
+               if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+                       return (next);
+
+       return (NULL);
+}
+
+void
+topo_set_pu_id(struct topo_node *node, cpuid_t id)
+{
+
+       KASSERT(node->type == TOPO_TYPE_PU,
+           ("topo_set_pu_id: wrong node type: %u", node->type));
+       KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
+           ("topo_set_pu_id: cpuset already not empty"));
+       node->id = id;
+       CPU_SET(id, &node->cpuset);
+       node->cpu_count = 1;
+       node->subtype = 1;
+
+       while ((node = node->parent) != NULL) {
+               if (CPU_ISSET(id, &node->cpuset))
+                       break;
+               CPU_SET(id, &node->cpuset);
+               node->cpu_count++;
+       }
+}
+
+int
+topo_analyze(struct topo_node *topo_root, int all,
+    int *pkg_count, int *cores_per_pkg, int *thrs_per_core)
+{
+       struct topo_node *pkg_node;
+       struct topo_node *core_node;
+       struct topo_node *pu_node;
+       int thrs_per_pkg;
+       int cpp_counter;
+       int tpc_counter;
+       int tpp_counter;
+
+       *pkg_count = 0;
+       *cores_per_pkg = -1;
+       *thrs_per_core = -1;
+       thrs_per_pkg = -1;
+       pkg_node = topo_root;
+       while (pkg_node != NULL) {
+               if (pkg_node->type != TOPO_TYPE_PKG) {
+                       pkg_node = topo_next_node(topo_root, pkg_node);
+                       continue;
+               }
+               if (!all && CPU_EMPTY(&pkg_node->cpuset)) {
+                       pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
+                       continue;
+               }
+
+               (*pkg_count)++;
+
+               cpp_counter = 0;
+               tpp_counter = 0;
+               core_node = pkg_node;
+               while (core_node != NULL) {
+                       if (core_node->type == TOPO_TYPE_CORE) {
+                               if (!all && CPU_EMPTY(&core_node->cpuset)) {
+                                       core_node =
+                                           topo_next_nonchild_node(pkg_node,
+                                               core_node);
+                                       continue;
+                               }
+
+                               cpp_counter++;
+
+                               tpc_counter = 0;
+                               pu_node = core_node;
+                               while (pu_node != NULL) {
+                                       if (pu_node->type == TOPO_TYPE_PU &&
+                                           (all || 
!CPU_EMPTY(&pu_node->cpuset)))
+                                               tpc_counter++;
+                                       pu_node = topo_next_node(core_node,
+                                           pu_node);
+                               }
+
+                               if (*thrs_per_core == -1)
+                                       *thrs_per_core = tpc_counter;
+                               else if (*thrs_per_core != tpc_counter)
+                                       return (0);
+
+                               core_node = topo_next_nonchild_node(pkg_node,
+                                   core_node);
+                       } else {
+                               /* PU node directly under PKG. */
+                               if (core_node->type == TOPO_TYPE_PU &&
+                                  (all || !CPU_EMPTY(&core_node->cpuset)))
+                                       tpp_counter++;
+                               core_node = topo_next_node(pkg_node,
+                                   core_node);
+                       }
+               }
+
+               if (*cores_per_pkg == -1)
+                       *cores_per_pkg = cpp_counter;
+               else if (*cores_per_pkg != cpp_counter)
+                       return (0);
+               if (thrs_per_pkg == -1)
+                       thrs_per_pkg = tpp_counter;
+               else if (thrs_per_pkg != tpp_counter)
+                       return (0);
+
+               pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
+       }
+
+       KASSERT(*pkg_count > 0,
+               ("bug in topology or analysis"));
+       if (*cores_per_pkg == 0) {
+               KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0,
+                       ("bug in topology or analysis"));
+               *thrs_per_core = thrs_per_pkg;
+       }
+
+       return (1);
+}
+#endif /* SMP */
+

Modified: head/sys/sys/smp.h
==============================================================================
--- head/sys/sys/smp.h  Mon Apr  4 15:56:14 2016        (r297557)
+++ head/sys/sys/smp.h  Mon Apr  4 16:09:29 2016        (r297558)
@@ -17,9 +17,52 @@
 #ifndef LOCORE
 
 #include <sys/cpuset.h>
+#include <sys/queue.h>
 
 /*
- * Topology of a NUMA or HTT system.
+ * Types of nodes in the topological tree.
+ */
+typedef enum {
+       /* No node has this type; can be used in topo API calls. */
+       TOPO_TYPE_DUMMY,
+       /* Processing unit aka computing unit aka logical CPU. */
+       TOPO_TYPE_PU,
+       /* Physical subdivision of a package. */
+       TOPO_TYPE_CORE,
+       /* CPU L1/L2/L3 cache. */
+       TOPO_TYPE_CACHE,
+       /* Package aka chip, equivalent to socket. */
+       TOPO_TYPE_PKG,
+       /* NUMA node. */
+       TOPO_TYPE_NODE,
+       /* Other logical or physical grouping of PUs. */
+       /* E.g. PUs on the same dye, or PUs sharing an FPU. */
+       TOPO_TYPE_GROUP,
+       /* The whole system. */
+       TOPO_TYPE_SYSTEM
+} topo_node_type;
+
+/* Hardware indenitifier of a topology component. */
+typedef        unsigned int hwid_t;
+/* Logical CPU idenitifier. */
+typedef        int cpuid_t;
+
+/* A node in the topology. */
+struct topo_node {
+       struct topo_node                        *parent;
+       TAILQ_HEAD(topo_children, topo_node)    children;
+       TAILQ_ENTRY(topo_node)                  siblings;
+       cpuset_t                                cpuset;
+       topo_node_type                          type;
+       uintptr_t                               subtype;
+       hwid_t                                  hwid;
+       cpuid_t                                 id;
+       int                                     nchildren;
+       int                                     cpu_count;
+};
+
+/*
+ * Scheduling topology of a NUMA or SMP system.
  *
  * The top level topology is an array of pointers to groups.  Each group
  * contains a bitmask of cpus in its group or subgroups.  It may also
@@ -52,6 +95,8 @@ typedef struct cpu_group *cpu_group_t;
 #define        CG_SHARE_L2     2
 #define        CG_SHARE_L3     3
 
+#define MAX_CACHE_LEVELS       CG_SHARE_L3
+
 /*
  * Behavior modifiers for load balancing and affinity.
  */
@@ -60,10 +105,29 @@ typedef struct cpu_group *cpu_group_t;
 #define        CG_FLAG_THREAD  (CG_FLAG_HTT | CG_FLAG_SMT)     /* Any 
threading. */
 
 /*
- * Convenience routines for building topologies.
+ * Convenience routines for building and traversing topologies.
  */
 #ifdef SMP
+void topo_init_node(struct topo_node *node);
+void topo_init_root(struct topo_node *root);
+struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype);
+struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype);
+void topo_promote_child(struct topo_node *child);
+struct topo_node * topo_next_node(struct topo_node *top,
+    struct topo_node *node);
+struct topo_node * topo_next_nonchild_node(struct topo_node *top,
+    struct topo_node *node);
+void topo_set_pu_id(struct topo_node *node, cpuid_t id);
+int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count,
+    int *cores_per_pkg, int *thrs_per_core);
+
+#define        TOPO_FOREACH(i, root)   \
+       for (i = root; i != NULL; i = topo_next_node(root, i))
+
 struct cpu_group *smp_topo(void);
+struct cpu_group *smp_topo_alloc(u_int count);
 struct cpu_group *smp_topo_none(void);
 struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags);
 struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share,

Modified: head/sys/x86/x86/mp_x86.c
==============================================================================
--- head/sys/x86/x86/mp_x86.c   Mon Apr  4 15:56:14 2016        (r297557)
+++ head/sys/x86/x86/mp_x86.c   Mon Apr  4 16:09:29 2016        (r297558)
@@ -133,19 +133,28 @@ volatile int aps_ready = 0;
  * the APs.
  */
 struct cpu_info cpu_info[MAX_APIC_ID + 1];
-int cpu_apic_ids[MAXCPU];
 int apic_cpuids[MAX_APIC_ID + 1];
+int cpu_apic_ids[MAXCPU];
 
 /* Holds pending bitmap based IPIs per CPU */
 volatile u_int cpu_ipi_pending[MAXCPU];
 
-int cpu_logical;               /* logical cpus per core */
-int cpu_cores;                 /* cores per package */
-
 static void    release_aps(void *dummy);
 
-static u_int   hyperthreading_cpus;    /* logical cpus sharing L1 cache */
 static int     hyperthreading_allowed = 1;
+SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
+       &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
+
+static struct topo_node topo_root;
+
+static int pkg_id_shift;
+static int core_id_shift;
+static int disabled_cpus;
+
+struct cache_info {
+       int     id_shift;
+       int     present;
+} static caches[MAX_CACHE_LEVELS];
 
 void
 mem_range_AP_init(void)
@@ -155,60 +164,125 @@ mem_range_AP_init(void)
                mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
-static void
-topo_probe_amd(void)
+/*
+ * Round up to the next power of two, if necessary, and then
+ * take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+mask_width(u_int x)
 {
-       int core_id_bits;
-       int id;
 
-       /* AMD processors do not support HTT. */
-       cpu_logical = 1;
+       return (fls(x << (1 - powerof2(x))) - 1);
+}
+
+static int
+add_deterministic_cache(int type, int level, int share_count)
+{
 
-       if ((amd_feature2 & AMDID2_CMP) == 0) {
-               cpu_cores = 1;
-               return;
+       if (type == 0)
+               return (0);
+       if (type > 3) {
+               printf("unexpected cache type %d\n", type);
+               return (1);
        }
-
-       core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
-           AMDID_COREID_SIZE_SHIFT;
-       if (core_id_bits == 0) {
-               cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1;
-               return;
+       if (type == 2) /* ignore instruction cache */
+               return (1);
+       if (level == 0 || level > MAX_CACHE_LEVELS) {
+               printf("unexpected cache level %d\n", type);
+               return (1);
        }
 
-       /* Fam 10h and newer should get here. */
-       for (id = 0; id <= MAX_APIC_ID; id++) {
-               /* Check logical CPU availability. */
-               if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
-                       continue;
-               /* Check if logical CPU has the same package ID. */
-               if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits))
-                       continue;
-               cpu_cores++;
+       if (caches[level - 1].present) {
+               printf("WARNING: multiple entries for L%u data cache\n", level);
+               printf("%u => %u\n", caches[level - 1].id_shift,
+                   mask_width(share_count));
+       }
+       caches[level - 1].id_shift = mask_width(share_count);
+       caches[level - 1].present = 1;
+
+       if (caches[level - 1].id_shift > pkg_id_shift) {
+               printf("WARNING: L%u data cache covers more "
+                   "APIC IDs than a package\n", level);
+               printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
+               caches[level - 1].id_shift = pkg_id_shift;
+       }
+       if (caches[level - 1].id_shift < core_id_shift) {
+               printf("WARNING: L%u data cache covers less "
+                   "APIC IDs than a core\n", level);
+               printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
+               caches[level - 1].id_shift = core_id_shift;
        }
+
+       return (1);
 }
 
-/*
- * Round up to the next power of two, if necessary, and then
- * take log2.
- * Returns -1 if argument is zero.
- */
-static __inline int
-mask_width(u_int x)
+static void
+topo_probe_amd(void)
 {
+       u_int p[4];
+       int level;
+       int share_count;
+       int type;
+       int i;
 
-       return (fls(x << (1 - powerof2(x))) - 1);
+       /* No multi-core capability. */
+       if ((amd_feature2 & AMDID2_CMP) == 0)
+               return;
+
+       /* For families 10h and newer. */
+       pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
+           AMDID_COREID_SIZE_SHIFT;
+
+       /* For 0Fh family. */
+       if (pkg_id_shift == 0)
+               pkg_id_shift =
+                   mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
+
+       if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
+               for (i = 0; ; i++) {
+                       cpuid_count(0x8000001d, i, p);
+                       type = p[0] & 0x1f;
+                       level = (p[0] >> 5) & 0x7;
+                       share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+                       if (!add_deterministic_cache(type, level, share_count))
+                               break;
+               }
+       } else {
+               if (cpu_exthigh >= 0x80000005) {
+                       cpuid_count(0x80000005, 0, p);
+                       if (((p[2] >> 24) & 0xff) != 0) {
+                               caches[0].id_shift = 0;
+                               caches[0].present = 1;
+                       }
+               }
+               if (cpu_exthigh >= 0x80000006) {
+                       cpuid_count(0x80000006, 0, p);
+                       if (((p[2] >> 16) & 0xffff) != 0) {
+                               caches[1].id_shift = 0;
+                               caches[1].present = 1;
+                       }
+                       if (((p[3] >> 18) & 0x3fff) != 0) {
+
+                               /*
+                                * TODO: Account for dual-node processors
+                                * where each node within a package has its own
+                                * L3 cache.
+                                */
+                               caches[2].id_shift = pkg_id_shift;
+                               caches[2].present = 1;
+                       }
+               }
+       }
 }
 
 static void
-topo_probe_0x4(void)
+topo_probe_intel_0x4(void)
 {
        u_int p[4];
-       int pkg_id_bits;
-       int core_id_bits;
        int max_cores;
        int max_logical;
-       int id;
 
        /* Both zero and one here mean one logical processor per package. */
        max_logical = (cpu_feature & CPUID_HTT) != 0 ?
@@ -216,180 +290,432 @@ topo_probe_0x4(void)
        if (max_logical <= 1)
                return;
 
-       /*
-        * Because of uniformity assumption we examine only
-        * those logical processors that belong to the same
-        * package as BSP.  Further, we count number of
-        * logical processors that belong to the same core
-        * as BSP thus deducing number of threads per core.
-        */
        if (cpu_high >= 0x4) {
                cpuid_count(0x04, 0, p);
                max_cores = ((p[0] >> 26) & 0x3f) + 1;
        } else
                max_cores = 1;
-       core_id_bits = mask_width(max_logical/max_cores);
-       if (core_id_bits < 0)
-               return;
-       pkg_id_bits = core_id_bits + mask_width(max_cores);
-
-       for (id = 0; id <= MAX_APIC_ID; id++) {
-               /* Check logical CPU availability. */
-               if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
-                       continue;
-               /* Check if logical CPU has the same package ID. */
-               if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
-                       continue;
-               cpu_cores++;
-               /* Check if logical CPU has the same package and core IDs. */
-               if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
-                       cpu_logical++;
-       }
 
-       KASSERT(cpu_cores >= 1 && cpu_logical >= 1,
-           ("topo_probe_0x4 couldn't find BSP"));
-
-       cpu_cores /= cpu_logical;
-       hyperthreading_cpus = cpu_logical;
+       core_id_shift = mask_width(max_logical/max_cores);
+       KASSERT(core_id_shift >= 0,
+           ("intel topo: max_cores > max_logical\n"));
+       pkg_id_shift = core_id_shift + mask_width(max_cores);
 }
 
 static void
-topo_probe_0xb(void)
+topo_probe_intel_0xb(void)
 {
        u_int p[4];
        int bits;
-       int cnt;
-       int i;
-       int logical;
        int type;
-       int x;
+       int i;
+
+       /* Fall back if CPU leaf 11 doesn't really exist. */
+       cpuid_count(0x0b, 0, p);
+       if (p[1] == 0) {
+               topo_probe_intel_0x4();
+               return;
+       }
 
        /* We only support three levels for now. */
-       for (i = 0; i < 3; i++) {
+       for (i = 0; ; i++) {
                cpuid_count(0x0b, i, p);
 
-               /* Fall back if CPU leaf 11 doesn't really exist. */
-               if (i == 0 && p[1] == 0) {
-                       topo_probe_0x4();
-                       return;
-               }
-
                bits = p[0] & 0x1f;
-               logical = p[1] &= 0xffff;
                type = (p[2] >> 8) & 0xff;
-               if (type == 0 || logical == 0)
+
+               if (type == 0)
                        break;
-               /*
-                * Because of uniformity assumption we examine only
-                * those logical processors that belong to the same
-                * package as BSP.
-                */
-               for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
-                       if (!cpu_info[x].cpu_present ||
-                           cpu_info[x].cpu_disabled)
-                               continue;
-                       if (x >> bits == boot_cpu_id >> bits)
-                               cnt++;
-               }
+
+               /* TODO: check for duplicate (re-)assignment */
                if (type == CPUID_TYPE_SMT)
-                       cpu_logical = cnt;
+                       core_id_shift = bits;
                else if (type == CPUID_TYPE_CORE)
-                       cpu_cores = cnt;
+                       pkg_id_shift = bits;
+               else
+                       printf("unknown CPU level type %d\n", type);
+       }
+
+       if (pkg_id_shift < core_id_shift) {
+               printf("WARNING: core covers more APIC IDs than a package\n");
+               core_id_shift = pkg_id_shift;
+       }
+}
+
+static void
+topo_probe_intel_caches(void)
+{
+       u_int p[4];
+       int level;
+       int share_count;
+       int type;
+       int i;
+
+       if (cpu_high < 0x4) {
+               /*
+                * Available cache level and sizes can be determined
+                * via CPUID leaf 2, but that requires a huge table of hardcoded
+                * values, so for now just assume L1 and L2 caches potentially
+                * shared only by HTT processing units, if HTT is present.
+                */
+               caches[0].id_shift = pkg_id_shift;
+               caches[0].present = 1;
+               caches[1].id_shift = pkg_id_shift;
+               caches[1].present = 1;
+               return;
+       }
+
+       for (i = 0; ; i++) {
+               cpuid_count(0x4, i, p);
+               type = p[0] & 0x1f;
+               level = (p[0] >> 5) & 0x7;
+               share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+               if (!add_deterministic_cache(type, level, share_count))
+                       break;
        }
-       if (cpu_logical == 0)
-               cpu_logical = 1;
-       cpu_cores /= cpu_logical;
+}
+
+static void
+topo_probe_intel(void)
+{
+
+       /*
+        * See Intel(R) 64 Architecture Processor
+        * Topology Enumeration article for details.
+        *
+        * Note that 0x1 <= cpu_high < 4 case should be
+        * compatible with topo_probe_intel_0x4() logic when
+        * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
+        * or it should trigger the fallback otherwise.
+        */
+       if (cpu_high >= 0xb)
+               topo_probe_intel_0xb();
+       else if (cpu_high >= 0x1)
+               topo_probe_intel_0x4();
+
+       topo_probe_intel_caches();
 }
 
 /*
- * Both topology discovery code and code that consumes topology
- * information assume top-down uniformity of the topology.
- * That is, all physical packages must be identical and each
- * core in a package must have the same number of threads.
  * Topology information is queried only on BSP, on which this
  * code runs and for which it can query CPUID information.
- * Then topology is extrapolated on all packages using the
- * uniformity assumption.
+ * Then topology is extrapolated on all packages using an
+ * assumption that APIC ID to hardware component ID mapping is
+ * homogenious.
+ * That doesn't necesserily imply that the topology is uniform.
  */
 void
 topo_probe(void)
 {
        static int cpu_topo_probed = 0;
+       struct x86_topo_layer {
+               int type;
+               int subtype;
+               int id_shift;
+       } topo_layers[MAX_CACHE_LEVELS + 3];
+       struct topo_node *parent;
+       struct topo_node *node;
+       int layer;
+       int nlayers;
+       int node_id;
+       int i;
 
        if (cpu_topo_probed)
                return;
 
        CPU_ZERO(&logical_cpus_mask);
+
        if (mp_ncpus <= 1)
-               cpu_cores = cpu_logical = 1;
+               ; /* nothing */
        else if (cpu_vendor_id == CPU_VENDOR_AMD)
                topo_probe_amd();
-       else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
-               /*
-                * See Intel(R) 64 Architecture Processor
-                * Topology Enumeration article for details.
-                *
-                * Note that 0x1 <= cpu_high < 4 case should be
-                * compatible with topo_probe_0x4() logic when
-                * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
-                * or it should trigger the fallback otherwise.
-                */
-               if (cpu_high >= 0xb)
-                       topo_probe_0xb();
-               else if (cpu_high >= 0x1)
-                       topo_probe_0x4();
-       }
+       else if (cpu_vendor_id == CPU_VENDOR_INTEL)
+               topo_probe_intel();
+
+       KASSERT(pkg_id_shift >= core_id_shift,
+           ("bug in APIC topology discovery"));
+
+       nlayers = 0;
+       bzero(topo_layers, sizeof(topo_layers));
+
+       topo_layers[nlayers].type = TOPO_TYPE_PKG;
+       topo_layers[nlayers].id_shift = pkg_id_shift;
+       if (bootverbose)
+               printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
+       nlayers++;
 
        /*
-        * Fallback: assume each logical CPU is in separate
-        * physical package.  That is, no multi-core, no SMT.
-        */
-       if (cpu_cores == 0 || cpu_logical == 0)
-               cpu_cores = cpu_logical = 1;
+        * Consider all caches to be within a package/chip
+        * and "in front" of all sub-components like
+        * cores and hardware threads.
+        */
+       for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
+               if (caches[i].present) {
+                       KASSERT(caches[i].id_shift <= pkg_id_shift,
+                               ("bug in APIC topology discovery"));
+                       KASSERT(caches[i].id_shift >= core_id_shift,
+                               ("bug in APIC topology discovery"));
+
+                       topo_layers[nlayers].type = TOPO_TYPE_CACHE;
+                       topo_layers[nlayers].subtype = i + 1;
+                       topo_layers[nlayers].id_shift = caches[i].id_shift;
+                       if (bootverbose)
+                               printf("L%u cache ID shift: %u\n",
+                                   topo_layers[nlayers].subtype,
+                                   topo_layers[nlayers].id_shift);
+                       nlayers++;
+               }
+       }
+
+       if (pkg_id_shift > core_id_shift) {
+               topo_layers[nlayers].type = TOPO_TYPE_CORE;
+               topo_layers[nlayers].id_shift = core_id_shift;
+               if (bootverbose)
+                       printf("Core ID shift: %u\n",
+                           topo_layers[nlayers].id_shift);
+               nlayers++;
+       }
+
+       topo_layers[nlayers].type = TOPO_TYPE_PU;
+       topo_layers[nlayers].id_shift = 0;
+       nlayers++;
+
+       topo_init_root(&topo_root);
+       for (i = 0; i <= MAX_APIC_ID; ++i) {
+               if (!cpu_info[i].cpu_present)
+                       continue;
+
+               parent = &topo_root;
+               for (layer = 0; layer < nlayers; ++layer) {
+                       node_id = i >> topo_layers[layer].id_shift;
+                       parent = topo_add_node_by_hwid(parent, node_id,
+                           topo_layers[layer].type,
+                           topo_layers[layer].subtype);
+               }
+       }
+
+       parent = &topo_root;
+       for (layer = 0; layer < nlayers; ++layer) {
+               node_id = boot_cpu_id >> topo_layers[layer].id_shift;
+               node = topo_find_node_by_hwid(parent, node_id,
+                   topo_layers[layer].type,
+                   topo_layers[layer].subtype);
+               topo_promote_child(node);
+               parent = node;
+       }
+
        cpu_topo_probed = 1;
 }
 
-struct cpu_group *
-cpu_topo(void)
+/*
+ * Assign logical CPU IDs to local APICs.
+ */
+void
+assign_cpu_ids(void)
 {
-       int cg_flags;
+       struct topo_node *node;
+       u_int smt_mask;
+
+       smt_mask = (1u << core_id_shift) - 1;
 
        /*
-        * Determine whether any threading flags are
-        * necessry.
+        * Assign CPU IDs to local APIC IDs and disable any CPUs
+        * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
         */
-       topo_probe();
-       if (cpu_logical > 1 && hyperthreading_cpus)
-               cg_flags = CG_FLAG_HTT;
-       else if (cpu_logical > 1)
-               cg_flags = CG_FLAG_SMT;
+       mp_ncpus = 0;
+       TOPO_FOREACH(node, &topo_root) {
+               if (node->type != TOPO_TYPE_PU)
+                       continue;
+
+               if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
+                       cpu_info[node->hwid].cpu_hyperthread = 1;
+
+               if (resource_disabled("lapic", node->hwid)) {
+                       if (node->hwid != boot_cpu_id)
+                               cpu_info[node->hwid].cpu_disabled = 1;
+                       else
+                               printf("Cannot disable BSP, APIC ID = %d\n",
+                                   node->hwid);
+               }
+
+               if (!hyperthreading_allowed &&
+                   cpu_info[node->hwid].cpu_hyperthread)
+                       cpu_info[node->hwid].cpu_disabled = 1;
+
+               if (mp_ncpus >= MAXCPU)
+                       cpu_info[node->hwid].cpu_disabled = 1;
+
+               if (cpu_info[node->hwid].cpu_disabled) {
+                       disabled_cpus++;
+                       continue;
+               }
+
+               cpu_apic_ids[mp_ncpus] = node->hwid;
+               apic_cpuids[node->hwid] = mp_ncpus;
+               topo_set_pu_id(node, mp_ncpus);
+               mp_ncpus++;
+       }
+
+       KASSERT(mp_maxid >= mp_ncpus - 1,
+           ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
+           mp_ncpus));
+}
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+cpu_mp_announce(void)
+{
+       struct topo_node *node;
+       const char *hyperthread;
+       int pkg_count;
+       int cores_per_pkg;
+       int thrs_per_core;
+
+       printf("FreeBSD/SMP: ");
+       if (topo_analyze(&topo_root, 1, &pkg_count,
+           &cores_per_pkg, &thrs_per_core)) {
+               printf("%d package(s)", pkg_count);
+               if (cores_per_pkg > 0)
+                       printf(" x %d core(s)", cores_per_pkg);
+               if (thrs_per_core > 1)
+                   printf(" x %d hardware threads", thrs_per_core);
+       } else {
+               printf("Non-uniform topology");
+       }
+       printf("\n");
+
+       if (disabled_cpus) {
+               printf("FreeBSD/SMP Online: ");
+               if (topo_analyze(&topo_root, 0, &pkg_count,
+                   &cores_per_pkg, &thrs_per_core)) {
+                       printf("%d package(s)", pkg_count);
+                       if (cores_per_pkg > 0)
+                               printf(" x %d core(s)", cores_per_pkg);
+                       if (thrs_per_core > 1)
+                           printf(" x %d hardware threads", thrs_per_core);
+               } else {
+                       printf("Non-uniform topology");
+               }
+               printf("\n");
+       }
+
+       if (!bootverbose)
+               return;
+
+       TOPO_FOREACH(node, &topo_root) {
+               switch (node->type) {
+               case TOPO_TYPE_PKG:
+                       printf("Package HW ID = %u (%#x)\n",
+                           node->hwid, node->hwid);
+                       break;
+               case TOPO_TYPE_CORE:
+                       printf("\tCore HW ID = %u (%#x)\n",

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to