Currently sched_debug can be added to the kernel commandline parameters
to dump domain information during boot. This method is not practical with
a large number of CPUs.

This patch adds per-cpu entries to debugfs under a sched directory.
Reading the per-cpu file shows the domain information in a human-readable
format:

$ cat /sys/kernel/debug/sched/cpu0
domain 0 / SMT:
    flags: 0x2af:  load-balance new-idle exec fork affine cpu-capacity 
share-pkg-resources
    span: 0-7
    groups:
        0 (cpu_capacity = 147)
        1 (cpu_capacity = 147)
        2 (cpu_capacity = 147)
        3 (cpu_capacity = 147)
        4 (cpu_capacity = 147)
        5 (cpu_capacity = 147)
        6 (cpu_capacity = 147)
        7 (cpu_capacity = 147)

domain 2 / DIE:
    flags: 0x102f:  load-balance new-idle exec fork affine prefer-sibling
    span: 0-127
    groups:
        0-7 (cpu_capacity = 1176)
        8-15 (cpu_capacity = 1176)
        16-23 (cpu_capacity = 1176)
        24-31 (cpu_capacity = 1176)
        32-39 (cpu_capacity = 1176)
        40-47 (cpu_capacity = 1176)
        48-55 (cpu_capacity = 1176)
        56-63 (cpu_capacity = 1176)
        64-71 (cpu_capacity = 1176)
        72-79 (cpu_capacity = 1176)
        80-87 (cpu_capacity = 1176)
        88-95 (cpu_capacity = 1176)
        96-103 (cpu_capacity = 1176)
        104-111 (cpu_capacity = 1176)
        112-119 (cpu_capacity = 1176)
        120-127 (cpu_capacity = 1176)

domain 3 / NUMA:
    flags: 0x642f:  load-balance new-idle exec fork affine serialize overlap 
numa
    span: 0-1023
    groups:
        0-127 (cpu_capacity = 18816)
        128-255 (cpu_capacity = 18816)
        256-383 (cpu_capacity = 18816)
        384-511 (cpu_capacity = 18816)
        512-639 (cpu_capacity = 18816)
        640-767 (cpu_capacity = 18816)
        768-895 (cpu_capacity = 18816)
        896-1023 (cpu_capacity = 18816)

Before spending too much time formalizing this I wanted to see if you guys
would entertain the idea of making this info available via debugfs. It does
move the existing sched_features file to sched/features -- not sure how 
acceptable it is to move files in debugfs.

TO-DO: handle hotplug

Signed-off-by: David Ahern <david.ah...@oracle.com>
---
 kernel/sched/core.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 164 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f53202a..b4d8d0c8260e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -268,12 +268,173 @@ static const struct file_operations sched_feat_fops = {
        .release        = single_release,
 };
 
+static const char * const sd_flag_names[] = {
+       "load-balance",
+       "new-idle",
+       "exec",
+       "fork",
+       "wake",
+       "affine",
+       "",
+       "cpu-capacity",
+       "power-domain",
+       "share-pkg-resources",
+       "serialize",
+       "asym-packing",
+       "prefer-sibling",
+       "overlap",
+       "numa",
+       "",
+};
+static void sched_cpu_domain_show(struct seq_file *m, struct sched_domain *sd,
+                                 int cpu)
+{
+       struct cpumask groupmask;
+       struct sched_group *group = sd->groups;
+       int i;
+
+       cpumask_clear(&groupmask);
+
+       seq_printf(m, "domain %d / %s:\n", sd->level, sd->name);
+       seq_printf(m, "    flags: 0x%x: ", sd->flags);
+
+       for (i = 0; i < ARRAY_SIZE(sd_flag_names); ++i) {
+               if (sd->flags & (1 << i))
+                       seq_printf(m, " %s", sd_flag_names[i]);
+       }
+       seq_puts(m, "\n");
+
+       if (!(sd->flags & SD_LOAD_BALANCE) && sd->parent)
+               seq_puts(m, "           ERROR: !SD_LOAD_BALANCE domain has 
parent\n");
+
+       seq_printf(m, "    span: %*pbl\n",
+                  cpumask_pr_args(sched_domain_span(sd)));
+
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+               seq_printf(m, "    ERROR: domain->span does not contain 
CPU%d\n", cpu);
+
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group)))
+               seq_printf(m, "    ERROR: domain->groups does not contain 
CPU%d\n", cpu);
+
+       seq_puts(m, "    groups:\n");
+       do {
+               if (!group) {
+                       seq_puts(m, "            ERROR: group is NULL\n");
+                       break;
+               }
+
+               /*
+                * Even though we initialize ->capacity to something semi-sane,
+                * we leave capacity_orig unset. This allows us to detect if
+                * domain iteration is still funny without causing /0 traps.
+                */
+               if (!group->sgc->capacity_orig) {
+                       seq_puts(m, "        ERROR: domain->cpu_capacity not 
set\n");
+                       break;
+               }
+
+               if (!cpumask_weight(sched_group_cpus(group))) {
+                       seq_puts(m, "        ERROR: empty group\n");
+                       break;
+               }
+
+               if (!(sd->flags & SD_OVERLAP) &&
+                   cpumask_intersects(&groupmask, sched_group_cpus(group))) {
+                       seq_puts(m, "        ERROR: repeated CPUs\n");
+                       break;
+               }
+
+               cpumask_or(&groupmask, &groupmask, sched_group_cpus(group));
+
+               seq_printf(m, "        %*pbl",
+                          cpumask_pr_args(sched_group_cpus(group)));
+
+               if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+                       seq_printf(m, " (cpu_capacity = %d)",
+                                  group->sgc->capacity);
+               }
+               seq_puts(m, "\n");
+
+               group = group->next;
+       } while (group != sd->groups);
+
+       if (!cpumask_equal(sched_domain_span(sd), &groupmask))
+               seq_puts(m, "    ERROR: groups don't span domain->span\n");
+
+       if (sd->parent &&
+           !cpumask_subset(&groupmask, sched_domain_span(sd->parent))) {
+               seq_puts(m, "    ERROR: parent span is not a superset of 
domain->span\n");
+       }
+}
+
+static int sched_cpu_show(struct seq_file *m, void *unused)
+{
+       struct sched_domain *sd;
+       int cpu = (int) ((long) m->private);
+
+       if (cpu < 0 || cpu > CONFIG_NR_CPUS) {
+               seq_printf(m, "invalid CPU, %d\n", cpu);
+               return 0;
+       }
+
+       for_each_domain(cpu, sd) {
+               sched_cpu_domain_show(m, sd, cpu);
+               seq_puts(m, "\n");
+       }
+
+       return 0;
+}
+
+static int sched_cpu_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_cpu_show, inode->i_private);
+}
+static const struct file_operations sched_cpu_fops = {
+       .open           = sched_cpu_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+static struct dentry *d_sched_debug;
+static struct dentry *d_sched_cpu[CONFIG_NR_CPUS];
+
+static int sched_debugfs_add_cpu(int cpu)
+{
+       char buf[32];
+       long lcpu = cpu;
+
+       snprintf(buf, sizeof(buf), "cpu%d", cpu);
+       d_sched_cpu[cpu] = debugfs_create_file(buf, 0444, d_sched_debug,
+                                               (void *) lcpu, &sched_cpu_fops);
+
+       if (d_sched_cpu[cpu] == NULL)
+               pr_warn("Failed to create debugfs entry for cpu %d\n", cpu);
+
+       return 0;
+}
+
 static __init int sched_init_debug(void)
 {
-       debugfs_create_file("sched_features", 0644, NULL, NULL,
+       int cpu;
+       int rc = 0;
+
+       d_sched_debug = debugfs_create_dir("sched", NULL);
+       if (!d_sched_debug) {
+               pr_warn("Could not create debugfs 'sched' entry\n");
+               return 0;
+       }
+
+       debugfs_create_file("features", 0644, d_sched_debug, NULL,
                        &sched_feat_fops);
 
-       return 0;
+       for_each_online_cpu(cpu) {
+               rc = sched_debugfs_add_cpu(cpu);
+               if (rc)
+                       goto out;
+       }
+
+out:
+       return rc;
 }
 late_initcall(sched_init_debug);
 #endif /* CONFIG_SCHED_DEBUG */
@@ -6689,7 +6850,7 @@ struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
 
                if (!cpumask_subset(sched_domain_span(child),
                                    sched_domain_span(sd))) {
-                       pr_err("BUG: arch topology borken\n");
+                       pr_err("BUG: arch topology broken\n");
 #ifdef CONFIG_SCHED_DEBUG
                        pr_err("     the %s domain not a subset of the %s 
domain\n",
                                        child->name, sd->name);
-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to