debug: decouple sched_stat tracepoints from CONFIG_SCHEDSTATS

Peter Zijlstra Tue, 28 Jun 2016 05:44:23 -0700

On Fri, Jun 17, 2016 at 12:43:22PM -0500, Josh Poimboeuf wrote:
> NOTE: I didn't include any performance numbers because I wasn't able to
> get consistent results.  I tried the following on a Xeon E5-2420 v2 CPU:
> 
>   $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do echo 
> -n performance > $i; done
>   $ echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
>   $ echo 100 > /sys/devices/system/cpu/intel_pstate/min_perf_pct
>   $ echo 0 > /proc/sys/kernel/nmi_watchdog
>   $ taskset 0x10 perf stat -n -r10 perf bench sched pipe -l 1000000
> 
> I was going to post the numbers from that, both with and without
> SCHEDSTATS, but then when I tried to repeat the test on a different day,
> the results were surprisingly different, with different conclusions.
> 
> So any advice on measuring scheduler performance would be appreciated...


Yeah, its a bit of a pain in general...

A) perf stat --null --repeat 50 -- perf bench sched messaging -g 50 -l 5000 | 
grep "seconds time elapsed"
B) perf stat --null --repeat 50 -- taskset 1 perf bench sched pipe | grep 
"seconds time elapsed"

1) tip/master + 1-4
2) tip/master + 1-5
3) tip/master + 1-5 + below

        1               2               3

A)      4.627767855     4.650429917     4.646208062
        4.633921933     4.641424424     4.612021058
        4.649536375     4.663144144     4.636815948
        4.630165619     4.649053552     4.613022902

B)      1.770732957     1.789534273     1.773334291
        1.761740716     1.795618428     1.773338681
        1.763761666     1.822316496     1.774385589


>From this it looks like patch 5 does hurt a wee bit, but we can get most
of that back by reordering the structure a bit. The results seem
'stable' across rebuilds and reboots (I've pop'ed all patches and
rebuild, rebooted and re-benched 1 at the end and obtained similar
results).

Although, possible that if we reorder first and then do 5, we'll just
see a bigger regression. I've not bothered.


---
 include/linux/sched.h |   33 +++++++++++++++------------------
 kernel/sched/core.c   |    4 ++--
 kernel/sched/debug.c  |    6 +++---
 3 files changed, 20 insertions(+), 23 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,7 @@ struct uts_namespace;
 struct load_weight {
        unsigned long weight;
        u32 inv_weight;
-};
+} __packed;
 
 /*
  * The load_avg/util_avg accumulates an infinite geometric series
@@ -1315,44 +1315,40 @@ struct sched_statistics {
 
 struct sched_entity {
        struct load_weight      load;           /* for load-balancing */
+       unsigned int            on_rq;
        struct rb_node          run_node;
        struct list_head        group_node;
-       unsigned int            on_rq;
 
-       u64                     exec_start;
+       u64                     exec_start ____cacheline_aligned_in_smp;
        u64                     sum_exec_runtime;
        u64                     vruntime;
        u64                     prev_sum_exec_runtime;
-
-       u64                     nr_migrations;
-
        u64                     wait_start;
        u64                     sleep_start;
        u64                     block_start;
 
+#ifdef CONFIG_SMP
+       /*
+        * Per entity load average tracking.
+        */
+       struct sched_avg        avg ____cacheline_aligned_in_smp;
+#endif
 #ifdef CONFIG_SCHEDSTATS
        struct sched_statistics statistics;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-       int                     depth;
+       /*
+        * mostly constant values, separate from modifications above
+        */
+       int                     depth ____cacheline_aligned_in_smp;
        struct sched_entity     *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq           *cfs_rq;
        /* rq "owned" by this entity/group: */
        struct cfs_rq           *my_q;
 #endif
-
-#ifdef CONFIG_SMP
-       /*
-        * Per entity load average tracking.
-        *
-        * Put into separate cache line so it does not
-        * collide with read-mostly values above.
-        */
-       struct sched_avg        avg ____cacheline_aligned_in_smp;
-#endif
-};
+} ____cacheline_aligned_in_smp;
 
 struct sched_rt_entity {
        struct list_head run_list;
@@ -1475,6 +1471,7 @@ struct task_struct {
        int prio, static_prio, normal_prio;
        unsigned int rt_priority;
        const struct sched_class *sched_class;
+       u64 nr_migrations;
        struct sched_entity se;
        struct sched_rt_entity rt;
 #ifdef CONFIG_CGROUP_SCHED
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1239,7 +1239,7 @@ void set_task_cpu(struct task_struct *p,
        if (task_cpu(p) != new_cpu) {
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p);
-               p->se.nr_migrations++;
+               p->nr_migrations++;
                perf_event_task_migrate(p);
        }
 
@@ -2167,7 +2167,7 @@ static void __sched_fork(unsigned long c
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
-       p->se.nr_migrations             = 0;
+       p->nr_migrations                = 0;
        p->se.vruntime                  = 0;
        INIT_LIST_HEAD(&p->se.group_node);
 
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,7 +885,7 @@ void proc_sched_show_task(struct task_st
 
        nr_switches = p->nvcsw + p->nivcsw;
 
-       P(se.nr_migrations);
+       P(nr_migrations);
 
        PN(se.wait_start);
        PN(se.sleep_start);
@@ -926,9 +926,9 @@ void proc_sched_show_task(struct task_st
                        avg_atom = -1LL;
 
                avg_per_cpu = p->se.sum_exec_runtime;
-               if (p->se.nr_migrations) {
+               if (p->nr_migrations) {
                        avg_per_cpu = div64_u64(avg_per_cpu,
-                                               p->se.nr_migrations);
+                                               p->nr_migrations);
                } else {
                        avg_per_cpu = -1LL;
                }

Re: [PATCH 0/5] sched/debug: decouple sched_stat tracepoints from CONFIG_SCHEDSTATS

Reply via email to