fair: Fix how load gets propagated from cfs_rq to its sched_entity

Peter Zijlstra Wed, 03 May 2017 11:01:52 -0700


This is on my IVB-EP, 2 sockets, 10 cores / socket, 2 threads / core.


workload is constrained to 1 socket.

root@ivb-ep:~/bench/schbench# numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 
15000 -r 30
Latency percentiles (usec)
        50.0000th: 21
        75.0000th: 30
        90.0000th: 38
        95.0000th: 42
        *99.0000th: 49
        99.5000th: 53
        99.9000th: 15024
        min=0, max=15056


root@ivb-ep:~/bench/schbench# echo NO_FUDGE > /debug/sched_features; echo 
NO_FUDGE2 > /debug/sched_features ; mkdir /cgroup/ponies; echo $$ > 
/cgroup/ponies/tasks ; numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 
30 ; echo $$ > /cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
        50.0000th: 14
        75.0000th: 19
        90.0000th: 24
        95.0000th: 28
        *99.0000th: 57
        99.5000th: 15024
        99.9000th: 15024
        min=0, max=15060


root@ivb-ep:~/bench/schbench# echo NO_FUDGE > /debug/sched_features; echo 
FUDGE2 > /debug/sched_features ; mkdir /cgroup/ponies; echo $$ > 
/cgroup/ponies/tasks ; numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 
30 ; echo $$ > /cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
        50.0000th: 14
        75.0000th: 19
        90.0000th: 24
        95.0000th: 26
        *99.0000th: 38
        99.5000th: 49
        99.9000th: 9648
        min=0, max=15035


root@ivb-ep:~/bench/schbench# echo FUDGE > /debug/sched_features; echo FUDGE2 > 
/debug/sched_features ; mkdir /cgroup/ponies; echo $$ > /cgroup/ponies/tasks ; 
numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 30 ; echo $$ > 
/cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
        50.0000th: 14
        75.0000th: 19
        90.0000th: 24
        95.0000th: 27
        *99.0000th: 3060
        99.5000th: 7848
        99.9000th: 15024
        min=0, max=15041


root@ivb-ep:~/bench/schbench# echo 0 > /sys/module/fair/parameters/prop_type
root@ivb-ep:~/bench/schbench# echo FUDGE > /debug/sched_features; echo FUDGE2 > 
/debug/sched_features ; mkdir /cgroup/ponies; echo $$ > /cgroup/ponies/tasks ; 
numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 30 ; echo $$ > 
/cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
        50.0000th: 14
        75.0000th: 19
        90.0000th: 24
        95.0000th: 27
        *99.0000th: 52
        99.5000th: 4712
        99.9000th: 14640
        min=0, max=15033


Just FUDGE2 on its own seems to be the best on my system and is a change
that makes sense (and something Paul recently pointed out as well).

The implementation isn't particularly pretty or fast, but should
illustrate the idea.

Poking at the whole update_tg_cfs_load() thing only makes it worse after
that. And while I agree that that code is mind bending; it seems to work
OK-ish.

Tejun, Vincent, could you guys have a poke?

The thing is that if we assume se->avg.load_avg is correct, we should
already compute a correct cfs_rq->runnable_load_avg, we do all that
propagation right.

But because se->avg.load_avg is stuck in the 'past' because it's sum is
based on all its old weight, things don't quite work out. If we otoh
treat it as a runnable_sum and scale with weight, it seems to work out
fine.

Arguably sys_nice and all related crud should do the same, but nobody
really uses nice at any frequency, whereas we constantly change the
weight of our group entities.

---
 kernel/sched/fair.c     | 184 +++++++++++++++++++++++++++++++-----------------
 kernel/sched/features.h |   2 +
 2 files changed, 122 insertions(+), 64 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cd6c3f9..d6a33e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,6 +32,7 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/moduleparam.h>
 
 #include <trace/events/sched.h>
 
@@ -2632,16 +2633,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+
+enum shares_type {
+       shares_runnable,
+       shares_avg,
+       shares_weight,
+};
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, enum shares_type 
shares_type)
 {
-       long tg_weight, load, shares;
+       long tg_weight, tg_shares, load, shares;
+       struct task_group *tg = cfs_rq->tg;
 
-       /*
-        * This really should be: cfs_rq->avg.load_avg, but instead we use
-        * cfs_rq->load.weight, which is its upper bound. This helps ramp up
-        * the shares for small weight interactive tasks.
-        */
-       load = scale_load_down(cfs_rq->load.weight);
+       tg_shares = READ_ONCE(tg->shares);
+
+       switch (shares_type) {
+       case shares_runnable:
+               load = cfs_rq->runnable_load_avg;
+               break;
+
+       default:
+       case shares_avg:
+               load = cfs_rq->avg.load_avg;
+               break;
+
+       case shares_weight:
+               /*
+                * This really should be: cfs_rq->avg.load_avg, but instead we
+                * use cfs_rq->load.weight, which is its upper bound. This
+                * helps ramp up the shares for small weight interactive tasks.
+                */
+               load = scale_load_down(cfs_rq->load.weight);
+               break;
+       }
 
        tg_weight = atomic_long_read(&tg->load_avg);
 
@@ -2665,23 +2689,33 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, 
struct task_group *tg)
         * case no task is runnable on a CPU MIN_SHARES=2 should be returned
         * instead of 0.
         */
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       if (shares > tg->shares)
-               shares = tg->shares;
-
-       return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group 
*tg)
-{
-       return tg->shares;
+       return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
 # endif /* CONFIG_SMP */
 
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {                          \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(*ptr) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+       res = var - val;                                        \
+       if (res > var)                                          \
+               res = 0;                                        \
+       WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
 {
+       if (se->load.weight == weight)
+               return;
+
        if (se->on_rq) {
                /* commit outstanding execution time */
                if (cfs_rq->curr == se)
@@ -2689,10 +2723,40 @@ static void reweight_entity(struct cfs_rq *cfs_rq, 
struct sched_entity *se,
                account_entity_dequeue(cfs_rq, se);
        }
 
+       if (sched_feat(FUDGE2)) {
+               unsigned long new_weight = max(scale_load_down(weight), 1UL);
+               unsigned long old_weight = 
max(scale_load_down(se->load.weight), 1UL);
+
+               sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+               sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+
+               if (se->on_rq) {
+                       sub_positive(&cfs_rq->runnable_load_avg, 
se->avg.load_avg);
+                       sub_positive(&cfs_rq->runnable_load_sum, 
se->avg.load_sum);
+               }
+
+               se->avg.load_avg *= new_weight;
+               se->avg.load_sum *= new_weight;
+
+               se->avg.load_avg /= old_weight;
+               se->avg.load_sum /= old_weight;
+       }
+
        update_load_set(&se->load, weight);
 
-       if (se->on_rq)
+       if (se->on_rq) {
                account_entity_enqueue(cfs_rq, se);
+       }
+
+       if (sched_feat(FUDGE2)) {
+               cfs_rq->avg.load_avg += se->avg.load_avg;
+               cfs_rq->avg.load_sum += se->avg.load_sum;
+
+               if (se->on_rq) {
+                       cfs_rq->runnable_load_avg += se->avg.load_avg;
+                       cfs_rq->runnable_load_sum += se->avg.load_sum;
+               }
+       }
 }
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -2700,7 +2764,6 @@ static inline int throttled_hierarchy(struct cfs_rq 
*cfs_rq);
 static void update_cfs_shares(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = group_cfs_rq(se);
-       struct task_group *tg;
        long shares;
 
        if (!cfs_rq)
@@ -2709,13 +2772,14 @@ static void update_cfs_shares(struct sched_entity *se)
        if (throttled_hierarchy(cfs_rq))
                return;
 
-       tg = cfs_rq->tg;
-
 #ifndef CONFIG_SMP
-       if (likely(se->load.weight == tg->shares))
+       shares = READ_ONCE(cfs_rq->tg->shares);
+
+       if (likely(se->load.weight == shares))
                return;
+#else
+       shares = calc_cfs_shares(cfs_rq, shares_weight);
 #endif
-       shares = calc_cfs_shares(cfs_rq, tg);
 
        reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -3070,42 +3134,51 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
        cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
 }
 
+static int prop_type = shares_avg;
+
+module_param(prop_type, int, 0644);
+
 /* Take into account change of load of a child task group */
 static inline void
 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-       long delta, load = gcfs_rq->avg.load_avg;
+       long delta, load;
 
        /*
         * If the load of group cfs_rq is null, the load of the
         * sched_entity will also be null so we can skip the formula
         */
-       if (load) {
-               long tg_load;
+       if (!sched_feat(FUDGE)) {
+               load = gcfs_rq->avg.load_avg;
+               if (load) {
+                       long tg_load;
 
-               /* Get tg's load and ensure tg_load > 0 */
-               tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+                       /* Get tg's load and ensure tg_load > 0 */
+                       tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
 
-               /* Ensure tg_load >= load and updated with current load*/
-               tg_load -= gcfs_rq->tg_load_avg_contrib;
-               tg_load += load;
+                       /* Ensure tg_load >= load and updated with current 
load*/
+                       tg_load -= gcfs_rq->tg_load_avg_contrib;
+                       tg_load += load;
 
-               /*
-                * We need to compute a correction term in the case that the
-                * task group is consuming more CPU than a task of equal
-                * weight. A task with a weight equals to tg->shares will have
-                * a load less or equal to scale_load_down(tg->shares).
-                * Similarly, the sched_entities that represent the task group
-                * at parent level, can't have a load higher than
-                * scale_load_down(tg->shares). And the Sum of sched_entities'
-                * load must be <= scale_load_down(tg->shares).
-                */
-               if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-                       /* scale gcfs_rq's load into tg's shares*/
-                       load *= scale_load_down(gcfs_rq->tg->shares);
-                       load /= tg_load;
+                       /*
+                        * We need to compute a correction term in the case 
that the
+                        * task group is consuming more CPU than a task of equal
+                        * weight. A task with a weight equals to tg->shares 
will have
+                        * a load less or equal to scale_load_down(tg->shares).
+                        * Similarly, the sched_entities that represent the 
task group
+                        * at parent level, can't have a load higher than
+                        * scale_load_down(tg->shares). And the Sum of 
sched_entities'
+                        * load must be <= scale_load_down(tg->shares).
+                        */
+                       if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+                               /* scale gcfs_rq's load into tg's shares*/
+                               load *= scale_load_down(gcfs_rq->tg->shares);
+                               load /= tg_load;
+                       }
                }
+       } else {
+               load = calc_cfs_shares(gcfs_rq, prop_type);
        }
 
        delta = load - se->avg.load_avg;
@@ -3236,23 +3309,6 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq)
        }
 }
 
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do {                          \
-       typeof(_ptr) ptr = (_ptr);                              \
-       typeof(*ptr) val = (_val);                              \
-       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
-       res = var - val;                                        \
-       if (res > var)                                          \
-               res = 0;                                        \
-       WRITE_ONCE(*ptr, res);                                  \
-} while (0)
-
 /**
  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
  * @now: current time, as per cfs_rq_clock_task()
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index dc4d148..4c517b4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -80,3 +80,5 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(FUDGE, true)
+SCHED_FEAT(FUDGE2, true)

Re: [PATCH v2 1/2] sched/fair: Fix how load gets propagated from cfs_rq to its sched_entity

Reply via email to