On 8/6/19 10:12 AM, Peter Zijlstra wrote:

>> I'm wondering if something simpler will work.  It is easier to maintain 
>> fairness
>> between the CPU threads.  A simple scheme may be if the force idle deficit
>> on a CPU thread exceeds a threshold compared to its sibling, we will
>> bias in choosing the task on the suppressed CPU thread.  
>> The fairness among the tenents per run queue is balanced out by cfq fairness,
>> so things should be fair if we maintain fairness in CPU utilization between
>> the two CPU threads.
> 
> IIRC pjt once did a simle 5ms flip flop between siblings.
> 

Trying out Peter's suggestions in the following two patches on v3 to
provide fairness between the CPU threads.
The changes is in patch 2 and patch 1 is simply a code reorg.

It is only minimally tested and seems to provide fairness between
two will-it-scale cgroups. Haven't tried it yet on something that
is less CPU intensive with lots of sleep in between.

Also need to put the account idle time for rt and dl sched classes.
Will do that later if this works for the fair class.

Tim

----------------patch 1-----------------------

>From ede10309986a6b1bcc82d317f86a5b06459d76bd Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.c...@linux.intel.com>
Date: Wed, 24 Jul 2019 13:58:18 -0700
Subject: [PATCH 1/2] sched: Move sched fair prio comparison to fair.c

Consolidate the task priority comparison of the fair class
to fair.c.  A simple code reorganization and there are no functional changes.

Signed-off-by: Tim Chen <tim.c.c...@linux.intel.com>
---
 kernel/sched/core.c  | 21 ++-------------------
 kernel/sched/fair.c  | 21 +++++++++++++++++++++
 kernel/sched/sched.h |  1 +
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e3cd9cb17809..567eba50dc38 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -105,25 +105,8 @@ static inline bool prio_less(struct task_struct *a, struct 
task_struct *b)
        if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
                return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-       if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-               u64 a_vruntime = a->se.vruntime;
-               u64 b_vruntime = b->se.vruntime;
-
-               /*
-                * Normalize the vruntime if tasks are in different cpus.
-                */
-               if (task_cpu(a) != task_cpu(b)) {
-                       b_vruntime -= task_cfs_rq(b)->min_vruntime;
-                       b_vruntime += task_cfs_rq(a)->min_vruntime;
-
-                       trace_printk("(%d:%Lu,%Lu,%Lu) <> (%d:%Lu,%Lu,%Lu)\n",
-                                    a->pid, a_vruntime, a->se.vruntime, 
task_cfs_rq(a)->min_vruntime,
-                                    b->pid, b_vruntime, b->se.vruntime, 
task_cfs_rq(b)->min_vruntime);
-
-               }
-
-               return !((s64)(a_vruntime - b_vruntime) <= 0);
-       }
+       if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+               return prio_less_fair(a, b);
 
        return false;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02bff10237d4..e289b6e1545b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -602,6 +602,27 @@ static inline u64 calc_delta_fair(u64 delta, struct 
sched_entity *se)
        return delta;
 }
 
+bool prio_less_fair(struct task_struct *a, struct task_struct *b)
+{
+       u64 a_vruntime = a->se.vruntime;
+       u64 b_vruntime = b->se.vruntime;
+
+       /*
+        * Normalize the vruntime if tasks are in different cpus.
+        */
+       if (task_cpu(a) != task_cpu(b)) {
+               b_vruntime -= task_cfs_rq(b)->min_vruntime;
+               b_vruntime += task_cfs_rq(a)->min_vruntime;
+
+               trace_printk("(%d:%Lu,%Lu,%Lu) <> (%d:%Lu,%Lu,%Lu)\n",
+                            a->pid, a_vruntime, a->se.vruntime, 
task_cfs_rq(a)->min_vruntime,
+                            b->pid, b_vruntime, b->se.vruntime, 
task_cfs_rq(b)->min_vruntime);
+
+       }
+
+       return !((s64)(a_vruntime - b_vruntime) <= 0);
+}
+
 /*
  * The idea is to set a period in which each task runs once.
  *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e91c188a452c..bdabe7ce1152 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1015,6 +1015,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
 }
 
 extern void queue_core_balance(struct rq *rq);
+extern bool prio_less_fair(struct task_struct *a, struct task_struct *b);
 
 #else /* !CONFIG_SCHED_CORE */
 
-- 
2.20.1


--------------------patch 2------------------------

>From e27305b0042631382bd4f72260d579bf4c971d2f Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.c...@linux.intel.com>
Date: Tue, 6 Aug 2019 12:50:45 -0700
Subject: [PATCH 2/2] sched: Enforce fairness between cpu threads

CPU thread could be suppressed by its sibling for extended time.
Implement a budget for force idling, making all CPU threads have
equal chance to run.

Signed-off-by: Tim Chen <tim.c.c...@linux.intel.com>
---
 kernel/sched/core.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c  | 12 ++++++++++++
 kernel/sched/sched.h |  4 ++++
 3 files changed, 57 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 567eba50dc38..e22042883723 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -207,6 +207,44 @@ static struct task_struct *sched_core_next(struct 
task_struct *p, unsigned long
        return p;
 }
 
+void account_core_idletime(struct task_struct *p, u64 exec)
+{
+       const struct cpumask *smt_mask;
+       struct rq *rq;
+       bool force_idle, refill;
+       int i, cpu;
+
+       rq = task_rq(p);
+       if (!sched_core_enabled(rq) || !p->core_cookie)
+               return;
+
+       cpu = task_cpu(p);
+       force_idle = false;
+       refill = true;
+       smt_mask = cpu_smt_mask(cpu);
+
+       for_each_cpu(i, smt_mask) {
+               if (cpu == i)
+                       continue;
+
+               if (cpu_rq(i)->core_forceidle)
+                       force_idle = true;
+
+               /* Only refill if everyone has run out of allowance */
+               if (cpu_rq(i)->core_idle_allowance > 0)
+                       refill = false;
+       }
+
+       if (force_idle)
+               rq->core_idle_allowance -= (s64) exec;
+
+       if (rq->core_idle_allowance < 0 && refill) {
+               for_each_cpu(i, smt_mask) {
+                       cpu_rq(i)->core_idle_allowance += (s64) 
SCHED_IDLE_ALLOWANCE;
+               }
+       }
+}
+
 /*
  * The static-key + stop-machine variable are needed such that:
  *
@@ -273,6 +311,8 @@ void sched_core_put(void)
 
 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
 static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void account_core_idletime(struct task_struct *p, u64 exec) { }
+{
 
 #endif /* CONFIG_SCHED_CORE */
 
@@ -6773,6 +6813,7 @@ void __init sched_init(void)
                rq->core_enabled = 0;
                rq->core_tree = RB_ROOT;
                rq->core_forceidle = false;
+               rq->core_idle_allowance = (s64) SCHED_IDLE_ALLOWANCE;
 
                rq->core_cookie = 0UL;
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e289b6e1545b..d4f9ea03296e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -611,6 +611,17 @@ bool prio_less_fair(struct task_struct *a, struct 
task_struct *b)
         * Normalize the vruntime if tasks are in different cpus.
         */
        if (task_cpu(a) != task_cpu(b)) {
+
+               if (a->core_cookie && b->core_cookie &&
+                   a->core_cookie != b->core_cookie) {
+                       /*
+                        * Will be force idling one thread,
+                        * pick the thread that has more allowance.
+                        */
+                       return (task_rq(a)->core_idle_allowance <=
+                               task_rq(b)->core_idle_allowance) ? true : false;
+               }
+
                b_vruntime -= task_cfs_rq(b)->min_vruntime;
                b_vruntime += task_cfs_rq(a)->min_vruntime;
 
@@ -817,6 +828,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
                cgroup_account_cputime(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
+               account_core_idletime(curtask, delta_exec);
        }
 
        account_cfs_rq_runtime(cfs_rq, delta_exec);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bdabe7ce1152..927334b2078c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -963,6 +963,7 @@ struct rq {
        struct task_struct      *core_pick;
        unsigned int            core_enabled;
        unsigned int            core_sched_seq;
+       s64                     core_idle_allowance;
        struct rb_root          core_tree;
        bool                    core_forceidle;
 
@@ -999,6 +1000,8 @@ static inline int cpu_of(struct rq *rq)
 }
 
 #ifdef CONFIG_SCHED_CORE
+#define SCHED_IDLE_ALLOWANCE   5000000         /* 5 msec */
+
 DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
 
 static inline bool sched_core_enabled(struct rq *rq)
@@ -1016,6 +1019,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
 
 extern void queue_core_balance(struct rq *rq);
 extern bool prio_less_fair(struct task_struct *a, struct task_struct *b);
+extern void  account_core_idletime(struct task_struct *p, u64 exec);
 
 #else /* !CONFIG_SCHED_CORE */
 
-- 
2.20.1

Reply via email to