On 8/6/19 10:12 AM, Peter Zijlstra wrote: >> I'm wondering if something simpler will work. It is easier to maintain >> fairness >> between the CPU threads. A simple scheme may be if the force idle deficit >> on a CPU thread exceeds a threshold compared to its sibling, we will >> bias in choosing the task on the suppressed CPU thread. >> The fairness among the tenents per run queue is balanced out by cfq fairness, >> so things should be fair if we maintain fairness in CPU utilization between >> the two CPU threads. > > IIRC pjt once did a simle 5ms flip flop between siblings. >
Trying out Peter's suggestions in the following two patches on v3 to provide fairness between the CPU threads. The changes is in patch 2 and patch 1 is simply a code reorg. It is only minimally tested and seems to provide fairness between two will-it-scale cgroups. Haven't tried it yet on something that is less CPU intensive with lots of sleep in between. Also need to put the account idle time for rt and dl sched classes. Will do that later if this works for the fair class. Tim ----------------patch 1----------------------- >From ede10309986a6b1bcc82d317f86a5b06459d76bd Mon Sep 17 00:00:00 2001 From: Tim Chen <tim.c.c...@linux.intel.com> Date: Wed, 24 Jul 2019 13:58:18 -0700 Subject: [PATCH 1/2] sched: Move sched fair prio comparison to fair.c Consolidate the task priority comparison of the fair class to fair.c. A simple code reorganization and there are no functional changes. Signed-off-by: Tim Chen <tim.c.c...@linux.intel.com> --- kernel/sched/core.c | 21 ++------------------- kernel/sched/fair.c | 21 +++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e3cd9cb17809..567eba50dc38 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -105,25 +105,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b) if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ return !dl_time_before(a->dl.deadline, b->dl.deadline); - if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */ - u64 a_vruntime = a->se.vruntime; - u64 b_vruntime = b->se.vruntime; - - /* - * Normalize the vruntime if tasks are in different cpus. - */ - if (task_cpu(a) != task_cpu(b)) { - b_vruntime -= task_cfs_rq(b)->min_vruntime; - b_vruntime += task_cfs_rq(a)->min_vruntime; - - trace_printk("(%d:%Lu,%Lu,%Lu) <> (%d:%Lu,%Lu,%Lu)\n", - a->pid, a_vruntime, a->se.vruntime, task_cfs_rq(a)->min_vruntime, - b->pid, b_vruntime, b->se.vruntime, task_cfs_rq(b)->min_vruntime); - - } - - return !((s64)(a_vruntime - b_vruntime) <= 0); - } + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return prio_less_fair(a, b); return false; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02bff10237d4..e289b6e1545b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -602,6 +602,27 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) return delta; } +bool prio_less_fair(struct task_struct *a, struct task_struct *b) +{ + u64 a_vruntime = a->se.vruntime; + u64 b_vruntime = b->se.vruntime; + + /* + * Normalize the vruntime if tasks are in different cpus. + */ + if (task_cpu(a) != task_cpu(b)) { + b_vruntime -= task_cfs_rq(b)->min_vruntime; + b_vruntime += task_cfs_rq(a)->min_vruntime; + + trace_printk("(%d:%Lu,%Lu,%Lu) <> (%d:%Lu,%Lu,%Lu)\n", + a->pid, a_vruntime, a->se.vruntime, task_cfs_rq(a)->min_vruntime, + b->pid, b_vruntime, b->se.vruntime, task_cfs_rq(b)->min_vruntime); + + } + + return !((s64)(a_vruntime - b_vruntime) <= 0); +} + /* * The idea is to set a period in which each task runs once. * diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e91c188a452c..bdabe7ce1152 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1015,6 +1015,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) } extern void queue_core_balance(struct rq *rq); +extern bool prio_less_fair(struct task_struct *a, struct task_struct *b); #else /* !CONFIG_SCHED_CORE */ -- 2.20.1 --------------------patch 2------------------------ >From e27305b0042631382bd4f72260d579bf4c971d2f Mon Sep 17 00:00:00 2001 From: Tim Chen <tim.c.c...@linux.intel.com> Date: Tue, 6 Aug 2019 12:50:45 -0700 Subject: [PATCH 2/2] sched: Enforce fairness between cpu threads CPU thread could be suppressed by its sibling for extended time. Implement a budget for force idling, making all CPU threads have equal chance to run. Signed-off-by: Tim Chen <tim.c.c...@linux.intel.com> --- kernel/sched/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 12 ++++++++++++ kernel/sched/sched.h | 4 ++++ 3 files changed, 57 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 567eba50dc38..e22042883723 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -207,6 +207,44 @@ static struct task_struct *sched_core_next(struct task_struct *p, unsigned long return p; } +void account_core_idletime(struct task_struct *p, u64 exec) +{ + const struct cpumask *smt_mask; + struct rq *rq; + bool force_idle, refill; + int i, cpu; + + rq = task_rq(p); + if (!sched_core_enabled(rq) || !p->core_cookie) + return; + + cpu = task_cpu(p); + force_idle = false; + refill = true; + smt_mask = cpu_smt_mask(cpu); + + for_each_cpu(i, smt_mask) { + if (cpu == i) + continue; + + if (cpu_rq(i)->core_forceidle) + force_idle = true; + + /* Only refill if everyone has run out of allowance */ + if (cpu_rq(i)->core_idle_allowance > 0) + refill = false; + } + + if (force_idle) + rq->core_idle_allowance -= (s64) exec; + + if (rq->core_idle_allowance < 0 && refill) { + for_each_cpu(i, smt_mask) { + cpu_rq(i)->core_idle_allowance += (s64) SCHED_IDLE_ALLOWANCE; + } + } +} + /* * The static-key + stop-machine variable are needed such that: * @@ -273,6 +311,8 @@ void sched_core_put(void) static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void account_core_idletime(struct task_struct *p, u64 exec) { } +{ #endif /* CONFIG_SCHED_CORE */ @@ -6773,6 +6813,7 @@ void __init sched_init(void) rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_forceidle = false; + rq->core_idle_allowance = (s64) SCHED_IDLE_ALLOWANCE; rq->core_cookie = 0UL; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e289b6e1545b..d4f9ea03296e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -611,6 +611,17 @@ bool prio_less_fair(struct task_struct *a, struct task_struct *b) * Normalize the vruntime if tasks are in different cpus. */ if (task_cpu(a) != task_cpu(b)) { + + if (a->core_cookie && b->core_cookie && + a->core_cookie != b->core_cookie) { + /* + * Will be force idling one thread, + * pick the thread that has more allowance. + */ + return (task_rq(a)->core_idle_allowance <= + task_rq(b)->core_idle_allowance) ? true : false; + } + b_vruntime -= task_cfs_rq(b)->min_vruntime; b_vruntime += task_cfs_rq(a)->min_vruntime; @@ -817,6 +828,7 @@ static void update_curr(struct cfs_rq *cfs_rq) trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); + account_core_idletime(curtask, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bdabe7ce1152..927334b2078c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -963,6 +963,7 @@ struct rq { struct task_struct *core_pick; unsigned int core_enabled; unsigned int core_sched_seq; + s64 core_idle_allowance; struct rb_root core_tree; bool core_forceidle; @@ -999,6 +1000,8 @@ static inline int cpu_of(struct rq *rq) } #ifdef CONFIG_SCHED_CORE +#define SCHED_IDLE_ALLOWANCE 5000000 /* 5 msec */ + DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); static inline bool sched_core_enabled(struct rq *rq) @@ -1016,6 +1019,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) extern void queue_core_balance(struct rq *rq); extern bool prio_less_fair(struct task_struct *a, struct task_struct *b); +extern void account_core_idletime(struct task_struct *p, u64 exec); #else /* !CONFIG_SCHED_CORE */ -- 2.20.1