We cannot switch a task group from regular scheduling to coscheduling atomically, as it would require locking the whole system. Instead, the switch is done runqueue by runqueue via cosched_set_scheduled().
This means that other CPUs may see an intermediate state when locking a bunch of runqueues, where the sdrq->is_root fields do not yield a consistent picture across a task group. Handle these cases. Signed-off-by: Jan H. Schönherr <jscho...@amazon.de> --- kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 322a84ec9511..8da2033596ff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -646,6 +646,15 @@ static struct cfs_rq *current_cfs(struct rq *rq) { struct sdrq *sdrq = READ_ONCE(rq->sdrq_data.current_sdrq); + /* + * We might race with concurrent is_root-changes, causing + * current_sdrq to reference an sdrq which is no longer + * !is_root. Counter that by ascending the tg-hierarchy + * until we find an sdrq with is_root. + */ + while (sdrq->is_root && sdrq->tg_parent) + sdrq = sdrq->tg_parent; + return sdrq->cfs_rq; } #else @@ -7141,6 +7150,23 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf se = pick_next_entity(cfs_rq, curr); cfs_rq = pick_next_cfs(se); + +#ifdef CONFIG_COSCHEDULING + if (cfs_rq && is_sd_se(se) && cfs_rq->sdrq.is_root) { + WARN_ON_ONCE(1); /* Untested code path */ + /* + * Race with is_root update. + * + * We just moved downwards in the hierarchy via an + * SD-SE, the CFS-RQ should have is_root set to zero. + * However, a reconfiguration may be in progress. We + * basically ignore that reconfiguration. + * + * Contrary to the case below, there is nothing to fix + * as all the set_next_entity() calls are done later. + */ + } +#endif } while (cfs_rq); if (is_sd_se(se)) @@ -7192,6 +7218,48 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = pick_next_cfs(se); + +#ifdef CONFIG_COSCHEDULING + if (cfs_rq && is_sd_se(se) && cfs_rq->sdrq.is_root) { + /* + * Race with is_root update. + * + * We just moved downwards in the hierarchy via an + * SD-SE, the CFS-RQ should have is_root set to zero. + * However, a reconfiguration may be in progress. We + * basically ignore that reconfiguration, but we need + * to fix the picked path to correspond to that + * reconfiguration. + * + * Thus, we walk the hierarchy upwards again and do two + * things simultaneously: + * + * 1. put back picked entities which are not on the + * "correct" path, + * 2. pick the entities along the correct path. + * + * We do this until both paths upwards converge. + */ + struct sched_entity *se2 = cfs_rq->sdrq.tg_se; + bool top = false; + + WARN_ON_ONCE(1); /* Untested code path */ + while (se && se != se2) { + if (!top) { + put_prev_entity(cfs_rq_of(se), se); + if (cfs_rq_of(se) == top_cfs_rq) + top = true; + } + if (top) + se = cfs_rq_of(se)->sdrq.tg_se; + else + se = parent_entity(se); + set_next_entity(cfs_rq_of(se2), se2); + se2 = parent_entity(se2); + } + } +#endif + } while (cfs_rq); retidle: __maybe_unused; -- 2.9.3.1.gcba166c.dirty