We cannot switch a task group from regular scheduling to coscheduling
atomically, as it would require locking the whole system. Instead,
the switch is done runqueue by runqueue via cosched_set_scheduled().

This means that other CPUs may see an intermediate state when locking
a bunch of runqueues, where the sdrq->is_root fields do not yield
a consistent picture across a task group.

Handle these cases.

Signed-off-by: Jan H. Schönherr <jscho...@amazon.de>
---
 kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 322a84ec9511..8da2033596ff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -646,6 +646,15 @@ static struct cfs_rq *current_cfs(struct rq *rq)
 {
        struct sdrq *sdrq = READ_ONCE(rq->sdrq_data.current_sdrq);
 
+       /*
+        * We might race with concurrent is_root-changes, causing
+        * current_sdrq to reference an sdrq which is no longer
+        * !is_root. Counter that by ascending the tg-hierarchy
+        * until we find an sdrq with is_root.
+        */
+       while (sdrq->is_root && sdrq->tg_parent)
+               sdrq = sdrq->tg_parent;
+
        return sdrq->cfs_rq;
 }
 #else
@@ -7141,6 +7150,23 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf
 
                se = pick_next_entity(cfs_rq, curr);
                cfs_rq = pick_next_cfs(se);
+
+#ifdef CONFIG_COSCHEDULING
+               if (cfs_rq && is_sd_se(se) && cfs_rq->sdrq.is_root) {
+                       WARN_ON_ONCE(1); /* Untested code path */
+                       /*
+                        * Race with is_root update.
+                        *
+                        * We just moved downwards in the hierarchy via an
+                        * SD-SE, the CFS-RQ should have is_root set to zero.
+                        * However, a reconfiguration may be in progress. We
+                        * basically ignore that reconfiguration.
+                        *
+                        * Contrary to the case below, there is nothing to fix
+                        * as all the set_next_entity() calls are done later.
+                        */
+               }
+#endif
        } while (cfs_rq);
 
        if (is_sd_se(se))
@@ -7192,6 +7218,48 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf
                se = pick_next_entity(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = pick_next_cfs(se);
+
+#ifdef CONFIG_COSCHEDULING
+               if (cfs_rq && is_sd_se(se) && cfs_rq->sdrq.is_root) {
+                       /*
+                        * Race with is_root update.
+                        *
+                        * We just moved downwards in the hierarchy via an
+                        * SD-SE, the CFS-RQ should have is_root set to zero.
+                        * However, a reconfiguration may be in progress. We
+                        * basically ignore that reconfiguration, but we need
+                        * to fix the picked path to correspond to that
+                        * reconfiguration.
+                        *
+                        * Thus, we walk the hierarchy upwards again and do two
+                        * things simultaneously:
+                        *
+                        * 1. put back picked entities which are not on the
+                        *    "correct" path,
+                        * 2. pick the entities along the correct path.
+                        *
+                        * We do this until both paths upwards converge.
+                        */
+                       struct sched_entity *se2 = cfs_rq->sdrq.tg_se;
+                       bool top = false;
+
+                       WARN_ON_ONCE(1); /* Untested code path */
+                       while (se && se != se2) {
+                               if (!top) {
+                                       put_prev_entity(cfs_rq_of(se), se);
+                                       if (cfs_rq_of(se) == top_cfs_rq)
+                                               top = true;
+                               }
+                               if (top)
+                                       se = cfs_rq_of(se)->sdrq.tg_se;
+                               else
+                                       se = parent_entity(se);
+                               set_next_entity(cfs_rq_of(se2), se2);
+                               se2 = parent_entity(se2);
+                       }
+               }
+#endif
+
        } while (cfs_rq);
 
 retidle: __maybe_unused;
-- 
2.9.3.1.gcba166c.dirty

Reply via email to