SD-SEs require some attention during enqueuing and dequeuing. In some
aspects they behave similar to TG-SEs, for example, we must not dequeue
a SD-SE if it still represents other load. But SD-SEs are also different
due to the concurrent load updates by multiple CPUs and that we need to
be careful when to access it, as an SD-SE belongs to the next hierarchy
level which is protected by a different lock.

Make sure to propagate enqueues and dequeues correctly, and to notify
the leader when needed.

Additionally, we define cfs_rq->h_nr_running to refer to number tasks
and SD-SEs below the CFS runqueue without drilling down into SD-SEs.
(Phrased differently, h_nr_running counts non-TG-SEs along the task
group hierarchy.) This makes later adjustments for load-balancing
more natural, as SD-SEs now appear similar to tasks, allowing to
balance coscheduled sets individually.

Signed-off-by: Jan H. Schönherr <jscho...@amazon.de>
---
 kernel/sched/fair.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 102 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 483db54ee20a..bc219c9c3097 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4600,17 +4600,40 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                /* throttled entity or throttle-on-deactivate */
                if (!se->on_rq)
                        break;
+               if (is_sd_se(se)) {
+                       /*
+                        * don't dequeue sd_se if it represents other
+                        * children besides the dequeued one
+                        */
+                       if (se->load.weight)
+                               dequeue = 0;
+
+                       task_delta = 1;
+               }
 
                if (dequeue)
                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+               if (dequeue && is_sd_se(se)) {
+                       /*
+                        * If we dequeued an SD-SE and we are not the leader,
+                        * the leader might want to select another task group
+                        * right now.
+                        *
+                        * FIXME: Change leadership instead?
+                        */
+                       if (leader_of(se) != cpu_of(rq))
+                               resched_cpu_locked(leader_of(se));
+               }
+               if (!dequeue && is_sd_se(se))
+                       break;
                qcfs_rq->h_nr_running -= task_delta;
 
                if (qcfs_rq->load.weight)
                        dequeue = 0;
        }
 
-       if (!se)
-               sub_nr_running(rq, task_delta);
+       if (!se || !is_cpu_rq(hrq_of(cfs_rq_of(se))))
+               sub_nr_running(rq, cfs_rq->h_nr_running);
 
        rq_chain_unlock(&rc);
 
@@ -4641,8 +4664,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
        struct sched_entity *se;
        int enqueue = 1;
-       long task_delta;
+       long task_delta, orig_task_delta;
        struct rq_chain rc;
+#ifdef CONFIG_COSCHEDULING
+       int lcpu = rq->sdrq_data.leader;
+#endif
 
        SCHED_WARN_ON(!is_cpu_rq(rq));
 
@@ -4669,24 +4695,40 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                return;
 
        task_delta = cfs_rq->h_nr_running;
+       orig_task_delta = task_delta;
        rq_chain_init(&rc, rq);
        for_each_sched_entity(se) {
                rq_chain_lock(&rc, se);
                update_sdse_load(se);
                if (se->on_rq)
                        enqueue = 0;
+               if (is_sd_se(se))
+                       task_delta = 1;
 
                cfs_rq = cfs_rq_of(se);
                if (enqueue)
                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+               if (!enqueue && is_sd_se(se))
+                       break;
                cfs_rq->h_nr_running += task_delta;
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
+
+#ifdef CONFIG_COSCHEDULING
+               /*
+                * FIXME: Pro-actively reschedule the leader, can't tell
+                *        currently whether we actually have to.
+                */
+               if (lcpu != cfs_rq->sdrq.data->leader) {
+                       lcpu = cfs_rq->sdrq.data->leader;
+                       resched_cpu_locked(lcpu);
+               }
+#endif /* CONFIG_COSCHEDULING */
        }
 
-       if (!se)
-               add_nr_running(rq, task_delta);
+       if (!se || !is_cpu_rq(hrq_of(cfs_rq_of(se))))
+               add_nr_running(rq, orig_task_delta);
 
        rq_chain_unlock(&rc);
 
@@ -5213,6 +5255,9 @@ bool enqueue_entity_fair(struct rq *rq, struct 
sched_entity *se, int flags,
 {
        struct cfs_rq *cfs_rq;
        struct rq_chain rc;
+#ifdef CONFIG_COSCHEDULING
+       int lcpu = rq->sdrq_data.leader;
+#endif
 
        rq_chain_init(&rc, rq);
        for_each_sched_entity(se) {
@@ -5221,6 +5266,8 @@ bool enqueue_entity_fair(struct rq *rq, struct 
sched_entity *se, int flags,
                if (se->on_rq)
                        break;
                cfs_rq = cfs_rq_of(se);
+               if (is_sd_se(se))
+                       task_delta = 1;
                enqueue_entity(cfs_rq, se, flags);
 
                /*
@@ -5234,6 +5281,22 @@ bool enqueue_entity_fair(struct rq *rq, struct 
sched_entity *se, int flags,
                cfs_rq->h_nr_running += task_delta;
 
                flags = ENQUEUE_WAKEUP;
+
+#ifdef CONFIG_COSCHEDULING
+               /*
+                * FIXME: Pro-actively reschedule the leader, can't tell
+                *        currently whether we actually have to.
+                *
+                *        There are some cases that slip through
+                *        check_preempt_curr(), like the leader not getting
+                *        notified (and not becoming aware of the addition
+                *        timely), when an RT task is running.
+                */
+               if (lcpu != cfs_rq->sdrq.data->leader) {
+                       lcpu = cfs_rq->sdrq.data->leader;
+                       resched_cpu_locked(lcpu);
+               }
+#endif /* CONFIG_COSCHEDULING */
        }
 
        for_each_sched_entity(se) {
@@ -5241,6 +5304,9 @@ bool enqueue_entity_fair(struct rq *rq, struct 
sched_entity *se, int flags,
                rq_chain_lock(&rc, se);
                update_sdse_load(se);
                cfs_rq = cfs_rq_of(se);
+
+               if (is_sd_se(se))
+                       task_delta = 0;
                cfs_rq->h_nr_running += task_delta;
 
                if (cfs_rq_throttled(cfs_rq))
@@ -5304,8 +5370,36 @@ bool dequeue_entity_fair(struct rq *rq, struct 
sched_entity *se, int flags,
                rq_chain_lock(&rc, se);
                update_sdse_load(se);
                cfs_rq = cfs_rq_of(se);
+
+               if (is_sd_se(se)) {
+                       /*
+                        * don't dequeue sd_se if it represents other
+                        * children besides the dequeued one
+                        */
+                       if (se->load.weight)
+                               break;
+
+                       /* someone else did our job */
+                       if (!se->on_rq)
+                               break;
+
+                       task_delta = 1;
+               }
+
                dequeue_entity(cfs_rq, se, flags);
 
+               if (is_sd_se(se)) {
+                       /*
+                        * If we dequeued an SD-SE and we are not the leader,
+                        * the leader might want to select another task group
+                        * right now.
+                        *
+                        * FIXME: Change leadership instead?
+                        */
+                       if (leader_of(se) != cpu_of(rq))
+                               resched_cpu_locked(leader_of(se));
+               }
+
                /*
                 * end evaluation on encountering a throttled cfs_rq
                 *
@@ -5339,6 +5433,9 @@ bool dequeue_entity_fair(struct rq *rq, struct 
sched_entity *se, int flags,
                rq_chain_lock(&rc, se);
                update_sdse_load(se);
                cfs_rq = cfs_rq_of(se);
+
+               if (is_sd_se(se))
+                       task_delta = 0;
                cfs_rq->h_nr_running -= task_delta;
 
                if (cfs_rq_throttled(cfs_rq))
-- 
2.9.3.1.gcba166c.dirty

Reply via email to