CONFIG_FAIR_USER_SCHED is great and I'm happy to see it is enabled by default
but it suffers from some limitations IMHO at this time:

- on a single user system, it's useful to have root processes be given twice
as CPU as user processes but I don't want nice 19 cron jobs like updatedb or
rpmq to have twice as cpu as my nice -20 tasks.

- on a multi user system, a user should be able to give back its cpu share to
other users. This is not possible for now with CONFIG_FAIR_USER_SCHED.

This implies that returning EPERM on nice(<0) becomes worthless, as it is
equivalent to nice(>0) for every other process of the user, ignoring the limits
of the nice range.

To address these problems, this patch changes the weight of the cfs_rq of each
user to the maximum weight of the processes on this cfs_rq, scaled with
/sys/kernel/uids/UID/cpu_share. It's possible that more elaborate mathematics
than taking the max are needed, but basic testing showed the expected fairness.

Signed-off-by: Guillaume Chazarain <[EMAIL PROTECTED]>
---

 include/linux/sched.h |    4 ++
 kernel/sched.c        |   50 +++++++++++++++++++----
 kernel/sched_fair.c   |  108 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 155d743..d6d2db9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -908,6 +908,10 @@ struct sched_entity {
        /* rq "owned" by this entity/group: */
        struct cfs_rq           *my_q;
 #endif
+#ifdef CONFIG_FAIR_USER_SCHED
+       /* used to track the max load.weight */
+       struct rb_node          max_load;
+#endif
 };
 
 struct task_struct {
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f6bd11..df8114b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -260,6 +260,10 @@ struct cfs_rq {
        struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
        struct task_group *tg;    /* group that "owns" this runqueue */
 #endif
+#ifdef CONFIG_FAIR_USER_SCHED
+       /* used to track the sched_entity with the max load in this cfs_rq */
+       struct rb_root max_load_se;
+#endif
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -7094,14 +7098,12 @@ done:
        task_rq_unlock(rq, &flags);
 }
 
+/* cfs_rq->rq->lock must be taken */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
        struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
        int on_rq;
 
-       spin_lock_irq(&rq->lock);
-
        on_rq = se->on_rq;
        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
@@ -7111,22 +7113,54 @@ static void set_se_shares(struct sched_entity *se, 
unsigned long shares)
 
        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
+}
 
-       spin_unlock_irq(&rq->lock);
+#ifdef CONFIG_FAIR_USER_SCHED
+static void update_group_share(struct task_group *tg, int cpu)
+{
+       struct rb_node *max_load_node = rb_last(&tg->cfs_rq[cpu]->max_load_se);
+       struct sched_entity *max_load_entry;
+       unsigned long shares;
+
+       if (!max_load_node)
+               /* empty cfs_rq */
+               return;
+
+       max_load_entry = rb_entry(max_load_node, struct sched_entity, max_load);
+       shares = scale_tg_weight(tg, max_load_entry->load.weight);
+       set_se_shares(tg->se[cpu], shares);
+}
+#else
+static void update_group_share(struct task_group *tg, int cpu)
+{
+       set_se_shares(tg->se[cpu], tg->shares);
 }
+#endif
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
-       int i;
+       int cpu;
+       unsigned long flags;
+
+       if (shares <= 1)
+               return -EINVAL;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+       if ((shares * prio_to_weight[0]) / prio_to_weight[0] != shares)
+               /* The provided value would overflow in scale_tg_weight() */
+               return -EINVAL;
+#endif
 
        spin_lock(&tg->lock);
        if (tg->shares == shares)
                goto done;
 
        tg->shares = shares;
-       for_each_possible_cpu(i)
-               set_se_shares(tg->se[i], shares);
-
+       for_each_possible_cpu(cpu) {
+               spin_lock_irqsave(&tg->cfs_rq[cpu]->rq->lock, flags);
+               update_group_share(tg, cpu);
+               spin_unlock_irqrestore(&tg->cfs_rq[cpu]->rq->lock, flags);
+       }
 done:
        spin_unlock(&tg->lock);
        return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 01859f6..70ed34e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,6 +135,112 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, 
struct sched_entity *se)
        return se->vruntime - cfs_rq->min_vruntime;
 }
 
+#ifdef CONFIG_FAIR_USER_SCHED
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
+static unsigned long scale_tg_weight(struct task_group *tg, unsigned long 
weight)
+{
+       unsigned long scaled_weight = (weight * tg->shares) / NICE_0_LOAD;
+       return max(scaled_weight, 2UL);
+}
+
+static void enqueue_max_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct rb_node **link;
+       struct rb_node *parent = NULL;
+       struct sched_entity *entry;
+       unsigned long scaled_weight;
+
+       if (!cfs_rq->tg)
+               /* this is the main cfs_rq, not a user specific one */
+               return;
+
+       /*
+        * We cannot take cfs_rq->tg->lock. As we already lock the rq, we
+        * could deadlock in sched_group_set_shares()
+        */
+
+       /*
+        * Insert the sched_entity at its place in cfs_rq->max_load_se, given
+        * its load.weight
+        */
+       link = &cfs_rq->max_load_se.rb_node;
+       while (*link) {
+               parent = *link;
+               entry = rb_entry(parent, struct sched_entity, max_load);
+               if (se->load.weight < entry->load.weight)
+                       link = &parent->rb_left;
+               else
+                       link = &parent->rb_right;
+       }
+       rb_link_node(&se->max_load, parent, link);
+       rb_insert_color(&se->max_load, &cfs_rq->max_load_se);
+
+       /*
+        * Update the weight of the cfs_rq if we inserted a higher weight
+        * sched_entity
+        */
+       scaled_weight = scale_tg_weight(cfs_rq->tg, se->load.weight);
+       if (scaled_weight > cfs_rq->load.weight)
+               set_se_shares(se->parent, scaled_weight);
+}
+
+static void dequeue_max_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct rb_node *prev_node;
+       struct rb_node *next_node;
+       struct sched_entity *prev_entry;
+       unsigned long scaled_weight;
+
+       if (!cfs_rq->tg)
+               /* this is the main cfs_rq, not a user specific one */
+               return;
+
+       /*
+        * We cannot take cfs_rq->tg->lock. As we already lock the rq, we
+        * could deadlock in sched_group_set_shares()
+        */
+
+       next_node = rb_next(&se->max_load);
+       if (!next_node)
+               prev_node = rb_prev(&se->max_load);
+       rb_erase(&se->max_load, &cfs_rq->max_load_se);
+
+       if (next_node)
+               /* The removed sched_entity was not the highest weight */
+               return;
+
+       /*
+        * Here we know we removed the highest weight sched_entity, so
+        * prev_node now contains the highest weight sched_entity
+        */
+
+       if (!prev_node)
+               /* The cfs_rq is now empty */
+               return;
+
+       prev_entry = rb_entry(prev_node, struct sched_entity, max_load);
+       if (prev_entry->load.weight == se->load.weight)
+               /*
+                * A sched_entity in the cfs_rq had the same weight, so the
+                * max weight did not change
+                */
+               return;
+
+       scaled_weight = scale_tg_weight(cfs_rq->tg, prev_entry->load.weight);
+       if (scaled_weight != cfs_rq->load.weight)
+               set_se_shares(prev_entry->parent, scaled_weight);
+}
+#else
+static inline void enqueue_max_load(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
+{
+}
+
+static inline void dequeue_max_load(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
+{
+}
+#endif
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -173,6 +279,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
        rb_link_node(&se->run_node, parent, link);
        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+       enqueue_max_load(cfs_rq, se);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -181,6 +288,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
                cfs_rq->rb_leftmost = rb_next(&se->run_node);
 
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+       dequeue_max_load(cfs_rq, se);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to