[PATCH -v3] sched, numa: Use {cpu, pid} to create task groups for shared faults

Peter Zijlstra Fri, 02 Aug 2013 09:48:32 -0700

Here's the latest; it seems to not crash and appears to actually do as
advertised.


---
Subject: sched, numa: Use {cpu, pid} to create task groups for shared faults
From: Peter Zijlstra <[email protected]>
Date: Tue Jul 30 10:40:20 CEST 2013

A very simple/straight forward shared fault task grouping
implementation.

Signed-off-by: Peter Zijlstra <[email protected]>
---
 include/linux/sched.h |    3 
 kernel/sched/core.c   |    3 
 kernel/sched/fair.c   |  174 +++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h  |    5 -
 4 files changed, 171 insertions(+), 14 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1341,6 +1341,9 @@ struct task_struct {
        u64 node_stamp;                 /* migration stamp  */
        struct callback_head numa_work;
 
+       struct list_head numa_entry;
+       struct numa_group *numa_group;
+
        /*
         * Exponential decaying average of faults on a per-node basis.
         * Scheduling placement decisions are made based on the these counts.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1733,6 +1733,9 @@ static void __sched_fork(struct task_str
        p->numa_work.next = &p->numa_work;
        p->numa_faults = NULL;
        p->numa_faults_buffer = NULL;
+
+       INIT_LIST_HEAD(&p->numa_entry);
+       p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
 
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1149,6 +1149,17 @@ static void numa_migrate_preferred(struc
        p->numa_migrate_retry = jiffies + HZ/10;
 }
 
+struct numa_group {
+       atomic_t refcount;
+
+       spinlock_t lock; /* nr_tasks, tasks */
+       int nr_tasks;
+       struct list_head task_list;
+
+       struct rcu_head rcu;
+       atomic_long_t faults[0];
+};
+
 static void task_numa_placement(struct task_struct *p)
 {
        int seq, nid, max_nid = -1;
@@ -1157,6 +1168,7 @@ static void task_numa_placement(struct t
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
+
        p->numa_scan_seq = seq;
        p->numa_migrate_seq++;
        p->numa_scan_period_max = task_scan_max(p);
@@ -1167,14 +1179,24 @@ static void task_numa_placement(struct t
                int priv, i;
 
                for (priv = 0; priv < 2; priv++) {
+                       long diff;
+
                        i = task_faults_idx(nid, priv);
 
+                       diff = -p->numa_faults[i];
+
                        /* Decay existing window, copy faults since last scan */
                        p->numa_faults[i] >>= 1;
                        p->numa_faults[i] += p->numa_faults_buffer[i];
                        p->numa_faults_buffer[i] = 0;
 
+                       diff += p->numa_faults[i];
                        faults += p->numa_faults[i];
+
+                       if (p->numa_group) {
+                               /* safe because we can only change our own 
group */
+                               atomic_long_add(diff, 
&p->numa_group->faults[i]);
+                       }
                }
 
                if (faults > max_faults) {
@@ -1211,6 +1233,130 @@ static void task_numa_placement(struct t
        }
 }
 
+static inline int get_numa_group(struct numa_group *grp)
+{
+       return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+       if (atomic_dec_and_test(&grp->refcount))
+               kfree_rcu(grp, rcu);
+}
+
+static void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+       if (l1 > l2)
+               swap(l1, l2);
+
+       spin_lock(l1);
+       spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static void task_numa_group(struct task_struct *p, int cpu, int pid)
+{
+       struct numa_group *grp, *my_grp;
+       struct task_struct *tsk;
+       bool join = false;
+       int i;
+
+       if (unlikely(!p->numa_group)) {
+               unsigned int size = sizeof(struct numa_group) +
+                                   2*nr_node_ids*sizeof(atomic_long_t);
+
+               grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+               if (!grp)
+                       return;
+
+               atomic_set(&grp->refcount, 1);
+               spin_lock_init(&grp->lock);
+               INIT_LIST_HEAD(&grp->task_list);
+
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       atomic_long_set(&grp->faults[i], p->numa_faults[i]);
+
+               list_add(&p->numa_entry, &grp->task_list);
+               grp->nr_tasks++;
+               rcu_assign_pointer(p->numa_group, grp);
+       }
+
+       rcu_read_lock();
+       tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+       if ((tsk->pid & LAST__PID_MASK) != pid)
+               goto unlock;
+
+       grp = rcu_dereference(tsk->numa_group);
+       if (!grp)
+               goto unlock;
+
+       my_grp = p->numa_group;
+       if (grp == my_grp)
+               goto unlock;
+
+       /*
+        * Only join the other group if its bigger; if we're the bigger group,
+        * the other task will join us.
+        */
+       if (my_grp->nr_tasks > grp->nr_tasks)
+               goto unlock;
+
+       /*
+        * Tie-break on the grp address.
+        */
+       if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+               goto unlock;
+
+       if (!get_numa_group(grp))
+               goto unlock;
+
+       join = true;
+
+unlock:
+       rcu_read_unlock();
+
+       if (!join)
+               return;
+
+       for (i = 0; i < 2*nr_node_ids; i++) {
+               atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]);
+               atomic_long_add(p->numa_faults[i], &grp->faults[i]);
+       }
+
+       double_lock(&my_grp->lock, &grp->lock);
+
+       list_move(&p->numa_entry, &grp->task_list);
+       my_grp->nr_tasks--;
+       grp->nr_tasks++;
+
+       spin_unlock(&my_grp->lock);
+       spin_unlock(&grp->lock);
+
+       rcu_assign_pointer(p->numa_group, grp);
+
+       put_numa_group(my_grp);
+}
+
+void task_numa_free(struct task_struct *p)
+{
+       struct numa_group *grp = p->numa_group;
+       int i;
+
+       kfree(p->numa_faults);
+
+       if (grp) {
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       atomic_long_sub(p->numa_faults[i], &grp->faults[i]);
+
+               spin_lock(&grp->lock);
+               list_del(&p->numa_entry);
+               grp->nr_tasks--;
+               spin_unlock(&grp->lock);
+               rcu_assign_pointer(p->numa_group, NULL);
+               put_numa_group(grp);
+       }
+}
+
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
@@ -1226,21 +1372,12 @@ void task_numa_fault(int last_cpupid, in
        if (!p->mm)
                return;
 
-       /*
-        * First accesses are treated as private, otherwise consider accesses
-        * to be private if the accessing pid has not changed
-        */
-       if (!cpupid_pid_unset(last_cpupid))
-               priv = ((p->pid & LAST__PID_MASK) == 
cpupid_to_pid(last_cpupid));
-       else
-               priv = 1;
-
        /* Allocate buffer to track faults on a per-node basis */
        if (unlikely(!p->numa_faults)) {
                int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
 
                /* numa_faults and numa_faults_buffer share the allocation */
-               p->numa_faults = kzalloc(size * 2, GFP_KERNEL);
+               p->numa_faults = kzalloc(size * 2, GFP_KERNEL | __GFP_NOWARN);
                if (!p->numa_faults)
                        return;
 
@@ -1249,6 +1386,23 @@ void task_numa_fault(int last_cpupid, in
        }
 
        /*
+        * First accesses are treated as private, otherwise consider accesses
+        * to be private if the accessing pid has not changed
+        */
+       if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+               priv = 1;
+       } else {
+               int cpu, pid;
+
+               cpu = cpupid_to_cpu(last_cpupid);
+               pid = cpupid_to_pid(last_cpupid);
+
+               priv = (pid == (p->pid & LAST__PID_MASK));
+               if (!priv)
+                       task_numa_group(p, cpu, pid);
+       }
+
+       /*
         * If pages are properly placed (did not migrate) then scan slower.
         * This is reset periodically in case of phase changes
         *
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -556,10 +556,7 @@ static inline u64 rq_clock_task(struct r
 #ifdef CONFIG_NUMA_BALANCING
 extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *, struct task_struct *);
-static inline void task_numa_free(struct task_struct *p)
-{
-       kfree(p->numa_faults);
-}
+extern void task_numa_free(struct task_struct *p);
 #else /* CONFIG_NUMA_BALANCING */
 static inline void task_numa_free(struct task_struct *p)
 {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -v3] sched, numa: Use {cpu, pid} to create task groups for shared faults

Reply via email to