On 2014/8/12 5:05, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
> 
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1963,7 +1963,7 @@ zonelist_scan:
>>>  
>>>     /*
>>>      * Scan zonelist, looking for a zone with enough free.
>>> -    * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
>>> +    * See __cpuset_node_allowed() comment in kernel/cpuset.c.
>>>      */
>>>     for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>>                                             high_zoneidx, nodemask) {
>>> @@ -1974,7 +1974,7 @@ zonelist_scan:
>>>                             continue;
>>>             if (cpusets_enabled() &&
>>>                     (alloc_flags & ALLOC_CPUSET) &&
>>> -                   !cpuset_zone_allowed_softwall(zone, gfp_mask))
>>> +                   !cpuset_zone_allowed(zone, gfp_mask))
>>>                             continue;
>>
>> So, this is get_page_from_freelist. It's called from
>> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
>> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
>> bit set only for __GFP_WAIT allocations. That said, w/o your patch we
>> try to respect cpusets for all allocations, including atomic, and only
>> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
>> allocations, while with your patch we always ignore cpusets for
>> !__GFP_WAIT allocations. Not sure if it really matters though, because
>> usually one uses cpuset.mems in conjunction with cpuset.cpus and it
>> won't make any difference then. It also doesn't conflict with any cpuset
>> documentation.
>>
> 
> Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  

I'm not quite sure. That code has been there before I got involved in cpuset.

> The only thing that we get by falling back to the page allocator slowpath 
> is that kswapd gets woken up before the allocation is attempted without 
> ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
> succeed on any node.  Even with the patch, if the allocation fails because 
> all nodes are below their min watermark, then we still fallback to the 
> slowpath and wake up kswapd but there's nothing much else we can do 
> because it's !__GFP_WAIT.
> .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single 
cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson <p...@sgi.com>
Date:   Wed Dec 13 00:34:25 2006 -0800

    [PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
 kernel/cpuset.c | 81 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 32 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 
 /*
  * CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
+       unsigned long flags;
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
                        continue;
                rcu_read_unlock();
 
-               mutex_lock(&callback_mutex);
+               spin_lock_irqsave(&callback_lock, flags);
                cpumask_copy(cp->effective_cpus, new_cpus);
-               mutex_unlock(&callback_mutex);
+               spin_unlock_irqrestore(&callback_lock, flags);
 
                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
 {
+       unsigned long flags;
        int retval;
 
        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
        if (retval < 0)
                return retval;
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 
        /* use trialcs->cpus_allowed as a temp variable */
        update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1105,6 +1107,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
  */
 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 {
+       unsigned long flags;
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
 
@@ -1132,8 +1135,9 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
                rcu_read_unlock();
 
                mutex_lock(&callback_mutex);
+               spin_lock_irqsave(&callback_lock, flags);
                cp->effective_mems = *new_mems;
-               mutex_unlock(&callback_mutex);
+               spin_unlock_irqrestore(&callback_lock, flags);
 
                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1162,6 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
 {
+       unsigned long flags;
        int retval;
 
        /*
@@ -1201,9 +1206,9 @@ static int update_nodemask(struct cpuset *cs, struct 
cpuset *trialcs,
        if (retval < 0)
                goto done;
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        cs->mems_allowed = trialcs->mems_allowed;
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 
        /* use trialcs->mems_allowed as a temp variable */
        update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1264,6 +1269,7 @@ static void update_tasks_flags(struct cpuset *cs)
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
 {
+       unsigned long flags;
        struct cpuset *trialcs;
        int balance_flag_changed;
        int spread_flag_changed;
@@ -1288,9 +1294,9 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
                        || (is_spread_page(cs) != is_spread_page(trialcs)));
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        cs->flags = trialcs->flags;
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                rebuild_sched_domains_locked();
@@ -1690,11 +1696,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, 
void *v)
        ssize_t count;
        char *buf, *s;
        int ret = 0;
+       unsigned long flags;
 
        count = seq_get_buf(sf, &buf);
        s = buf;
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_mutex, flags);
 
        switch (type) {
        case FILE_CPULIST:
@@ -1721,7 +1728,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, 
void *v)
                seq_commit(sf, -1);
        }
 out_unlock:
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqstore(&callback_lock, flags);
        return ret;
 }
 
@@ -1924,6 +1931,7 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
        struct cpuset *parent = parent_cs(cs);
        struct cpuset *tmp_cs;
        struct cgroup_subsys_state *pos_css;
+       unsigned long flags;
 
        if (!parent)
                return 0;
@@ -1938,12 +1946,12 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
 
        cpuset_inc();
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        if (cgroup_on_dfl(cs->css.cgroup)) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
        }
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 
        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
@@ -1970,10 +1978,10 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
        }
        rcu_read_unlock();
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        cs->mems_allowed = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
-       mutex_unlock(&callback_mutex);
+       spin_lock_irqrestore(&callback_lock, flags);
 out_unlock:
        mutex_unlock(&cpuset_mutex);
        return 0;
@@ -2011,8 +2019,10 @@ static void cpuset_css_free(struct cgroup_subsys_state 
*css)
 
 static void cpuset_bind(struct cgroup_subsys_state *root_css)
 {
+       unsigned long flags;
+
        mutex_lock(&cpuset_mutex);
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
 
        if (cgroup_on_dfl(root_css->cgroup)) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2023,7 +2033,7 @@ static void cpuset_bind(struct cgroup_subsys_state 
*root_css)
                top_cpuset.mems_allowed = top_cpuset.effective_mems;
        }
 
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
        mutex_unlock(&cpuset_mutex);
 }
 
@@ -2107,13 +2117,14 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
                            bool cpus_updated, bool mems_updated)
 {
        bool is_empty;
+       unsigned long flags;
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 
        /*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2145,15 +2156,17 @@ hotplug_update_tasks(struct cpuset *cs,
                     struct cpumask *new_cpus, nodemask_t *new_mems,
                     bool cpus_updated, bool mems_updated)
 {
+       unsigned long flags;
+
        if (cpumask_empty(new_cpus))
                cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
        if (nodes_empty(*new_mems))
                *new_mems = parent_cs(cs)->effective_mems;
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->effective_mems = *new_mems;
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 
        if (cpus_updated)
                update_tasks_cpumask(cs);
@@ -2227,6 +2240,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
        bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+       unsigned long flags;
 
        mutex_lock(&cpuset_mutex);
 
@@ -2239,21 +2253,21 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
 
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
-               mutex_lock(&callback_mutex);
+               spin_lock_irqsave(&callback_lock, flags);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
-               mutex_unlock(&callback_mutex);
+               spin_unlock_irqrestore(&callback_lock, flags);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }
 
        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
-               mutex_lock(&callback_mutex);
+               spin_lock_irqsave(&callback_lock, flags);
                if (!on_dfl)
                        top_cpuset.mems_allowed = new_mems;
                top_cpuset.effective_mems = new_mems;
-               mutex_unlock(&callback_mutex);
+               spin_unlock_irqrestore(&callback_lock, flags);
                update_tasks_nodemask(&top_cpuset);
        }
 
@@ -2346,11 +2360,13 @@ void __init cpuset_init_smp(void)
 
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
-       mutex_lock(&callback_mutex);
+       unsigned long flags;
+
+       spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_cpus(task_cs(tsk), pmask);
        rcu_read_unlock();
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 }
 
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2396,12 +2412,13 @@ void cpuset_init_current_mems_allowed(void)
 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
        nodemask_t mask;
+       unsigned long flags;
 
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
 
        return mask;
 }
@@ -2514,14 +2531,14 @@ int __cpuset_node_allowed_softwall(int node, gfp_t 
gfp_mask)
                return 1;
 
        /* Not hardwall and node outside mems_allowed: scan up cpusets */
-       mutex_lock(&callback_mutex);
+       spin_lock_irqsave(&callback_mutex, flags);
 
        rcu_read_lock();
        cs = nearest_hardwall_ancestor(task_cs(current));
        allowed = node_isset(node, cs->mems_allowed);
        rcu_read_unlock();
 
-       mutex_unlock(&callback_mutex);
+       spin_unlock_irqrestore(&callback_lock, flags);
        return allowed;
 }
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to