The current cpuset partition code is able to dynamically update the
sched domains of a running system to perform what is essentally the
"isolcpus=domain,..." boot command line feature at run time.

To enable runtime modification of nohz_full, we will have to make use
of the CPU hotplug functionality to facilitate the proper addition
or subtraction of nohz_full CPUs. In other word, we can't hold the
cpu_hotplug_lock while doing so. Given the current lock ordering, we
will need to introduce a new top level mutex to ensure proper mutual
exclusion in case there is a need to update the cpuset states that
may require the use of CPU hotplug. This patch introduces a new top
level isolcpus_update_mutex for such purpose. This new mutex will be
acquired in case the cpuset partition states or the set of isolated
CPUs may have to be changed.

The update_unbound_workqueue_cpumask() is now renamed to
update_isolation_cpumasks() and moved outside of cpu_hotplug_lock
critical regions to enable its future extension to invoke CPU hotplug.

A new global isolcpus_update_state structure is added to track if
update_isolation_cpumasks() will need to be invoked. So the existing
partition_xcpus_add/del() functions and their callers can now be
simplified.

Signed-off-by: Waiman Long <[email protected]>
---
 kernel/cgroup/cpuset.c | 149 ++++++++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 63 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 27adb04df675..2190efd33efb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -215,29 +215,39 @@ static struct cpuset top_cpuset = {
 };
 
 /*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
- * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
- * structures. Note that cpuset_mutex needs to be a mutex as it is used in
- * paths that rely on priority inheritance (e.g. scheduler - on RT) for
- * correctness.
+ * CPUSET Locking Convention
+ * -------------------------
  *
- * A task must hold both locks to modify cpusets.  If a task holds
- * cpuset_mutex, it blocks others, ensuring that it is the only task able to
- * also acquire callback_lock and be able to modify cpusets.  It can perform
- * various checks on the cpuset structure first, knowing nothing will change.
- * It can also allocate memory while just holding cpuset_mutex.  While it is
- * performing these checks, various callback routines can briefly acquire
- * callback_lock to query cpusets.  Once it is ready to make the changes, it
- * takes callback_lock, blocking everyone else.
+ * Below are the three global locks guarding cpuset structures in lock
+ * acquisition order:
+ *  - isolcpus_update_mutex
+ *  - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock)
+ *  - cpuset_mutex
+ *  - callback_lock (raw spinlock)
  *
- * Calls to the kernel memory allocator can not be made while holding
- * callback_lock, as that would risk double tripping on callback_lock
- * from one of the callbacks into the cpuset code from within
- * __alloc_pages().
+ * The first isolcpus_update_mutex should only be held if the existing set of
+ * isolated CPUs (in isolated partition) or any of the partition states may be
+ * changed. Otherwise, it can be skipped. This is used to prevent concurrent
+ * updates to the set of isolated CPUs.
  *
- * If a task is only holding callback_lock, then it has read-only
- * access to cpusets.
+ * A task must hold all the remaining three locks to modify externally visible
+ * or used fields of cpusets, though some of the internally used cpuset fields
+ * can be modified by holding cpu_hotplug_lock and cpuset_mutex only. If only
+ * reliable read access of the externally used fields are needed, a task can
+ * hold either cpuset_mutex or callback_lock.
+ *
+ * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others,
+ * ensuring that it is the only task able to also acquire callback_lock and
+ * be able to modify cpusets.  It can perform various checks on the cpuset
+ * structure first, knowing nothing will change. It can also allocate memory
+ * without holding callback_lock. While it is performing these checks, various
+ * callback routines can briefly acquire callback_lock to query cpusets.  Once
+ * it is ready to make the changes, it takes callback_lock, blocking everyone
+ * else.
+ *
+ * Calls to the kernel memory allocator cannot be made while holding
+ * callback_lock which is a spinlock, as the memory allocator may sleep or
+ * call back into cpuset code and acquire callback_lock.
  *
  * Now, the task_struct fields mems_allowed and mempolicy may be changed
  * by other task, we use alloc_lock in the task_struct fields to protect
@@ -248,6 +258,7 @@ static struct cpuset top_cpuset = {
  * cpumasks and nodemasks.
  */
 
+static DEFINE_MUTEX(isolcpus_update_mutex);
 static DEFINE_MUTEX(cpuset_mutex);
 
 void cpuset_lock(void)
@@ -272,6 +283,17 @@ void cpuset_callback_unlock_irq(void)
        spin_unlock_irq(&callback_lock);
 }
 
+/*
+ * Isolcpus update state (protected by isolcpus_update_mutex mutex)
+ *
+ * It contains data related to updating the isolated CPUs configuration in
+ * isolated partitions.
+ */
+static struct {
+       bool updating;          /* Isolcpus updating in progress */
+       cpumask_var_t cpus;     /* CPUs to be updated */
+} isolcpus_update_state;
+
 static struct workqueue_struct *cpuset_migrate_mm_wq;
 
 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
@@ -1273,6 +1295,9 @@ static void isolated_cpus_update(int old_prs, int 
new_prs, struct cpumask *xcpus
                cpumask_or(isolated_cpus, isolated_cpus, xcpus);
        else
                cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+
+       isolcpus_update_state.updating = true;
+       cpumask_or(isolcpus_update_state.cpus, isolcpus_update_state.cpus, 
xcpus);
 }
 
 /*
@@ -1280,31 +1305,26 @@ static void isolated_cpus_update(int old_prs, int 
new_prs, struct cpumask *xcpus
  * @new_prs: new partition_root_state
  * @parent: parent cpuset
  * @xcpus: exclusive CPUs to be added
- * Return: true if isolated_cpus modified, false otherwise
  *
  * Remote partition if parent == NULL
  */
-static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
+static void partition_xcpus_add(int new_prs, struct cpuset *parent,
                                struct cpumask *xcpus)
 {
-       bool isolcpus_updated;
-
        WARN_ON_ONCE(new_prs < 0);
        lockdep_assert_held(&callback_lock);
        if (!parent)
                parent = &top_cpuset;
 
-
        if (parent == &top_cpuset)
                cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
 
-       isolcpus_updated = (new_prs != parent->partition_root_state);
-       if (isolcpus_updated)
+       if (new_prs != parent->partition_root_state)
                isolated_cpus_update(parent->partition_root_state, new_prs,
                                     xcpus);
 
        cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
-       return isolcpus_updated;
+       return;
 }
 
 /*
@@ -1312,15 +1332,12 @@ static bool partition_xcpus_add(int new_prs, struct 
cpuset *parent,
  * @old_prs: old partition_root_state
  * @parent: parent cpuset
  * @xcpus: exclusive CPUs to be removed
- * Return: true if isolated_cpus modified, false otherwise
  *
  * Remote partition if parent == NULL
  */
-static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
+static void partition_xcpus_del(int old_prs, struct cpuset *parent,
                                struct cpumask *xcpus)
 {
-       bool isolcpus_updated;
-
        WARN_ON_ONCE(old_prs < 0);
        lockdep_assert_held(&callback_lock);
        if (!parent)
@@ -1329,27 +1346,33 @@ static bool partition_xcpus_del(int old_prs, struct 
cpuset *parent,
        if (parent == &top_cpuset)
                cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
 
-       isolcpus_updated = (old_prs != parent->partition_root_state);
-       if (isolcpus_updated)
+       if (old_prs != parent->partition_root_state)
                isolated_cpus_update(old_prs, parent->partition_root_state,
                                     xcpus);
 
        cpumask_and(xcpus, xcpus, cpu_active_mask);
        cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
-       return isolcpus_updated;
+       return;
 }
 
-static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
+/**
+ * update_isolation_cpumasks - Update external isolation CPU masks
+ *
+ * The following external CPU masks will be updated if necessary:
+ * - workqueue unbound cpumask
+ */
+static void update_isolation_cpumasks(void)
 {
        int ret;
 
-       lockdep_assert_cpus_held();
-
-       if (!isolcpus_updated)
+       if (!isolcpus_update_state.updating)
                return;
 
        ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
        WARN_ON_ONCE(ret < 0);
+
+       cpumask_clear(isolcpus_update_state.cpus);
+       isolcpus_update_state.updating = false;
 }
 
 /**
@@ -1441,8 +1464,6 @@ static inline bool is_local_partition(struct cpuset *cs)
 static int remote_partition_enable(struct cpuset *cs, int new_prs,
                                   struct tmpmasks *tmp)
 {
-       bool isolcpus_updated;
-
        /*
         * The user must have sysadmin privilege.
         */
@@ -1466,11 +1487,10 @@ static int remote_partition_enable(struct cpuset *cs, 
int new_prs,
                return PERR_INVCPUS;
 
        spin_lock_irq(&callback_lock);
-       isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+       partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
        list_add(&cs->remote_sibling, &remote_children);
        cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
        spin_unlock_irq(&callback_lock);
-       update_unbound_workqueue_cpumask(isolcpus_updated);
        cpuset_force_rebuild();
        cs->prs_err = 0;
 
@@ -1493,15 +1513,12 @@ static int remote_partition_enable(struct cpuset *cs, 
int new_prs,
  */
 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
 {
-       bool isolcpus_updated;
-
        WARN_ON_ONCE(!is_remote_partition(cs));
        WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
 
        spin_lock_irq(&callback_lock);
        list_del_init(&cs->remote_sibling);
-       isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
-                                              NULL, cs->effective_xcpus);
+       partition_xcpus_del(cs->partition_root_state, NULL, 
cs->effective_xcpus);
        if (cs->prs_err)
                cs->partition_root_state = -cs->partition_root_state;
        else
@@ -1511,7 +1528,6 @@ static void remote_partition_disable(struct cpuset *cs, 
struct tmpmasks *tmp)
        compute_effective_exclusive_cpumask(cs, NULL, NULL);
        reset_partition_data(cs);
        spin_unlock_irq(&callback_lock);
-       update_unbound_workqueue_cpumask(isolcpus_updated);
        cpuset_force_rebuild();
 
        /*
@@ -1536,7 +1552,6 @@ static void remote_cpus_update(struct cpuset *cs, struct 
cpumask *xcpus,
 {
        bool adding, deleting;
        int prs = cs->partition_root_state;
-       int isolcpus_updated = 0;
 
        if (WARN_ON_ONCE(!is_remote_partition(cs)))
                return;
@@ -1569,9 +1584,9 @@ static void remote_cpus_update(struct cpuset *cs, struct 
cpumask *xcpus,
 
        spin_lock_irq(&callback_lock);
        if (adding)
-               isolcpus_updated += partition_xcpus_add(prs, NULL, 
tmp->addmask);
+               partition_xcpus_add(prs, NULL, tmp->addmask);
        if (deleting)
-               isolcpus_updated += partition_xcpus_del(prs, NULL, 
tmp->delmask);
+               partition_xcpus_del(prs, NULL, tmp->delmask);
        /*
         * Need to update effective_xcpus and exclusive_cpus now as
         * update_sibling_cpumasks() below may iterate back to the same cs.
@@ -1580,7 +1595,6 @@ static void remote_cpus_update(struct cpuset *cs, struct 
cpumask *xcpus,
        if (xcpus)
                cpumask_copy(cs->exclusive_cpus, xcpus);
        spin_unlock_irq(&callback_lock);
-       update_unbound_workqueue_cpumask(isolcpus_updated);
        if (adding || deleting)
                cpuset_force_rebuild();
 
@@ -1662,7 +1676,6 @@ static int update_parent_effective_cpumask(struct cpuset 
*cs, int cmd,
        int old_prs, new_prs;
        int part_error = PERR_NONE;     /* Partition error? */
        int subparts_delta = 0;
-       int isolcpus_updated = 0;
        struct cpumask *xcpus = user_xcpus(cs);
        bool nocpu;
 
@@ -1932,18 +1945,15 @@ static int update_parent_effective_cpumask(struct 
cpuset *cs, int cmd,
         * and vice versa.
         */
        if (adding)
-               isolcpus_updated += partition_xcpus_del(old_prs, parent,
-                                                       tmp->addmask);
+               partition_xcpus_del(old_prs, parent, tmp->addmask);
        if (deleting)
-               isolcpus_updated += partition_xcpus_add(new_prs, parent,
-                                                       tmp->delmask);
+               partition_xcpus_add(new_prs, parent, tmp->delmask);
 
        if (is_partition_valid(parent)) {
                parent->nr_subparts += subparts_delta;
                WARN_ON_ONCE(parent->nr_subparts < 0);
        }
        spin_unlock_irq(&callback_lock);
-       update_unbound_workqueue_cpumask(isolcpus_updated);
 
        if ((old_prs != new_prs) && (cmd == partcmd_update))
                update_partition_exclusive_flag(cs, new_prs);
@@ -2968,7 +2978,6 @@ static int update_prstate(struct cpuset *cs, int new_prs)
        else if (isolcpus_updated)
                isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
        spin_unlock_irq(&callback_lock);
-       update_unbound_workqueue_cpumask(isolcpus_updated);
 
        /* Force update if switching back to member & update effective_xcpus */
        update_cpumasks_hier(cs, &tmpmask, !new_prs);
@@ -3224,6 +3233,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
        int retval = -ENODEV;
 
        buf = strstrip(buf);
+       mutex_lock(&isolcpus_update_mutex);
        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
@@ -3256,6 +3266,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
+       update_isolation_cpumasks();
+       mutex_unlock(&isolcpus_update_mutex);
        flush_workqueue(cpuset_migrate_mm_wq);
        return retval ?: nbytes;
 }
@@ -3358,12 +3370,15 @@ static ssize_t cpuset_partition_write(struct 
kernfs_open_file *of, char *buf,
        else
                return -EINVAL;
 
+       mutex_lock(&isolcpus_update_mutex);
        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (is_cpuset_online(cs))
                retval = update_prstate(cs, val);
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
+       update_isolation_cpumasks();
+       mutex_unlock(&isolcpus_update_mutex);
        return retval ?: nbytes;
 }
 
@@ -3586,15 +3601,22 @@ static void cpuset_css_killed(struct 
cgroup_subsys_state *css)
 {
        struct cpuset *cs = css_cs(css);
 
+       mutex_lock(&isolcpus_update_mutex);
+       /*
+        * Here the partition root state can't be changed by user again.
+        */
+       if (!is_partition_valid(cs))
+               goto out;
+
        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
-
        /* Reset valid partition back to member */
-       if (is_partition_valid(cs))
-               update_prstate(cs, PRS_MEMBER);
-
+       update_prstate(cs, PRS_MEMBER);
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
+       update_isolation_cpumasks();
+out:
+       mutex_unlock(&isolcpus_update_mutex);
 
 }
 
@@ -3751,6 +3773,7 @@ int __init cpuset_init(void)
        BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
+       BUG_ON(!zalloc_cpumask_var(&isolcpus_update_state.cpus, GFP_KERNEL));
 
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
-- 
2.50.0


Reply via email to