On 2026/2/13 0:46, Waiman Long wrote:
> As any change to isolated_cpus is going to be propagated to the
> HK_TYPE_DOMAIN housekeeping cpumask, it can be problematic if
> housekeeping cpumasks are directly being modified from the CPU hotplug
> code path. This is especially the case if we are going to enable dynamic
> update to the nohz_full housekeeping cpumask (HK_TYPE_KERNEL_NOISE)
> in the near future with the help of CPU hotplug.
> 
> Avoid these potential problems by changing the cpuset code to not
> updating isolated_cpus when calling from CPU hotplug. A new special
> PRS_INVALID_ISOLCPUS is added to indicate the current cpuset is an
> invalid partition but its effective_xcpus are still in isolated_cpus.
> This special state will be set if an isolated partition becomes invalid
> due to the shutdown of the last active CPU in that partition. We also
> need to keep the effective_xcpus even if exclusive_cpus isn't set.
> 
> When changes are made to "cpuset.cpus", "cpuset.cpus.exclusive" or
> "cpuset.cpus.partition" of a PRS_INVALID_ISOLCPUS cpuset, its state
> will be reset back to PRS_INVALID_ISOLATED and its effective_xcpus will
> be removed from isolated_cpus before proceeding.
> 
> As CPU hotplug will no longer update isolated_cpus, some of the test
> cases in test_cpuset_prs.h will have to be updated to match the new
> expected results. Some new test cases are also added to confirm that
> "cpuset.cpus.isolated" and HK_TYPE_DOMAIN housekeeping cpumask will
> both be updated.
> 
> Signed-off-by: Waiman Long <[email protected]>
> ---
>  kernel/cgroup/cpuset.c                        | 85 ++++++++++++++++---
>  .../selftests/cgroup/test_cpuset_prs.sh       | 21 +++--
>  2 files changed, 87 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index c792380f9b60..48b7f275085b 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -159,6 +159,8 @@ static bool force_sd_rebuild;                     /* RWCS 
> */
>   *   2 - partition root without load balancing (isolated)
>   *  -1 - invalid partition root
>   *  -2 - invalid isolated partition root
> + *  -3 - invalid isolated partition root but with effective xcpus still
> + *    in isolated_cpus (set from CPU hotplug side)
>   *
>   *  There are 2 types of partitions - local or remote. Local partitions are
>   *  those whose parents are partition root themselves. Setting of
> @@ -187,6 +189,7 @@ static bool force_sd_rebuild;                     /* RWCS 
> */
>  #define PRS_ISOLATED         2
>  #define PRS_INVALID_ROOT     -1
>  #define PRS_INVALID_ISOLATED -2
> +#define PRS_INVALID_ISOLCPUS -3 /* Effective xcpus still in isolated_cpus */
>  

How about adding a helper?

bool hotplug_invalidate_isolate(struct cpuset *cs)
{
        if (current->flags & PF_KTHREAD) &&
                (cs->partition_root_state == PRS_INVALID_ISOLATED);
}

>  /*
>   * Temporary cpumasks for working with partitions that are passed among
> @@ -382,6 +385,30 @@ static inline bool is_in_v2_mode(void)
>             (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
>  }
>  
> +/*
> + * If the given cpuset has a partition state of PRS_INVALID_ISOLCPUS,
> + * remove its effective_xcpus from isolated_cpus and reset its state to
> + * PRS_INVALID_ISOLATED. Also clear effective_xcpus if exclusive_cpus is
> + * empty.
> + */
> +static void fix_invalid_isolcpus(struct cpuset *cs, struct cpuset *trialcs)
> +{
> +     if (likely(cs->partition_root_state != PRS_INVALID_ISOLCPUS))
> +             return;

        if (likely(cs->partition_root_state != PRS_INVALID_ISOLATED ||
                   cpumask_empty(cs->effective_xcpus)))
                return;

> +     WARN_ON_ONCE(cpumask_empty(cs->effective_xcpus));
> +     spin_lock_irq(&callback_lock);
> +     cpumask_andnot(isolated_cpus, isolated_cpus, cs->effective_xcpus);
> +     if (cpumask_empty(cs->exclusive_cpus))
> +             cpumask_clear(cs->effective_xcpus);
> +     cs->partition_root_state = PRS_INVALID_ISOLATED;
> +     spin_unlock_irq(&callback_lock);
> +     isolated_cpus_updating = true;
> +     if (trialcs) {
> +             trialcs->partition_root_state = PRS_INVALID_ISOLATED;
> +             cpumask_copy(trialcs->effective_xcpus, cs->effective_xcpus);
> +     }
> +}
> +
>  /**
>   * partition_is_populated - check if partition has tasks
>   * @cs: partition root to be checked
> @@ -1160,7 +1187,8 @@ static void reset_partition_data(struct cpuset *cs)
>  
>       lockdep_assert_held(&callback_lock);
>  
> -     if (cpumask_empty(cs->exclusive_cpus)) {
> +     if (cpumask_empty(cs->exclusive_cpus) &&
> +         (cs->partition_root_state != PRS_INVALID_ISOLCPUS)) {

        if (cpumask_empty(cs->exclusive_cpus) &&
            !hotplug_invalidate_isolate(cs))

>               cpumask_clear(cs->effective_xcpus);
>               if (is_cpu_exclusive(cs))
>                       clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
> @@ -1189,6 +1217,10 @@ static void isolated_cpus_update(int old_prs, int 
> new_prs, struct cpumask *xcpus
>                       return;
>               cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
>       }
> +     /*
> +      * Shouldn't update isolated_cpus from CPU hotplug
> +      */
> +     WARN_ON_ONCE(current->flags & PF_KTHREAD);
>       isolated_cpus_updating = true;
>  }
>  
> @@ -1208,7 +1240,6 @@ static void partition_xcpus_add(int new_prs, struct 
> cpuset *parent,
>       if (!parent)
>               parent = &top_cpuset;
>  
> -
>       if (parent == &top_cpuset)
>               cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
>  
> @@ -1224,11 +1255,12 @@ static void partition_xcpus_add(int new_prs, struct 
> cpuset *parent,
>   * @old_prs: old partition_root_state
>   * @parent: parent cpuset
>   * @xcpus: exclusive CPUs to be removed
> + * @no_isolcpus: don't update isolated_cpus
>   *
>   * Remote partition if parent == NULL
>   */
>  static void partition_xcpus_del(int old_prs, struct cpuset *parent,
> -                             struct cpumask *xcpus)
> +                             struct cpumask *xcpus, bool no_isolcpus)

remove.

>  {
>       WARN_ON_ONCE(old_prs < 0);
>       lockdep_assert_held(&callback_lock);
> @@ -1238,7 +1270,7 @@ static void partition_xcpus_del(int old_prs, struct 
> cpuset *parent,
>       if (parent == &top_cpuset)
>               cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
>  
> -     if (old_prs != parent->partition_root_state)
> +     if ((old_prs != parent->partition_root_state) && !no_isolcpus)

        if ((old_prs != parent->partition_root_state) &&
            !hotplug_invalidate_isolate(cs))

>               isolated_cpus_update(old_prs, parent->partition_root_state,
>                                    xcpus);
>  
> @@ -1496,6 +1528,8 @@ static int remote_partition_enable(struct cpuset *cs, 
> int new_prs,
>   */
>  static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
>  {
> +     int old_prs = cs->partition_root_state;
> +
>       WARN_ON_ONCE(!is_remote_partition(cs));
>       /*
>        * When a CPU is offlined, top_cpuset may end up with no available CPUs,
> @@ -1508,14 +1542,24 @@ static void remote_partition_disable(struct cpuset 
> *cs, struct tmpmasks *tmp)
>  
>       spin_lock_irq(&callback_lock);
>       cs->remote_partition = false;
> -     partition_xcpus_del(cs->partition_root_state, NULL, 
> cs->effective_xcpus);
>       if (cs->prs_err)
>               cs->partition_root_state = -cs->partition_root_state;
>       else
>               cs->partition_root_state = PRS_MEMBER;
> +     /*
> +      * Don't update isolated_cpus if calling from CPU hotplug kthread
> +      */
> +     if ((current->flags & PF_KTHREAD) &&
> +         (cs->partition_root_state == PRS_INVALID_ISOLATED))
> +             cs->partition_root_state = PRS_INVALID_ISOLCPUS;
>  

remove.

> -     /* effective_xcpus may need to be changed */
> -     compute_excpus(cs, cs->effective_xcpus);
> +     partition_xcpus_del(old_prs, NULL, cs->effective_xcpus,
> +                         cs->partition_root_state == PRS_INVALID_ISOLCPUS);
> +     /*
> +      * effective_xcpus may need to be changed
> +      */
> +     if (cs->partition_root_state != PRS_INVALID_ISOLCPUS)
> +             compute_excpus(cs, cs->effective_xcpus);
>       reset_partition_data(cs);
>       spin_unlock_irq(&callback_lock);
>       update_isolation_cpumasks();
> @@ -1580,7 +1624,7 @@ static void remote_cpus_update(struct cpuset *cs, 
> struct cpumask *xcpus,
>       if (adding)
>               partition_xcpus_add(prs, NULL, tmp->addmask);
>       if (deleting)
> -             partition_xcpus_del(prs, NULL, tmp->delmask);
> +             partition_xcpus_del(prs, NULL, tmp->delmask, false);
>       /*
>        * Need to update effective_xcpus and exclusive_cpus now as
>        * update_sibling_cpumasks() below may iterate back to the same cs.
> @@ -1893,6 +1937,10 @@ static int update_parent_effective_cpumask(struct 
> cpuset *cs, int cmd,
>                       if (!part_error)
>                               new_prs = -old_prs;
>                       break;
> +             case PRS_INVALID_ISOLCPUS:
> +                     if (!part_error)
> +                             new_prs = PRS_ISOLATED;
> +                     break;

remove

>               }
>       }
>  
> @@ -1923,12 +1971,19 @@ static int update_parent_effective_cpumask(struct 
> cpuset *cs, int cmd,
>       if (old_prs != new_prs)
>               cs->partition_root_state = new_prs;
>  
> +     /*
> +      * Don't update isolated_cpus if calling from CPU hotplug kthread
> +      */
> +     if ((current->flags & PF_KTHREAD) &&
> +         (cs->partition_root_state == PRS_INVALID_ISOLATED))
> +             cs->partition_root_state = PRS_INVALID_ISOLCPUS;

remove

>       /*
>        * Adding to parent's effective_cpus means deletion CPUs from cs
>        * and vice versa.
>        */
>       if (adding)
> -             partition_xcpus_del(old_prs, parent, tmp->addmask);
> +             partition_xcpus_del(old_prs, parent, tmp->addmask,
> +                                 cs->partition_root_state == 
> PRS_INVALID_ISOLCPUS);

remove

>       if (deleting)
>               partition_xcpus_add(new_prs, parent, tmp->delmask);
>  
> @@ -2317,6 +2372,7 @@ static void partition_cpus_change(struct cpuset *cs, 
> struct cpuset *trialcs,
>       if (cs_is_member(cs))
>               return;
>  
> +     fix_invalid_isolcpus(cs, trialcs);
>       prs_err = validate_partition(cs, trialcs);
>       if (prs_err)
>               trialcs->prs_err = cs->prs_err = prs_err;
> @@ -2818,6 +2874,7 @@ static int update_prstate(struct cpuset *cs, int 
> new_prs)
>       if (alloc_tmpmasks(&tmpmask))
>               return -ENOMEM;
>  
> +     fix_invalid_isolcpus(cs, NULL);
>       err = update_partition_exclusive_flag(cs, new_prs);
>       if (err)
>               goto out;
> @@ -3268,6 +3325,7 @@ static int cpuset_partition_show(struct seq_file *seq, 
> void *v)
>               type = "root";
>               fallthrough;
>       case PRS_INVALID_ISOLATED:
> +     case PRS_INVALID_ISOLCPUS:
>               if (!type)
>                       type = "isolated";
>               err = perr_strings[READ_ONCE(cs->prs_err)];
> @@ -3463,9 +3521,9 @@ static void cpuset_css_offline(struct 
> cgroup_subsys_state *css)
>  }
>  
>  /*
> - * If a dying cpuset has the 'cpus.partition' enabled, turn it off by
> - * changing it back to member to free its exclusive CPUs back to the pool to
> - * be used by other online cpusets.
> + * If a dying cpuset has the 'cpus.partition' enabled or is in the
> + * PRS_INVALID_ISOLCPUS state, turn it off by changing it back to member to
> + * free its exclusive CPUs back to the pool to be used by other online 
> cpusets.
>   */
>  static void cpuset_css_killed(struct cgroup_subsys_state *css)
>  {
> @@ -3473,7 +3531,8 @@ static void cpuset_css_killed(struct 
> cgroup_subsys_state *css)
>  
>       cpuset_full_lock();
>       /* Reset valid partition back to member */
> -     if (is_partition_valid(cs))
> +     if (is_partition_valid(cs) ||
> +         (cs->partition_root_state == PRS_INVALID_ISOLCPUS))
        if (is_partition_valid(cs) ||
            (cs->partition_root_state == PRS_INVALID_ISOLATED &&
             !cpumask_empty(cs->effective_xcpus)))
>               update_prstate(cs, PRS_MEMBER);
>       cpuset_full_unlock();
>  }

Can this be simpler?

> diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh 
> b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> index 5dff3ad53867..380506157f70 100755
> --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> @@ -234,6 +234,7 @@ TEST_MATRIX=(
>       "$SETUP_A123_PARTITIONS    .     C2-3    .      .      .     0 
> A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
>  
>       # CPU offlining cases:
> +     # cpuset.cpus.isolated should no longer be updated.
>       "   C0-1     .      .    C2-3    S+    C4-5     .     O2=0   0 
> A1:0-1|B1:3"
>       "C0-3:P1:S+ C2-3:P1 .      .     O2=0    .      .      .     0 
> A1:0-1|A2:3"
>       "C0-3:P1:S+ C2-3:P1 .      .     O2=0   O2=1    .      .     0 
> A1:0-1|A2:2-3"
> @@ -245,8 +246,9 @@ TEST_MATRIX=(
>       "C2-3:P1:S+  C3:P2  .      .     O2=0   O2=1    .      .     0 
> A1:2|A2:3 A1:P1|A2:P2"
>       "C2-3:P1:S+  C3:P1  .      .     O2=0    .      .      .     0 A1:|A2:3 
> A1:P1|A2:P1"
>       "C2-3:P1:S+  C3:P1  .      .     O3=0    .      .      .     0 A1:2|A2: 
> A1:P1|A2:P1"
> -     "C2-3:P1:S+  C3:P1  .      .    T:O2=0   .      .      .     0 
> A1:3|A2:3 A1:P1|A2:P-1"
> -     "C2-3:P1:S+  C3:P1  .      .      .    T:O3=0   .      .     0 
> A1:2|A2:2 A1:P1|A2:P-1"
> +     "C2-3:P1:S+  C3:P2  .      .    T:O2=0   .      .      .     0 
> A1:3|A2:3 A1:P1|A2:P-2"
> +     "C1-3:P1:S+  C3:P2  .      .      .    T:O3=0   .      .     0 
> A1:1-2|A2:1-2|XA2:3 A1:P1|A2:P-2 3"
> +     "C1-3:P1:S+  C3:P2  .      .      .    T:O3=0  O3=1    .     0 
> A1:1-2|A2:3|XA2:3 A1:P1|A2:P2  3"
>       "$SETUP_A123_PARTITIONS    .     O1=0    .      .      .     0 
> A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
>       "$SETUP_A123_PARTITIONS    .     O2=0    .      .      .     0 
> A1:1|A2:|A3:3 A1:P1|A2:P1|A3:P1"
>       "$SETUP_A123_PARTITIONS    .     O3=0    .      .      .     0 
> A1:1|A2:2|A3: A1:P1|A2:P1|A3:P1"
> @@ -299,13 +301,14 @@ TEST_MATRIX=(
>                                                                      
> A1:P0|A2:P2|A3:P-1 2-4"
>  
>       # Remote partition offline tests
> +     # CPU offline shouldn't change 
> cpuset.cpus.{isolated,exclusive.effective}
>       " C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:O2=0 .   0 
> A1:0-1|A2:1|A3:3 A1:P0|A3:P2 2-3"
>       " C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:O2=0 O2=1 0 
> A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3"
> -     " C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3    P2:O3=0   .   0 
> A1:0-2|A2:1-2|A3: A1:P0|A3:P2 3"
> -     " C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3   T:P2:O3=0  .   0 
> A1:0-2|A2:1-2|A3:1-2 A1:P0|A3:P-2 3|"
> +     " C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3    P2:O3=0   .   0 
> A1:0-2|A2:1-2|A3:|XA3:3 A1:P0|A3:P2 3"
> +     " C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3   T:P2:O3=0  .   0 
> A1:0-2|A2:1-2|A3:1-2|XA3:3 A1:P0|A3:P-2 3"
>  
>       # An invalidated remote partition cannot self-recover from hotplug
> -     " C0-3:S+ C1-3:S+  C2      .    X2-3   X2-3   T:P2:O2=0 O2=1 0 
> A1:0-3|A2:1-3|A3:2 A1:P0|A3:P-2 ."
> +     " C0-3:S+ C1-3:S+  C2      .    X2-3   X2-3   T:P2:O2=0 O2=1 0 
> A1:0-3|A2:1-3|A3:2|XA3:2 A1:P0|A3:P-2 2"
>  
>       # cpus.exclusive.effective clearing test
>       " C0-3:S+ C1-3:S+  C2      .   X2-3:X    .      .      .     0 
> A1:0-3|A2:1-3|A3:2|XA1:"
> @@ -764,7 +767,7 @@ check_cgroup_states()
>  # only CPUs in isolated partitions as well as those that are isolated at
>  # boot time.
>  #
> -# $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>}
> +# $1 - expected isolated cpu list(s) <isolcpus1>{|<isolcpus2>}
>  # <isolcpus1> - expected sched/domains value
>  # <isolcpus2> - cpuset.cpus.isolated value = <isolcpus1> if not defined
>  #
> @@ -773,6 +776,7 @@ check_isolcpus()
>       EXPECTED_ISOLCPUS=$1
>       ISCPUS=${CGROUP2}/cpuset.cpus.isolated
>       ISOLCPUS=$(cat $ISCPUS)
> +     HKICPUS=$(cat /sys/devices/system/cpu/isolated)
>       LASTISOLCPU=
>       SCHED_DOMAINS=/sys/kernel/debug/sched/domains
>       if [[ $EXPECTED_ISOLCPUS = . ]]
> @@ -810,6 +814,11 @@ check_isolcpus()
>       ISOLCPUS=
>       EXPECTED_ISOLCPUS=$EXPECTED_SDOMAIN
>  
> +     #
> +     # The inverse of HK_TYPE_DOMAIN cpumask in $HKICPUS should match 
> $ISOLCPUS
> +     #
> +     [[ "$ISOLCPUS" != "$HKICPUS" ]] && return 1
> +
>       #
>       # Use the sched domain in debugfs to check isolated CPUs, if available
>       #

-- 
Best regards,
Ridong


Reply via email to