Hi Yury, Thanks for taking a look at this.
On Thu, Jun 26, 2025 at 12:41:08AM +0530, Shrikanth Hegde wrote:
Reference patch for how an architecture can make use of this infra.
This is not meant to be merged. Instead the vp_manual_hint should either
come from hardware or could be derived using steal time.
If you don't add any code that manages the 'avoid' mask on the host
side, all this becomes a dead code.
Ok.
Maybe I can keep this debug file, until we get the infra where
the hint derivation would be done by hardware by means of hcall or gets
calculated based on steal time.
I think i will have polish this a bit and move it to appropriate place
if this is to be kept.
When the provided hint is less than the total CPUs in the system, it
will enable the cpu avoid static key and set those CPUs as avoid.
Signed-off-by: Shrikanth Hegde <sshe...@linux.ibm.com>
---
arch/powerpc/include/asm/paravirt.h | 2 ++
arch/powerpc/kernel/smp.c | 50 +++++++++++++++++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/arch/powerpc/include/asm/paravirt.h
b/arch/powerpc/include/asm/paravirt.h
index b78b82d66057..b6497e0b60d8 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -10,6 +10,8 @@
#include <asm/hvcall.h>
#endif
+DECLARE_STATIC_KEY_FALSE(paravirt_cpu_avoid_enabled);
+
#ifdef CONFIG_PPC_SPLPAR
#include <linux/smp.h>
#include <asm/kvm_guest.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5ac7084eebc0..e00cdc4de441 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -64,6 +64,7 @@
#include <asm/systemcfg.h>
#include <trace/events/ipi.h>
+#include <linux/debugfs.h>
#ifdef DEBUG
#include <asm/udbg.h>
@@ -82,6 +83,7 @@ bool has_big_cores __ro_after_init;
bool coregroup_enabled __ro_after_init;
bool thread_group_shares_l2 __ro_after_init;
bool thread_group_shares_l3 __ro_after_init;
+static int vp_manual_hint = NR_CPUS;
DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -1727,6 +1729,7 @@ static void __init build_sched_topology(void)
BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
set_sched_topology(powerpc_topology);
+ vp_manual_hint = num_present_cpus();
}
void __init smp_cpus_done(unsigned int max_cpus)
@@ -1807,4 +1810,51 @@ void __noreturn arch_cpu_idle_dead(void)
start_secondary_resume();
}
+/*
+ * sysfs hint to mark CPUs as Avoid. This will help in restricting
+ * the workload to specified number of CPUs.
+ * For example 40 > vp_manual_hint means, workload will run on
+ * 0-39 CPUs.
+ */
+
+static int pv_vp_manual_hint_set(void *data, u64 val)
+{
+ int cpu;
+
+ if (val == 0 || vp_manual_hint > num_present_cpus())
This should be
if (val == 0 || val > num_present_cpus())
+ vp_manual_hint = num_present_cpus();
+
+ if (val != vp_manual_hint)
+ vp_manual_hint = val;
This all is effectively just:
vp_manual_hint = val;
Isn't?
Yes, With some checks for sane values.
+ if (vp_manual_hint < num_present_cpus())
+ static_branch_enable(¶virt_cpu_avoid_enabled);
+ else
+ static_branch_disable(¶virt_cpu_avoid_enabled);
+
+ for_each_present_cpu(cpu) {
+ if (cpu >= vp_manual_hint)
+ set_cpu_avoid(cpu, true);
+ else
+ set_cpu_avoid(cpu, false);
+ }
+ return 0;
+}
+
+static int pv_vp_manual_hint_get(void *data, u64 *val)
+{
+ *val = vp_manual_hint;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_vp_manual_hint, pv_vp_manual_hint_get,
pv_vp_manual_hint_set, "%llu\n");
+
+static __init int paravirt_debugfs_init(void)
+{
+ if (is_shared_processor())
+ debugfs_create_file("vp_manual_hint", 0600, arch_debugfs_dir, NULL,
&fops_pv_vp_manual_hint);
+ return 0;
+}
+
+device_initcall(paravirt_debugfs_init)
#endif
--
2.43.0