date:20250704

[PATCH v6 02/18] xen/xsm: wrap around xsm_sysctl with CONFIG_SYSCTL

2025-07-04 Thread Penny Zheng

As function xsm_sysctl() is solely invoked in sysctl.c, we need to
wrap around it with CONFIG_SYSCTL

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
 xen/include/xsm/xsm.h | 4 
 xen/xsm/dummy.c   | 2 ++
 xen/xsm/flask/hooks.c | 4 
 3 files changed, 10 insertions(+)

diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h
index 24acc16125..22e2429f52 100644
--- a/xen/include/xsm/xsm.h
+++ b/xen/include/xsm/xsm.h
@@ -261,7 +261,11 @@ static inline int xsm_domctl(xsm_default_t def, struct 
domain *d,
 
 static inline int xsm_sysctl(xsm_default_t def, int cmd)
 {
+#ifdef CONFIG_SYSCTL
 return alternative_call(xsm_ops.sysctl, cmd);
+#else
+return -EOPNOTSUPP;
+#endif
 }
 
 static inline int xsm_readconsole(xsm_default_t def, uint32_t clear)
diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
index 93fbfc43cc..93a0665ecc 100644
--- a/xen/xsm/dummy.c
+++ b/xen/xsm/dummy.c
@@ -22,7 +22,9 @@ static const struct xsm_ops __initconst_cf_clobber dummy_ops 
= {
 .sysctl_scheduler_op   = xsm_sysctl_scheduler_op,
 .set_target= xsm_set_target,
 .domctl= xsm_domctl,
+#ifdef CONFIG_SYSCTL
 .sysctl= xsm_sysctl,
+#endif
 .readconsole   = xsm_readconsole,
 
 .evtchn_unbound= xsm_evtchn_unbound,
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index 6a53487ea4..3a43e5a1d6 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -856,6 +856,7 @@ static int cf_check flask_domctl(struct domain *d, unsigned 
int cmd,
 }
 }
 
+#ifdef CONFIG_SYSCTL
 static int cf_check flask_sysctl(int cmd)
 {
 switch ( cmd )
@@ -933,6 +934,7 @@ static int cf_check flask_sysctl(int cmd)
 return avc_unknown_permission("sysctl", cmd);
 }
 }
+#endif /* CONFIG_SYSCTL */
 
 static int cf_check flask_readconsole(uint32_t clear)
 {
@@ -1884,7 +1886,9 @@ static const struct xsm_ops __initconst_cf_clobber 
flask_ops = {
 .sysctl_scheduler_op = flask_sysctl_scheduler_op,
 .set_target = flask_set_target,
 .domctl = flask_domctl,
+#ifdef CONFIG_SYSCTL
 .sysctl = flask_sysctl,
+#endif
 .readconsole = flask_readconsole,
 
 .evtchn_unbound = flask_evtchn_unbound,
-- 
2.34.1

[PATCH v6 01/18] xen: introduce CONFIG_SYSCTL

2025-07-04 Thread Penny Zheng

From: Stefano Stabellini 

We introduce a new Kconfig CONFIG_SYSCTL, which shall only be disabled
on some dom0less systems or PV shim on x86, to reduce Xen footprint.

Making SYSCTL without prompt is transient and it will be adjusted in the final
patch. And the consequence of introducing "CONFIG_SYSCTL=y" in .config file
generated from pvshim_defconfig is transient too, which will also be adjusted
in the final patch.

Signed-off-by: Stefano Stabellini 
Signed-off-by: Sergiy Kibrik 
Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v2 -> v3:
- remove "intend to" in commit message
---
v3 -> v4:
- introduce the option without prompt (simply "defbool y") to eliminate the
need for transient "#ifdef CONFIG_SYSCTL"
- calling out the transient scenario in commit message
---
v4 -> v5:
- a simple definition "def_bool y" for CONFIG_SYSCTL is enough here
- refactor commit message
- blank line ahead of endmenu
---
 xen/common/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 867710134a..fb5c60517a 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -611,4 +611,12 @@ config SYSTEM_SUSPEND
 
  If unsure, say N.
 
+menu "Supported hypercall interfaces"
+   visible if EXPERT
+
+config SYSCTL
+   def_bool y
+
+endmenu
+
 endmenu
-- 
2.34.1

[PATCH v6 04/18] xen/sysctl: make CONFIG_TRACEBUFFER depend on CONFIG_SYSCTL

2025-07-04 Thread Penny Zheng

Users could only access trace buffers via hypercall XEN_SYSCTL_tbuf_op,
so we shall make CONFIG_TRACEBUFFER depend on CONFIG_SYSCTL

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
 xen/common/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index fb5c60517a..dbd9747d1f 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -564,6 +564,7 @@ config DTB_FILE
 config TRACEBUFFER
bool "Enable tracing infrastructure" if EXPERT
default y
+   depends on SYSCTL
help
  Enable tracing infrastructure and pre-defined tracepoints within Xen.
  This will allow live information about Xen's execution and performance
-- 
2.34.1

[PATCH v6 10/18] xen/sysctl: wrap around XEN_SYSCTL_page_offline_op

2025-07-04 Thread Penny Zheng

The following functions are only to deal with XEN_SYSCTL_page_offline_op,
then shall be wrapped:
- xsm_page_offline()
- online_page()
- query_page_offline()

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v1 -> v2:
- add transient #ifdef in sysctl.c for correct compilation
- no need to wrap declarations
- place the #ifdef inside the function body to have less redundancy
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/common/page_alloc.c | 2 ++
 xen/include/xsm/xsm.h   | 6 ++
 xen/xsm/dummy.c | 2 ++
 xen/xsm/flask/hooks.c   | 6 ++
 4 files changed, 16 insertions(+)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index b204f22f50..dec4cb2ab1 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1779,6 +1779,7 @@ int offline_page(mfn_t mfn, int broken, uint32_t *status)
 return 0;
 }
 
+#ifdef CONFIG_SYSCTL
 /*
  * Online the memory.
  *   The caller should make sure end_pfn <= max_page,
@@ -1863,6 +1864,7 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
 
 return 0;
 }
+#endif /* CONFIG_SYSCTL */
 
 /*
  * This function should only be called with valid pages from the same NUMA
diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h
index 042a99449f..5ac99904c4 100644
--- a/xen/include/xsm/xsm.h
+++ b/xen/include/xsm/xsm.h
@@ -138,7 +138,9 @@ struct xsm_ops {
 int (*resource_setup_gsi)(int gsi);
 int (*resource_setup_misc)(void);
 
+#ifdef CONFIG_SYSCTL
 int (*page_offline)(uint32_t cmd);
+#endif
 int (*hypfs_op)(void);
 
 long (*do_xsm_op)(XEN_GUEST_HANDLE_PARAM(void) op);
@@ -597,7 +599,11 @@ static inline int xsm_resource_setup_misc(xsm_default_t 
def)
 
 static inline int xsm_page_offline(xsm_default_t def, uint32_t cmd)
 {
+#ifdef CONFIG_SYSCTL
 return alternative_call(xsm_ops.page_offline, cmd);
+#else
+return -EOPNOTSUPP;
+#endif
 }
 
 static inline int xsm_hypfs_op(xsm_default_t def)
diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
index cd0e844fcf..d46413ad8c 100644
--- a/xen/xsm/dummy.c
+++ b/xen/xsm/dummy.c
@@ -96,7 +96,9 @@ static const struct xsm_ops __initconst_cf_clobber dummy_ops 
= {
 .resource_setup_gsi= xsm_resource_setup_gsi,
 .resource_setup_misc   = xsm_resource_setup_misc,
 
+#ifdef CONFIG_SYSCTL
 .page_offline  = xsm_page_offline,
+#endif
 .hypfs_op  = xsm_hypfs_op,
 .hvm_param = xsm_hvm_param,
 .hvm_param_altp2mhvm   = xsm_hvm_param_altp2mhvm,
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index df7e10775b..45c12aa662 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -1206,10 +1206,12 @@ static int cf_check flask_resource_unplug_core(void)
 return avc_current_has_perm(SECINITSID_DOMXEN, SECCLASS_RESOURCE, 
RESOURCE__UNPLUG, NULL);
 }
 
+#ifdef CONFIG_SYSCTL
 static int flask_resource_use_core(void)
 {
 return avc_current_has_perm(SECINITSID_DOMXEN, SECCLASS_RESOURCE, 
RESOURCE__USE, NULL);
 }
+#endif /* CONFIG_SYSCTL */
 
 static int cf_check flask_resource_plug_pci(uint32_t machine_bdf)
 {
@@ -1274,6 +1276,7 @@ static int cf_check flask_resource_setup_misc(void)
 return avc_current_has_perm(SECINITSID_XEN, SECCLASS_RESOURCE, 
RESOURCE__SETUP, NULL);
 }
 
+#ifdef CONFIG_SYSCTL
 static inline int cf_check flask_page_offline(uint32_t cmd)
 {
 switch ( cmd )
@@ -1288,6 +1291,7 @@ static inline int cf_check flask_page_offline(uint32_t 
cmd)
 return avc_unknown_permission("page_offline", cmd);
 }
 }
+#endif /* CONFIG_SYSCTL */
 
 static inline int cf_check flask_hypfs_op(void)
 {
@@ -1948,7 +1952,9 @@ static const struct xsm_ops __initconst_cf_clobber 
flask_ops = {
 .resource_setup_gsi = flask_resource_setup_gsi,
 .resource_setup_misc = flask_resource_setup_misc,
 
+#ifdef CONFIG_SYSCTL
 .page_offline = flask_page_offline,
+#endif
 .hypfs_op = flask_hypfs_op,
 .hvm_param = flask_hvm_param,
 .hvm_param_altp2mhvm = flask_hvm_param_altp2mhvm,
-- 
2.34.1

Re: [RFC PATCH] xen/flask: estimate max sidtable size

2025-07-04 Thread Jan Beulich

On 04.07.2025 12:10, Sergiy Kibrik wrote:
> 01.07.25 13:42, Jan Beulich:
>> On 30.06.2025 10:55, Sergiy Kibrik wrote:
>>> @@ -54,4 +54,7 @@ $(obj)/policy.bin: FORCE
>>> FLASK_BUILD_DIR=$(FLASK_BUILD_DIR) POLICY_FILENAME=$(POLICY_SRC)
>>> cmp -s $(POLICY_SRC) $@ || cp $(POLICY_SRC) $@
>>>   
>>> +$(obj)/%/se_limits.h: $(obj)/policy.bin
>>> +   $(srcdir)/policy/mkselim.sh $^ $@
>>
>> Hmm, that's using the built-in policy, isn't it? What if later another
>> policy is loaded? Wouldn't it be possible to have ...
>>
>>> --- a/xen/xsm/flask/ss/sidtab.c
>>> +++ b/xen/xsm/flask/ss/sidtab.c
>>> @@ -13,6 +13,7 @@
>>>   #include "flask.h"
>>>   #include "security.h"
>>>   #include "sidtab.h"
>>> +#include "se_limits.h"
>>>   
>>>   #define SIDTAB_HASH(sid) ((sid) & SIDTAB_HASH_MASK)
>>>   
>>> @@ -228,7 +229,7 @@ int sidtab_context_to_sid(struct sidtab *s, struct 
>>> context *context,
>>>   if ( sid )
>>>   goto unlock_out;
>>>   /* No SID exists for the context.  Allocate a new one. */
>>> -if ( s->next_sid == UINT_MAX || s->shutdown )
>>> +if ( s->next_sid == SEPOL_SID_LIMIT || s->shutdown )
>>
>> ... more than this many SIDs? What if CONFIG_XSM_FLASK_POLICY isn't even set?
>>
> 
> It's using a policy from tools/flask/policy, yes. But not a built-in 
> policy, just reusing a bit of code from that code. The idea is that we 
> can have CONFIG_XSM_FLASK_POLICY option disabled yet still be able to 
> calculate SEPOL_SID_LIMIT.
> 
> As for loading another policy at runtime -- the calculated 
> SEPOL_SID_LIMIT=384 for current master flask policy is still pretty big 
> limit. From what I can see -- much less No. contexts are being used on a 
> running system, because most of calculated combinations of 
> user/role/type are not really usable (e.g. contexts with xen_t or 
> xenboot_t types and user_1 user are not expected etc). So there should 
> be enough room even for more complex custom policies.

But still there could be odd ones. Imo such a static limit can then only be
introduced via Kconfig option.

Jan

Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-cmd

2025-07-04 Thread Jan Beulich

On 04.07.2025 10:56, Penny, Zheng wrote:
> [Public]
> 
>> -Original Message-
>> From: Jan Beulich 
>> Sent: Friday, July 4, 2025 4:33 PM
>> To: Penny, Zheng ; Andryuk, Jason
>> 
>> Cc: Huang, Ray ; Anthony PERARD
>> ; Juergen Gross ; Andrew
>> Cooper ; Orzel, Michal ;
>> Julien Grall ; Roger Pau Monné ; 
>> Stefano
>> Stabellini ; xen-devel@lists.xenproject.org
>> Subject: Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-
>> cmd
>>
>> On 04.07.2025 10:13, Penny, Zheng wrote:
>>> [Public]
>>>
 -Original Message-
 From: Jan Beulich 
 Sent: Tuesday, June 17, 2025 6:08 PM
 To: Penny, Zheng 
 Cc: Huang, Ray ; Anthony PERARD
 ; Juergen Gross ; Andrew
 Cooper ; Orzel, Michal
 ; Julien Grall ; Roger Pau
 Monné ; Stefano Stabellini
 ; xen-devel@lists.xenproject.org
 Subject: Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC
 sub- cmd

 On 27.05.2025 10:48, Penny Zheng wrote:
> --- a/tools/misc/xenpm.c
> +++ b/tools/misc/xenpm.c
> @@ -69,6 +69,7 @@ void show_help(void)
>  " set-max-cstate|'unlimited' 
> [|'unlimited']\n"
>  " set the C-State 
> limitation ( >= 0)
>> and\n"
>  " optionally the 
> C-sub-state limitation
 ( >= 0)\n"
> +" get-cpufreq-cppc  [cpuid]   list cpu cppc 
> parameter of CPU
  or all\n"
>  " set-cpufreq-cppc  [cpuid] 
> [balance|performance|powersave]
 *\n"
>  " set Hardware P-State 
> (HWP) parameters\n"
>  " on CPU  or all 
> if omitted.\n"
> @@ -812,33 +813,7 @@ static void print_cpufreq_para(int cpuid,
> struct xc_get_cpufreq_para *p_cpufreq)
>
>  printf("scaling_driver   : %s\n", p_cpufreq->scaling_driver);
>
> -if ( hwp )
> -{
> -const xc_cppc_para_t *cppc = &p_cpufreq->u.cppc_para;
> -
> -printf("cppc variables   :\n");
> -printf("  hardware limits: lowest [%"PRIu32"] lowest 
> nonlinear
 [%"PRIu32"]\n",
> -   cppc->lowest, cppc->lowest_nonlinear);
> -printf(" : nominal [%"PRIu32"] highest 
> [%"PRIu32"]\n",
> -   cppc->nominal, cppc->highest);
> -printf("  configured limits  : min [%"PRIu32"] max [%"PRIu32"] 
> energy
>> perf
 [%"PRIu32"]\n",
> -   cppc->minimum, cppc->maximum, cppc->energy_perf);
> -
> -if ( cppc->features & XEN_SYSCTL_CPPC_FEAT_ACT_WINDOW )
> -{
> -unsigned int activity_window;
> -const char *units;
> -
> -activity_window = calculate_activity_window(cppc, &units);
> -printf(" : activity_window [%"PRIu32" 
> %s]\n",
> -   activity_window, units);
> -}
> -
> -printf(" : desired [%"PRIu32"%s]\n",
> -   cppc->desired,
> -   cppc->desired ? "" : " hw autonomous");
> -}
> -else
> +if ( !hwp )
>  {
>  if ( p_cpufreq->gov_num )
>  printf("scaling_avail_gov: %s\n",

 I'm not sure it is a good idea to alter what is being output for 
 get-cpufreq-para.
 People may simply miss that output then, without having any
 indication where it went.
>>>
>>> Hwp is more like amd-cppc in active mode. It also does not rely on Xen
>>> governor to do performance tuning, so in previous design, we could borrow
>> governor filed to output cppc info However after introducing amd-cppc passive
>> mode, we have request to output both governor and CPPC info. And if 
>> continuing
>> expanding get-cpufreq-para to include CPPC info, it will make the parent 
>> stuct
>> xen_sysctl.u exceed exceed 128 bytes.
>>
>> How is the xenpm command "get-cpufreq-para" related to the sysctl interface 
>> struct
>> size? If you need to invoke two sysctl sub-ops to produce all relevant 
>> "get-cpufreq-
>> para" output, so be it I would say.
>>
> 
> Because we are limiting each sysctl-subcmd-struct, such as " struct 
> xen_sysctl_pm_op ", 128 bytes in "struct xen_sysctl"，They are all combined as 
> a union.
> Also, in "struct xen_sysctl_pm_op", its descending sub-op structs, including 
> "struct xen_get_cpufreq_para", are all combined as union too
> ```
> struct xen_sysctl_pm_op {
> ... ...
> union {
> struct xen_get_cpufreq_para get_para;
> struct xen_set_cpufreq_gov  set_gov;
> struct xen_set_cpufreq_para set_para;
> struct xen_set_cppc_paraset_cppc;
> uint64_aligned_t get_avgfreq;
> uint32_tset_sched_opt_smt;
> #define XEN

[PATCH v6 18/18] xen/sysctl: wrap around sysctl hypercall

2025-07-04 Thread Penny Zheng

From: Stefano Stabellini 

Wrap sysctl hypercall def and sysctl.o with CONFIG_SYSCTL, and since
PV_SHIM_EXCLUSIVE needs sorting in the future, we move them out of
PV_SHIM_EXCLUSIVE condition at the same time.

We need to make SYSCTL with prompt back, add help info and set default
value as y. We shall at least provide "# CONFIG_SYSCTL is not set" in preset
configs for PV shim on x86.

Signed-off-by: Stefano Stabellini 
Signed-off-by: Sergiy Kibrik 
Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v1 -> v2:
- remove all transient "#ifdef CONFIG_SYSCTL"-s in sysctl.c
---
v2 -> v3:
- move out of CONFIG_PV_SHIM_EXCLUSIVE condition
---
v3 -> v4:
- make SYSCTL with prompt
- state unsetting SYSCTL in pvshim_defconfig
---
v4 -> v5:
- adapt to the new changes in commit "xen: introduce CONFIG_SYSCTL"
- expand help info also for PV shim
- refactor commit message
---
 xen/arch/x86/configs/pvshim_defconfig | 1 +
 xen/common/Kconfig| 6 +-
 xen/common/Makefile   | 2 +-
 xen/include/hypercall-defs.c  | 8 ++--
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/configs/pvshim_defconfig 
b/xen/arch/x86/configs/pvshim_defconfig
index 9dc91c33e3..aab5940e62 100644
--- a/xen/arch/x86/configs/pvshim_defconfig
+++ b/xen/arch/x86/configs/pvshim_defconfig
@@ -28,3 +28,4 @@ CONFIG_EXPERT=y
 # CONFIG_GDBSX is not set
 # CONFIG_PM_OP is not set
 # CONFIG_PM_STATS is not set
+# CONFIG_SYSCTL is not set
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 65f07289dd..64865112a1 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -616,7 +616,11 @@ menu "Supported hypercall interfaces"
visible if EXPERT
 
 config SYSCTL
-   def_bool y
+   bool "Enable sysctl hypercall"
+   default y
+   help
+ This option shall only be disabled on some dom0less systems, or
+ PV shim on x86, to reduce Xen footprint.
 
 endmenu
 
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 98f0873056..15ab048244 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -49,6 +49,7 @@ obj-y += spinlock.o
 obj-$(CONFIG_STACK_PROTECTOR) += stack-protector.o
 obj-y += stop_machine.o
 obj-y += symbols.o
+obj-$(CONFIG_SYSCTL) += sysctl.o
 obj-y += tasklet.o
 obj-y += time.o
 obj-y += timer.o
@@ -70,7 +71,6 @@ obj-$(CONFIG_COMPAT) += $(addprefix compat/,domain.o memory.o 
multicall.o xlat.o
 ifneq ($(CONFIG_PV_SHIM_EXCLUSIVE),y)
 obj-y += domctl.o
 obj-$(CONFIG_VM_EVENT) += monitor.o
-obj-y += sysctl.o
 endif
 
 extra-y := symbols-dummy.o
diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c
index 7720a29ade..c1081d87a2 100644
--- a/xen/include/hypercall-defs.c
+++ b/xen/include/hypercall-defs.c
@@ -194,8 +194,10 @@ kexec_op(unsigned long op, void *uarg)
 #ifdef CONFIG_IOREQ_SERVER
 dm_op(domid_t domid, unsigned int nr_bufs, xen_dm_op_buf_t *bufs)
 #endif
-#ifndef CONFIG_PV_SHIM_EXCLUSIVE
+#ifdef CONFIG_SYSCTL
 sysctl(xen_sysctl_t *u_sysctl)
+#endif
+#ifndef CONFIG_PV_SHIM_EXCLUSIVE
 domctl(xen_domctl_t *u_domctl)
 paging_domctl_cont(xen_domctl_t *u_domctl)
 platform_op(xen_platform_op_t *u_xenpf_op)
@@ -273,8 +275,10 @@ physdev_op compat   do   hvm   
   hvm  do_arm
 #ifdef CONFIG_HVM
 hvm_op do   do   do   do   do
 #endif
-#ifndef CONFIG_PV_SHIM_EXCLUSIVE
+#ifdef CONFIG_SYSCTL
 sysctl do   do   do   do   do
+#endif
+#ifndef CONFIG_PV_SHIM_EXCLUSIVE
 domctl do   do   do   do   do
 #endif
 #ifdef CONFIG_KEXEC
-- 
2.34.1

[PATCH v6 17/18] xen/x86: remove "depends on !PV_SHIM_EXCLUSIVE"

2025-07-04 Thread Penny Zheng

Remove all "depends on !PV_SHIM_EXCLUSIVE" (also the functionally
equivalent "if !...") in Kconfig file, since negative dependancy will badly
affect allyesconfig. Although "if !PV_SHIM_EXCLUSIVE" for CONFIG_VGA is not
truly a dependency, setting PV_SHIM_EXCLUSIVE y still makes it unconfigurable.
So we remove it here too

Add "#CONFIG_xxx is not set" for above options in presets for x86 PV shim, as
the explicit declaration is to ephasize setting for the shim is different
from the general default.

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
This commit shall be commited together with the next one, which is also the
last commit.
---
v2 -> v3:
- remove comment for PV_SHIM_EXCLUSIVE
---
v3 -> v4:
- explicitly state "CONFIG_xxx is not set" in "pvshim_defconfig"
- Add "default y" for SHADOW_PAGING and TBOOT
- refactor commit message
---
v4 -> v5:
- For not breaking allyesconfig, changes to defaults are actually not needed.
So remove them all
- Leave one blank lines
---
v5 -> v6
- add description for change: removing "if !PV_SHIM_EXCLUSIVE" for CONFIG_VGA
---
 xen/arch/x86/Kconfig  | 4 
 xen/arch/x86/hvm/Kconfig  | 1 -
 xen/drivers/video/Kconfig | 2 +-
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index 752d5141bb..b45f17a16e 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -289,8 +289,6 @@ config PV_SHIM_EXCLUSIVE
 
  If unsure, say N.
 
-if !PV_SHIM_EXCLUSIVE
-
 config HYPERV_GUEST
bool "Hyper-V Guest"
select GUEST
@@ -299,8 +297,6 @@ config HYPERV_GUEST
 
  If unsure, say N.
 
-endif
-
 config REQUIRE_NX
bool "Require NX (No eXecute) support"
help
diff --git a/xen/arch/x86/hvm/Kconfig b/xen/arch/x86/hvm/Kconfig
index 2def0f98e2..b903764bda 100644
--- a/xen/arch/x86/hvm/Kconfig
+++ b/xen/arch/x86/hvm/Kconfig
@@ -1,6 +1,5 @@
 menuconfig HVM
bool "HVM support"
-   depends on !PV_SHIM_EXCLUSIVE
default !PV_SHIM
select COMPAT
select IOREQ_SERVER
diff --git a/xen/drivers/video/Kconfig b/xen/drivers/video/Kconfig
index 245030beea..0a51e87eb2 100644
--- a/xen/drivers/video/Kconfig
+++ b/xen/drivers/video/Kconfig
@@ -3,7 +3,7 @@ config VIDEO
bool
 
 config VGA
-   bool "VGA support" if !PV_SHIM_EXCLUSIVE
+   bool "VGA support"
select VIDEO
depends on X86
default y if !PV_SHIM_EXCLUSIVE
-- 
2.34.1

Re: [PATCH v4 59/65] accel: Always register AccelOpsClass::get_virtual_clock() handler

2025-07-04 Thread Mads Ynddal



> On 2 Jul 2025, at 20.53, Philippe Mathieu-Daudé  wrote:
> 
> In order to dispatch over AccelOpsClass::get_virtual_clock(),
> we need it always defined, not calling a hidden handler under
> the hood. Make AccelOpsClass::get_virtual_clock() mandatory.
> Register the default cpus_kick_thread() for each accelerator.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> include/system/accel-ops.h| 2 ++
> accel/hvf/hvf-accel-ops.c | 1 +
> accel/kvm/kvm-accel-ops.c | 1 +
> accel/tcg/tcg-accel-ops.c | 2 ++
> accel/xen/xen-all.c   | 1 +
> system/cpus.c | 7 ---
> target/i386/nvmm/nvmm-accel-ops.c | 1 +
> target/i386/whpx/whpx-accel-ops.c | 1 +
> 8 files changed, 13 insertions(+), 3 deletions(-)

Reviewed-by: Mads Ynddal

Re: [PATCH v4 58/65] accel: Always register AccelOpsClass::get_elapsed_ticks() handler

2025-07-04 Thread Mads Ynddal



> On 2 Jul 2025, at 20.53, Philippe Mathieu-Daudé  wrote:
> 
> In order to dispatch over AccelOpsClass::get_elapsed_ticks(),
> we need it always defined, not calling a hidden handler under
> the hood. Make AccelOpsClass::get_elapsed_ticks() mandatory.
> Register the default cpus_kick_thread() for each accelerator.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> include/system/accel-ops.h| 1 +
> accel/hvf/hvf-accel-ops.c | 2 ++
> accel/kvm/kvm-accel-ops.c | 3 +++
> accel/qtest/qtest.c   | 2 ++
> accel/tcg/tcg-accel-ops.c | 3 +++
> accel/xen/xen-all.c   | 2 ++
> system/cpus.c | 6 ++
> target/i386/nvmm/nvmm-accel-ops.c | 3 +++
> target/i386/whpx/whpx-accel-ops.c | 3 +++
> 9 files changed, 21 insertions(+), 4 deletions(-)
> 

Reviewed-by: Mads Ynddal

Re: [RFC PATCH] xen/flask: estimate max sidtable size

2025-07-04 Thread Sergiy Kibrik


01.07.25 13:42, Jan Beulich:

On 30.06.2025 10:55, Sergiy Kibrik wrote:

Currently Xen lacks a defined largest number of security IDs it can potentially
use. The number of SIDs are naturally limited by number of security contexts
provided by a given security policy, i.e. how many combination of user, role
and type there can be, and is dependant on the policy being used.
Thus in Xen the number of allocated entries in sidtable is hard-limited by 
UINT_MAX.
However in the embedded environment configured for safety it is desirable to
avoid guest-triggered dynamic memory allocations at runtime, or at least limit
them to some decent amounts. So we seek to estimate this limit.

This patch suggests one way to do it using Xen's flask policy.
List of users, roles and types is read from binary policy using setools utils,
then it is used to count the No. of combinations these values can give.
This No. of combinations then can be used in code as a practical replacement
of UINT_MAX limit. Also it can be used later to pre-allocate sidtable at boot
and avoid runtime entries allocation altogether.

Signed-off-by: Sergiy Kibrik 
---
This RFC presents a concept of estimating a max possible sidtable size.
Can we discuss how valid this concept is? Currently it yields 420 as max SID,
is it a reasonable number?


As this is policy dependent - what policy did you use to obtain that 420?


it's actually from my custom extended policy, for policy in Xen master 
branch it's 384.






@@ -54,4 +54,7 @@ $(obj)/policy.bin: FORCE
FLASK_BUILD_DIR=$(FLASK_BUILD_DIR) POLICY_FILENAME=$(POLICY_SRC)
cmp -s $(POLICY_SRC) $@ || cp $(POLICY_SRC) $@
  
+$(obj)/%/se_limits.h: $(obj)/policy.bin

+   $(srcdir)/policy/mkselim.sh $^ $@


Hmm, that's using the built-in policy, isn't it? What if later another
policy is loaded? Wouldn't it be possible to have ...


--- a/xen/xsm/flask/ss/sidtab.c
+++ b/xen/xsm/flask/ss/sidtab.c
@@ -13,6 +13,7 @@
  #include "flask.h"
  #include "security.h"
  #include "sidtab.h"
+#include "se_limits.h"
  
  #define SIDTAB_HASH(sid) ((sid) & SIDTAB_HASH_MASK)
  
@@ -228,7 +229,7 @@ int sidtab_context_to_sid(struct sidtab *s, struct context *context,

  if ( sid )
  goto unlock_out;
  /* No SID exists for the context.  Allocate a new one. */
-if ( s->next_sid == UINT_MAX || s->shutdown )
+if ( s->next_sid == SEPOL_SID_LIMIT || s->shutdown )


... more than this many SIDs? What if CONFIG_XSM_FLASK_POLICY isn't even set?



It's using a policy from tools/flask/policy, yes. But not a built-in 
policy, just reusing a bit of code from that code. The idea is that we 
can have CONFIG_XSM_FLASK_POLICY option disabled yet still be able to 
calculate SEPOL_SID_LIMIT.


As for loading another policy at runtime -- the calculated 
SEPOL_SID_LIMIT=384 for current master flask policy is still pretty big 
limit. From what I can see -- much less No. contexts are being used on a 
running system, because most of calculated combinations of 
user/role/type are not really usable (e.g. contexts with xen_t or 
xenboot_t types and user_1 user are not expected etc). So there should 
be enough room even for more complex custom policies.



It also doesn't really become clear to me how you avoid or even (meaningfully)
bound memory allocation here. A table of several hundred entries is still a
decent size. If you really knew the max size up front, why couldn't the table
be allocated statically. (Sadly the table allocation isn't in context, as you
don't even touch that code, wherever it lives.)



As said before, this limit is crude and still far from the number of 
actually usable contexts. So allocating this memory beforehand can be 
kind of wasteful, as most of it will probably never be used.


  -Sergiy

Re: [PATCH 5/6] x86/idle: Drop incorrect smp_mb() in mwait_idle_with_hints()

2025-07-04 Thread Andrew Cooper

On 03/07/2025 2:30 pm, Jan Beulich wrote:
> On 03.07.2025 14:37, Andrew Cooper wrote:
>> On 03/07/2025 10:24 am, Jan Beulich wrote:
>>> On 02.07.2025 16:41, Andrew Cooper wrote:
 With the recent simplifications, it becomes obvious that smp_mb() isn't the
 right barrier; all we need is a compiler barrier.

 Include this in monitor() itself, along with an explantion.
>>> Ah, here we go. As per my comment on patch 4, would this perhaps better move
>>> ahead (which however would require a bit of an adjustment to the 
>>> description)?
>> As said, it's not necessary in practice.
> As said where? All you say here is that a memory barrier is needed. Perhaps
> my use of "ahead" was ambiguous? I meant "move ahead in the series", not
> "move ahead in code". Apart from this as a possibility I fear I don't
> understand.

I did take it to mean "ahead in the series".

Your comment in patch 4 talks about alternative(), alternative_io() and
barriers.  I stated (admittedly without reference) that the barrier
between two alternatives() doesn't matter because of their volatileness.

It can move in the series, because it is genuinely independent and
unrelated to patch 4 AFAICT.

>> Nothing good can come of having any loads hoisted above MWAIT, but from
>> a programmers point of view, it's indistinguishable from e.g. taking an
>> SMI.  If there's a correctness issue, it's not MWAIT's fault.
> Well, yes, but what in the code is it that tells the compiler not to? Up
> to and including LTO, should we ever get that to work again. This
> specifically may be why mwait() may need to gain one, despite not itself
> dealing with any memory (operands).

In practice, mwait() is surrounded by spec_ctrl_{enter,exit}_idle() and
nothing is crossing those.  I'm going to leave the mwait side of things
alone for now.

But even with LTO, if hoisting a read causes a problem, that's a bug in
whatever got hoisted, not in MWAIT.

~Andrew

RE: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-cmd

2025-07-04 Thread Penny, Zheng

[Public]

> -Original Message-
> From: Jan Beulich 
> Sent: Friday, July 4, 2025 4:33 PM
> To: Penny, Zheng ; Andryuk, Jason
> 
> Cc: Huang, Ray ; Anthony PERARD
> ; Juergen Gross ; Andrew
> Cooper ; Orzel, Michal ;
> Julien Grall ; Roger Pau Monné ; Stefano
> Stabellini ; xen-devel@lists.xenproject.org
> Subject: Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-
> cmd
>
> On 04.07.2025 10:13, Penny, Zheng wrote:
> > [Public]
> >
> >> -Original Message-
> >> From: Jan Beulich 
> >> Sent: Tuesday, June 17, 2025 6:08 PM
> >> To: Penny, Zheng 
> >> Cc: Huang, Ray ; Anthony PERARD
> >> ; Juergen Gross ; Andrew
> >> Cooper ; Orzel, Michal
> >> ; Julien Grall ; Roger Pau
> >> Monné ; Stefano Stabellini
> >> ; xen-devel@lists.xenproject.org
> >> Subject: Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC
> >> sub- cmd
> >>
> >> On 27.05.2025 10:48, Penny Zheng wrote:
> >>> --- a/tools/misc/xenpm.c
> >>> +++ b/tools/misc/xenpm.c
> >>> @@ -69,6 +69,7 @@ void show_help(void)
> >>>  " set-max-cstate|'unlimited' 
> >>> [|'unlimited']\n"
> >>>  " set the C-State 
> >>> limitation ( >= 0)
> and\n"
> >>>  " optionally the 
> >>> C-sub-state limitation
> >> ( >= 0)\n"
> >>> +" get-cpufreq-cppc  [cpuid]   list cpu cppc 
> >>> parameter of CPU
> >>  or all\n"
> >>>  " set-cpufreq-cppc  [cpuid] 
> >>> [balance|performance|powersave]
> >> *\n"
> >>>  " set Hardware P-State 
> >>> (HWP) parameters\n"
> >>>  " on CPU  or all 
> >>> if omitted.\n"
> >>> @@ -812,33 +813,7 @@ static void print_cpufreq_para(int cpuid,
> >>> struct xc_get_cpufreq_para *p_cpufreq)
> >>>
> >>>  printf("scaling_driver   : %s\n", p_cpufreq->scaling_driver);
> >>>
> >>> -if ( hwp )
> >>> -{
> >>> -const xc_cppc_para_t *cppc = &p_cpufreq->u.cppc_para;
> >>> -
> >>> -printf("cppc variables   :\n");
> >>> -printf("  hardware limits: lowest [%"PRIu32"] lowest 
> >>> nonlinear
> >> [%"PRIu32"]\n",
> >>> -   cppc->lowest, cppc->lowest_nonlinear);
> >>> -printf(" : nominal [%"PRIu32"] highest 
> >>> [%"PRIu32"]\n",
> >>> -   cppc->nominal, cppc->highest);
> >>> -printf("  configured limits  : min [%"PRIu32"] max [%"PRIu32"] 
> >>> energy
> perf
> >> [%"PRIu32"]\n",
> >>> -   cppc->minimum, cppc->maximum, cppc->energy_perf);
> >>> -
> >>> -if ( cppc->features & XEN_SYSCTL_CPPC_FEAT_ACT_WINDOW )
> >>> -{
> >>> -unsigned int activity_window;
> >>> -const char *units;
> >>> -
> >>> -activity_window = calculate_activity_window(cppc, &units);
> >>> -printf(" : activity_window [%"PRIu32" 
> >>> %s]\n",
> >>> -   activity_window, units);
> >>> -}
> >>> -
> >>> -printf(" : desired [%"PRIu32"%s]\n",
> >>> -   cppc->desired,
> >>> -   cppc->desired ? "" : " hw autonomous");
> >>> -}
> >>> -else
> >>> +if ( !hwp )
> >>>  {
> >>>  if ( p_cpufreq->gov_num )
> >>>  printf("scaling_avail_gov: %s\n",
> >>
> >> I'm not sure it is a good idea to alter what is being output for 
> >> get-cpufreq-para.
> >> People may simply miss that output then, without having any
> >> indication where it went.
> >
> > Hwp is more like amd-cppc in active mode. It also does not rely on Xen
> > governor to do performance tuning, so in previous design, we could borrow
> governor filed to output cppc info However after introducing amd-cppc passive
> mode, we have request to output both governor and CPPC info. And if continuing
> expanding get-cpufreq-para to include CPPC info, it will make the parent stuct
> xen_sysctl.u exceed exceed 128 bytes.
>
> How is the xenpm command "get-cpufreq-para" related to the sysctl interface 
> struct
> size? If you need to invoke two sysctl sub-ops to produce all relevant 
> "get-cpufreq-
> para" output, so be it I would say.
>

Because we are limiting each sysctl-subcmd-struct, such as " struct 
xen_sysctl_pm_op ", 128 bytes in "struct xen_sysctl"，They are all combined as a 
union.
Also, in "struct xen_sysctl_pm_op", its descending sub-op structs, including 
"struct xen_get_cpufreq_para", are all combined as union too
```
struct xen_sysctl_pm_op {
... ...
union {
struct xen_get_cpufreq_para get_para;
struct xen_set_cpufreq_gov  set_gov;
struct xen_set_cpufreq_para set_para;
struct xen_set_cppc_paraset_cppc;
uint64_aligned_t get_avgfreq;
uint32_tset_sched_opt_smt;
#define XEN_SYSCTL_CX_UNLIMITED 0xU
uint32_tget_max_cstate;
uint32_t

[PATCH] xen/arm32: Tidy up setup_mm()

2025-07-04 Thread Michal Orzel

The current look and feel of setup_mm() leaves a lot to be desired. The
scope of variables is not the best, many variables are not really needed
while some others are set but not used. The first iteration of membanks
is split from the loop for no reason. Tidy up this function for better
readability.

No functional change.

Signed-off-by: Michal Orzel 
---
 xen/arch/arm/arm32/mmu/mm.c | 28 +---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/xen/arch/arm/arm32/mmu/mm.c b/xen/arch/arm/arm32/mmu/mm.c
index e6d9b49acd3c..5e4766ddcf65 100644
--- a/xen/arch/arm/arm32/mmu/mm.c
+++ b/xen/arch/arm/arm32/mmu/mm.c
@@ -74,8 +74,7 @@ static paddr_t __init fit_xenheap_in_static_heap(uint32_t 
size, paddr_t align)
 void __init setup_mm(void)
 {
 const struct membanks *mem = bootinfo_get_mem();
-paddr_t ram_start, ram_end, ram_size, e, bank_start, bank_end, bank_size;
-paddr_t static_heap_end = 0, static_heap_size = 0;
+paddr_t ram_start = INVALID_PADDR, ram_end = 0, ram_size = 0, e;
 unsigned long heap_pages, xenheap_pages, domheap_pages;
 unsigned int i;
 const uint32_t ctr = READ_CP32(CTR);
@@ -89,19 +88,14 @@ void __init setup_mm(void)
 
 init_pdx();
 
-ram_start = mem->bank[0].start;
-ram_size  = mem->bank[0].size;
-ram_end   = ram_start + ram_size;
-
-for ( i = 1; i < mem->nr_banks; i++ )
+for ( i = 0; i < mem->nr_banks; i++ )
 {
-bank_start = mem->bank[i].start;
-bank_size = mem->bank[i].size;
-bank_end = bank_start + bank_size;
+const struct membank *bank = &mem->bank[i];
+paddr_t bank_end = bank->start + bank->size;
 
-ram_size  = ram_size + bank_size;
-ram_start = min(ram_start,bank_start);
-ram_end   = max(ram_end,bank_end);
+ram_size = ram_size + bank->size;
+ram_start = min(ram_start, bank->start);
+ram_end = max(ram_end, bank_end);
 }
 
 total_pages = ram_size >> PAGE_SHIFT;
@@ -109,18 +103,14 @@ void __init setup_mm(void)
 if ( using_static_heap )
 {
 const struct membanks *reserved_mem = bootinfo_get_reserved_mem();
+paddr_t static_heap_size = 0;
 
 for ( i = 0 ; i < reserved_mem->nr_banks; i++ )
 {
 if ( reserved_mem->bank[i].type != MEMBANK_STATIC_HEAP )
 continue;
 
-bank_start = reserved_mem->bank[i].start;
-bank_size = reserved_mem->bank[i].size;
-bank_end = bank_start + bank_size;
-
-static_heap_size += bank_size;
-static_heap_end = max(static_heap_end, bank_end);
+static_heap_size += reserved_mem->bank[i].size;
 }
 
 heap_pages = static_heap_size >> PAGE_SHIFT;
-- 
2.25.1

Re: [PATCH v6 00/18] xen: introduce CONFIG_SYSCTL

2025-07-04 Thread Jan Beulich

On 04.07.2025 11:29, Penny Zheng wrote:
> It can be beneficial for some dom0less systems to further reduce Xen footprint
> via disabling some hypercalls handling code, which may not to be used &
> required in such systems. Each hypercall has a separate option to keep
> configuration flexible.
> 
> Options to disable hypercalls:
> - sysctl
> - domctl
> - hvm
> - physdev
> - platform
> 
> This patch serie is only focusing on introducing CONFIG_SYSCTL. Different
> options will be covered in different patch serie.
> 
> Features, like LIVEPATCH, Overlay DTB, which fully rely on sysctl op, will
> be wrapped with CONFIG_SYSCTL, to reduce Xen footprint as much as possible.
> 
> It is derived from Stefano Stabellini's commit "xen: introduce kconfig 
> options to
> disable hypercalls"(
> https://lore.kernel.org/xen-devel/20241219092917.3006174-1-sergiy_kib...@epam.com)
> 
> ---
> Commit "xen/x86: remove "depends on !PV_SHIM_EXCLUSIVE"" and commit "
> xen/sysctl: wrap around sysctl hypercall" shall be commited together.
> ---
> Penny Zheng (16):
>   xen/xsm: wrap around xsm_sysctl with CONFIG_SYSCTL
>   xen/sysctl: wrap around XEN_SYSCTL_readconsole
>   xen/sysctl: make CONFIG_TRACEBUFFER depend on CONFIG_SYSCTL
>   xen/sysctl: wrap around XEN_SYSCTL_sched_id
>   xen/sysctl: wrap around XEN_SYSCTL_perfc_op
>   xen/sysctl: wrap around XEN_SYSCTL_lockprof_op
>   xen/pmstat: introduce CONFIG_PM_OP
>   xen/sysctl: introduce CONFIG_PM_STATS
>   xen/sysctl: wrap around XEN_SYSCTL_page_offline_op
>   xen/sysctl: wrap around XEN_SYSCTL_cpupool_op
>   xen/sysctl: wrap around XEN_SYSCTL_scheduler_op
>   xen/sysctl: wrap around XEN_SYSCTL_physinfo
>   xen/sysctl: make CONFIG_COVERAGE depend on CONFIG_SYSCTL
>   xen/sysctl: make CONFIG_LIVEPATCH depend on CONFIG_SYSCTL
>   xen/sysctl: wrap around arch-specific arch_do_sysctl
>   xen/x86: remove "depends on !PV_SHIM_EXCLUSIVE"
> 
> Stefano Stabellini (2):
>   xen: introduce CONFIG_SYSCTL
>   xen/sysctl: wrap around sysctl hypercall

This doesn't look to be based on latest staging, where some of the changes above
are already present. I specifically tried to get some of the stuff in so that
the next re-posting wouldn't be as large.

Jan

[PATCH v6 16/18] xen/sysctl: wrap around arch-specific arch_do_sysctl

2025-07-04 Thread Penny Zheng

Function arch_do_sysctl is to perform arch-specific sysctl op.
Some functions, like psr_get_info() for x86, DTB overlay support for arm,
are solely available through sysctl op, then they all shall be wrapped
with CONFIG_SYSCTL

Also, remove all #ifdef CONFIG_SYSCTL-s in arch-specific sysctl.c, as
we put the guardian in Makefile for the whole file.
Since PV_SHIM_EXCLUSIVE needs sorting in the future, we move
obj-$(CONFIG_SYSCTL) += sysctl.o out of PV_SHIM_EXCLUSIVE condition.

Signed-off-by: Stefano Stabellini 
Signed-off-by: Sergiy Kibrik 
Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
Acked-by: Jan Beulich 
---
v1 -> v2:
- use "depends on" for config OVERLAY_DTB
- no need to wrap declaration
- add transient #ifdef in sysctl.c for correct compilation
---
v2 -> v3:
- move obj-$(CONFIG_SYSCTL) += sysctl.o out of PV_SHIM_EXCLUSIVE condition
- move copyback out of #ifdef
- add #else process for default label
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
- move #ifdef ahead of the comment
---
 xen/arch/arm/Kconfig   |  1 +
 xen/arch/arm/Makefile  |  2 +-
 xen/arch/arm/sysctl.c  |  2 --
 xen/arch/riscv/stubs.c |  2 +-
 xen/arch/x86/Makefile  |  2 +-
 xen/arch/x86/psr.c | 18 ++
 xen/arch/x86/sysctl.c  |  2 --
 7 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
index 3f25da3ca5..e076419d5e 100644
--- a/xen/arch/arm/Kconfig
+++ b/xen/arch/arm/Kconfig
@@ -139,6 +139,7 @@ config HAS_ITS
 
 config OVERLAY_DTB
bool "DTB overlay support (UNSUPPORTED)" if UNSUPPORTED
+   depends on SYSCTL
help
  Dynamic addition/removal of Xen device tree nodes using a dtbo.
 
diff --git a/xen/arch/arm/Makefile b/xen/arch/arm/Makefile
index ab0a0c2be6..f833cdf207 100644
--- a/xen/arch/arm/Makefile
+++ b/xen/arch/arm/Makefile
@@ -51,7 +51,7 @@ obj-y += setup.o
 obj-y += shutdown.o
 obj-y += smp.o
 obj-y += smpboot.o
-obj-y += sysctl.o
+obj-$(CONFIG_SYSCTL) += sysctl.o
 obj-y += time.o
 obj-y += traps.o
 obj-y += vcpreg.o
diff --git a/xen/arch/arm/sysctl.c b/xen/arch/arm/sysctl.c
index 2d350b700a..32cab4feff 100644
--- a/xen/arch/arm/sysctl.c
+++ b/xen/arch/arm/sysctl.c
@@ -15,7 +15,6 @@
 #include 
 #include 
 
-#ifdef CONFIG_SYSCTL
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 {
 pi->capabilities |= XEN_SYSCTL_PHYSCAP_hvm | XEN_SYSCTL_PHYSCAP_hap;
@@ -23,7 +22,6 @@ void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 pi->arch_capabilities |= MASK_INSR(sve_encode_vl(get_sys_vl_len()),
XEN_SYSCTL_PHYSCAP_ARM_SVE_MASK);
 }
-#endif
 
 long arch_do_sysctl(struct xen_sysctl *sysctl,
 XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
diff --git a/xen/arch/riscv/stubs.c b/xen/arch/riscv/stubs.c
index f86a1c17cb..8918cebf35 100644
--- a/xen/arch/riscv/stubs.c
+++ b/xen/arch/riscv/stubs.c
@@ -302,6 +302,7 @@ unsigned long raw_copy_from_guest(void *to, const void 
__user *from,
 BUG_ON("unimplemented");
 }
 
+#ifdef CONFIG_SYSCTL
 /* sysctl.c */
 
 long arch_do_sysctl(struct xen_sysctl *sysctl,
@@ -310,7 +311,6 @@ long arch_do_sysctl(struct xen_sysctl *sysctl,
 BUG_ON("unimplemented");
 }
 
-#ifdef CONFIG_SYSCTL
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 {
 BUG_ON("unimplemented");
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index ce724a9daa..96d63219e7 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -63,6 +63,7 @@ obj-y += smpboot.o
 obj-y += spec_ctrl.o
 obj-y += srat.o
 obj-y += string.o
+obj-$(CONFIG_SYSCTL) += sysctl.o
 obj-y += time.o
 obj-y += traps-setup.o
 obj-y += traps.o
@@ -78,7 +79,6 @@ ifneq ($(CONFIG_PV_SHIM_EXCLUSIVE),y)
 obj-y += domctl.o
 obj-y += platform_hypercall.o
 obj-$(CONFIG_COMPAT) += x86_64/platform_hypercall.o
-obj-y += sysctl.o
 endif
 
 extra-y += asm-macros.i
diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index 5815a35335..499d320e61 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -133,9 +133,11 @@ static const struct feat_props {
  */
 enum psr_type alt_type;
 
+#ifdef CONFIG_SYSCTL
 /* get_feat_info is used to return feature HW info through sysctl. */
 bool (*get_feat_info)(const struct feat_node *feat,
   uint32_t data[], unsigned int array_len);
+#endif
 
 /* write_msr is used to write out feature MSR register. */
 void (*write_msr)(unsigned int cos, uint32_t val, enum psr_type type);
@@ -418,6 +420,7 @@ static bool mba_init_feature(const struct cpuid_leaf *regs,
 return true;
 }
 
+#ifdef CONFIG_SYSCTL
 static bool cf_check cat_get_feat_info(
 const struct feat_node *feat, uint32_t data[], unsigned int array_len)
 {
@@ -430,6 +433,7 @@ static bool cf_check cat_get_feat_info(
 
 return true;
 }
+#endif /* CONFIG_SYSCTL */
 
 /* L3 CAT props */
 static void cf_check l3_cat_write_msr(
@@ -442,11 +446,14 @@ static const struct feat_props l3_cat_props = {
 .cos_num = 1,
 .typ

[PATCH v6 12/18] xen/sysctl: wrap around XEN_SYSCTL_scheduler_op

2025-07-04 Thread Penny Zheng

Function sched_adjust_global is designed for XEN_SYSCTL_scheduler_op, so
itself and its calling flow, like .adjust_global, shall all be wrapped.

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
Acked-by: Stewart Hildebrand  #a653
---
v1 -> v2:
- no need to wrap declarations
- add transient #ifdef in sysctl.c for correct compilation
---
v2 -> v3:
- move #endif up ahead of the blank line
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/common/sched/arinc653.c | 6 ++
 xen/common/sched/core.c | 2 ++
 xen/common/sched/credit.c   | 4 
 xen/common/sched/credit2.c  | 4 
 xen/common/sched/private.h  | 4 
 xen/include/xsm/xsm.h   | 4 
 xen/xsm/dummy.c | 2 ++
 xen/xsm/flask/hooks.c   | 4 
 8 files changed, 30 insertions(+)

diff --git a/xen/common/sched/arinc653.c b/xen/common/sched/arinc653.c
index 432ccfe662..3c014c9934 100644
--- a/xen/common/sched/arinc653.c
+++ b/xen/common/sched/arinc653.c
@@ -220,6 +220,7 @@ static void update_schedule_units(const struct scheduler 
*ops)
   SCHED_PRIV(ops)->schedule[i].unit_id);
 }
 
+#ifdef CONFIG_SYSCTL
 /**
  * This function is called by the adjust_global scheduler hook to put
  * in place a new ARINC653 schedule.
@@ -334,6 +335,7 @@ arinc653_sched_get(
 
 return 0;
 }
+#endif /* CONFIG_SYSCTL */
 
 /**
  * Scheduler callback functions   *
@@ -653,6 +655,7 @@ a653_switch_sched(struct scheduler *new_ops, unsigned int 
cpu,
 return &sr->_lock;
 }
 
+#ifdef CONFIG_SYSCTL
 /**
  * Xen scheduler callback function to perform a global (not domain-specific)
  * adjustment. It is used by the ARINC 653 scheduler to put in place a new
@@ -692,6 +695,7 @@ a653sched_adjust_global(const struct scheduler *ops,
 
 return rc;
 }
+#endif /* CONFIG_SYSCTL */
 
 /**
  * This structure defines our scheduler for Xen.
@@ -726,7 +730,9 @@ static const struct scheduler sched_arinc653_def = {
 .switch_sched   = a653_switch_sched,
 
 .adjust = NULL,
+#ifdef CONFIG_SYSCTL
 .adjust_global  = a653sched_adjust_global,
+#endif
 
 .dump_settings  = NULL,
 .dump_cpu_state = NULL,
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 13fdf57e57..ea95dea65a 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -2112,6 +2112,7 @@ long sched_adjust(struct domain *d, struct 
xen_domctl_scheduler_op *op)
 return ret;
 }
 
+#ifdef CONFIG_SYSCTL
 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
 {
 struct cpupool *pool;
@@ -2140,6 +2141,7 @@ long sched_adjust_global(struct xen_sysctl_scheduler_op 
*op)
 
 return rc;
 }
+#endif /* CONFIG_SYSCTL */
 
 static void vcpu_periodic_timer_work_locked(struct vcpu *v)
 {
diff --git a/xen/common/sched/credit.c b/xen/common/sched/credit.c
index a6bb321e7d..6dcf6b2c8b 100644
--- a/xen/common/sched/credit.c
+++ b/xen/common/sched/credit.c
@@ -1256,6 +1256,7 @@ __csched_set_tslice(struct csched_private *prv, unsigned 
int timeslice_ms)
 prv->credit = prv->credits_per_tslice * prv->ncpus;
 }
 
+#ifdef CONFIG_SYSCTL
 static int cf_check
 csched_sys_cntl(const struct scheduler *ops,
 struct xen_sysctl_scheduler_op *sc)
@@ -1298,6 +1299,7 @@ csched_sys_cntl(const struct scheduler *ops,
 out:
 return rc;
 }
+#endif /* CONFIG_SYSCTL */
 
 static void *cf_check
 csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
@@ -2288,7 +2290,9 @@ static const struct scheduler sched_credit_def = {
 
 .adjust = csched_dom_cntl,
 .adjust_affinity= csched_aff_cntl,
+#ifdef CONFIG_SYSCTL
 .adjust_global  = csched_sys_cntl,
+#endif
 
 .pick_resource  = csched_res_pick,
 .do_schedule= csched_schedule,
diff --git a/xen/common/sched/credit2.c b/xen/common/sched/credit2.c
index 0a83f23725..0b3b61df57 100644
--- a/xen/common/sched/credit2.c
+++ b/xen/common/sched/credit2.c
@@ -3131,6 +3131,7 @@ csched2_aff_cntl(const struct scheduler *ops, struct 
sched_unit *unit,
 __clear_bit(__CSFLAG_pinned, &svc->flags);
 }
 
+#ifdef CONFIG_SYSCTL
 static int cf_check csched2_sys_cntl(
 const struct scheduler *ops, struct xen_sysctl_scheduler_op *sc)
 {
@@ -3162,6 +3163,7 @@ static int cf_check csched2_sys_cntl(
 
 return 0;
 }
+#endif /* CONFIG_SYSCTL */
 
 static void *cf_check
 csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
@@ -4232,7 +4234,9 @@ static const struct scheduler sched_credit2_def = {
 
 .adjust = csched2_dom_cntl,
 .adjust_affinity= csched2_aff_cntl,
+#ifdef CONFIG_SYSCTL
 .adjust_global  = csched2_sys_cntl,
+#endif
 
 .pick_resource  = csched2_res_pick,
 .migrate= csched2_unit_migrate,
diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
index c0e7c96d24..d6884550cd 100644
--- a/xen/common/sched/private.h
+++ b/xen/common/sched/priv

[PATCH v6 15/18] xen/sysctl: make CONFIG_LIVEPATCH depend on CONFIG_SYSCTL

2025-07-04 Thread Penny Zheng

LIVEPATCH mechanism relies on LIVEPATCH_SYSCTL hypercall, so CONFIG_LIVEPATCH
shall depend on CONFIG_SYSCTL

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v1 -> v2:
- commit message refactor
---
 xen/common/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 998e672fa7..65f07289dd 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -472,7 +472,7 @@ config CRYPTO
 config LIVEPATCH
bool "Live patching support"
default X86
-   depends on "$(XEN_HAS_BUILD_ID)" = "y"
+   depends on "$(XEN_HAS_BUILD_ID)" = "y" && SYSCTL
select CC_SPLIT_SECTIONS
help
  Allows a running Xen hypervisor to be dynamically patched using
-- 
2.34.1

[PATCH v6 13/18] xen/sysctl: wrap around XEN_SYSCTL_physinfo

2025-07-04 Thread Penny Zheng

The following functions are only used to deal with XEN_SYSCTL_physinfo,
then they shall be wrapped:
- arch_do_physinfo()
- get_outstanding_claims()

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
Acked-by: Jan Beulich 
---
v1 -> v2:
- no need to wrap declaration
- add transient #ifdef in sysctl.c for correct compilation
---
v2 -> v3:
- move #endif up ahead of the blank line
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/arch/arm/sysctl.c   | 2 ++
 xen/arch/riscv/stubs.c  | 2 ++
 xen/arch/x86/sysctl.c   | 2 ++
 xen/common/page_alloc.c | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/xen/arch/arm/sysctl.c b/xen/arch/arm/sysctl.c
index 32cab4feff..2d350b700a 100644
--- a/xen/arch/arm/sysctl.c
+++ b/xen/arch/arm/sysctl.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 
+#ifdef CONFIG_SYSCTL
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 {
 pi->capabilities |= XEN_SYSCTL_PHYSCAP_hvm | XEN_SYSCTL_PHYSCAP_hap;
@@ -22,6 +23,7 @@ void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 pi->arch_capabilities |= MASK_INSR(sve_encode_vl(get_sys_vl_len()),
XEN_SYSCTL_PHYSCAP_ARM_SVE_MASK);
 }
+#endif
 
 long arch_do_sysctl(struct xen_sysctl *sysctl,
 XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
diff --git a/xen/arch/riscv/stubs.c b/xen/arch/riscv/stubs.c
index e396b67cd3..f86a1c17cb 100644
--- a/xen/arch/riscv/stubs.c
+++ b/xen/arch/riscv/stubs.c
@@ -310,10 +310,12 @@ long arch_do_sysctl(struct xen_sysctl *sysctl,
 BUG_ON("unimplemented");
 }
 
+#ifdef CONFIG_SYSCTL
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 {
 BUG_ON("unimplemented");
 }
+#endif /* CONFIG_SYSCTL */
 
 /* p2m.c */
 
diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
index 1b04947516..f64addbe2b 100644
--- a/xen/arch/x86/sysctl.c
+++ b/xen/arch/x86/sysctl.c
@@ -91,6 +91,7 @@ static long cf_check smt_up_down_helper(void *data)
 return ret;
 }
 
+#ifdef CONFIG_SYSCTL
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 {
 memcpy(pi->hw_cap, boot_cpu_data.x86_capability,
@@ -104,6 +105,7 @@ void arch_do_physinfo(struct xen_sysctl_physinfo *pi)
 if ( IS_ENABLED(CONFIG_SHADOW_PAGING) )
 pi->capabilities |= XEN_SYSCTL_PHYSCAP_shadow;
 }
+#endif /* CONFIG_SYSCTL */
 
 long arch_do_sysctl(
 struct xen_sysctl *sysctl, XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index dec4cb2ab1..8177d12f50 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -601,6 +601,7 @@ out:
 return ret;
 }
 
+#ifdef CONFIG_SYSCTL
 void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages)
 {
 spin_lock(&heap_lock);
@@ -608,6 +609,7 @@ void get_outstanding_claims(uint64_t *free_pages, uint64_t 
*outstanding_pages)
 *free_pages = avail_heap_pages(MEMZONE_XEN + 1, NR_ZONES - 1, -1);
 spin_unlock(&heap_lock);
 }
+#endif /* CONFIG_SYSCTL */
 
 static bool __read_mostly first_node_initialised;
 #ifndef CONFIG_SEPARATE_XENHEAP
-- 
2.34.1

[PATCH v6 14/18] xen/sysctl: make CONFIG_COVERAGE depend on CONFIG_SYSCTL

2025-07-04 Thread Penny Zheng

Users rely on SYSCTL_coverage_op hypercall to interact with the coverage data,
that is, according operations shall be wrapped around with CONFIG_SYSCTL.
Right now, it is compiled under CONFIG_COVERAGE, so we shall make
CONFIG_COVERAGE depend on CONFIG_SYSCTL.

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v1 -> v2:
- commit message refactor
---
v3 -> v4:
- commit message refactor
---
v4 -> v5:
- commit message refactor
- switch the dependency order of "!LIVEPATCH" and "SYSCTL"
---
 xen/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/Kconfig.debug b/xen/Kconfig.debug
index d14093017e..38a134fa3b 100644
--- a/xen/Kconfig.debug
+++ b/xen/Kconfig.debug
@@ -37,7 +37,7 @@ config SELF_TESTS
 
 config COVERAGE
bool "Code coverage support"
-   depends on !LIVEPATCH
+   depends on SYSCTL && !LIVEPATCH
select SUPPRESS_DUPLICATE_SYMBOL_WARNINGS if !ENFORCE_UNIQUE_SYMBOLS
help
  Enable code coverage support.
-- 
2.34.1

[PATCH v6 09/18] xen/sysctl: introduce CONFIG_PM_STATS

2025-07-04 Thread Penny Zheng

We introduce a new Kconfig CONFIG_PM_STATS for wrapping all operations
regarding performance management statistics.
The major codes reside in xen/drivers/acpi/pmstat.c, including the
pm-statistic-related sysctl op: do_get_pm_info().
CONFIG_PM_STATS also shall depend on CONFIG_SYSCTL

We shall also provide "# CONFIG_PM_STATS is not set" in preset configs for
PV shim on x86.

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
Acked-by: Jan Beulich 
---
v1 -> v2:
- rename to CONFIG_PM_STATS
- fix indention and stray semicolon
- make code movements into a new commit
- No need to wrap inline functions and declarations
---
v2 -> v3:
- sepearte functions related to do_pm_op() into a new commit
- both braces shall be moved to the line with the closing parenthesis
---
v3 -> v4:
- be consistent with the comment on the #endif
---
v4 -> v5
- add blank line before endmenu
---
v5 -> v6
- add "# CONFIG_PM_STATS is not set" in preset configs for PV shim on x86
---
 xen/arch/x86/acpi/cpu_idle.c  |  2 ++
 xen/arch/x86/configs/pvshim_defconfig |  1 +
 xen/common/Kconfig|  8 
 xen/common/sysctl.c   |  2 +-
 xen/drivers/acpi/Makefile |  2 +-
 xen/include/acpi/cpufreq/processor_perf.h | 10 ++
 6 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 6c3a10e6fb..3a5524691b 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -1490,6 +1490,7 @@ static void amd_cpuidle_init(struct acpi_processor_power 
*power)
 vendor_override = -1;
 }
 
+#ifdef CONFIG_PM_STATS
 uint32_t pmstat_get_cx_nr(unsigned int cpu)
 {
 return processor_powers[cpu] ? processor_powers[cpu]->count : 0;
@@ -1609,6 +1610,7 @@ int pmstat_reset_cx_stat(unsigned int cpu)
 {
 return 0;
 }
+#endif /* CONFIG_PM_STATS */
 
 void cpuidle_disable_deep_cstate(void)
 {
diff --git a/xen/arch/x86/configs/pvshim_defconfig 
b/xen/arch/x86/configs/pvshim_defconfig
index bacd04c963..9dc91c33e3 100644
--- a/xen/arch/x86/configs/pvshim_defconfig
+++ b/xen/arch/x86/configs/pvshim_defconfig
@@ -27,3 +27,4 @@ CONFIG_EXPERT=y
 # CONFIG_DEBUG is not set
 # CONFIG_GDBSX is not set
 # CONFIG_PM_OP is not set
+# CONFIG_PM_STATS is not set
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 943cf0a950..998e672fa7 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -628,4 +628,12 @@ config PM_OP
  This option shall enable userspace performance management control
  to do power/performance analyzing and tuning.
 
+config PM_STATS
+   bool "Enable Performance Management Statistics"
+   depends on ACPI && HAS_CPUFREQ && SYSCTL
+   default y
+   help
+ Enable collection of performance management statistics to aid in
+ analyzing and tuning power/performance characteristics of the system
+
 endmenu
diff --git a/xen/common/sysctl.c b/xen/common/sysctl.c
index daf57fbe56..5207664252 100644
--- a/xen/common/sysctl.c
+++ b/xen/common/sysctl.c
@@ -170,7 +170,7 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) 
u_sysctl)
 op->u.availheap.avail_bytes <<= PAGE_SHIFT;
 break;
 
-#if defined (CONFIG_ACPI) && defined (CONFIG_HAS_CPUFREQ)
+#ifdef CONFIG_PM_STATS
 case XEN_SYSCTL_get_pmstat:
 ret = do_get_pm_info(&op->u.get_pmstat);
 break;
diff --git a/xen/drivers/acpi/Makefile b/xen/drivers/acpi/Makefile
index 1d811a51a7..477408afbe 100644
--- a/xen/drivers/acpi/Makefile
+++ b/xen/drivers/acpi/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_X86) += apei/
 obj-bin-y += tables.init.o
 obj-$(CONFIG_ACPI_NUMA) += numa.o
 obj-y += osl.o
-obj-$(CONFIG_HAS_CPUFREQ) += pmstat.o
+obj-$(CONFIG_PM_STATS) += pmstat.o
 obj-$(CONFIG_PM_OP) += pm-op.o
 
 obj-$(CONFIG_X86) += hwregs.o
diff --git a/xen/include/acpi/cpufreq/processor_perf.h 
b/xen/include/acpi/cpufreq/processor_perf.h
index fa28b14faf..caa768626c 100644
--- a/xen/include/acpi/cpufreq/processor_perf.h
+++ b/xen/include/acpi/cpufreq/processor_perf.h
@@ -9,9 +9,19 @@
 
 unsigned int powernow_register_driver(void);
 unsigned int get_measured_perf(unsigned int cpu, unsigned int flag);
+#ifdef CONFIG_PM_STATS
 void cpufreq_statistic_update(unsigned int cpu, uint8_t from, uint8_t to);
 int  cpufreq_statistic_init(unsigned int cpu);
 void cpufreq_statistic_exit(unsigned int cpu);
+#else
+static inline void cpufreq_statistic_update(unsigned int cpu, uint8_t from,
+uint8_t to) {}
+static inline int cpufreq_statistic_init(unsigned int cpu)
+{
+return 0;
+}
+static inline void cpufreq_statistic_exit(unsigned int cpu) {}
+#endif /* CONFIG_PM_STATS */
 
 int  cpufreq_limit_change(unsigned int cpu);
 
-- 
2.34.1

[PATCH v6 08/18] xen/pmstat: introduce CONFIG_PM_OP

2025-07-04 Thread Penny Zheng

We move the following functions into a new file drivers/acpi/pm-op.c, as
they are all more fitting in performance controling and only called by
do_pm_op():
 - get_cpufreq_para()
 - set_cpufreq_para()
 - set_cpufreq_gov()
 - set_cpufreq_cppc()
 - cpufreq_driver_getavg()
 - cpufreq_update_turbo()
 - cpufreq_get_turbo_status()
We introduce a new Kconfig CONFIG_PM_OP to wrap the new file.

Also, although the following helpers are only called by do_pm_op(), they have
dependency on local variable, we wrap them with CONFIG_PM_OP in place:
 - write_userspace_scaling_setspeed()
 - write_ondemand_sampling_rate()
 - write_ondemand_up_threshold()
 - get_cpufreq_ondemand_para()
 - cpufreq_driver.update()
 - get_hwp_para()
Various style corrections shall be applied at the same time while moving these
functions, including:
 - add extra space before and after bracket of if() and switch()
 - fix indentation
 - drop all the unnecessary inner figure braces

We shall also provide "# CONFIG_PM_OP is not set" in preset configs for
PV shim on x86.

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
Acked-by: Jan Beulich 
---
v2 -> v3
- new commit
---
v3 -> v4:
- rename the file to pm-op.c
- drop all the unnecessary inner figure braces
- be consistent with the comment on the #endif
---
v4 -> v5:
- add blank line before endmenu
---
v5 -> v6:
- rebase changes from "xen/cpufreq: normalize hwp driver check with 
hwp_active()"
and "xen/cpufreq: move "init" flag into common structure"
- add "# CONFIG_PM_OP is not set" in preset configs for PV shim on x86
---
 xen/arch/x86/acpi/cpufreq/hwp.c  |   6 +
 xen/arch/x86/acpi/cpufreq/powernow.c |   4 +
 xen/arch/x86/configs/pvshim_defconfig|   1 +
 xen/common/Kconfig   |   8 +
 xen/common/sysctl.c  |   2 +
 xen/drivers/acpi/Makefile|   1 +
 xen/drivers/acpi/pm-op.c | 395 +++
 xen/drivers/acpi/pmstat.c| 355 -
 xen/drivers/cpufreq/cpufreq_misc_governors.c |   2 +
 xen/drivers/cpufreq/cpufreq_ondemand.c   |   2 +
 xen/drivers/cpufreq/utility.c|  41 --
 xen/include/acpi/cpufreq/cpufreq.h   |   3 -
 12 files changed, 421 insertions(+), 399 deletions(-)
 create mode 100644 xen/drivers/acpi/pm-op.c

diff --git a/xen/arch/x86/acpi/cpufreq/hwp.c b/xen/arch/x86/acpi/cpufreq/hwp.c
index d5fa3d47ca..e4c09244ab 100644
--- a/xen/arch/x86/acpi/cpufreq/hwp.c
+++ b/xen/arch/x86/acpi/cpufreq/hwp.c
@@ -466,6 +466,7 @@ static int cf_check hwp_cpufreq_cpu_exit(struct 
cpufreq_policy *policy)
 return 0;
 }
 
+#ifdef CONFIG_PM_OP
 /*
  * The SDM reads like turbo should be disabled with MSR_IA32_PERF_CTL and
  * PERF_CTL_TURBO_DISENGAGE, but that does not seem to actually work, at least
@@ -508,6 +509,7 @@ static int cf_check hwp_cpufreq_update(unsigned int cpu, 
struct cpufreq_policy *
 
 return per_cpu(hwp_drv_data, cpu)->ret;
 }
+#endif /* CONFIG_PM_OP */
 
 static const struct cpufreq_driver __initconst_cf_clobber
 hwp_cpufreq_driver = {
@@ -516,9 +518,12 @@ hwp_cpufreq_driver = {
 .target = hwp_cpufreq_target,
 .init   = hwp_cpufreq_cpu_init,
 .exit   = hwp_cpufreq_cpu_exit,
+#ifdef CONFIG_PM_OP
 .update = hwp_cpufreq_update,
+#endif
 };
 
+#ifdef CONFIG_PM_OP
 int get_hwp_para(unsigned int cpu,
  struct xen_cppc_para *cppc_para)
 {
@@ -639,6 +644,7 @@ int set_hwp_para(struct cpufreq_policy *policy,
 
 return hwp_cpufreq_target(policy, 0, 0);
 }
+#endif /* CONFIG_PM_OP */
 
 int __init hwp_register_driver(void)
 {
diff --git a/xen/arch/x86/acpi/cpufreq/powernow.c 
b/xen/arch/x86/acpi/cpufreq/powernow.c
index 69364e1855..12fca45b45 100644
--- a/xen/arch/x86/acpi/cpufreq/powernow.c
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c
@@ -49,6 +49,7 @@ static void cf_check transition_pstate(void *pstate)
 wrmsrl(MSR_PSTATE_CTRL, *(unsigned int *)pstate);
 }
 
+#ifdef CONFIG_PM_OP
 static void cf_check update_cpb(void *data)
 {
 struct cpufreq_policy *policy = data;
@@ -77,6 +78,7 @@ static int cf_check powernow_cpufreq_update(
 
 return 0;
 }
+#endif /* CONFIG_PM_OP */
 
 static int cf_check powernow_cpufreq_target(
 struct cpufreq_policy *policy,
@@ -324,7 +326,9 @@ powernow_cpufreq_driver = {
 .target = powernow_cpufreq_target,
 .init   = powernow_cpufreq_cpu_init,
 .exit   = powernow_cpufreq_cpu_exit,
+#ifdef CONFIG_PM_OP
 .update = powernow_cpufreq_update
+#endif
 };
 
 unsigned int __init powernow_register_driver(void)
diff --git a/xen/arch/x86/configs/pvshim_defconfig 
b/xen/arch/x86/configs/pvshim_defconfig
index 2ad27f898e..bacd04c963 100644
--- a/xen/arch/x86/configs/pvshim_defconfig
+++ b/xen/arch/x86/configs/pvshim_defconfig
@@ -26,3 +26,4 @@ CONFIG_EXPERT=y
 # CONFIG_INTEL_IOMMU is not set
 # CONFIG_DEBUG is not set
 # CONFIG_GDBSX is not set
+# CONFIG_PM_OP is not set
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index dbd9747d1

[PATCH v6 07/18] xen/sysctl: wrap around XEN_SYSCTL_lockprof_op

2025-07-04 Thread Penny Zheng

The following function is only to serve spinlock profiling via
XEN_SYSCTL_lockprof_op, so it shall be wrapped:
- spinlock_profile_control()

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v2 -> v3:
- add the blank line
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/common/spinlock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
index 38caa10a2e..0389293b09 100644
--- a/xen/common/spinlock.c
+++ b/xen/common/spinlock.c
@@ -690,6 +690,7 @@ void cf_check spinlock_profile_reset(unsigned char key)
 spinlock_profile_iterate(spinlock_profile_reset_elem, NULL);
 }
 
+#ifdef CONFIG_SYSCTL
 typedef struct {
 struct xen_sysctl_lockprof_op *pc;
 int  rc;
@@ -749,6 +750,7 @@ int spinlock_profile_control(struct xen_sysctl_lockprof_op 
*pc)
 
 return rc;
 }
+#endif /* CONFIG_SYSCTL */
 
 void _lock_profile_register_struct(
 int32_t type, struct lock_profile_qhead *qhead, int32_t idx)
-- 
2.34.1

[PATCH v6 03/18] xen/sysctl: wrap around XEN_SYSCTL_readconsole

2025-07-04 Thread Penny Zheng

The following functions is to deal with XEN_SYSCTL_readconsole sub-op, and
shall be wrapped:
- xsm_readconsole()
- read_console_ring()

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v2 -> v3:
- move #endif up ahead of the blank line
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/drivers/char/console.c | 2 ++
 xen/include/xsm/xsm.h  | 4 
 xen/xsm/dummy.c| 2 +-
 xen/xsm/flask/hooks.c  | 4 ++--
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c
index ba5a809a99..c5afac9f72 100644
--- a/xen/drivers/char/console.c
+++ b/xen/drivers/char/console.c
@@ -368,6 +368,7 @@ static void conring_puts(const char *str, size_t len)
 conringc = conringp - conring_size;
 }
 
+#ifdef CONFIG_SYSCTL
 long read_console_ring(struct xen_sysctl_readconsole *op)
 {
 XEN_GUEST_HANDLE_PARAM(char) str;
@@ -410,6 +411,7 @@ long read_console_ring(struct xen_sysctl_readconsole *op)
 
 return 0;
 }
+#endif /* CONFIG_SYSCTL */
 
 
 /*
diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h
index 22e2429f52..042a99449f 100644
--- a/xen/include/xsm/xsm.h
+++ b/xen/include/xsm/xsm.h
@@ -270,7 +270,11 @@ static inline int xsm_sysctl(xsm_default_t def, int cmd)
 
 static inline int xsm_readconsole(xsm_default_t def, uint32_t clear)
 {
+#ifdef CONFIG_SYSCTL
 return alternative_call(xsm_ops.readconsole, clear);
+#else
+return -EOPNOTSUPP;
+#endif
 }
 
 static inline int xsm_evtchn_unbound(
diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
index 93a0665ecc..cd0e844fcf 100644
--- a/xen/xsm/dummy.c
+++ b/xen/xsm/dummy.c
@@ -24,8 +24,8 @@ static const struct xsm_ops __initconst_cf_clobber dummy_ops 
= {
 .domctl= xsm_domctl,
 #ifdef CONFIG_SYSCTL
 .sysctl= xsm_sysctl,
-#endif
 .readconsole   = xsm_readconsole,
+#endif
 
 .evtchn_unbound= xsm_evtchn_unbound,
 .evtchn_interdomain= xsm_evtchn_interdomain,
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index 3a43e5a1d6..df7e10775b 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -934,7 +934,6 @@ static int cf_check flask_sysctl(int cmd)
 return avc_unknown_permission("sysctl", cmd);
 }
 }
-#endif /* CONFIG_SYSCTL */
 
 static int cf_check flask_readconsole(uint32_t clear)
 {
@@ -945,6 +944,7 @@ static int cf_check flask_readconsole(uint32_t clear)
 
 return domain_has_xen(current->domain, perms);
 }
+#endif /* CONFIG_SYSCTL */
 
 static inline uint32_t resource_to_perm(uint8_t access)
 {
@@ -1888,8 +1888,8 @@ static const struct xsm_ops __initconst_cf_clobber 
flask_ops = {
 .domctl = flask_domctl,
 #ifdef CONFIG_SYSCTL
 .sysctl = flask_sysctl,
-#endif
 .readconsole = flask_readconsole,
+#endif
 
 .evtchn_unbound = flask_evtchn_unbound,
 .evtchn_interdomain = flask_evtchn_interdomain,
-- 
2.34.1

[PATCH v6 05/18] xen/sysctl: wrap around XEN_SYSCTL_sched_id

2025-07-04 Thread Penny Zheng

The following function shall be wrapped:
- scheduler_id()

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/common/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 9043414290..13fdf57e57 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -2069,11 +2069,13 @@ long do_set_timer_op(s_time_t timeout)
 return 0;
 }
 
+#ifdef CONFIG_SYSCTL
 /* scheduler_id - fetch ID of current scheduler */
 int scheduler_id(void)
 {
 return operations.sched_id;
 }
+#endif
 
 /* Adjust scheduling parameter for a given domain. */
 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
-- 
2.34.1

[PATCH v6 00/18] xen: introduce CONFIG_SYSCTL

2025-07-04 Thread Penny Zheng

It can be beneficial for some dom0less systems to further reduce Xen footprint
via disabling some hypercalls handling code, which may not to be used &
required in such systems. Each hypercall has a separate option to keep
configuration flexible.

Options to disable hypercalls:
- sysctl
- domctl
- hvm
- physdev
- platform

This patch serie is only focusing on introducing CONFIG_SYSCTL. Different
options will be covered in different patch serie.

Features, like LIVEPATCH, Overlay DTB, which fully rely on sysctl op, will
be wrapped with CONFIG_SYSCTL, to reduce Xen footprint as much as possible.

It is derived from Stefano Stabellini's commit "xen: introduce kconfig options 
to
disable hypercalls"(
https://lore.kernel.org/xen-devel/20241219092917.3006174-1-sergiy_kib...@epam.com)

---
Commit "xen/x86: remove "depends on !PV_SHIM_EXCLUSIVE"" and commit "
xen/sysctl: wrap around sysctl hypercall" shall be commited together.
---
Penny Zheng (16):
  xen/xsm: wrap around xsm_sysctl with CONFIG_SYSCTL
  xen/sysctl: wrap around XEN_SYSCTL_readconsole
  xen/sysctl: make CONFIG_TRACEBUFFER depend on CONFIG_SYSCTL
  xen/sysctl: wrap around XEN_SYSCTL_sched_id
  xen/sysctl: wrap around XEN_SYSCTL_perfc_op
  xen/sysctl: wrap around XEN_SYSCTL_lockprof_op
  xen/pmstat: introduce CONFIG_PM_OP
  xen/sysctl: introduce CONFIG_PM_STATS
  xen/sysctl: wrap around XEN_SYSCTL_page_offline_op
  xen/sysctl: wrap around XEN_SYSCTL_cpupool_op
  xen/sysctl: wrap around XEN_SYSCTL_scheduler_op
  xen/sysctl: wrap around XEN_SYSCTL_physinfo
  xen/sysctl: make CONFIG_COVERAGE depend on CONFIG_SYSCTL
  xen/sysctl: make CONFIG_LIVEPATCH depend on CONFIG_SYSCTL
  xen/sysctl: wrap around arch-specific arch_do_sysctl
  xen/x86: remove "depends on !PV_SHIM_EXCLUSIVE"

Stefano Stabellini (2):
  xen: introduce CONFIG_SYSCTL
  xen/sysctl: wrap around sysctl hypercall

 xen/Kconfig.debug|   2 +-
 xen/arch/arm/Kconfig |   1 +
 xen/arch/arm/Makefile|   2 +-
 xen/arch/riscv/stubs.c   |   2 +
 xen/arch/x86/Kconfig |   4 -
 xen/arch/x86/Makefile|   2 +-
 xen/arch/x86/acpi/cpu_idle.c |   2 +
 xen/arch/x86/acpi/cpufreq/hwp.c  |   6 +
 xen/arch/x86/acpi/cpufreq/powernow.c |   4 +
 xen/arch/x86/configs/pvshim_defconfig|   3 +
 xen/arch/x86/hvm/Kconfig |   1 -
 xen/arch/x86/psr.c   |  18 +
 xen/common/Kconfig   |  31 +-
 xen/common/Makefile  |   2 +-
 xen/common/page_alloc.c  |   4 +
 xen/common/perfc.c   |   2 +
 xen/common/sched/arinc653.c  |   6 +
 xen/common/sched/core.c  |   4 +
 xen/common/sched/cpupool.c   |   8 +
 xen/common/sched/credit.c|   4 +
 xen/common/sched/credit2.c   |   4 +
 xen/common/sched/private.h   |   4 +
 xen/common/spinlock.c|   2 +
 xen/common/sysctl.c  |   4 +-
 xen/drivers/acpi/Makefile|   3 +-
 xen/drivers/acpi/pm-op.c | 395 +++
 xen/drivers/acpi/pmstat.c| 355 -
 xen/drivers/char/console.c   |   2 +
 xen/drivers/cpufreq/cpufreq_misc_governors.c |   2 +
 xen/drivers/cpufreq/cpufreq_ondemand.c   |   2 +
 xen/drivers/cpufreq/utility.c|  41 --
 xen/drivers/video/Kconfig|   2 +-
 xen/include/acpi/cpufreq/cpufreq.h   |   3 -
 xen/include/acpi/cpufreq/processor_perf.h|  10 +
 xen/include/hypercall-defs.c |   8 +-
 xen/include/xsm/xsm.h|  18 +
 xen/xsm/dummy.c  |   6 +
 xen/xsm/flask/hooks.c|  14 +
 38 files changed, 569 insertions(+), 414 deletions(-)
 create mode 100644 xen/drivers/acpi/pm-op.c

-- 
2.34.1

[PATCH v6 11/18] xen/sysctl: wrap around XEN_SYSCTL_cpupool_op

2025-07-04 Thread Penny Zheng

Function cpupool_do_sysctl is designed for doing cpupool related sysctl
operations, and shall be wrapped.

The following static functions are only called by cpupool_do_sysctl(), then
shall be wrapped too:
- cpupool_get_next_by_id
- cpupool_destroy
- cpupool_unassign_cpu_helper
- cpupool_unassign_cpu

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v1 -> v2:
- no need to wrap declaration
- add transient #ifdef in sysctl.c for correct compilation
---
v2 -> v3
- move #endif up ahead of the blank line
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/common/sched/cpupool.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c
index 3d02c7b706..f5459c2779 100644
--- a/xen/common/sched/cpupool.c
+++ b/xen/common/sched/cpupool.c
@@ -241,10 +241,12 @@ struct cpupool *cpupool_get_by_id(unsigned int poolid)
 return __cpupool_get_by_id(poolid, true);
 }
 
+#ifdef CONFIG_SYSCTL
 static struct cpupool *cpupool_get_next_by_id(unsigned int poolid)
 {
 return __cpupool_get_by_id(poolid, false);
 }
+#endif /* CONFIG_SYSCTL */
 
 void cpupool_put(struct cpupool *pool)
 {
@@ -352,6 +354,7 @@ static struct cpupool *cpupool_create(unsigned int poolid,
 
 return ERR_PTR(ret);
 }
+#ifdef CONFIG_SYSCTL
 /*
  * destroys the given cpupool
  * returns 0 on success, 1 else
@@ -379,6 +382,7 @@ static int cpupool_destroy(struct cpupool *c)
 debugtrace_printk("cpupool_destroy(pool=%u)\n", c->cpupool_id);
 return 0;
 }
+#endif /* CONFIG_SYSCTL */
 
 /*
  * Move domain to another cpupool
@@ -568,6 +572,7 @@ static int cpupool_unassign_cpu_start(struct cpupool *c, 
unsigned int cpu)
 return ret;
 }
 
+#ifdef CONFIG_SYSCTL
 static long cf_check cpupool_unassign_cpu_helper(void *info)
 {
 struct cpupool *c = info;
@@ -633,6 +638,7 @@ static int cpupool_unassign_cpu(struct cpupool *c, unsigned 
int cpu)
 }
 return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
 }
+#endif /* CONFIG_SYSCTL */
 
 /*
  * add a new domain to a cpupool
@@ -810,6 +816,7 @@ static void cpupool_cpu_remove_forced(unsigned int cpu)
 rcu_read_unlock(&sched_res_rculock);
 }
 
+#ifdef CONFIG_SYSCTL
 /*
  * do cpupool related sysctl operations
  */
@@ -975,6 +982,7 @@ int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
 
 return ret;
 }
+#endif /* CONFIG_SYSCTL */
 
 unsigned int cpupool_get_id(const struct domain *d)
 {
-- 
2.34.1

[PATCH v6 06/18] xen/sysctl: wrap around XEN_SYSCTL_perfc_op

2025-07-04 Thread Penny Zheng

perfc_control() and perfc_copy_info() are responsible for providing control
of perf counters via XEN_SYSCTL_perfc_op in DOM0, so they both shall
be wrapped.

Signed-off-by: Penny Zheng 
Reviewed-by: Stefano Stabellini 
---
v2 -> v3:
- add the blank line
---
v3 -> v4:
- remove transient "#ifdef CONFIG_SYSCTL"
---
 xen/common/perfc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xen/common/perfc.c b/xen/common/perfc.c
index 8302b7cf6d..0f3b89af2c 100644
--- a/xen/common/perfc.c
+++ b/xen/common/perfc.c
@@ -149,6 +149,7 @@ void cf_check perfc_reset(unsigned char key)
 }
 }
 
+#ifdef CONFIG_SYSCTL
 static struct xen_sysctl_perfc_desc perfc_d[NR_PERFCTRS];
 static xen_sysctl_perfc_val_t *perfc_vals;
 static unsigned int  perfc_nbr_vals;
@@ -265,6 +266,7 @@ int perfc_control(struct xen_sysctl_perfc_op *pc)
 
 return rc;
 }
+#endif /* CONFIG_SYSCTL */
 
 /*
  * Local variables:
-- 
2.34.1

RE: [PATCH v6 00/18] xen: introduce CONFIG_SYSCTL

2025-07-04 Thread Penny, Zheng

[Public]

> -Original Message-
> From: Jan Beulich 
> Sent: Friday, July 4, 2025 5:49 PM
> To: Penny, Zheng 
> Cc: Huang, Ray ; Andrew Cooper
> ; Anthony PERARD ;
> Orzel, Michal ; Julien Grall ; Roger Pau
> Monné ; Stefano Stabellini ; 
> Daniel
> P. Smith ; Dario Faggioli ;
> Juergen Gross ; George Dunlap ;
> Nathan Studer ; Stewart Hildebrand
> ; Bertrand Marquis ; Volodymyr
> Babchuk ; Alistair Francis
> ; Bob Eshleman ;
> Connor Davis ; Oleksii Kurochko
> ; xen-devel@lists.xenproject.org; xen-
> de...@dornerworks.com
> Subject: Re: [PATCH v6 00/18] xen: introduce CONFIG_SYSCTL
>
> On 04.07.2025 11:29, Penny Zheng wrote:
> > It can be beneficial for some dom0less systems to further reduce Xen
> > footprint via disabling some hypercalls handling code, which may not
> > to be used & required in such systems. Each hypercall has a separate
> > option to keep configuration flexible.
> >
> > Options to disable hypercalls:
> > - sysctl
> > - domctl
> > - hvm
> > - physdev
> > - platform
> >
> > This patch serie is only focusing on introducing CONFIG_SYSCTL.
> > Different options will be covered in different patch serie.
> >
> > Features, like LIVEPATCH, Overlay DTB, which fully rely on sysctl op,
> > will be wrapped with CONFIG_SYSCTL, to reduce Xen footprint as much as
> possible.
> >
> > It is derived from Stefano Stabellini's commit "xen: introduce kconfig
> > options to disable hypercalls"(
> > https://lore.kernel.org/xen-devel/20241219092917.3006174-1-Sergiy_Kibr
> > i...@epam.com)
> >
> > ---
> > Commit "xen/x86: remove "depends on !PV_SHIM_EXCLUSIVE"" and commit "
> > xen/sysctl: wrap around sysctl hypercall" shall be commited together.
> > ---
> > Penny Zheng (16):
> >   xen/xsm: wrap around xsm_sysctl with CONFIG_SYSCTL
> >   xen/sysctl: wrap around XEN_SYSCTL_readconsole
> >   xen/sysctl: make CONFIG_TRACEBUFFER depend on CONFIG_SYSCTL
> >   xen/sysctl: wrap around XEN_SYSCTL_sched_id
> >   xen/sysctl: wrap around XEN_SYSCTL_perfc_op
> >   xen/sysctl: wrap around XEN_SYSCTL_lockprof_op
> >   xen/pmstat: introduce CONFIG_PM_OP
> >   xen/sysctl: introduce CONFIG_PM_STATS
> >   xen/sysctl: wrap around XEN_SYSCTL_page_offline_op
> >   xen/sysctl: wrap around XEN_SYSCTL_cpupool_op
> >   xen/sysctl: wrap around XEN_SYSCTL_scheduler_op
> >   xen/sysctl: wrap around XEN_SYSCTL_physinfo
> >   xen/sysctl: make CONFIG_COVERAGE depend on CONFIG_SYSCTL
> >   xen/sysctl: make CONFIG_LIVEPATCH depend on CONFIG_SYSCTL
> >   xen/sysctl: wrap around arch-specific arch_do_sysctl
> >   xen/x86: remove "depends on !PV_SHIM_EXCLUSIVE"
> >
> > Stefano Stabellini (2):
> >   xen: introduce CONFIG_SYSCTL
> >   xen/sysctl: wrap around sysctl hypercall
>
> This doesn't look to be based on latest staging, where some of the changes 
> above
> are already present. I specifically tried to get some of the stuff in so that 
> the next
> re-posting wouldn't be as large.

Sorry, I'll rebase

>
> Jan

Re: [PATCH v6 30/39] accel: Propagate AccelState to AccelClass::init_machine()

2025-07-04 Thread Zhao Liu

On Thu, Jul 03, 2025 at 07:32:36PM +0200, Philippe Mathieu-Daudé wrote:
> Date: Thu,  3 Jul 2025 19:32:36 +0200
> From: Philippe Mathieu-Daudé 
> Subject: [PATCH v6 30/39] accel: Propagate AccelState to
>  AccelClass::init_machine()
> X-Mailer: git-send-email 2.49.0
> 
> In order to avoid init_machine() to call current_accel(),
> pass AccelState along.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> Reviewed-by: Richard Henderson 
> Reviewed-by: Alex Bennée 
> ---
>  include/qemu/accel.h| 2 +-
>  accel/accel-system.c| 2 +-
>  accel/hvf/hvf-all.c | 2 +-
>  accel/kvm/kvm-all.c | 2 +-
>  accel/qtest/qtest.c | 2 +-
>  accel/tcg/tcg-all.c | 2 +-
>  accel/xen/xen-all.c | 2 +-
>  bsd-user/main.c | 2 +-
>  linux-user/main.c   | 2 +-
>  target/i386/nvmm/nvmm-all.c | 2 +-
>  target/i386/whpx/whpx-all.c | 2 +-
>  11 files changed, 11 insertions(+), 11 deletions(-)

...

> diff --git a/accel/accel-system.c b/accel/accel-system.c
> index b5b368c6a9c..fb8abe38594 100644
> --- a/accel/accel-system.c
> +++ b/accel/accel-system.c
> @@ -37,7 +37,7 @@ int accel_init_machine(AccelState *accel, MachineState *ms)
>  int ret;
>  ms->accelerator = accel;
>  *(acc->allowed) = true;
> -ret = acc->init_machine(ms);
> +ret = acc->init_machine(accel, ms);

Now we've already set "ms->accelerator", so that we could get @accel
by ms->accelerator.

But considerring the user emulation, where the @ms is NULL, and for
these cases, it needs to bring current_accel() back in patch 32.

Anyway, this solution is also fine for me, so,

Reviewed-by: Zhao Liu 


...But there're still more comments/questions about user emulation:

> --- a/bsd-user/main.c
> +++ b/bsd-user/main.c
> @@ -474,7 +474,7 @@ int main(int argc, char **argv)
>   opt_one_insn_per_tb, &error_abort);
>  object_property_set_int(OBJECT(accel), "tb-size",
>  opt_tb_size, &error_abort);
> -ac->init_machine(NULL);
> +ac->init_machine(accel, NULL);

Not the issue about this patch though,

it seems user emulation doesn't set acc->allowed. At least TCG enabled
is necessary, I guess?

>  }
>  
>  /*
> diff --git a/linux-user/main.c b/linux-user/main.c
> index 5ac5b55dc65..a9142ee7268 100644
> --- a/linux-user/main.c
> +++ b/linux-user/main.c
> @@ -820,7 +820,7 @@ int main(int argc, char **argv, char **envp)
>   opt_one_insn_per_tb, &error_abort);
>  object_property_set_int(OBJECT(accel), "tb-size",
>  opt_tb_size, &error_abort);
> -ac->init_machine(NULL);
> +ac->init_machine(accel, NULL);

Ditto.

>  }
>  
>  /*

Thanks,
Zhao

Re: [PATCH v4 56/65] accel: Expose and register generic_handle_interrupt()

2025-07-04 Thread Mads Ynddal



> On 2 Jul 2025, at 20.53, Philippe Mathieu-Daudé  wrote:
> 
> In order to dispatch over AccelOpsClass::handle_interrupt(),
> we need it always defined, not calling a hidden handler under
> the hood. Make AccelOpsClass::handle_interrupt() mandatory.
> Expose generic_handle_interrupt() prototype and register it
> for each accelerator.
> 
> Suggested-by: Richard Henderson 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> include/system/accel-ops.h| 3 +++
> accel/hvf/hvf-accel-ops.c | 1 +
> accel/kvm/kvm-accel-ops.c | 1 +
> accel/qtest/qtest.c   | 1 +
> accel/xen/xen-all.c   | 1 +
> system/cpus.c | 9 +++--
> target/i386/nvmm/nvmm-accel-ops.c | 1 +
> target/i386/whpx/whpx-accel-ops.c | 1 +
> 8 files changed, 12 insertions(+), 6 deletions(-)
> 

Reviewed-by: Mads Ynddal

Re: [PATCH v6 38/39] accel: Extract AccelClass definition to 'accel/accel-ops.h'

2025-07-04 Thread Richard Henderson


On 7/3/25 11:32, Philippe Mathieu-Daudé wrote:

Only accelerator implementations (and the common accelator
code) need to know about AccelClass internals. Move the
definition out but forward declare AccelState and AccelClass.

Signed-off-by: Philippe Mathieu-Daudé
---
  MAINTAINERS |  2 +-
  include/accel/accel-ops.h   | 50 +
  include/qemu/accel.h| 40 ++---
  include/system/hvf_int.h|  3 ++-
  include/system/kvm_int.h|  1 +
  accel/accel-common.c|  1 +
  accel/accel-system.c|  1 +
  accel/hvf/hvf-all.c |  1 +
  accel/kvm/kvm-all.c |  1 +
  accel/qtest/qtest.c |  1 +
  accel/tcg/tcg-accel-ops.c   |  1 +
  accel/tcg/tcg-all.c |  1 +
  accel/xen/xen-all.c |  1 +
  bsd-user/main.c |  1 +
  gdbstub/system.c|  1 +
  linux-user/main.c   |  1 +
  system/memory.c |  1 +
  target/i386/nvmm/nvmm-all.c |  1 +
  target/i386/whpx/whpx-all.c |  1 +
  19 files changed, 70 insertions(+), 40 deletions(-)
  create mode 100644 include/accel/accel-ops.h


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 37/39] accel: Rename 'system/accel-ops.h' -> 'accel/accel-cpu-ops.h'

2025-07-04 Thread Richard Henderson


On 7/3/25 11:32, Philippe Mathieu-Daudé wrote:

Unfortunately "system/accel-ops.h" handlers are not only
system-specific. For example, the cpu_reset_hold() hook
is part of the vCPU creation, after it is realized.

Mechanical rename to drop 'system' using:

   $ sed -i -e s_system/accel-ops.h_accel/accel-cpu-ops.h_g \
   $(git grep -l system/accel-ops.h)

Signed-off-by: Philippe Mathieu-Daudé
---
  include/{system/accel-ops.h => accel/accel-cpu-ops.h} | 8 
  accel/accel-common.c  | 2 +-
  accel/accel-system.c  | 2 +-
  accel/hvf/hvf-accel-ops.c | 2 +-
  accel/kvm/kvm-accel-ops.c | 2 +-
  accel/qtest/qtest.c   | 2 +-
  accel/tcg/tcg-accel-ops.c | 2 +-
  accel/xen/xen-all.c   | 2 +-
  cpu-target.c  | 2 +-
  gdbstub/system.c  | 2 +-
  system/cpus.c | 2 +-
  target/i386/nvmm/nvmm-accel-ops.c | 2 +-
  target/i386/whpx/whpx-accel-ops.c | 2 +-
  13 files changed, 16 insertions(+), 16 deletions(-)
  rename include/{system/accel-ops.h => accel/accel-cpu-ops.h} (96%)


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v5 3/9] xen/riscv: imsic_init() implementation

2025-07-04 Thread Oleksii Kurochko



On 6/30/25 4:27 PM, Jan Beulich wrote:

--- a/xen/arch/riscv/include/asm/smp.h
+++ b/xen/arch/riscv/include/asm/smp.h
@@ -3,6 +3,7 @@
  #define ASM__RISCV__SMP_H
  
  #include 

+#include 
  #include 
  
  #include 

@@ -18,6 +19,18 @@ static inline unsigned long cpuid_to_hartid(unsigned long 
cpuid)
  return pcpu_info[cpuid].hart_id;
  }
  
+static inline unsigned int hartid_to_cpuid(unsigned long hartid)

+{
+for ( unsigned int cpuid = 0; cpuid < ARRAY_SIZE(pcpu_info); cpuid++ )

We had been there before, I think: Why "cpuid", not "cpu" (as we have it about
everywhere else)?


To be in sync with other already merged functions, f.e. 
set_cpuid_to_hartid(cpuid, hartid).

~ Oleksii

Re: [PATCH 3/6] xen/softirq: Rework arch_skip_send_event_check() into arch_pend_softirq()

2025-07-04 Thread Andrew Cooper

On 04/07/2025 9:25 am, Jan Beulich wrote:
> On 04.07.2025 09:55, Roger Pau Monné wrote:
>> On Fri, Jul 04, 2025 at 09:23:29AM +0200, Jan Beulich wrote:
>>> On 03.07.2025 18:21, Roger Pau Monné wrote:
 On Wed, Jul 02, 2025 at 03:41:18PM +0100, Andrew Cooper wrote:
> --- a/xen/include/xen/softirq.h
> +++ b/xen/include/xen/softirq.h
> @@ -23,6 +23,22 @@ enum {
>  
>  #define NR_SOFTIRQS (NR_COMMON_SOFTIRQS + NR_ARCH_SOFTIRQS)
>  
> +/*
> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
> + * skipped, false if the IPI cannot be skipped.
> + */
> +#ifndef arch_pend_softirq
> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned 
> int cpu)
 Nit: I would maybe it arch_set_softirq(), I find `pend` not that clear
 (I would rather fully spell `pending` instead).
>>> I, too, did wonder about the naming here. But using "pending" as you suggest
>>> has the effect of giving the function a name we would normally associate 
>>> with
>>> a predicate ("Is it pending?"), whereas here the function is used to _mark_ 
>>> a
>>> softirq as pending. Hence in the end I didn't comment at all; I'd be fine
>>> with "set", but I'm also okay with "pend".
>> It's a set and check kind of function, so I don't care much.  Just
>> found the pend a bit too short, I don't think we usually abbreviate
>> pending to pend.
> Aiui it's not an abbreviation, but kind of a verb (inverse-)derived from 
> pending.
> I don't know whether that's "official"; my dictionary doesn't have it.

It's used frequently in some circles, meaning "to make pending".

But I've changed the name because I don't care enough to argue.

~Andrew

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

2025-07-04 Thread Andrew Cooper

On 04/07/2025 8:24 am, Roger Pau Monné wrote:
> On Thu, Jul 03, 2025 at 06:48:23PM +0100, Andrew Cooper wrote:
>> On 03/07/2025 5:36 pm, Roger Pau Monné wrote:
>>> On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
 In order elide IPIs, we must be able to identify whether a target CPU is in
 MWAIT at the point it is woken up.  i.e. the store to wake it up must also
 identify the state.

 Create a new in_mwait variable beside __softirq_pending, so we can use a
 CMPXCHG to set the softirq while also observing the status safely.  
 Implement
 an x86 version of arch_pend_softirq() which does this.

 In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
 precisely what it means.  X86_BUG_MONITOR can be accounted for simply by 
 not
 advertising in_mwait.

 Signed-off-by: Andrew Cooper 
 ---
 CC: Jan Beulich 
 CC: Roger Pau Monné 
 CC: Anthony PERARD 
 CC: Michal Orzel 
 CC: Julien Grall 
 CC: Stefano Stabellini 

 This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of 
 all
 of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").

 In Linux, they're both in the same flags field, which adds complexity.  In
 Xen, __softirq_pending is already unsigned long for everything other than 
 x86,
 so adding an arch-neutral "in_mwait" would need wider changes.
 ---
  xen/arch/x86/acpi/cpu_idle.c   | 20 +-
  xen/arch/x86/include/asm/hardirq.h | 14 +++-
  xen/arch/x86/include/asm/softirq.h | 34 ++
  3 files changed, 66 insertions(+), 2 deletions(-)

 diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
 index caa0fef0b3b1..13040ef467a0 100644
 --- a/xen/arch/x86/acpi/cpu_idle.c
 +++ b/xen/arch/x86/acpi/cpu_idle.c
 @@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
  {
  unsigned int cpu = smp_processor_id();
 -const unsigned int *this_softirq_pending = &softirq_pending(cpu);
 +irq_cpustat_t *stat = &irq_stat[cpu];
 +const unsigned int *this_softirq_pending = &stat->__softirq_pending;
 +
 +/*
 + * By setting in_mwait, we promise to other CPUs that we'll notice 
 changes
 + * to __softirq_pending without being sent an IPI.  We achieve this by
 + * either not going to sleep, or by having hardware notice on our 
 behalf.
 + *
 + * Some errata exist where MONITOR doesn't work properly, and the
 + * workaround is to force the use of an IPI.  Cause this to happen by
 + * simply not advertising outselves as being in_mwait.
 + */
 +alternative_io("movb $1, %[in_mwait]",
 +   "", X86_BUG_MONITOR,
 +   [in_mwait] "=m" (stat->in_mwait));
  
  monitor(this_softirq_pending, 0, 0);
  smp_mb();
 @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned 
 int ecx)
  mwait(eax, ecx);
  spec_ctrl_exit_idle(info);
  }
 +
 +alternative_io("movb $0, %[in_mwait]",
 +   "", X86_BUG_MONITOR,
 +   [in_mwait] "=m" (stat->in_mwait));
>>> Isn't it a bit overkill to use alternatives here when you could have a
>>> conditional based on a global variable to decide whether to set/clear
>>> in_mwait?
>> I view it differently.  Why should the common case suffer overhead
>> (extra memory read and conditional branch) to work around 3 buggy pieces
>> of hardware in a common path?
> TBH I don't think the extra read and conditional would make any
> difference in this specific path performance wise.  Either the CPU is
> going to sleep and has nothing to do, or the cost of getting back from
> idle will shadow the overhead of an extra read and conditional.
>
> It's all a question of taste I guess.  If it was me I would set/clear
> stat->in_mwait unconditionally in mwait_idle_with_hints(), and then in
> arch_pend_softirq() I would return:
>
> return new & (1UL << 32) || force_mwait_ipi_wakeup;
>
> Or AFAICT you could possibly skip the cmpxchg() in the
> force_mwait_ipi_wakeup case in arch_pend_softirq(), and just do:
>
> if ( force_mwait_ipi_wakeup )
> return test_and_set_bit(nr, &softirq_pending(cpu));
>
> Because in that case Xen doesn't care about the in_mwait status.  It
> would be an optimization for the buggy hardware that already has to
> deal with the extra cost of always sending an IPI.

These will both function, but they're both poor options.

They're in a loop over a cpumask, and because of the full barriers in
the atomic options, the read cannot be hoisted or the loop split,
because the compiler has been told "the value may change on each loop
iteration".


This was a code-gen/perf note I h

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

2025-07-04 Thread Andrew Cooper

On 04/07/2025 8:52 am, Roger Pau Monné wrote:
> On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
>> diff --git a/xen/arch/x86/include/asm/softirq.h 
>> b/xen/arch/x86/include/asm/softirq.h
>> index e4b194f069fb..069e5716a68d 100644
>> --- a/xen/arch/x86/include/asm/softirq.h
>> +++ b/xen/arch/x86/include/asm/softirq.h
>> @@ -1,6 +1,8 @@
>>  #ifndef __ASM_SOFTIRQ_H__
>>  #define __ASM_SOFTIRQ_H__
>>  
>> +#include 
>> +
>>  #define NMI_SOFTIRQ(NR_COMMON_SOFTIRQS + 0)
>>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
>>  #define VCPU_KICK_SOFTIRQ  (NR_COMMON_SOFTIRQS + 2)
>> @@ -9,4 +11,36 @@
>>  #define HVM_DPCI_SOFTIRQ   (NR_COMMON_SOFTIRQS + 4)
>>  #define NR_ARCH_SOFTIRQS   5
>>  
>> +/*
>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>> + * skipped, false if the IPI cannot be skipped.
>> + *
>> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order 
>> to
>> + * set softirq @nr while also observing in_mwait in a race-free way.
>> + */
>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int 
>> cpu)
>> +{
>> +uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
>> +uint64_t old, new;
>> +unsigned int softirq = 1U << nr;
>> +
>> +old = ACCESS_ONCE(*ptr);
>> +
>> +do {
>> +if ( old & softirq )
>> +/* Softirq already pending, nothing to do. */
>> +return true;
>> +
>> +new = old | softirq;
>> +
>> +} while ( (old = cmpxchg(ptr, old, new)) != new );
>> +
>> +/*
>> + * We have caused the softirq to become pending.  If in_mwait was set, 
>> the
>> + * target CPU will notice the modification and act on it.
>> + */
>> +return new & (1UL << 32);
> Maybe I haven't got enough coffee yet, but if we do the cmpxchg()
> won't it be enough to send the IPI when softirq is first set, but not
> necessarily for each extra softirq that's set if there's already one
> enabled?

It was me who didn't have enough coffee when writing the code.  Already
fixed locally.

~Andrew

[PATCH v2 1/6] x86/idle: Remove broken MWAIT implementation

2025-07-04 Thread Andrew Cooper

cpuidle_wakeup_mwait() is a TOCTOU race.  The cpumask_and() sampling
cpuidle_mwait_flags can take a arbitrary period of time, and there's no
guarantee that the target CPUs are still in MWAIT when writing into
mwait_wakeup(cpu).

The consequence of the race is that we'll fail to IPI certain targets.  Also,
there's no guarantee that mwait_idle_with_hints() will raise a TIMER_SOFTIRQ
on it's way out.

The fundamental bug is that the "in_mwait" variable needs to be in the
monitored line, and not in a separate cpuidle_mwait_flags variable, in order
to do this in a race-free way.

Arranging to fix this while keeping the old implementation is prohibitive, so
strip the current one out in order to implement the new one cleanly.  In the
interim, this causes IPIs to be used unconditionally which is safe albeit
suboptimal.

Fixes: 3d521e933e1b ("cpuidle: mwait on softirq_pending & remove wakeup ipis")
Fixes: 1adb34ea846d ("CPUIDLE: re-implement mwait wakeup process")
Signed-off-by: Andrew Cooper 
Reviewed-by: Roger Pau Monné 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Anthony PERARD 
CC: Michal Orzel 
CC: Julien Grall 
CC: Stefano Stabellini 
---
 xen/arch/x86/acpi/cpu_idle.c   | 48 --
 xen/arch/x86/hpet.c|  2 --
 xen/arch/x86/include/asm/hardirq.h |  9 +++---
 xen/include/xen/cpuidle.h  |  2 --
 xen/include/xen/irq_cpustat.h  |  1 -
 5 files changed, 9 insertions(+), 53 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 6c3a10e6fb4e..141f0f0facf6 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -436,27 +436,6 @@ static int __init cf_check cpu_idle_key_init(void)
 }
 __initcall(cpu_idle_key_init);
 
-/*
- * The bit is set iff cpu use monitor/mwait to enter C state
- * with this flag set, CPU can be waken up from C state
- * by writing to specific memory address, instead of sending an IPI.
- */
-static cpumask_t cpuidle_mwait_flags;
-
-void cpuidle_wakeup_mwait(cpumask_t *mask)
-{
-cpumask_t target;
-unsigned int cpu;
-
-cpumask_and(&target, mask, &cpuidle_mwait_flags);
-
-/* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */
-for_each_cpu(cpu, &target)
-mwait_wakeup(cpu) = 0;
-
-cpumask_andnot(mask, mask, &target);
-}
-
 /* Force sending of a wakeup IPI regardless of mwait usage. */
 bool __ro_after_init force_mwait_ipi_wakeup;
 
@@ -465,42 +444,25 @@ bool arch_skip_send_event_check(unsigned int cpu)
 if ( force_mwait_ipi_wakeup )
 return false;
 
-/*
- * This relies on softirq_pending() and mwait_wakeup() to access data
- * on the same cache line.
- */
-smp_mb();
-return !!cpumask_test_cpu(cpu, &cpuidle_mwait_flags);
+return false;
 }
 
 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 {
 unsigned int cpu = smp_processor_id();
-s_time_t expires = per_cpu(timer_deadline, cpu);
-const void *monitor_addr = &mwait_wakeup(cpu);
+const unsigned int *this_softirq_pending = &softirq_pending(cpu);
 
-monitor(monitor_addr, 0, 0);
+monitor(this_softirq_pending, 0, 0);
 smp_mb();
 
-/*
- * Timer deadline passing is the event on which we will be woken via
- * cpuidle_mwait_wakeup. So check it now that the location is armed.
- */
-if ( (expires > NOW() || expires == 0) && !softirq_pending(cpu) )
+if ( !*this_softirq_pending )
 {
 struct cpu_info *info = get_cpu_info();
 
-cpumask_set_cpu(cpu, &cpuidle_mwait_flags);
-
 spec_ctrl_enter_idle(info);
 mwait(eax, ecx);
 spec_ctrl_exit_idle(info);
-
-cpumask_clear_cpu(cpu, &cpuidle_mwait_flags);
 }
-
-if ( expires <= NOW() && expires > 0 )
-raise_softirq(TIMER_SOFTIRQ);
 }
 
 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
@@ -901,7 +863,7 @@ void cf_check acpi_dead_idle(void)
 
 if ( cx->entry_method == ACPI_CSTATE_EM_FFH )
 {
-void *mwait_ptr = &mwait_wakeup(smp_processor_id());
+void *mwait_ptr = &softirq_pending(smp_processor_id());
 
 /*
  * Cache must be flushed as the last operation before sleeping.
diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
index 1bca8c8b670d..192de433cfd1 100644
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -188,8 +188,6 @@ static void evt_do_broadcast(cpumask_t *mask)
 if ( __cpumask_test_and_clear_cpu(cpu, mask) )
 raise_softirq(TIMER_SOFTIRQ);
 
-cpuidle_wakeup_mwait(mask);
-
 if ( !cpumask_empty(mask) )
cpumask_raise_softirq(mask, TIMER_SOFTIRQ);
 }
diff --git a/xen/arch/x86/include/asm/hardirq.h 
b/xen/arch/x86/include/asm/hardirq.h
index 342361cb6fdd..f3e93cc9b507 100644
--- a/xen/arch/x86/include/asm/hardirq.h
+++ b/xen/arch/x86/include/asm/hardirq.h
@@ -5,11 +5,10 @@
 #include 
 
 typedef struct {
-   unsigned int __softirq_pending;
-   unsigned int __local_irq_count;
-   unsigned int nmi_co

[PATCH v2 5/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

2025-07-04 Thread Andrew Cooper

In order elide IPIs, we must be able to identify whether a target CPU is in
MWAIT at the point it is woken up.  i.e. the store to wake it up must also
identify the state.

Create a new in_mwait variable beside __softirq_pending, so we can use a
CMPXCHG to set the softirq while also observing the status safely.  Implement
an x86 version of arch_pend_softirq() which does this.

In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
advertising in_mwait.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Anthony PERARD 
CC: Michal Orzel 
CC: Julien Grall 
CC: Stefano Stabellini 

This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of all
of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").

In Linux, they're both in the same flags field, which adds complexity.  In
Xen, __softirq_pending is already unsigned long for everything other than x86,
so adding an arch-neutral "in_mwait" would need wider changes.

v2:
 * Fix cmpxchg() expression.
 * Use BUILD_BUG_ON()s to check opencoding of in_mwait.

TODO: We want to introduce try_cmpxchg() which is better at the C and code-gen
level.
---
 xen/arch/x86/acpi/cpu_idle.c   | 20 -
 xen/arch/x86/include/asm/hardirq.h | 14 -
 xen/arch/x86/include/asm/softirq.h | 48 ++
 3 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 4f69df5a7438..c5d7a6c6fe2a 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -443,7 +443,21 @@ __initcall(cpu_idle_key_init);
 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 {
 unsigned int cpu = smp_processor_id();
-const unsigned int *this_softirq_pending = &softirq_pending(cpu);
+irq_cpustat_t *stat = &irq_stat[cpu];
+const unsigned int *this_softirq_pending = &stat->__softirq_pending;
+
+/*
+ * By setting in_mwait, we promise to other CPUs that we'll notice changes
+ * to __softirq_pending without being sent an IPI.  We achieve this by
+ * either not going to sleep, or by having hardware notice on our behalf.
+ *
+ * Some errata exist where MONITOR doesn't work properly, and the
+ * workaround is to force the use of an IPI.  Cause this to happen by
+ * simply not advertising ourselves as being in_mwait.
+ */
+alternative_io("movb $1, %[in_mwait]",
+   "", X86_BUG_MONITOR,
+   [in_mwait] "=m" (stat->in_mwait));
 
 monitor(this_softirq_pending, 0, 0);
 
@@ -455,6 +469,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int 
ecx)
 mwait(eax, ecx);
 spec_ctrl_exit_idle(info);
 }
+
+alternative_io("movb $0, %[in_mwait]",
+   "", X86_BUG_MONITOR,
+   [in_mwait] "=m" (stat->in_mwait));
 }
 
 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
diff --git a/xen/arch/x86/include/asm/hardirq.h 
b/xen/arch/x86/include/asm/hardirq.h
index f3e93cc9b507..1647cff04dc8 100644
--- a/xen/arch/x86/include/asm/hardirq.h
+++ b/xen/arch/x86/include/asm/hardirq.h
@@ -5,7 +5,19 @@
 #include 
 
 typedef struct {
-unsigned int __softirq_pending;
+/*
+ * The layout is important.  Any CPU can set bits in __softirq_pending,
+ * but in_mwait is a status bit owned by the CPU.  softirq_mwait_raw must
+ * cover both, and must be in a single cacheline.
+ */
+union {
+struct {
+unsigned int __softirq_pending;
+bool in_mwait;
+};
+uint64_t softirq_mwait_raw;
+};
+
 unsigned int __local_irq_count;
 unsigned int nmi_count;
 unsigned int mce_count;
diff --git a/xen/arch/x86/include/asm/softirq.h 
b/xen/arch/x86/include/asm/softirq.h
index e4b194f069fb..55b65c9747b1 100644
--- a/xen/arch/x86/include/asm/softirq.h
+++ b/xen/arch/x86/include/asm/softirq.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_SOFTIRQ_H__
 #define __ASM_SOFTIRQ_H__
 
+#include 
+
 #define NMI_SOFTIRQ(NR_COMMON_SOFTIRQS + 0)
 #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
 #define VCPU_KICK_SOFTIRQ  (NR_COMMON_SOFTIRQS + 2)
@@ -9,4 +11,50 @@
 #define HVM_DPCI_SOFTIRQ   (NR_COMMON_SOFTIRQS + 4)
 #define NR_ARCH_SOFTIRQS   5
 
+/*
+ * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
+ * skipped, false if the IPI cannot be skipped.
+ *
+ * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
+ * set softirq @nr while also observing in_mwait in a race-free way.
+ */
+static always_inline bool arch_set_softirq(unsigned int nr, unsigned int cpu)
+{
+uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
+uint64_t prev, old, new;
+unsigned int softirq = 1U << nr;
+
+old = ACCESS_ONCE(*ptr);
+
+for ( ;; )
+{
+if ( old & softirq )
+/* Softir

[PATCH v2 6/6] x86/idle: Fix buggy "x86/mwait-idle: enable interrupts before C1 on Xeons"

2025-07-04 Thread Andrew Cooper

The check of this_softirq_pending must be performed with irqs disabled, but
this property was broken by an attempt to optimise entry/exit latency.

Commit c227233ad64c in Linux (which we copied into Xen) was fixed up by
edc8fc01f608 in Linux, which we have so far missed.

Going to sleep without waking on interrupts is nonsensical outside of
play_dead(), so overload this to select between two possible MWAITs, the
second using the STI shadow to cover MWAIT for exactly the same reason as we
do in safe_halt().

Fixes: b17e0ec72ede ("x86/mwait-idle: enable interrupts before C1 on Xeons")
Signed-off-by: Andrew Cooper 
Reviewed-by: Jan Beulich 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Anthony PERARD 
CC: Michal Orzel 
CC: Julien Grall 
CC: Stefano Stabellini 
---
 xen/arch/x86/acpi/cpu_idle.c  | 16 +++-
 xen/arch/x86/cpu/mwait-idle.c |  8 ++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index c5d7a6c6fe2a..e50a9234a0d4 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -80,6 +80,13 @@ static always_inline void mwait(unsigned int eax, unsigned 
int ecx)
:: "a" (eax), "c" (ecx) );
 }
 
+static always_inline void sti_mwait_cli(unsigned int eax, unsigned int ecx)
+{
+/* STI shadow covers MWAIT. */
+asm volatile ( "sti; mwait; cli"
+   :: "a" (eax), "c" (ecx) );
+}
+
 #define GET_HW_RES_IN_NS(msr, val) \
 do { rdmsrl(msr, val); val = tsc_ticks2ns(val); } while( 0 )
 #define GET_MC6_RES(val)  GET_HW_RES_IN_NS(0x664, val)
@@ -461,12 +468,19 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int 
ecx)
 
 monitor(this_softirq_pending, 0, 0);
 
+ASSERT(!local_irq_is_enabled());
+
 if ( !*this_softirq_pending )
 {
 struct cpu_info *info = get_cpu_info();
 
 spec_ctrl_enter_idle(info);
-mwait(eax, ecx);
+
+if ( ecx & MWAIT_ECX_INTERRUPT_BREAK )
+mwait(eax, ecx);
+else
+sti_mwait_cli(eax, ecx);
+
 spec_ctrl_exit_idle(info);
 }
 
diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c
index 5c16f5ad3a82..5e98011bfd0c 100644
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -946,12 +946,8 @@ static void cf_check mwait_idle(void)
 
update_last_cx_stat(power, cx, before);
 
-   if (cx->irq_enable_early)
-   local_irq_enable();
-
-   mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
-
-   local_irq_disable();
+   mwait_idle_with_hints(cx->address,
+ cx->irq_enable_early ? 0 : 
MWAIT_ECX_INTERRUPT_BREAK);
 
after = alternative_call(cpuidle_get_tick);
 
-- 
2.39.5

[PATCH v2 0/6] x86/idle: Multiple MWAIT fixes

2025-07-04 Thread Andrew Cooper

The two main bugs were identified in Linux first, and I've modelled Xen's fix
similarly.

Patches 1-4 want committing together.  They do bisect and operate correctly,
but the range takes out an optimisation in order to reimplement it correctly.

https://gitlab.com/xen-project/hardware/xen-staging/-/pipelines/1905583317

Andrew Cooper (6):
  x86/idle: Remove broken MWAIT implementation
  x86/idle: Drop incorrect smp_mb() in mwait_idle_with_hints()
  x86/idle: Convert force_mwait_ipi_wakeup to X86_BUG_MONITOR
  xen/softirq: Rework arch_skip_send_event_check() into arch_set_softirq()
  x86/idle: Implement a new MWAIT IPI-elision algorithm
  x86/idle: Fix buggy "x86/mwait-idle: enable interrupts before C1 on Xeons"

 xen/arch/x86/acpi/cpu_idle.c   | 92 +++---
 xen/arch/x86/cpu/intel.c   |  2 +-
 xen/arch/x86/cpu/mwait-idle.c  |  8 +--
 xen/arch/x86/hpet.c|  2 -
 xen/arch/x86/include/asm/cpufeatures.h |  1 +
 xen/arch/x86/include/asm/hardirq.h | 21 --
 xen/arch/x86/include/asm/mwait.h   |  3 -
 xen/arch/x86/include/asm/softirq.h | 48 +-
 xen/common/softirq.c   |  8 +--
 xen/include/xen/cpuidle.h  |  2 -
 xen/include/xen/irq_cpustat.h  |  1 -
 xen/include/xen/softirq.h  | 16 +
 12 files changed, 124 insertions(+), 80 deletions(-)

-- 
2.39.5

[PATCH v2 2/6] x86/idle: Drop incorrect smp_mb() in mwait_idle_with_hints()

2025-07-04 Thread Andrew Cooper

With the recent simplifications, it becomes obvious that smp_mb() isn't the
right barrier.  Strictly speaking, MONITOR is ordered as a load, but smp_rmb()
isn't correct either, as this only pertains to local ordering.  All we need is
a compiler barrier().

Merge the barier() into the monitor() itself, along with an explantion.

No functional change.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Anthony PERARD 
CC: Michal Orzel 
CC: Julien Grall 
CC: Stefano Stabellini 

v2:
 * Move earlier in the series, not that it matters IMO.
 * Expand the commit message.
---
 xen/arch/x86/acpi/cpu_idle.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 141f0f0facf6..68dd44be5bb0 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -66,8 +66,12 @@ static always_inline void monitor(
 alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR,
   [addr] "a" (addr));
 
+/*
+ * The memory clobber is a compiler barrier.  Subseqeunt reads from the
+ * monitored cacheline must not be reordered over MONITOR.
+ */
 asm volatile ( "monitor"
-   :: "a" (addr), "c" (ecx), "d" (edx) );
+   :: "a" (addr), "c" (ecx), "d" (edx) : "memory" );
 }
 
 static always_inline void mwait(unsigned int eax, unsigned int ecx)
@@ -453,7 +457,6 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int 
ecx)
 const unsigned int *this_softirq_pending = &softirq_pending(cpu);
 
 monitor(this_softirq_pending, 0, 0);
-smp_mb();
 
 if ( !*this_softirq_pending )
 {
-- 
2.39.5

[PATCH v2 4/6] xen/softirq: Rework arch_skip_send_event_check() into arch_set_softirq()

2025-07-04 Thread Andrew Cooper

x86 is the only architecture wanting an optimisation here, but the
test_and_set_bit() is a store into the monitored line (i.e. will wake up the
target) and, prior to the removal of the broken IPI-elision algorithm, was
racy, causing unnecessary IPIs to be sent.

To do this in a race-free way, the store to the monited line needs to also
sample the status of the target in one atomic action.  Implement a new arch
helper with different semantics; to make the softirq pending and decide about
IPIs together.  For now, implement the default helper.  It will be overridden
by x86 in a subsequent change.

No functional change.

Signed-off-by: Andrew Cooper 
Reviewed-by: Roger Pau Monné 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Anthony PERARD 
CC: Michal Orzel 
CC: Julien Grall 
CC: Stefano Stabellini 

v2:
 * Rename new helper to arch_set_softirq()
 * Expand commit message
---
 xen/arch/x86/acpi/cpu_idle.c   |  5 -
 xen/arch/x86/include/asm/softirq.h |  2 --
 xen/common/softirq.c   |  8 ++--
 xen/include/xen/softirq.h  | 16 
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 07056a91a29e..4f69df5a7438 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -440,11 +440,6 @@ static int __init cf_check cpu_idle_key_init(void)
 }
 __initcall(cpu_idle_key_init);
 
-bool arch_skip_send_event_check(unsigned int cpu)
-{
-return false;
-}
-
 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 {
 unsigned int cpu = smp_processor_id();
diff --git a/xen/arch/x86/include/asm/softirq.h 
b/xen/arch/x86/include/asm/softirq.h
index 415ee866c79d..e4b194f069fb 100644
--- a/xen/arch/x86/include/asm/softirq.h
+++ b/xen/arch/x86/include/asm/softirq.h
@@ -9,6 +9,4 @@
 #define HVM_DPCI_SOFTIRQ   (NR_COMMON_SOFTIRQS + 4)
 #define NR_ARCH_SOFTIRQS   5
 
-bool arch_skip_send_event_check(unsigned int cpu);
-
 #endif /* __ASM_SOFTIRQ_H__ */
diff --git a/xen/common/softirq.c b/xen/common/softirq.c
index 60f344e8425e..dc3aabce3330 100644
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -94,9 +94,7 @@ void cpumask_raise_softirq(const cpumask_t *mask, unsigned 
int nr)
 raise_mask = &per_cpu(batch_mask, this_cpu);
 
 for_each_cpu(cpu, mask)
-if ( !test_and_set_bit(nr, &softirq_pending(cpu)) &&
- cpu != this_cpu &&
- !arch_skip_send_event_check(cpu) )
+if ( !arch_set_softirq(nr, cpu) && cpu != this_cpu )
 __cpumask_set_cpu(cpu, raise_mask);
 
 if ( raise_mask == &send_mask )
@@ -107,9 +105,7 @@ void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
 {
 unsigned int this_cpu = smp_processor_id();
 
-if ( test_and_set_bit(nr, &softirq_pending(cpu))
- || (cpu == this_cpu)
- || arch_skip_send_event_check(cpu) )
+if ( arch_set_softirq(nr, cpu) || cpu == this_cpu )
 return;
 
 if ( !per_cpu(batching, this_cpu) || in_irq() )
diff --git a/xen/include/xen/softirq.h b/xen/include/xen/softirq.h
index e9f79ec0ce3e..48f17e49efa1 100644
--- a/xen/include/xen/softirq.h
+++ b/xen/include/xen/softirq.h
@@ -23,6 +23,22 @@ enum {
 
 #define NR_SOFTIRQS (NR_COMMON_SOFTIRQS + NR_ARCH_SOFTIRQS)
 
+/*
+ * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
+ * skipped, false if the IPI cannot be skipped.
+ */
+#ifndef arch_set_softirq
+static always_inline bool arch_set_softirq(unsigned int nr, unsigned int cpu)
+{
+/*
+ * Try to set the softirq pending.  If we set the bit (i.e. the old bit
+ * was 0), we're responsible to send the IPI.  If the softirq was already
+ * pending (i.e. the old bit was 1), no IPI is needed.
+ */
+return test_and_set_bit(nr, &softirq_pending(cpu));
+}
+#endif
+
 typedef void (*softirq_handler)(void);
 
 void do_softirq(void);
-- 
2.39.5

[PATCH v2 3/6] x86/idle: Convert force_mwait_ipi_wakeup to X86_BUG_MONITOR

2025-07-04 Thread Andrew Cooper

We're going to want alternative-patch based on it.

No functional change.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Anthony PERARD 
CC: Michal Orzel 
CC: Julien Grall 
CC: Stefano Stabellini 
---
 xen/arch/x86/acpi/cpu_idle.c   | 6 --
 xen/arch/x86/cpu/intel.c   | 2 +-
 xen/arch/x86/include/asm/cpufeatures.h | 1 +
 xen/arch/x86/include/asm/mwait.h   | 3 ---
 4 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 68dd44be5bb0..07056a91a29e 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -440,14 +440,8 @@ static int __init cf_check cpu_idle_key_init(void)
 }
 __initcall(cpu_idle_key_init);
 
-/* Force sending of a wakeup IPI regardless of mwait usage. */
-bool __ro_after_init force_mwait_ipi_wakeup;
-
 bool arch_skip_send_event_check(unsigned int cpu)
 {
-if ( force_mwait_ipi_wakeup )
-return false;
-
 return false;
 }
 
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index 5215b5405c76..f7bd0d777289 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -436,7 +436,7 @@ static void __init probe_mwait_errata(void)
 {
 printk(XENLOG_WARNING
"Forcing IPI MWAIT wakeup due to CPU erratum\n");
-force_mwait_ipi_wakeup = true;
+setup_force_cpu_cap(X86_BUG_MONITOR);
 }
 }
 
diff --git a/xen/arch/x86/include/asm/cpufeatures.h 
b/xen/arch/x86/include/asm/cpufeatures.h
index 84c93292c80c..56231b00f15d 100644
--- a/xen/arch/x86/include/asm/cpufeatures.h
+++ b/xen/arch/x86/include/asm/cpufeatures.h
@@ -53,6 +53,7 @@ XEN_CPUFEATURE(USE_VMCALL,X86_SYNTH(30)) /* Use 
VMCALL instead of VMMCAL
 #define X86_BUG_CLFLUSH_MFENCEX86_BUG( 2) /* MFENCE needed to serialise 
CLFLUSH */
 #define X86_BUG_IBPB_NO_RET   X86_BUG( 3) /* IBPB doesn't flush the 
RSB/RAS */
 #define X86_BUG_CLFLUSH_MONITOR   X86_BUG( 4) /* MONITOR requires CLFLUSH */
+#define X86_BUG_MONITOR   X86_BUG( 5) /* MONITOR doesn't always notice 
writes (force IPIs) */
 
 #define X86_SPEC_NO_LFENCE_ENTRY_PV X86_BUG(16) /* (No) safety LFENCE for 
SPEC_CTRL_ENTRY_PV. */
 #define X86_SPEC_NO_LFENCE_ENTRY_INTR X86_BUG(17) /* (No) safety LFENCE for 
SPEC_CTRL_ENTRY_INTR. */
diff --git a/xen/arch/x86/include/asm/mwait.h b/xen/arch/x86/include/asm/mwait.h
index c52cd3f51011..000a692f6d19 100644
--- a/xen/arch/x86/include/asm/mwait.h
+++ b/xen/arch/x86/include/asm/mwait.h
@@ -13,9 +13,6 @@
 
 #define MWAIT_ECX_INTERRUPT_BREAK  0x1
 
-/* Force sending of a wakeup IPI regardless of mwait usage. */
-extern bool force_mwait_ipi_wakeup;
-
 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx);
 #ifdef CONFIG_INTEL
 bool mwait_pc10_supported(void);
-- 
2.39.5

[PATCH 5.15.y v3] xen: replace xen_remap() with memremap()

2025-07-04 Thread Teddy Astie

From: Juergen Gross 

[ upstream commit 41925b105e345ebc84cedb64f59d20cb14a62613 ]

xen_remap() is used to establish mappings for frames not under direct
control of the kernel: for Xenstore and console ring pages, and for
grant pages of non-PV guests.

Today xen_remap() is defined to use ioremap() on x86 (doing uncached
mappings), and ioremap_cache() on Arm (doing cached mappings).

Uncached mappings for those use cases are bad for performance, so they
should be avoided if possible. As all use cases of xen_remap() don't
require uncached mappings (the mapped area is always physical RAM),
a mapping using the standard WB cache mode is fine.

As sparse is flagging some of the xen_remap() use cases to be not
appropriate for iomem(), as the result is not annotated with the
__iomem modifier, eliminate xen_remap() completely and replace all
use cases with memremap() specifying the MEMREMAP_WB caching mode.

xen_unmap() can be replaced with memunmap().

Reported-by: kernel test robot 
Signed-off-by: Juergen Gross 
Reviewed-by: Boris Ostrovsky 
Acked-by: Stefano Stabellini 
Link: https://lore.kernel.org/r/20220530082634.6339-1-jgr...@suse.com
Signed-off-by: Juergen Gross 
Signed-off-by: Teddy Astie  [backport to 5.15.y]
---
v3:
- add missing hvc_xen.c change
v2:
- also remove xen_remap/xen_unmap on ARM
---
 arch/x86/include/asm/xen/page.h   | 3 ---
 drivers/tty/hvc/hvc_xen.c | 2 +-
 drivers/xen/grant-table.c | 6 +++---
 drivers/xen/xenbus/xenbus_probe.c | 3 +--
 include/xen/arm/page.h| 3 ---
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a162e559753..c183b7f9efef 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -355,9 +355,6 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
-#define xen_remap(cookie, size) ioremap((cookie), (size))
-#define xen_unmap(cookie) iounmap((cookie))
-
 static inline bool xen_arch_need_swiotlb(struct device *dev,
 phys_addr_t phys,
 dma_addr_t dev_addr)
diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index 141acc662eba..6a11a4177a16 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -270,7 +270,7 @@ static int xen_hvm_console_init(void)
if (r < 0 || v == 0)
goto err;
gfn = v;
-   info->intf = xen_remap(gfn << XEN_PAGE_SHIFT, XEN_PAGE_SIZE);
+   info->intf = memremap(gfn << XEN_PAGE_SHIFT, XEN_PAGE_SIZE, 
MEMREMAP_WB);
if (info->intf == NULL)
goto err;
info->vtermno = HVC_COOKIE;
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 0a2d24d6ac6f..a10e0741bec5 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -743,7 +743,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr)
if (xen_auto_xlat_grant_frames.count)
return -EINVAL;
 
-   vaddr = xen_remap(addr, XEN_PAGE_SIZE * max_nr_gframes);
+   vaddr = memremap(addr, XEN_PAGE_SIZE * max_nr_gframes, MEMREMAP_WB);
if (vaddr == NULL) {
pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n",
&addr);
@@ -751,7 +751,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr)
}
pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL);
if (!pfn) {
-   xen_unmap(vaddr);
+   memunmap(vaddr);
return -ENOMEM;
}
for (i = 0; i < max_nr_gframes; i++)
@@ -770,7 +770,7 @@ void gnttab_free_auto_xlat_frames(void)
if (!xen_auto_xlat_grant_frames.count)
return;
kfree(xen_auto_xlat_grant_frames.pfn);
-   xen_unmap(xen_auto_xlat_grant_frames.vaddr);
+   memunmap(xen_auto_xlat_grant_frames.vaddr);
 
xen_auto_xlat_grant_frames.pfn = NULL;
xen_auto_xlat_grant_frames.count = 0;
diff --git a/drivers/xen/xenbus/xenbus_probe.c 
b/drivers/xen/xenbus/xenbus_probe.c
index 2068f83556b7..77ca24611293 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -982,8 +982,7 @@ static int __init xenbus_init(void)
 #endif
xen_store_gfn = (unsigned long)v;
xen_store_interface =
-   xen_remap(xen_store_gfn << XEN_PAGE_SHIFT,
- XEN_PAGE_SIZE);
+   memremap(xen_store_gfn << XEN_PAGE_SHIFT, 
XEN_PAGE_SIZE, MEMREMAP_WB);
break;
default:
pr_warn("Xenstore state unknown\n");
diff --git a/include/xen/arm/page.h b/include/xen/arm/page.h
index ac1b65470563..f831cfeca000 100644
--- a/include/xen/arm/page.h
+++ b/include/xen/arm/page.h
@@ -109,9 +109,6 @@ static inline bool set_phys_to_machine(unsigned long pfn, 
unsigned

Re: [PATCH v7 4/7] xen: Make the maximum number of altp2m views configurable for x86

2025-07-04 Thread Petr Beneš

On Wed, Jul 2, 2025 at 4:11 PM Jan Beulich  wrote:
> > --- a/xen/include/xen/sched.h
> > +++ b/xen/include/xen/sched.h
> > @@ -619,6 +619,8 @@ struct domain
> >  unsigned int guest_request_sync  : 1;
> >  } monitor;
> >
> > +unsigned int nr_altp2m;/* Number of altp2m tables. */
>
> We may have had that discussion earlier on, but it has been too long ago now:
> Why is this not an x86-only field (i.e. in struct arch_domain)? Or one
> dependent upon CONFIG_ALTP2M?

altp2m is not architecture specific. Though having it guarded with
CONFIG_ALTP2M is a good idea. Of course, each of its ~40 usages (that
is outside of altp2m.c) would have to be guarded, too.

P.

Re: [XEN PATCH 0/5] address violation of MISRA C Rule 5.5

2025-07-04 Thread Dmytro Prokopchuk1

Pipeline:
https://gitlab.com/xen-project/people/dimaprkp4k/xen/-/pipelines/1907188989

On 7/4/25 23:39, Dmytro Prokopchuk1 wrote:
> This patch series fully eliminates MISRA C Rule 5.5
> violations for ARM64.
>
> The previous thread is here:
> https://lore.kernel.org/xen-devel/48c7830931a98b2bf70ef1509f309b262b9e5792.1745427770.git.victorm.l...@amd.com/
> where that violation was proposed to be deviated.
>
> Dmytro Prokopchuk (5):
>gnttab: address violation of MISRA C Rule 5.5
>iommu: address violation of MISRA C Rule 5.5
>x86/irq: address violation of MISRA C Rule 5.5
>device-tree: address violation of MISRA C Rule 5.5
>xen/bitops: address violation of MISRA C Rule 5.5
>
>   xen/arch/x86/irq.c|  2 +-
>   xen/common/device-tree/domain-build.c |  9 -
>   xen/common/grant_table.c  | 22 +-
>   xen/include/xen/bitops.h  | 24 
>   xen/include/xen/fdt-domain-build.h|  4 ++--
>   xen/include/xen/iommu.h   |  5 ++---
>   xen/include/xen/irq.h |  4 ++--
>   7 files changed, 36 insertions(+), 34 deletions(-)
>

Re: [PATCH 0/3] hvmloader: add new SMBIOS tables (7,8,9,26,27,28)

2025-07-04 Thread Petr Beneš

On Wed, Jul 2, 2025 at 8:47 AM Jan Beulich  wrote:
>
> On 02.07.2025 01:45, Petr Beneš wrote:
> > From: Petr Beneš 
> >
> > Resubmitting patch from Anton Belousov and addressing review comments
> > from Jan: 
> > https://old-list-archives.xen.org/archives/html/xen-devel/2022-01/msg00725.html
>
> In which case shouldn't this submission have a version number, explicitly
> larger than 1?

Fair. I wasn't sure, since I'm the one who's posting the patch now.
What version number should I use next? Should I go with 2 or use
something else?

P.

Re: [PATCH v2 3/6] x86/idle: Convert force_mwait_ipi_wakeup to X86_BUG_MONITOR

2025-07-04 Thread Roger Pau Monné

On Fri, Jul 04, 2025 at 05:34:07PM +0100, Andrew Cooper wrote:
> We're going to want alternative-patch based on it.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper 

Acked-by: Roger Pau Monné 

Thanks, Roger.

Re: [PATCH v2 2/6] x86/idle: Drop incorrect smp_mb() in mwait_idle_with_hints()

2025-07-04 Thread Roger Pau Monné

On Fri, Jul 04, 2025 at 05:34:06PM +0100, Andrew Cooper wrote:
> With the recent simplifications, it becomes obvious that smp_mb() isn't the
> right barrier.  Strictly speaking, MONITOR is ordered as a load, but smp_rmb()
> isn't correct either, as this only pertains to local ordering.  All we need is
> a compiler barrier().
> 
> Merge the barier() into the monitor() itself, along with an explantion.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper 

Acked-by: Roger Pau Monné 

Thanks, Roger.

Re: [PATCH 3/3] hvmloader: add new SMBIOS tables (7,8,9,26,27,28)

2025-07-04 Thread Petr Beneš

On Wed, Jul 2, 2025 at 9:15 AM Jan Beulich  wrote:
>
> On 02.07.2025 01:45, Petr Beneš wrote:
> > From: Petr Beneš 
>
> This isn't in line with the first S-o-b, nor with the fact that in the cover
> letter you say this was previously submitted (and hence authored?) by Anton.

Can you please point me to the right direction? I have no idea what
tags should I specify here.

P.

[PATCH v7 1/8] vpci/header: Emulate extended capability list for dom0

2025-07-04 Thread Jiqian Chen

Add a new function to emulate extended capability list for dom0,
and call it in init_header(). So that it will be easy to hide a
extended capability whose initialization fails.

As for the extended capability list of domU, just move the logic
into above function and keep hiding it for domU.

Signed-off-by: Jiqian Chen 
---
cc: "Roger Pau Monné" 
---
v6->v7 changes:
* Change word "guest" to "DomU" in vpci_init_ext_capability_list().
* Change parameter of vpci_init_ext_capability_list() to be const.
* Delete check "if ( !header )" in the while loop of 
vpci_init_ext_capability_list().
* Change the loop from while to do while in vpci_init_ext_capability_list().

v5->v6 changes:
* Delete unnecessary parameter "ttl" in vpci_init_ext_capability_list()
  since vpci_add_register() can already detect the overlaps.

v4->v5 changes:
* Add check: if capability list of hardware has a overlap, print warning and 
return 0.

v3->v4 changes:
* Add check "if ( !header )   return 0;" to avoid adding handler for
  device that has no extended capabilities.

v2->v3 changes:
* In vpci_init_ext_capability_list(), when domain is domU, directly return after
  adding a handler(hiding all extended capability for domU).
* In vpci_init_ext_capability_list(), change condition to be "while ( pos >= 
0x100U && ttl-- )"
  instead of "while ( pos && ttl-- )".
* Add new function vpci_hw_write32, and pass it to extended capability handler 
for dom0.

v1->v2 changes:
new patch

Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/header.c | 44 ---
 xen/drivers/vpci/vpci.c   |  6 ++
 xen/include/xen/vpci.h|  2 ++
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index d26cbba08ee1..8ee8052cd4a3 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -836,6 +836,39 @@ static int vpci_init_capability_list(struct pci_dev *pdev)
   PCI_STATUS_RSVDZ_MASK);
 }
 
+static int vpci_init_ext_capability_list(const struct pci_dev *pdev)
+{
+unsigned int pos = PCI_CFG_SPACE_SIZE;
+
+if ( !is_hardware_domain(pdev->domain) )
+/* Extended capabilities read as zero, write ignore for DomU */
+return vpci_add_register(pdev->vpci, vpci_read_val, NULL,
+ pos, 4, (void *)0);
+
+do
+{
+uint32_t header = pci_conf_read32(pdev->sbdf, pos);
+int rc;
+
+rc = vpci_add_register(pdev->vpci, vpci_read_val, vpci_hw_write32,
+   pos, 4, (void *)(uintptr_t)header);
+if ( rc == -EEXIST )
+{
+printk(XENLOG_WARNING
+   "%pd %pp: overlap in extended cap list, offset %#x\n",
+   pdev->domain, &pdev->sbdf, pos);
+return 0;
+}
+
+if ( rc )
+return rc;
+
+pos = PCI_EXT_CAP_NEXT(header);
+} while ( pos >= PCI_CFG_SPACE_SIZE );
+
+return 0;
+}
+
 static int cf_check init_header(struct pci_dev *pdev)
 {
 uint16_t cmd;
@@ -888,14 +921,9 @@ static int cf_check init_header(struct pci_dev *pdev)
 if ( rc )
 return rc;
 
-if ( !is_hwdom )
-{
-/* Extended capabilities read as zero, write ignore */
-rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL, 0x100, 4,
-   (void *)0);
-if ( rc )
-return rc;
-}
+rc = vpci_init_ext_capability_list(pdev);
+if ( rc )
+return rc;
 
 if ( pdev->ignore_bars )
 return 0;
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 09988f04c27c..8474c0e3b995 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -267,6 +267,12 @@ void cf_check vpci_hw_write16(
 pci_conf_write16(pdev->sbdf, reg, val);
 }
 
+void cf_check vpci_hw_write32(
+const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data)
+{
+pci_conf_write32(pdev->sbdf, reg, val);
+}
+
 int vpci_add_register_mask(struct vpci *vpci, vpci_read_t *read_handler,
vpci_write_t *write_handler, unsigned int offset,
unsigned int size, void *data, uint32_t ro_mask,
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index fc8d5b470b0b..61d16cc8b897 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -80,6 +80,8 @@ void cf_check vpci_hw_write8(
 const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data);
 void cf_check vpci_hw_write16(
 const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data);
+void cf_check vpci_hw_write32(
+const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data);
 
 /*
  * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
-- 
2.34.1

[PATCH v7 3/8] vpci: Hide legacy capability when it fails to initialize

2025-07-04 Thread Jiqian Chen

When vpci fails to initialize a legacy capability of device, it just
returns an error and vPCI gets disabled for the whole device.  That
most likely renders the device unusable, plus possibly causing issues
to Xen itself if guest attempts to program the native MSI or MSI-X
capabilities if present.

So, add new function to hide legacy capability when initialization
fails. And remove the failed legacy capability from the vpci emulated
legacy capability list.

Signed-off-by: Jiqian Chen 
---
cc: "Roger Pau Monné" 
---
v6->v7 changes:
* Change the pointer parameter of vpci_get_register(),
  vpci_get_previous_cap_register() and vpci_capability_hide() to be const.

v5->v6 changes:
* Rename parameter rm to r in vpci_get_register().
* Use for loop to compact the code of vpci_get_previous_cap_register().
* Rename prev_next_r to prev_r in vpci_capability_hide(().
* Add printing when cap init, cleanup and hide fail.

v4->v5 changes:
* Modify vpci_get_register() to delete some unnecessary check, so that
  I don't need to move function vpci_register_cmp().
* Rename vpci_capability_mask() to vpci_capability_hide().

v3->v4 changes:
* Modify the commit message.
* In function vpci_get_previous_cap_register(), add an ASSERT_UNREACHABLE() if 
offset below 0x40.
* Modify vpci_capability_mask() to return error instead of using ASSERT.
* Use vpci_remove_register to remove PCI_CAP_LIST_ID register instead of open 
code.
* Add check "if ( !offset )" in vpci_capability_mask().

v2->v3 changes:
* Separated from the last version patch "vpci: Hide capability when it fails to 
initialize"
* Whole implementation changed because last version is wrong.
  This version adds a new helper function vpci_get_register() and uses it to get
  target handler and previous handler from vpci->handlers, then remove the 
target.

v1->v2 changes:
* Removed the "priorities" of initializing capabilities since it isn't used 
anymore.
* Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
  remove failed capability from list.
* Called vpci_make_msix_hole() in the end of init_msix().

Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/vpci.c | 109 +++-
 1 file changed, 108 insertions(+), 1 deletion(-)

diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index e7e5b64f1be4..a91c3d4a1415 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -83,6 +83,88 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
 
 #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
 
+static struct vpci_register *vpci_get_register(const struct vpci *vpci,
+   unsigned int offset,
+   unsigned int size)
+{
+struct vpci_register *r;
+
+ASSERT(spin_is_locked(&vpci->lock));
+
+list_for_each_entry ( r, &vpci->handlers, node )
+{
+if ( r->offset == offset && r->size == size )
+return r;
+
+if ( offset <= r->offset )
+break;
+}
+
+return NULL;
+}
+
+static struct vpci_register *vpci_get_previous_cap_register(
+const struct vpci *vpci, unsigned int offset)
+{
+uint32_t next;
+struct vpci_register *r;
+
+if ( offset < 0x40 )
+{
+ASSERT_UNREACHABLE();
+return NULL;
+}
+
+for ( r = vpci_get_register(vpci, PCI_CAPABILITY_LIST, 1); r;
+  r = next >= 0x40 ? vpci_get_register(vpci,
+   next + PCI_CAP_LIST_NEXT, 1)
+   : NULL )
+{
+next = (uint32_t)(uintptr_t)r->private;
+ASSERT(next == (uintptr_t)r->private);
+if ( next == offset )
+break;
+}
+
+return r;
+}
+
+static int vpci_capability_hide(const struct pci_dev *pdev, unsigned int cap)
+{
+const unsigned int offset = pci_find_cap_offset(pdev->sbdf, cap);
+struct vpci_register *prev_r, *next_r;
+struct vpci *vpci = pdev->vpci;
+
+if ( !offset )
+{
+ASSERT_UNREACHABLE();
+return 0;
+}
+
+spin_lock(&vpci->lock);
+prev_r = vpci_get_previous_cap_register(vpci, offset);
+next_r = vpci_get_register(vpci, offset + PCI_CAP_LIST_NEXT, 1);
+if ( !prev_r || !next_r )
+{
+spin_unlock(&vpci->lock);
+return -ENODEV;
+}
+
+prev_r->private = next_r->private;
+/*
+ * Not calling vpci_remove_register() here is to avoid redoing
+ * the register search
+ */
+list_del(&next_r->node);
+spin_unlock(&vpci->lock);
+xfree(next_r);
+
+if ( !is_hardware_domain(pdev->domain) )
+return vpci_remove_register(vpci, offset + PCI_CAP_LIST_ID, 1);
+
+return 0;
+}
+
 static int vpci_init_capabilities(struct pci_dev *pdev)
 {
 for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
@@ -103,7 +185,32 @@ static int vpci_init_capabilities(struct pci_dev *pdev)
 
 rc = capability->init(pdev);
 if ( rc )
-return rc;
+{
+const

[PATCH v7 7/8] vpci/msi: Free MSI resources when init_msi() fails

2025-07-04 Thread Jiqian Chen

When init_msi() fails, current logic return fail and free MSI-related
resources in vpci_deassign_device(). But the previous new changes will
hide MSI capability and return success, it can't reach
vpci_deassign_device() to remove resources if hiding success, so those
resources must be removed in cleanup function of MSI.

To do that, implement cleanup function for MSI.

Signed-off-by: Jiqian Chen 
---
cc: "Roger Pau Monné" 
---
v6->v7 changes:
* Change the pointer parameter of cleanup_msi() to be const.
* When vpci_remove_registers() in cleanup_msi() fails, not to return
  directly, instead try to free msi and re-add ctrl handler.
* Pass pdev->vpci into vpci_add_register() instead of pdev->vpci->msi in
  init_msi() since we need that every handler realize that msi is NULL
  when msi is free but handlers are still in there.

v5->v6 changes:
No.

v4->v5 changes:
* Change definition "static void cleanup_msi" to "static int cf_check 
cleanup_msi"
  since cleanup hook is changed to be int.
* Add a read-only register for MSI Control Register in the end of cleanup_msi.

v3->v4 changes:
* Change function name from fini_msi() to cleanup_msi().
* Remove unnecessary comment.
* Change to use XFREE to free vpci->msi.

v2->v3 changes:
* Remove all fail path, and use fini_msi() hook instead.
* Change the method to calculating the size of msi registers.

v1->v2 changes:
* Added a new function fini_msi to free all MSI resources instead of using an 
array
  to record registered registers.

Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/msi.c | 111 ++---
 1 file changed, 94 insertions(+), 17 deletions(-)

diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index c3eba4e14870..09b91a685df5 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -25,7 +25,11 @@
 static uint32_t cf_check control_read(
 const struct pci_dev *pdev, unsigned int reg, void *data)
 {
-const struct vpci_msi *msi = data;
+const struct vpci *vpci = data;
+const struct vpci_msi *msi = vpci->msi;
+
+if ( !msi )
+return pci_conf_read16(pdev->sbdf, reg);
 
 return MASK_INSR(fls(pdev->msi_maxvec) - 1, PCI_MSI_FLAGS_QMASK) |
MASK_INSR(fls(msi->vectors) - 1, PCI_MSI_FLAGS_QSIZE) |
@@ -37,12 +41,16 @@ static uint32_t cf_check control_read(
 static void cf_check control_write(
 const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data)
 {
-struct vpci_msi *msi = data;
+struct vpci *vpci = data;
+struct vpci_msi *msi = vpci->msi;
 unsigned int vectors = min_t(uint8_t,
  1u << MASK_EXTR(val, PCI_MSI_FLAGS_QSIZE),
  pdev->msi_maxvec);
 bool new_enabled = val & PCI_MSI_FLAGS_ENABLE;
 
+if ( !msi )
+return;
+
 /*
  * No change if the enable field and the number of vectors is
  * the same or the device is not enabled, in which case the
@@ -101,7 +109,11 @@ static void update_msi(const struct pci_dev *pdev, struct 
vpci_msi *msi)
 static uint32_t cf_check address_read(
 const struct pci_dev *pdev, unsigned int reg, void *data)
 {
-const struct vpci_msi *msi = data;
+const struct vpci *vpci = data;
+const struct vpci_msi *msi = vpci->msi;
+
+if ( !msi )
+return ~(uint32_t)0;
 
 return msi->address;
 }
@@ -109,7 +121,11 @@ static uint32_t cf_check address_read(
 static void cf_check address_write(
 const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data)
 {
-struct vpci_msi *msi = data;
+struct vpci *vpci = data;
+struct vpci_msi *msi = vpci->msi;
+
+if ( !msi )
+return;
 
 /* Clear low part. */
 msi->address &= ~0xULL;
@@ -122,7 +138,11 @@ static void cf_check address_write(
 static uint32_t cf_check address_hi_read(
 const struct pci_dev *pdev, unsigned int reg, void *data)
 {
-const struct vpci_msi *msi = data;
+const struct vpci *vpci = data;
+const struct vpci_msi *msi = vpci->msi;
+
+if ( !msi )
+return ~(uint32_t)0;
 
 return msi->address >> 32;
 }
@@ -130,7 +150,11 @@ static uint32_t cf_check address_hi_read(
 static void cf_check address_hi_write(
 const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data)
 {
-struct vpci_msi *msi = data;
+struct vpci *vpci = data;
+struct vpci_msi *msi = vpci->msi;
+
+if ( !msi )
+return;
 
 /* Clear and update high part. */
 msi->address  = (uint32_t)msi->address;
@@ -143,7 +167,11 @@ static void cf_check address_hi_write(
 static uint32_t cf_check data_read(
 const struct pci_dev *pdev, unsigned int reg, void *data)
 {
-const struct vpci_msi *msi = data;
+const struct vpci *vpci = data;
+const struct vpci_msi *msi = vpci->msi;
+
+if ( !msi )
+return ~(uint32_t)0;
 
 return msi->data;
 }
@@ -151,7 +179,11 @@ static uint32_t cf_check data_read(
 static void cf_check data_write(
 const struct pci_dev *

[PATCH v7 4/8] vpci: Hide extended capability when it fails to initialize

2025-07-04 Thread Jiqian Chen

When vpci fails to initialize a extended capability of device, it
just returns an error and vPCI gets disabled for the whole device.

So, add function to hide extended capability when initialization
fails. And remove the failed extended capability handler from vpci
extended capability list.

Signed-off-by: Jiqian Chen 
---
Comment left over for the situation that "capability in 0x100 can't be removed 
case"
from Jan in last version, need Roger's input.

Jan:
Can we rely on OSes recognizing ID 0 as "just skip"?
Since the size isn't encoded in the header, there might be issues lurking here.
---
cc: "Roger Pau Monné" 
cc: Andrew Cooper 
cc: Anthony PERARD 
cc: Michal Orzel 
cc: Jan Beulich 
cc: Julien Grall 
cc: Stefano Stabellini 
---
v6->v7 changes:
* Change the pointer parameter of vpci_get_previous_ext_cap_register()
  and vpci_ext_capability_hide() to be const.

v5->v6 changes:
* Change to use for loop to compact code of 
vpci_get_previous_ext_cap_register().
* Rename parameter rm to r in vpci_ext_capability_hide().
* Change comment to describ the case that hide capability of position
  0x100U.

v4->v5 changes:
* Modify the hex digits of PCI_EXT_CAP_NEXT_MASK and PCI_EXT_CAP_NEXT to be low 
case.
* Rename vpci_ext_capability_mask to vpci_ext_capability_hide.

v3->v4 changes:
* Change definition of PCI_EXT_CAP_NEXT to be "#define PCI_EXT_CAP_NEXT(header)
  (MASK_EXTR(header, PCI_EXT_CAP_NEXT_MASK) & 0xFFCU)" to avoid redundancy.
* Modify the commit message.
* Change vpci_ext_capability_mask() to return error instead of using ASSERT.
* Set the capability ID part to be zero when we need to hide the capability of 
position 0x100U.
* Add check "if ( !offset )" in vpci_ext_capability_mask().

v2->v3 changes:
* Separated from the last version patch "vpci: Hide capability when it fails to 
initialize".
* Whole implementation changed because last version is wrong.
  This version gets target handler and previous handler from vpci->handlers, 
then remove the target.
* Note: a case in function vpci_ext_capability_mask() needs to be discussed,
  because it may change the offset of next capability when the offset of target
  capability is 0x100U(the first extended capability), my implementation is 
just to
  ignore and let hardware to handle the target capability.

v1->v2 changes:
* Removed the "priorities" of initializing capabilities since it isn't used 
anymore.
* Added new function vpci_capability_mask() and vpci_ext_capability_mask() to
  remove failed capability from list.
* Called vpci_make_msix_hole() in the end of init_msix().

Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/vpci.c| 88 ++
 xen/include/xen/pci_regs.h |  5 ++-
 2 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index a91c3d4a1415..8be4b53533a3 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -165,6 +165,92 @@ static int vpci_capability_hide(const struct pci_dev 
*pdev, unsigned int cap)
 return 0;
 }
 
+static struct vpci_register *vpci_get_previous_ext_cap_register(
+const struct vpci *vpci, unsigned int offset)
+{
+unsigned int pos = PCI_CFG_SPACE_SIZE;
+struct vpci_register *r;
+
+if ( offset <= PCI_CFG_SPACE_SIZE )
+{
+ASSERT_UNREACHABLE();
+return NULL;
+}
+
+for ( r = vpci_get_register(vpci, pos, 4); r;
+  r = pos > PCI_CFG_SPACE_SIZE ? vpci_get_register(vpci, pos, 4)
+   : NULL )
+{
+uint32_t header = (uint32_t)(uintptr_t)r->private;
+
+ASSERT(header == (uintptr_t)r->private);
+
+pos = PCI_EXT_CAP_NEXT(header);
+if ( pos == offset )
+break;
+}
+
+return r;
+}
+
+static int vpci_ext_capability_hide(
+const struct pci_dev *pdev, unsigned int cap)
+{
+const unsigned int offset = pci_find_ext_capability(pdev->sbdf, cap);
+struct vpci_register *r, *prev_r;
+struct vpci *vpci = pdev->vpci;
+uint32_t header, pre_header;
+
+if ( offset < PCI_CFG_SPACE_SIZE )
+{
+ASSERT_UNREACHABLE();
+return 0;
+}
+
+spin_lock(&vpci->lock);
+r = vpci_get_register(vpci, offset, 4);
+if ( !r )
+{
+spin_unlock(&vpci->lock);
+return -ENODEV;
+}
+
+header = (uint32_t)(uintptr_t)r->private;
+if ( offset == PCI_CFG_SPACE_SIZE )
+{
+if ( PCI_EXT_CAP_NEXT(header) <= PCI_CFG_SPACE_SIZE )
+r->private = (void *)(uintptr_t)0;
+else
+/*
+ * The first extended capability (0x100) can not be removed from
+ * the linked list, so instead mask its capability ID to return 0
+ * and force OSes to skip it.
+ */
+r->private = (void *)(uintptr_t)(header & ~PCI_EXT_CAP_ID(header));
+
+spin_unlock(&vpci->lock);
+return 0;
+}
+
+prev_r = vpci_get_previous_ext_cap_register(vpci, offset);
+if ( !prev_r )

[PATCH v7 6/8] vpci/rebar: Free Rebar resources when init_rebar() fails

2025-07-04 Thread Jiqian Chen

When init_rebar() fails, current logic return fail and free Rebar-related
resources in vpci_deassign_device(). But the previous new changes will
hide Rebar capability and return success, it can't reach
vpci_deassign_device() to remove resources if hiding success, so those
resources must be removed in cleanup function of Rebar.

To do that, implement cleanup function for Rebar.

Signed-off-by: Jiqian Chen 
---
cc: "Roger Pau Monné" 
---
v6->v7 changes:
* Change the pointer parameter of cleanup_rebar() to be const.
* Print error when vpci_remove_registers() fail in cleanup_rebar().

v5->v6 changes:
No.

v4->v5 changes:
* Change definition "static void cleanup_rebar" to "static int cf_check 
cleanup_rebar" since cleanup hook is changed to be int.

v3->v4 changes:
* Change function name from fini_rebar() to cleanup_rebar().
* Change the error number to be E2BIG and ENXIO in init_rebar().

v2->v3 changes:
* Use fini_rebar() to remove all register instead of in the failure path of 
init_rebar();

v1->v2 changes:
* Called vpci_remove_registers() to remove all possible registered registers 
instead of using a array to record all registered register.

Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/rebar.c | 41 +---
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/xen/drivers/vpci/rebar.c b/xen/drivers/vpci/rebar.c
index 3c18792d9bcd..39ab9c2eb7d9 100644
--- a/xen/drivers/vpci/rebar.c
+++ b/xen/drivers/vpci/rebar.c
@@ -49,6 +49,32 @@ static void cf_check rebar_ctrl_write(const struct pci_dev 
*pdev,
 bar->guest_addr = bar->addr;
 }
 
+static int cf_check cleanup_rebar(const struct pci_dev *pdev)
+{
+int rc;
+uint32_t ctrl;
+unsigned int nbars;
+unsigned int rebar_offset = pci_find_ext_capability(pdev->sbdf,
+PCI_EXT_CAP_ID_REBAR);
+
+if ( !rebar_offset || !is_hardware_domain(pdev->domain) )
+{
+ASSERT_UNREACHABLE();
+return 0;
+}
+
+ctrl = pci_conf_read32(pdev->sbdf, rebar_offset + PCI_REBAR_CTRL(0));
+nbars = MASK_EXTR(ctrl, PCI_REBAR_CTRL_NBAR_MASK);
+
+rc = vpci_remove_registers(pdev->vpci, rebar_offset + PCI_REBAR_CAP(0),
+   PCI_REBAR_CTRL(nbars - 1));
+if ( rc )
+printk(XENLOG_ERR "%pd %pp: fail to remove Rebar handlers rc=%d\n",
+   pdev->domain, &pdev->sbdf, rc);
+
+return rc;
+}
+
 static int cf_check init_rebar(struct pci_dev *pdev)
 {
 uint32_t ctrl;
@@ -80,7 +106,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
 {
 printk(XENLOG_ERR "%pd %pp: too big BAR number %u in REBAR_CTRL\n",
pdev->domain, &pdev->sbdf, index);
-continue;
+return -E2BIG;
 }
 
 bar = &pdev->vpci->header.bars[index];
@@ -88,7 +114,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
 {
 printk(XENLOG_ERR "%pd %pp: BAR%u is not in memory space\n",
pdev->domain, &pdev->sbdf, index);
-continue;
+return -ENXIO;
 }
 
 rc = vpci_add_register(pdev->vpci, vpci_hw_read32, rebar_ctrl_write,
@@ -97,14 +123,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
 {
 printk(XENLOG_ERR "%pd %pp: BAR%u fail to add reg of REBAR_CTRL 
rc=%d\n",
pdev->domain, &pdev->sbdf, index, rc);
-/*
- * Ideally we would hide the ReBar capability on error, but code
- * for doing so still needs to be written. Use continue instead
- * to keep any already setup register hooks, as returning an
- * error will cause the hardware domain to get unmediated access
- * to all device registers.
- */
-continue;
+return rc;
 }
 
 bar->resizable_sizes =
@@ -118,7 +137,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
 
 return 0;
 }
-REGISTER_VPCI_EXTCAP(REBAR, init_rebar, NULL);
+REGISTER_VPCI_EXTCAP(REBAR, init_rebar, cleanup_rebar);
 
 /*
  * Local variables:
-- 
2.34.1

[PATCH v7 0/8] Support hiding capability when its initialization fails

2025-07-04 Thread Jiqian Chen

Hi,

This series is to
emulate extended capability list for dom0, including patch #1.
hide legacy and extended capability when its initialization fails, including 
patch #2, #3, #4.
remove all related registers and other resources when initializing capability 
fails, including patch #5, #6, #7, #8.

Best regards,
Jiqian Chen.
---
cc: Andrew Cooper 
cc: Anthony PERARD 
cc: Michal Orzel 
cc: Jan Beulich 
cc: Julien Grall 
cc: "Roger Pau Monné" 
cc: Stefano Stabellini 
---
Jiqian Chen (8):
  vpci/header: Emulate extended capability list for dom0
  vpci: Refactor REGISTER_VPCI_INIT
  vpci: Hide legacy capability when it fails to initialize
  vpci: Hide extended capability when it fails to initialize
  vpci: Refactor vpci_remove_register to remove matched registers
  vpci/rebar: Free Rebar resources when init_rebar() fails
  vpci/msi: Free MSI resources when init_msi() fails
  vpci/msix: Free MSIX resources when init_msix() fails

 tools/tests/vpci/main.c|   4 +-
 xen/arch/arm/xen.lds.S |   3 +-
 xen/arch/ppc/xen.lds.S |   3 +-
 xen/arch/riscv/xen.lds.S   |   3 +-
 xen/arch/x86/xen.lds.S |   2 +-
 xen/drivers/vpci/header.c  |  60 +---
 xen/drivers/vpci/msi.c | 111 ---
 xen/drivers/vpci/msix.c|  63 -
 xen/drivers/vpci/rebar.c   |  41 --
 xen/drivers/vpci/vpci.c| 279 +
 xen/include/xen/pci_regs.h |   5 +-
 xen/include/xen/vpci.h |  38 +++--
 xen/include/xen/xen.lds.h  |   2 +-
 13 files changed, 507 insertions(+), 107 deletions(-)

-- 
2.34.1

[PATCH v7 2/8] vpci: Refactor REGISTER_VPCI_INIT

2025-07-04 Thread Jiqian Chen

Refactor REGISTER_VPCI_INIT to contain more capability specific
information, this will benefit further follow-on changes to hide
capability when initialization fails.

What's more, change the definition of init_header() since it is
not a capability and it is needed for all devices' PCI config space.

After refactor, the "priority" of initializing capabilities isn't
needed anymore, so delete its related codes.

Note:
Call vpci_make_msix_hole() in the end of init_msix() since the change
of sequence of init_header() and init_msix(). And delete the call of
vpci_make_msix_hole() in modify_decoding() since it is not needed.

The cleanup hook is also added in this change, even if it's still
unused. Further changes will make use of it.

Signed-off-by: Jiqian Chen 
---
There is a byte alignment problem in the array __start_vpci_array, which can be 
solved after
"[PATCH] x86: don't have gcc over-align data" is merged.
---
cc: "Roger Pau Monné" 
cc: Andrew Cooper 
cc: Anthony PERARD 
cc: Michal Orzel 
cc: Jan Beulich 
cc: Julien Grall 
cc: Stefano Stabellini 
---
v6->v7 changes:
* Change the pointer parameter of cleanup hook of vpci_capability_t to be const.
  If change parameter of init hook to be const will affect init_msix, and it 
assigns pdev
  to struct vpci_msix, so keep no const to expanding the impact.
* Delete the vpci_make_msix_hole() call in modify_decoding().
* Change __start_vpci_array from vpci_capability_t* array to vpci_capability_t 
array.
* Change the name "finit##_t" to be "name##_entry" and add a "name" parameter 
to macro
  REGISTER_VPCI_CAPABILITY.

v5->v6 changes:
* Rename REGISTER_PCI_CAPABILITY to REGISTER_VPCI_CAPABILITY.
* Move vpci_capability_t entry from ".data.vpci" to ".data.rel.ro.vpci" and
  move the instances of VPCI_ARRAY in the linker scripts before *(.data.rel.ro).
* Change _start/end_vpci_array[] to be const pointer array.

v4->v5 changes:
* Rename REGISTER_VPCI_CAP to REGISTER_PCI_CAPABILITY, rename 
REGISTER_VPCI_LEGACY_CAP to
  REGISTER_VPCI_CAP, rename REGISTER_VPCI_EXTENDED_CAP to REGISTER_VPCI_EXTCAP.
* Change cleanup hook of vpci_capability_t from void to int.

v3->v4 changes
* Delete the useless trailing dot of section ".data.vpci".
* Add description about priority since this patch removes the initializing 
priority of
  capabilities and priority is not needed anymore.
* Change the hook name from fini to cleanup.
* Change the name x and y to be finit and fclean.
* Remove the unnecessary check "!capability->init"

v2->v3 changes:
* This is separated from patch "vpci: Hide capability when it fails to 
initialize" of v2.
* Delete __maybe_unused attribute of "out" in function vpci_assign_devic().
* Rename REGISTER_VPCI_EXTEND_CAP to REGISTER_VPCI_EXTENDED_CAP.

v1->v2 changes:
* Removed the "priorities" of initializing capabilities since it isn't used 
anymore.
* Added new function vpci_capability_mask() and vpci_ext_capability_mask() to 
remove
  failed capability from list.
* Called vpci_make_msix_hole() in the end of init_msix().

Best regards,
Jiqian Chen.
---
 xen/arch/arm/xen.lds.S|  3 +--
 xen/arch/ppc/xen.lds.S|  3 +--
 xen/arch/riscv/xen.lds.S  |  3 +--
 xen/arch/x86/xen.lds.S|  2 +-
 xen/drivers/vpci/header.c | 16 +-
 xen/drivers/vpci/msi.c|  2 +-
 xen/drivers/vpci/msix.c   | 11 +++---
 xen/drivers/vpci/rebar.c  |  2 +-
 xen/drivers/vpci/vpci.c   | 44 ++-
 xen/include/xen/vpci.h| 32 ++--
 xen/include/xen/xen.lds.h |  2 +-
 11 files changed, 71 insertions(+), 49 deletions(-)

diff --git a/xen/arch/arm/xen.lds.S b/xen/arch/arm/xen.lds.S
index 5bfbe1e92c1e..9f30c3a13ed1 100644
--- a/xen/arch/arm/xen.lds.S
+++ b/xen/arch/arm/xen.lds.S
@@ -57,6 +57,7 @@ SECTIONS
 
*(.rodata)
*(.rodata.*)
+   VPCI_ARRAY
*(.data.rel.ro)
*(.data.rel.ro.*)
 
@@ -64,8 +65,6 @@ SECTIONS
__proc_info_start = .;
*(.proc.info)
__proc_info_end = .;
-
-   VPCI_ARRAY
   } :text
 
 #if defined(BUILD_ID)
diff --git a/xen/arch/ppc/xen.lds.S b/xen/arch/ppc/xen.lds.S
index 1366e2819eed..1de0b77fc6b9 100644
--- a/xen/arch/ppc/xen.lds.S
+++ b/xen/arch/ppc/xen.lds.S
@@ -51,11 +51,10 @@ SECTIONS
 
 *(.rodata)
 *(.rodata.*)
+VPCI_ARRAY
 *(.data.rel.ro)
 *(.data.rel.ro.*)
 
-VPCI_ARRAY
-
 . = ALIGN(POINTER_ALIGN);
 } :text
 
diff --git a/xen/arch/riscv/xen.lds.S b/xen/arch/riscv/xen.lds.S
index 8c3c06de01f6..edcadff90bfe 100644
--- a/xen/arch/riscv/xen.lds.S
+++ b/xen/arch/riscv/xen.lds.S
@@ -46,11 +46,10 @@ SECTIONS
 
 *(.rodata)
 *(.rodata.*)
+VPCI_ARRAY
 *(.data.rel.ro)
 *(.data.rel.ro.*)
 
-VPCI_ARRAY
-
 . = ALIGN(POINTER_ALIGN);
 } :text
 
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index 636c7768aa3c..8e9cac75b09e 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -135,6 +135,7 @@ SECTIONS

[PATCH v7 8/8] vpci/msix: Free MSIX resources when init_msix() fails

2025-07-04 Thread Jiqian Chen

When init_msix() fails, current logic return fail and free MSIX-related
resources in vpci_deassign_device(). But the previous new changes will
hide MSIX capability and return success, it can't reach
vpci_deassign_device() to remove resources if hiding success, so those
resources must be removed in cleanup function of MSIX.

To do that, implement cleanup function for MSIX.

Signed-off-by: Jiqian Chen 
---
cc: "Roger Pau Monné" 
---
v6->v7 changes:
* Change the pointer parameter of cleanup_msix() to be const.
* When vpci_remove_registers() in cleanup_msix() fails, not to return
  directly, instead try to free msix and re-add ctrl handler.
* Pass pdev->vpci into vpci_add_register() instead of pdev->vpci->msix in
  init_msix() since we need that every handler realize that msix is NULL
  when msix is freed but handlers are still in there.

v5->v6 changes:
* Change the logic to add dummy handler when !vpci->msix in cleanup_msix().

v4->v5 changes:
* Change definition "static void cleanup_msix" to "static int cf_check 
cleanup_msix"
  since cleanup hook is changed to be int.
* Add a read-only register for MSIX Control Register in the end of 
cleanup_msix().

v3->v4 changes:
* Change function name from fini_msix() to cleanup_msix().
* Change to use XFREE to free vpci->msix.
* In cleanup function, change the sequence of check and remove action according 
to
  init_msix().

v2->v3 changes:
* Remove unnecessary clean operations in fini_msix().

v1->v2 changes:
new patch.

Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/msix.c | 54 ++---
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
index a1692b9d9f6a..114280337f3f 100644
--- a/xen/drivers/vpci/msix.c
+++ b/xen/drivers/vpci/msix.c
@@ -36,7 +36,11 @@
 static uint32_t cf_check control_read(
 const struct pci_dev *pdev, unsigned int reg, void *data)
 {
-const struct vpci_msix *msix = data;
+const struct vpci *vpci = data;
+const struct vpci_msix *msix = vpci->msix;
+
+if ( !msix )
+return pci_conf_read16(pdev->sbdf, reg);
 
 return (msix->max_entries - 1) |
(msix->enabled ? PCI_MSIX_FLAGS_ENABLE : 0) |
@@ -74,12 +78,16 @@ static void update_entry(struct vpci_msix_entry *entry,
 static void cf_check control_write(
 const struct pci_dev *pdev, unsigned int reg, uint32_t val, void *data)
 {
-struct vpci_msix *msix = data;
+struct vpci *vpci = data;
+struct vpci_msix *msix = vpci->msix;
 bool new_masked = val & PCI_MSIX_FLAGS_MASKALL;
 bool new_enabled = val & PCI_MSIX_FLAGS_ENABLE;
 unsigned int i;
 int rc;
 
+if ( !msix )
+return;
+
 if ( new_masked == msix->masked && new_enabled == msix->enabled )
 return;
 
@@ -656,6 +664,44 @@ static int vpci_make_msix_hole(const struct pci_dev *pdev)
 return 0;
 }
 
+static int cf_check cleanup_msix(const struct pci_dev *pdev)
+{
+int rc;
+struct vpci *vpci = pdev->vpci;
+const unsigned int msix_pos = pdev->msix_pos;
+
+if ( !msix_pos )
+return 0;
+
+rc = vpci_remove_registers(vpci, msix_control_reg(msix_pos), 2);
+if ( rc )
+printk(XENLOG_WARNING "%pd %pp: fail to remove MSIX handlers rc=%d\n",
+   pdev->domain, &pdev->sbdf, rc);
+
+if ( vpci->msix )
+{
+for ( unsigned int i = 0; i < ARRAY_SIZE(vpci->msix->table); i++ )
+if ( vpci->msix->table[i] )
+iounmap(vpci->msix->table[i]);
+
+list_del(&vpci->msix->next);
+XFREE(vpci->msix);
+}
+
+/*
+ * The driver may not traverse the capability list and think device
+ * supports MSIX by default. So here let the control register of MSIX
+ * be Read-Only is to ensure MSIX disabled.
+ */
+rc = vpci_add_register(vpci, vpci_hw_read16, NULL,
+   msix_control_reg(msix_pos), 2, NULL);
+if ( rc )
+printk(XENLOG_ERR "%pd %pp: fail to add MSIX ctrl handler rc=%d\n",
+   pdev->domain, &pdev->sbdf, rc);
+
+return rc;
+}
+
 static int cf_check init_msix(struct pci_dev *pdev)
 {
 struct domain *d = pdev->domain;
@@ -677,7 +723,7 @@ static int cf_check init_msix(struct pci_dev *pdev)
 return -ENOMEM;
 
 rc = vpci_add_register(pdev->vpci, control_read, control_write,
-   msix_control_reg(msix_offset), 2, msix);
+   msix_control_reg(msix_offset), 2, pdev->vpci);
 if ( rc )
 {
 xfree(msix);
@@ -710,7 +756,7 @@ static int cf_check init_msix(struct pci_dev *pdev)
 
 return rc;
 }
-REGISTER_VPCI_CAP(MSIX, init_msix, NULL);
+REGISTER_VPCI_CAP(MSIX, init_msix, cleanup_msix);
 
 /*
  * Local variables:
-- 
2.34.1

[PATCH v7 5/8] vpci: Refactor vpci_remove_register to remove matched registers

2025-07-04 Thread Jiqian Chen

vpci_remove_register() only supports removing a register in a time,
but the follow-on changes need to remove all registers within a range.
So, refactor it to support removing all registers in a given region.

And it is no issue to remove a non exist register, so remove the
__must_check prefix.

Signed-off-by: Jiqian Chen 
Reviewed-by: Roger Pau Monné 
---
cc: "Roger Pau Monné" 
cc: Anthony PERARD 
---
v6->v7 changes:
No.

v5->v6 changes:
* Modify commit message.
* Add Roger's Reviewed-by.

v4->v5 changes:
No.

v3->v4 changes:
* Use list_for_each_entry_safe instead of list_for_each_entry.
* Return ERANGE if overlap.

v2->v3 changes:
* Add new check to return error if registers overlap but not inside range.

v1->v2 changes:
new patch

Best regards,
Jiqian Chen.
---
 tools/tests/vpci/main.c |  4 ++--
 xen/drivers/vpci/vpci.c | 38 --
 xen/include/xen/vpci.h  |  4 ++--
 3 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/tools/tests/vpci/main.c b/tools/tests/vpci/main.c
index 33223db3eb77..ca72877d60cd 100644
--- a/tools/tests/vpci/main.c
+++ b/tools/tests/vpci/main.c
@@ -132,10 +132,10 @@ static void vpci_write32_mask(const struct pci_dev *pdev, 
unsigned int reg,
   rsvdz_mask))
 
 #define VPCI_REMOVE_REG(off, size)  \
-assert(!vpci_remove_register(test_pdev.vpci, off, size))
+assert(!vpci_remove_registers(test_pdev.vpci, off, size))
 
 #define VPCI_REMOVE_INVALID_REG(off, size)  \
-assert(vpci_remove_register(test_pdev.vpci, off, size))
+assert(vpci_remove_registers(test_pdev.vpci, off, size))
 
 /* Read a 32b register using all possible sizes. */
 void multiread4_check(unsigned int reg, uint32_t val)
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 8be4b53533a3..71ba30bbbd6b 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -152,7 +152,7 @@ static int vpci_capability_hide(const struct pci_dev *pdev, 
unsigned int cap)
 
 prev_r->private = next_r->private;
 /*
- * Not calling vpci_remove_register() here is to avoid redoing
+ * Not calling vpci_remove_registers() here is to avoid redoing
  * the register search
  */
 list_del(&next_r->node);
@@ -160,7 +160,7 @@ static int vpci_capability_hide(const struct pci_dev *pdev, 
unsigned int cap)
 xfree(next_r);
 
 if ( !is_hardware_domain(pdev->domain) )
-return vpci_remove_register(vpci, offset + PCI_CAP_LIST_ID, 1);
+return vpci_remove_registers(vpci, offset + PCI_CAP_LIST_ID, 1);
 
 return 0;
 }
@@ -553,34 +553,36 @@ int vpci_add_register_mask(struct vpci *vpci, vpci_read_t 
*read_handler,
 return 0;
 }
 
-int vpci_remove_register(struct vpci *vpci, unsigned int offset,
- unsigned int size)
+int vpci_remove_registers(struct vpci *vpci, unsigned int start,
+  unsigned int size)
 {
-const struct vpci_register r = { .offset = offset, .size = size };
-struct vpci_register *rm;
+struct vpci_register *rm, *tmp;
+unsigned int end = start + size;
 
 spin_lock(&vpci->lock);
-list_for_each_entry ( rm, &vpci->handlers, node )
+list_for_each_entry_safe ( rm, tmp, &vpci->handlers, node )
 {
-int cmp = vpci_register_cmp(&r, rm);
-
-/*
- * NB: do not use a switch so that we can use break to
- * get out of the list loop earlier if required.
- */
-if ( !cmp && rm->offset == offset && rm->size == size )
+/* Remove rm if rm is inside the range. */
+if ( rm->offset >= start && rm->offset + rm->size <= end )
 {
 list_del(&rm->node);
-spin_unlock(&vpci->lock);
 xfree(rm);
-return 0;
+continue;
 }
-if ( cmp <= 0 )
+
+/* Return error if registers overlap but not inside. */
+if ( rm->offset + rm->size > start && rm->offset < end )
+{
+spin_unlock(&vpci->lock);
+return -ERANGE;
+}
+
+if ( start < rm->offset )
 break;
 }
 spin_unlock(&vpci->lock);
 
-return -ENOENT;
+return 0;
 }
 
 /* Wrappers for performing reads/writes to the underlying hardware. */
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 61287e5d2e12..91bb407c728c 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -70,8 +70,8 @@ static inline int __must_check vpci_add_register(struct vpci 
*vpci,
   size, data, 0, 0, 0, 0);
 }
 
-int __must_check vpci_remove_register(struct vpci *vpci, unsigned int offset,
-  unsigned int size);
+int vpci_remove_registers(struct vpci *vpci, unsigned int start,
+  unsigned int size);
 
 /* Generic read/write handlers for the PCI config space. */
 uint32_t vpci_read(pci_sbdf_t sbdf, unsigned i

RE: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-cmd

2025-07-04 Thread Penny, Zheng

[Public]

> -Original Message-
> From: Jan Beulich 
> Sent: Tuesday, June 17, 2025 6:08 PM
> To: Penny, Zheng 
> Cc: Huang, Ray ; Anthony PERARD
> ; Juergen Gross ; Andrew
> Cooper ; Orzel, Michal ;
> Julien Grall ; Roger Pau Monné ; Stefano
> Stabellini ; xen-devel@lists.xenproject.org
> Subject: Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-
> cmd
>
> On 27.05.2025 10:48, Penny Zheng wrote:
> > --- a/tools/misc/xenpm.c
> > +++ b/tools/misc/xenpm.c
> > @@ -69,6 +69,7 @@ void show_help(void)
> >  " set-max-cstate|'unlimited' 
> > [|'unlimited']\n"
> >  " set the C-State 
> > limitation ( >= 0) and\n"
> >  " optionally the 
> > C-sub-state limitation
> ( >= 0)\n"
> > +" get-cpufreq-cppc  [cpuid]   list cpu cppc parameter 
> > of CPU
>  or all\n"
> >  " set-cpufreq-cppc  [cpuid] [balance|performance|powersave]
> *\n"
> >  " set Hardware P-State 
> > (HWP) parameters\n"
> >  " on CPU  or all if 
> > omitted.\n"
> > @@ -812,33 +813,7 @@ static void print_cpufreq_para(int cpuid, struct
> > xc_get_cpufreq_para *p_cpufreq)
> >
> >  printf("scaling_driver   : %s\n", p_cpufreq->scaling_driver);
> >
> > -if ( hwp )
> > -{
> > -const xc_cppc_para_t *cppc = &p_cpufreq->u.cppc_para;
> > -
> > -printf("cppc variables   :\n");
> > -printf("  hardware limits: lowest [%"PRIu32"] lowest nonlinear
> [%"PRIu32"]\n",
> > -   cppc->lowest, cppc->lowest_nonlinear);
> > -printf(" : nominal [%"PRIu32"] highest 
> > [%"PRIu32"]\n",
> > -   cppc->nominal, cppc->highest);
> > -printf("  configured limits  : min [%"PRIu32"] max [%"PRIu32"] 
> > energy perf
> [%"PRIu32"]\n",
> > -   cppc->minimum, cppc->maximum, cppc->energy_perf);
> > -
> > -if ( cppc->features & XEN_SYSCTL_CPPC_FEAT_ACT_WINDOW )
> > -{
> > -unsigned int activity_window;
> > -const char *units;
> > -
> > -activity_window = calculate_activity_window(cppc, &units);
> > -printf(" : activity_window [%"PRIu32" 
> > %s]\n",
> > -   activity_window, units);
> > -}
> > -
> > -printf(" : desired [%"PRIu32"%s]\n",
> > -   cppc->desired,
> > -   cppc->desired ? "" : " hw autonomous");
> > -}
> > -else
> > +if ( !hwp )
> >  {
> >  if ( p_cpufreq->gov_num )
> >  printf("scaling_avail_gov: %s\n",
>
> I'm not sure it is a good idea to alter what is being output for 
> get-cpufreq-para.
> People may simply miss that output then, without having any indication where 
> it
> went.

Hwp is more like amd-cppc in active mode. It also does not rely on Xen governor 
to do performance tuning, so in previous design, we could borrow governor filed 
to output cppc info
However after introducing amd-cppc passive mode, we have request to output both 
governor and CPPC info. And if continuing expanding get-cpufreq-para to include 
CPPC info, it will make the parent stuct xen_sysctl.u exceed exceed 128 bytes. 
So I'm left to create a new subcmd to specifically deal with CPPC info
I could leave above output for get-cpufreq-para unchanged. Then we will have 
duplicate CPPC info in two commands. Or is it fine to do that?

>
>
> Jan

Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for cpufreq scaling

2025-07-04 Thread Jan Beulich

On 04.07.2025 09:23, Penny, Zheng wrote:
> [Public]
> 
>> -Original Message-
>> From: Jan Beulich 
>> Sent: Friday, July 4, 2025 2:21 PM
>> To: Penny, Zheng 
>> Cc: Huang, Ray ; Andrew Cooper
>> ; Roger Pau Monné ; xen-
>> de...@lists.xenproject.org
>> Subject: Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver 
>> for
>> cpufreq scaling
>>
>> On 04.07.2025 05:40, Penny, Zheng wrote:
 -Original Message-
 From: Jan Beulich 
 Sent: Wednesday, July 2, 2025 6:48 PM

 On 02.07.2025 11:49, Penny, Zheng wrote:
>> -Original Message-
>> From: Jan Beulich 
>> Sent: Tuesday, June 17, 2025 12:00 AM
>> To: Penny, Zheng 
>>
>> On 27.05.2025 10:48, Penny Zheng wrote:
>>> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy 
>>> *policy,
>>> +unsigned int target_freq,
>>> +unsigned int relation) {
>>> +unsigned int cpu = policy->cpu;
>>> +const struct amd_cppc_drv_data *data =
>>> +per_cpu(amd_cppc_drv_data,
 cpu);
>>> +uint8_t des_perf;
>>> +int res;
>>> +
>>> +if ( unlikely(!target_freq) )
>>> +return 0;
>>> +
>>> +res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
>>> +if ( res )
>>> +return res;
>>> +
>>> +/*
>>> + * Setting with "lowest_nonlinear_perf" to ensure governoring
>>> + * performance in P-state range.
>>> + */
>>> +amd_cppc_write_request(policy->cpu, data-
>>> caps.lowest_nonlinear_perf,
>>> +   des_perf, data->caps.highest_perf);
>>
>> I fear I don't understand the comment, and hence it remains unclear
>> to me why lowest_nonlinear_perf is being used here.
>
> How about
> ```
> Choose lowest nonlinear performance as the minimum performance level
> at which
 the platform may run.
> Lowest nonlinear performance is the lowest performance level at
> which nonlinear power savings are achieved, Above this threshold,
> lower performance
 levels should be generally more energy efficient than higher performance 
 levels.
> ```

 I finally had to go to the ACPI spec to understand what this is
 about. There looks to be an implication that lowest <=
 lowest_nonlinear, and states in that range would correspond more to
 T-states than to P-states. With that I think I agree with the use
>>>
>>> Yes, It doesn't have definitive conclusion about relation between
>>> lowest and lowest_nonlinear In our internal FW designed spec, it
>>> always shows lowest_nonlinear corresponds to P2
>>>
 of lowest_nonlinear here. The comment, however, could do with moving
 farther away from merely quoting the pretty abstract text in the ACPI
 spec, as such quoting doesn't help in clarifying terminology used,
 when that terminology also isn't explained anywhere else in the code base.
>>>
>>>
>>> How about we add detailed explanations about each terminology in the
>>> beginning declaration , see:
>>> ```
>>> +/*
>>> + * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and
>>> +lowest_perf
>>> + * contain the values read from CPPC capability MSR.
>>> + * Field highest_perf represents highest performance, which is the
>>> +absolute
>>> + * maximum performance an individual processor may reach, assuming
>>> +ideal
>>> + * conditions
>>> + * Field nominal_perf represents maximum sustained performance level
>>> +of the
>>> + * processor, assuming ideal operating conditions.
>>> + * Field lowest_nonlinear_perf represents Lowest Nonlinear
>>> +Performance, which
>>> + * is the lowest performance level at which nonlinear power savings
>>> +are
>>> + * achieved. Above this threshold, lower performance levels should be
>>> + * generally more energy efficient than higher performance levels.
>>
>> Which is still only the vague statement also found in the spec. This says 
>> nothing
>> about what happens below that level, or what the relationship to other 
>> fields is.
>>
>>> + * Field lowest_perf represents the absolute lowest performance level
>>> +of the
>>> + * platform.
>>> + *
>>> + * Field max_perf, min_perf, des_perf store the values for CPPC request 
>>> MSR.
>>> + * Field max_perf conveys the maximum performance level at which the
>>> +platform
>>> + * may run. And it may be set to any performance value in the range
>>> + * [lowest_perf, highest_perf], inclusive.
>>> + * Field min_perf conveys the minimum performance level at which the
>>> +platform
>>> + * may run. And it may be set to any performance value in the range
>>> + * [lowest_perf, highest_perf], inclusive but must be less than or
>>> +equal to
>>> + * max_perf.
>>> + * Field des_perf conveys performance level Xen is requesting. And it
>>> +may be
>>> + * set to any performance value in the range [min_perf, max_perf

RE: [PATCH v5 12/18] xen/cpufreq: get performance policy from governor set via xenpm

2025-07-04 Thread Penny, Zheng

[Public]

> -Original Message-
> From: Jan Beulich 
> Sent: Tuesday, June 17, 2025 12:22 AM
> To: Penny, Zheng 
> Cc: Huang, Ray ; xen-devel@lists.xenproject.org
> Subject: Re: [PATCH v5 12/18] xen/cpufreq: get performance policy from 
> governor
> set via xenpm
>
> On 27.05.2025 10:48, Penny Zheng wrote:
> > --- a/xen/drivers/acpi/pmstat.c
> > +++ b/xen/drivers/acpi/pmstat.c
> > @@ -319,6 +319,14 @@ static int set_cpufreq_gov(struct xen_sysctl_pm_op
> *op)
> >  if (new_policy.governor == NULL)
> >  return -EINVAL;
> >
> > +new_policy.policy = cpufreq_policy_from_governor(new_policy.governor);
> > +if ( new_policy.policy == CPUFREQ_POLICY_UNKNOWN )
> > +{
> > +printk("Failed to get performance policy from %s, and users
> > + shall write epp values to deliver preference towards performance
> > + over efficiency",
>
> This message is excessively long and is lacking a newline (i.e. effectively 
> two:
> one in the middle and one at the end; yet better would be to find less verbose
> wording). What's worse, how would a "user" go about "writing epp values"?
>

Maybe change it to
```
printk("Failed to get performance policy from %s, Try \"xenpm 
set-cpufreq-cppc\"\n", ...);
```

> Jan

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

2025-07-04 Thread Roger Pau Monné

On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
> diff --git a/xen/arch/x86/include/asm/softirq.h 
> b/xen/arch/x86/include/asm/softirq.h
> index e4b194f069fb..069e5716a68d 100644
> --- a/xen/arch/x86/include/asm/softirq.h
> +++ b/xen/arch/x86/include/asm/softirq.h
> @@ -1,6 +1,8 @@
>  #ifndef __ASM_SOFTIRQ_H__
>  #define __ASM_SOFTIRQ_H__
>  
> +#include 
> +
>  #define NMI_SOFTIRQ(NR_COMMON_SOFTIRQS + 0)
>  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
>  #define VCPU_KICK_SOFTIRQ  (NR_COMMON_SOFTIRQS + 2)
> @@ -9,4 +11,36 @@
>  #define HVM_DPCI_SOFTIRQ   (NR_COMMON_SOFTIRQS + 4)
>  #define NR_ARCH_SOFTIRQS   5
>  
> +/*
> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
> + * skipped, false if the IPI cannot be skipped.
> + *
> + * We use a CMPXCHG covering both __softirq_pending and in_mwait, in order to
> + * set softirq @nr while also observing in_mwait in a race-free way.
> + */
> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int 
> cpu)
> +{
> +uint64_t *ptr = &irq_stat[cpu].softirq_mwait_raw;
> +uint64_t old, new;
> +unsigned int softirq = 1U << nr;
> +
> +old = ACCESS_ONCE(*ptr);
> +
> +do {
> +if ( old & softirq )
> +/* Softirq already pending, nothing to do. */
> +return true;
> +
> +new = old | softirq;
> +
> +} while ( (old = cmpxchg(ptr, old, new)) != new );
> +
> +/*
> + * We have caused the softirq to become pending.  If in_mwait was set, 
> the
> + * target CPU will notice the modification and act on it.
> + */
> +return new & (1UL << 32);

Maybe I haven't got enough coffee yet, but if we do the cmpxchg()
won't it be enough to send the IPI when softirq is first set, but not
necessarily for each extra softirq that's set if there's already one
enabled?  IOW:

uint64_t new, softirq = 1U << nr;
uint64_t old = ACCESS_ONCE(*ptr);

do {
if ( old & softirq )
/* Softirq already pending, nothing to do. */
return true;

new = old | softirq;

} while ( (old = cmpxchg(ptr, old, new)) != (new & ~softirq) );

/*
 * We have caused the softirq to become pending when it was
 * previously unset.  If in_mwait was set, the target CPU will
 * notice the modification and act on it.
 *
 * Reduce the logic to simply check whether the old value was
 * different than 0: it will either be the in_mwait field or any
 * already pending softirqs.
 */
return old;

Thanks, Roger.

[PATCH 0/2] xen/arm: Creating base for future PDX changes

2025-07-04 Thread Michal Orzel

Two small patches laying a base for future PDX changes. See discussion here:
https://lore.kernel.org/xen-devel/alpine.DEB.2.22.394.2507031103360.605088@ubuntu-linux-20-04-desktop/T/#m157f07f957bdad3d807ff53d05e75688a8ae6837

Michal Orzel (2):
  xen/arm64: Panic if direct map is too small
  xen/arm: Skip loops in init_pdx() when no PDX compression is used

 xen/arch/arm/arm32/mmu/mm.c |  1 -
 xen/arch/arm/arm64/mmu/mm.c |  4 +++-
 xen/arch/arm/setup.c| 10 --
 3 files changed, 11 insertions(+), 4 deletions(-)

-- 
2.25.1

[PATCH 1/2] xen/arm64: Panic if direct map is too small

2025-07-04 Thread Michal Orzel

Harden the code by panicing if direct map is too small for current memory
layout taking into account possible PDX compression. Otherwise the assert
is observed:
Assertion '(mfn_to_pdx(maddr_to_mfn(ma)) - directmap_base_pdx) < 
(DIRECTMAP_SIZE >> PAGE_SHIFT)' failed at ./arch/arm/include/asm/mmu/mm.h:72

At the moment, we don't set max_pdx denoting maximum usable PDX which
should be based on max_page. Consolidate setting of max_page and max_pdx
in init_pdx() for both arm32 and arm64. max_pdx will be used in the
future to set up frametable mappings respecting the PDX grouping.

Signed-off-by: Michal Orzel 
---
A similar check for frametable will be introduced with other changes to
frametable setting in the future.
---
 xen/arch/arm/arm32/mmu/mm.c | 1 -
 xen/arch/arm/arm64/mmu/mm.c | 4 +++-
 xen/arch/arm/setup.c| 6 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/xen/arch/arm/arm32/mmu/mm.c b/xen/arch/arm/arm32/mmu/mm.c
index 4d22f35618aa..e6d9b49acd3c 100644
--- a/xen/arch/arm/arm32/mmu/mm.c
+++ b/xen/arch/arm/arm32/mmu/mm.c
@@ -190,7 +190,6 @@ void __init setup_mm(void)
 
 /* Frame table covers all of RAM region, including holes */
 setup_frametable_mappings(ram_start, ram_end);
-max_page = PFN_DOWN(ram_end);
 
 /*
  * The allocators may need to use map_domain_page() (such as for
diff --git a/xen/arch/arm/arm64/mmu/mm.c b/xen/arch/arm/arm64/mmu/mm.c
index a0a2dd8cc762..3e64be6ae664 100644
--- a/xen/arch/arm/arm64/mmu/mm.c
+++ b/xen/arch/arm/arm64/mmu/mm.c
@@ -224,6 +224,9 @@ static void __init setup_directmap_mappings(unsigned long 
base_mfn,
  */
 directmap_virt_start = DIRECTMAP_VIRT_START +
 (base_mfn - mfn_gb) * PAGE_SIZE;
+
+if ( (max_pdx - directmap_base_pdx) > (DIRECTMAP_SIZE >> PAGE_SHIFT) )
+panic("Direct map is too small\n");
 }
 
 if ( base_mfn < mfn_x(directmap_mfn_start) )
@@ -278,7 +281,6 @@ void __init setup_mm(void)
 directmap_mfn_end = maddr_to_mfn(ram_end);
 
 setup_frametable_mappings(ram_start, ram_end);
-max_page = PFN_DOWN(ram_end);
 
 init_staticmem_pages();
 init_sharedmem_pages();
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index 58acc2d0d4b8..93b730ffb5fb 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -254,7 +254,7 @@ static void __init relocate_fdt(const void **dtb_vaddr, 
size_t dtb_size)
 void __init init_pdx(void)
 {
 const struct membanks *mem = bootinfo_get_mem();
-paddr_t bank_start, bank_size, bank_end;
+paddr_t bank_start, bank_size, bank_end, ram_end = 0;
 
 /*
  * Arm does not have any restrictions on the bits to compress. Pass 0 to
@@ -290,10 +290,14 @@ void __init init_pdx(void)
 bank_start = mem->bank[bank].start;
 bank_size = mem->bank[bank].size;
 bank_end = bank_start + bank_size;
+ram_end = max(ram_end, bank_end);
 
 set_pdx_range(paddr_to_pfn(bank_start),
   paddr_to_pfn(bank_end));
 }
+
+max_page = PFN_DOWN(ram_end);
+max_pdx = pfn_to_pdx(max_page - 1) + 1;
 }
 
 size_t __read_mostly dcache_line_bytes;
-- 
2.25.1

[PATCH 2/2] xen/arm: Skip loops in init_pdx() when no PDX compression is used

2025-07-04 Thread Michal Orzel

When CONFIG_PDX_COMPRESSION=n, pdx_init_mask(), pdx_region_mask() and
pfn_pdx_hole_setup() are just stubs doing nothing. It does not make
sense to keep the two loops iterating over all the memory banks.

Signed-off-by: Michal Orzel 
---
 xen/arch/arm/setup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index 93b730ffb5fb..12b76a0a9837 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -255,7 +255,9 @@ void __init init_pdx(void)
 {
 const struct membanks *mem = bootinfo_get_mem();
 paddr_t bank_start, bank_size, bank_end, ram_end = 0;
+int bank;
 
+#ifdef CONFIG_PDX_COMPRESSION
 /*
  * Arm does not have any restrictions on the bits to compress. Pass 0 to
  * let the common code further restrict the mask.
@@ -264,7 +266,6 @@ void __init init_pdx(void)
  * update this function too.
  */
 uint64_t mask = pdx_init_mask(0x0);
-int bank;
 
 for ( bank = 0 ; bank < mem->nr_banks; bank++ )
 {
@@ -284,6 +285,7 @@ void __init init_pdx(void)
 }
 
 pfn_pdx_hole_setup(mask >> PAGE_SHIFT);
+#endif
 
 for ( bank = 0 ; bank < mem->nr_banks; bank++ )
 {
-- 
2.25.1

Re: [PATCH 3/6] xen/softirq: Rework arch_skip_send_event_check() into arch_pend_softirq()

2025-07-04 Thread Roger Pau Monné

On Fri, Jul 04, 2025 at 09:23:29AM +0200, Jan Beulich wrote:
> On 03.07.2025 18:21, Roger Pau Monné wrote:
> > On Wed, Jul 02, 2025 at 03:41:18PM +0100, Andrew Cooper wrote:
> >> --- a/xen/include/xen/softirq.h
> >> +++ b/xen/include/xen/softirq.h
> >> @@ -23,6 +23,22 @@ enum {
> >>  
> >>  #define NR_SOFTIRQS (NR_COMMON_SOFTIRQS + NR_ARCH_SOFTIRQS)
> >>  
> >> +/*
> >> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
> >> + * skipped, false if the IPI cannot be skipped.
> >> + */
> >> +#ifndef arch_pend_softirq
> >> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int 
> >> cpu)
> > 
> > Nit: I would maybe it arch_set_softirq(), I find `pend` not that clear
> > (I would rather fully spell `pending` instead).
> 
> I, too, did wonder about the naming here. But using "pending" as you suggest
> has the effect of giving the function a name we would normally associate with
> a predicate ("Is it pending?"), whereas here the function is used to _mark_ a
> softirq as pending. Hence in the end I didn't comment at all; I'd be fine
> with "set", but I'm also okay with "pend".

It's a set and check kind of function, so I don't care much.  Just
found the pend a bit too short, I don't think we usually abbreviate
pending to pend.  In fact we use pend in more than one variable name
to store the end of a physical memory region.

Thanks, Roger.

Re: [PATCH 3/6] xen/softirq: Rework arch_skip_send_event_check() into arch_pend_softirq()

2025-07-04 Thread Jan Beulich

On 03.07.2025 18:21, Roger Pau Monné wrote:
> On Wed, Jul 02, 2025 at 03:41:18PM +0100, Andrew Cooper wrote:
>> --- a/xen/include/xen/softirq.h
>> +++ b/xen/include/xen/softirq.h
>> @@ -23,6 +23,22 @@ enum {
>>  
>>  #define NR_SOFTIRQS (NR_COMMON_SOFTIRQS + NR_ARCH_SOFTIRQS)
>>  
>> +/*
>> + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
>> + * skipped, false if the IPI cannot be skipped.
>> + */
>> +#ifndef arch_pend_softirq
>> +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int 
>> cpu)
> 
> Nit: I would maybe it arch_set_softirq(), I find `pend` not that clear
> (I would rather fully spell `pending` instead).

I, too, did wonder about the naming here. But using "pending" as you suggest
has the effect of giving the function a name we would normally associate with
a predicate ("Is it pending?"), whereas here the function is used to _mark_ a
softirq as pending. Hence in the end I didn't comment at all; I'd be fine
with "set", but I'm also okay with "pend".

Jan

Re: [PATCH 4/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

2025-07-04 Thread Roger Pau Monné

On Thu, Jul 03, 2025 at 06:48:23PM +0100, Andrew Cooper wrote:
> On 03/07/2025 5:36 pm, Roger Pau Monné wrote:
> > On Wed, Jul 02, 2025 at 03:41:19PM +0100, Andrew Cooper wrote:
> >> In order elide IPIs, we must be able to identify whether a target CPU is in
> >> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
> >> identify the state.
> >>
> >> Create a new in_mwait variable beside __softirq_pending, so we can use a
> >> CMPXCHG to set the softirq while also observing the status safely.  
> >> Implement
> >> an x86 version of arch_pend_softirq() which does this.
> >>
> >> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
> >> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by 
> >> not
> >> advertising in_mwait.
> >>
> >> Signed-off-by: Andrew Cooper 
> >> ---
> >> CC: Jan Beulich 
> >> CC: Roger Pau Monné 
> >> CC: Anthony PERARD 
> >> CC: Michal Orzel 
> >> CC: Julien Grall 
> >> CC: Stefano Stabellini 
> >>
> >> This is modelled after Linux's TIF_NEED_RESCHED (single bit equivelent of 
> >> all
> >> of __softirq_pending), and TIF_POLLING_NRFLAG (arch-neutral "in_mwait").
> >>
> >> In Linux, they're both in the same flags field, which adds complexity.  In
> >> Xen, __softirq_pending is already unsigned long for everything other than 
> >> x86,
> >> so adding an arch-neutral "in_mwait" would need wider changes.
> >> ---
> >>  xen/arch/x86/acpi/cpu_idle.c   | 20 +-
> >>  xen/arch/x86/include/asm/hardirq.h | 14 +++-
> >>  xen/arch/x86/include/asm/softirq.h | 34 ++
> >>  3 files changed, 66 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
> >> index caa0fef0b3b1..13040ef467a0 100644
> >> --- a/xen/arch/x86/acpi/cpu_idle.c
> >> +++ b/xen/arch/x86/acpi/cpu_idle.c
> >> @@ -439,7 +439,21 @@ __initcall(cpu_idle_key_init);
> >>  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
> >>  {
> >>  unsigned int cpu = smp_processor_id();
> >> -const unsigned int *this_softirq_pending = &softirq_pending(cpu);
> >> +irq_cpustat_t *stat = &irq_stat[cpu];
> >> +const unsigned int *this_softirq_pending = &stat->__softirq_pending;
> >> +
> >> +/*
> >> + * By setting in_mwait, we promise to other CPUs that we'll notice 
> >> changes
> >> + * to __softirq_pending without being sent an IPI.  We achieve this by
> >> + * either not going to sleep, or by having hardware notice on our 
> >> behalf.
> >> + *
> >> + * Some errata exist where MONITOR doesn't work properly, and the
> >> + * workaround is to force the use of an IPI.  Cause this to happen by
> >> + * simply not advertising outselves as being in_mwait.
> >> + */
> >> +alternative_io("movb $1, %[in_mwait]",
> >> +   "", X86_BUG_MONITOR,
> >> +   [in_mwait] "=m" (stat->in_mwait));
> >>  
> >>  monitor(this_softirq_pending, 0, 0);
> >>  smp_mb();
> >> @@ -452,6 +466,10 @@ void mwait_idle_with_hints(unsigned int eax, unsigned 
> >> int ecx)
> >>  mwait(eax, ecx);
> >>  spec_ctrl_exit_idle(info);
> >>  }
> >> +
> >> +alternative_io("movb $0, %[in_mwait]",
> >> +   "", X86_BUG_MONITOR,
> >> +   [in_mwait] "=m" (stat->in_mwait));
> > Isn't it a bit overkill to use alternatives here when you could have a
> > conditional based on a global variable to decide whether to set/clear
> > in_mwait?
> 
> I view it differently.  Why should the common case suffer overhead
> (extra memory read and conditional branch) to work around 3 buggy pieces
> of hardware in a common path?

TBH I don't think the extra read and conditional would make any
difference in this specific path performance wise.  Either the CPU is
going to sleep and has nothing to do, or the cost of getting back from
idle will shadow the overhead of an extra read and conditional.

It's all a question of taste I guess.  If it was me I would set/clear
stat->in_mwait unconditionally in mwait_idle_with_hints(), and then in
arch_pend_softirq() I would return:

return new & (1UL << 32) || force_mwait_ipi_wakeup;

Or AFAICT you could possibly skip the cmpxchg() in the
force_mwait_ipi_wakeup case in arch_pend_softirq(), and just do:

if ( force_mwait_ipi_wakeup )
return test_and_set_bit(nr, &softirq_pending(cpu));

Because in that case Xen doesn't care about the in_mwait status.  It
would be an optimization for the buggy hardware that already has to
deal with the extra cost of always sending an IPI.

> >> +};
> >> +uint64_t softirq_mwait_raw;
> >> +};
> > This could be a named union type ...
> >
> >> +
> >>  unsigned int __local_irq_count;
> >>  unsigned int nmi_count;
> >>  unsigned int mce_count;
> >> diff --git a/xen/arch/x86/include/asm/softirq.h 
> >> b/xen/arch/x86/include/asm/softirq.h
> >> index e4b194f069fb..069e5716a68d 100644
> >> --- a/xen/arch

RE: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for cpufreq scaling

2025-07-04 Thread Penny, Zheng

[Public]

> -Original Message-
> From: Jan Beulich 
> Sent: Friday, July 4, 2025 2:21 PM
> To: Penny, Zheng 
> Cc: Huang, Ray ; Andrew Cooper
> ; Roger Pau Monné ; xen-
> de...@lists.xenproject.org
> Subject: Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for
> cpufreq scaling
>
> On 04.07.2025 05:40, Penny, Zheng wrote:
> >> -Original Message-
> >> From: Jan Beulich 
> >> Sent: Wednesday, July 2, 2025 6:48 PM
> >>
> >> On 02.07.2025 11:49, Penny, Zheng wrote:
>  -Original Message-
>  From: Jan Beulich 
>  Sent: Tuesday, June 17, 2025 12:00 AM
>  To: Penny, Zheng 
> 
>  On 27.05.2025 10:48, Penny Zheng wrote:
> > +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy 
> > *policy,
> > +unsigned int target_freq,
> > +unsigned int relation) {
> > +unsigned int cpu = policy->cpu;
> > +const struct amd_cppc_drv_data *data =
> > +per_cpu(amd_cppc_drv_data,
> >> cpu);
> > +uint8_t des_perf;
> > +int res;
> > +
> > +if ( unlikely(!target_freq) )
> > +return 0;
> > +
> > +res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> > +if ( res )
> > +return res;
> > +
> > +/*
> > + * Setting with "lowest_nonlinear_perf" to ensure governoring
> > + * performance in P-state range.
> > + */
> > +amd_cppc_write_request(policy->cpu, data-
> >caps.lowest_nonlinear_perf,
> > +   des_perf, data->caps.highest_perf);
> 
>  I fear I don't understand the comment, and hence it remains unclear
>  to me why lowest_nonlinear_perf is being used here.
> >>>
> >>> How about
> >>> ```
> >>> Choose lowest nonlinear performance as the minimum performance level
> >>> at which
> >> the platform may run.
> >>> Lowest nonlinear performance is the lowest performance level at
> >>> which nonlinear power savings are achieved, Above this threshold,
> >>> lower performance
> >> levels should be generally more energy efficient than higher performance 
> >> levels.
> >>> ```
> >>
> >> I finally had to go to the ACPI spec to understand what this is
> >> about. There looks to be an implication that lowest <=
> >> lowest_nonlinear, and states in that range would correspond more to
> >> T-states than to P-states. With that I think I agree with the use
> >
> > Yes, It doesn't have definitive conclusion about relation between
> > lowest and lowest_nonlinear In our internal FW designed spec, it
> > always shows lowest_nonlinear corresponds to P2
> >
> >> of lowest_nonlinear here. The comment, however, could do with moving
> >> farther away from merely quoting the pretty abstract text in the ACPI
> >> spec, as such quoting doesn't help in clarifying terminology used,
> >> when that terminology also isn't explained anywhere else in the code base.
> >
> >
> > How about we add detailed explanations about each terminology in the
> > beginning declaration , see:
> > ```
> > +/*
> > + * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and
> > +lowest_perf
> > + * contain the values read from CPPC capability MSR.
> > + * Field highest_perf represents highest performance, which is the
> > +absolute
> > + * maximum performance an individual processor may reach, assuming
> > +ideal
> > + * conditions
> > + * Field nominal_perf represents maximum sustained performance level
> > +of the
> > + * processor, assuming ideal operating conditions.
> > + * Field lowest_nonlinear_perf represents Lowest Nonlinear
> > +Performance, which
> > + * is the lowest performance level at which nonlinear power savings
> > +are
> > + * achieved. Above this threshold, lower performance levels should be
> > + * generally more energy efficient than higher performance levels.
>
> Which is still only the vague statement also found in the spec. This says 
> nothing
> about what happens below that level, or what the relationship to other fields 
> is.
>
> > + * Field lowest_perf represents the absolute lowest performance level
> > +of the
> > + * platform.
> > + *
> > + * Field max_perf, min_perf, des_perf store the values for CPPC request 
> > MSR.
> > + * Field max_perf conveys the maximum performance level at which the
> > +platform
> > + * may run. And it may be set to any performance value in the range
> > + * [lowest_perf, highest_perf], inclusive.
> > + * Field min_perf conveys the minimum performance level at which the
> > +platform
> > + * may run. And it may be set to any performance value in the range
> > + * [lowest_perf, highest_perf], inclusive but must be less than or
> > +equal to
> > + * max_perf.
> > + * Field des_perf conveys performance level Xen is requesting. And it
> > +may be
> > + * set to any performance value in the range [min_perf, max_perf], 
> > inclusive.
> > + */
> > +struct amd_cppc_drv_data
> > +{
>

Re: [PATCH v6 37/39] accel: Rename 'system/accel-ops.h' -> 'accel/accel-cpu-ops.h'

2025-07-04 Thread Zhao Liu

On Thu, Jul 03, 2025 at 07:32:43PM +0200, Philippe Mathieu-Daudé wrote:
> Date: Thu,  3 Jul 2025 19:32:43 +0200
> From: Philippe Mathieu-Daudé 
> Subject: [PATCH v6 37/39] accel: Rename 'system/accel-ops.h' ->
>  'accel/accel-cpu-ops.h'
> X-Mailer: git-send-email 2.49.0
> 
> Unfortunately "system/accel-ops.h" handlers are not only
> system-specific. For example, the cpu_reset_hold() hook
> is part of the vCPU creation, after it is realized.
> 
> Mechanical rename to drop 'system' using:
> 
>   $ sed -i -e s_system/accel-ops.h_accel/accel-cpu-ops.h_g \
>   $(git grep -l system/accel-ops.h)
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  include/{system/accel-ops.h => accel/accel-cpu-ops.h} | 8 
>  accel/accel-common.c  | 2 +-
>  accel/accel-system.c  | 2 +-
>  accel/hvf/hvf-accel-ops.c | 2 +-
>  accel/kvm/kvm-accel-ops.c | 2 +-
>  accel/qtest/qtest.c   | 2 +-
>  accel/tcg/tcg-accel-ops.c | 2 +-
>  accel/xen/xen-all.c   | 2 +-
>  cpu-target.c  | 2 +-
>  gdbstub/system.c  | 2 +-
>  system/cpus.c | 2 +-
>  target/i386/nvmm/nvmm-accel-ops.c | 2 +-
>  target/i386/whpx/whpx-accel-ops.c | 2 +-
>  13 files changed, 16 insertions(+), 16 deletions(-)
>  rename include/{system/accel-ops.h => accel/accel-cpu-ops.h} (96%)

...

> -#ifndef ACCEL_OPS_H
> -#define ACCEL_OPS_H
> +#ifndef ACCEL_CPU_OPS_H
> +#define ACCEL_CPU_OPS_H

Daniel mentioned "QEMU_" prefix is "best practice":

https://lore.kernel.org/qemu-devel/aadsmexeay45n...@redhat.com/

But I also think there's no need to change anything here for now. If
you agree, we can move in this direction in the future. So

Reviewed-by: Zhao Liu

Re: [PATCH 3/6] xen/softirq: Rework arch_skip_send_event_check() into arch_pend_softirq()

2025-07-04 Thread Jan Beulich

On 04.07.2025 09:55, Roger Pau Monné wrote:
> On Fri, Jul 04, 2025 at 09:23:29AM +0200, Jan Beulich wrote:
>> On 03.07.2025 18:21, Roger Pau Monné wrote:
>>> On Wed, Jul 02, 2025 at 03:41:18PM +0100, Andrew Cooper wrote:
 --- a/xen/include/xen/softirq.h
 +++ b/xen/include/xen/softirq.h
 @@ -23,6 +23,22 @@ enum {
  
  #define NR_SOFTIRQS (NR_COMMON_SOFTIRQS + NR_ARCH_SOFTIRQS)
  
 +/*
 + * Ensure softirq @nr is pending on @cpu.  Return true if an IPI can be
 + * skipped, false if the IPI cannot be skipped.
 + */
 +#ifndef arch_pend_softirq
 +static always_inline bool arch_pend_softirq(unsigned int nr, unsigned int 
 cpu)
>>>
>>> Nit: I would maybe it arch_set_softirq(), I find `pend` not that clear
>>> (I would rather fully spell `pending` instead).
>>
>> I, too, did wonder about the naming here. But using "pending" as you suggest
>> has the effect of giving the function a name we would normally associate with
>> a predicate ("Is it pending?"), whereas here the function is used to _mark_ a
>> softirq as pending. Hence in the end I didn't comment at all; I'd be fine
>> with "set", but I'm also okay with "pend".
> 
> It's a set and check kind of function, so I don't care much.  Just
> found the pend a bit too short, I don't think we usually abbreviate
> pending to pend.

Aiui it's not an abbreviation, but kind of a verb (inverse-)derived from 
pending.
I don't know whether that's "official"; my dictionary doesn't have it.

Jan

Re: [PATCH v5 12/18] xen/cpufreq: get performance policy from governor set via xenpm

2025-07-04 Thread Jan Beulich

On 04.07.2025 09:51, Penny, Zheng wrote:
> [Public]
> 
>> -Original Message-
>> From: Jan Beulich 
>> Sent: Tuesday, June 17, 2025 12:22 AM
>> To: Penny, Zheng 
>> Cc: Huang, Ray ; xen-devel@lists.xenproject.org
>> Subject: Re: [PATCH v5 12/18] xen/cpufreq: get performance policy from 
>> governor
>> set via xenpm
>>
>> On 27.05.2025 10:48, Penny Zheng wrote:
>>> --- a/xen/drivers/acpi/pmstat.c
>>> +++ b/xen/drivers/acpi/pmstat.c
>>> @@ -319,6 +319,14 @@ static int set_cpufreq_gov(struct xen_sysctl_pm_op
>> *op)
>>>  if (new_policy.governor == NULL)
>>>  return -EINVAL;
>>>
>>> +new_policy.policy = cpufreq_policy_from_governor(new_policy.governor);
>>> +if ( new_policy.policy == CPUFREQ_POLICY_UNKNOWN )
>>> +{
>>> +printk("Failed to get performance policy from %s, and users
>>> + shall write epp values to deliver preference towards performance
>>> + over efficiency",
>>
>> This message is excessively long and is lacking a newline (i.e. effectively 
>> two:
>> one in the middle and one at the end; yet better would be to find less 
>> verbose
>> wording). What's worse, how would a "user" go about "writing epp values"?
>>
> 
> Maybe change it to
> ```
> printk("Failed to get performance policy from %s, Try \"xenpm 
> set-cpufreq-cppc\"\n", ...);
> ```

Looks quite a bit better, thanks.

Jan

Re: [PATCH v6 28/39] accel: Expose and register generic_handle_interrupt()

2025-07-04 Thread Philippe Mathieu-Daudé


On 4/7/25 08:38, Xiaoyao Li wrote:

On 7/4/2025 1:32 AM, Philippe Mathieu-Daudé wrote:

In order to dispatch over AccelOpsClass::handle_interrupt(),
we need it always defined, 


It seems I can only understand it until I see the code to really require 
it to be mandatory.


See 
https://lore.kernel.org/qemu-devel/acd1d192-f016-48d3-90e1-39d70eac4...@linaro.org/




But anyway, the change itself is correct.

Reviewed-by: Xiaoyao Li 


Thanks!

Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-cmd

2025-07-04 Thread Jan Beulich

On 04.07.2025 10:13, Penny, Zheng wrote:
> [Public]
> 
>> -Original Message-
>> From: Jan Beulich 
>> Sent: Tuesday, June 17, 2025 6:08 PM
>> To: Penny, Zheng 
>> Cc: Huang, Ray ; Anthony PERARD
>> ; Juergen Gross ; Andrew
>> Cooper ; Orzel, Michal ;
>> Julien Grall ; Roger Pau Monné ; 
>> Stefano
>> Stabellini ; xen-devel@lists.xenproject.org
>> Subject: Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-
>> cmd
>>
>> On 27.05.2025 10:48, Penny Zheng wrote:
>>> --- a/tools/misc/xenpm.c
>>> +++ b/tools/misc/xenpm.c
>>> @@ -69,6 +69,7 @@ void show_help(void)
>>>  " set-max-cstate|'unlimited' 
>>> [|'unlimited']\n"
>>>  " set the C-State 
>>> limitation ( >= 0) and\n"
>>>  " optionally the 
>>> C-sub-state limitation
>> ( >= 0)\n"
>>> +" get-cpufreq-cppc  [cpuid]   list cpu cppc parameter 
>>> of CPU
>>  or all\n"
>>>  " set-cpufreq-cppc  [cpuid] [balance|performance|powersave]
>> *\n"
>>>  " set Hardware P-State 
>>> (HWP) parameters\n"
>>>  " on CPU  or all if 
>>> omitted.\n"
>>> @@ -812,33 +813,7 @@ static void print_cpufreq_para(int cpuid, struct
>>> xc_get_cpufreq_para *p_cpufreq)
>>>
>>>  printf("scaling_driver   : %s\n", p_cpufreq->scaling_driver);
>>>
>>> -if ( hwp )
>>> -{
>>> -const xc_cppc_para_t *cppc = &p_cpufreq->u.cppc_para;
>>> -
>>> -printf("cppc variables   :\n");
>>> -printf("  hardware limits: lowest [%"PRIu32"] lowest nonlinear
>> [%"PRIu32"]\n",
>>> -   cppc->lowest, cppc->lowest_nonlinear);
>>> -printf(" : nominal [%"PRIu32"] highest 
>>> [%"PRIu32"]\n",
>>> -   cppc->nominal, cppc->highest);
>>> -printf("  configured limits  : min [%"PRIu32"] max [%"PRIu32"] 
>>> energy perf
>> [%"PRIu32"]\n",
>>> -   cppc->minimum, cppc->maximum, cppc->energy_perf);
>>> -
>>> -if ( cppc->features & XEN_SYSCTL_CPPC_FEAT_ACT_WINDOW )
>>> -{
>>> -unsigned int activity_window;
>>> -const char *units;
>>> -
>>> -activity_window = calculate_activity_window(cppc, &units);
>>> -printf(" : activity_window [%"PRIu32" 
>>> %s]\n",
>>> -   activity_window, units);
>>> -}
>>> -
>>> -printf(" : desired [%"PRIu32"%s]\n",
>>> -   cppc->desired,
>>> -   cppc->desired ? "" : " hw autonomous");
>>> -}
>>> -else
>>> +if ( !hwp )
>>>  {
>>>  if ( p_cpufreq->gov_num )
>>>  printf("scaling_avail_gov: %s\n",
>>
>> I'm not sure it is a good idea to alter what is being output for 
>> get-cpufreq-para.
>> People may simply miss that output then, without having any indication where 
>> it
>> went.
> 
> Hwp is more like amd-cppc in active mode. It also does not rely on Xen 
> governor to do performance tuning, so in previous design, we could borrow 
> governor filed to output cppc info
> However after introducing amd-cppc passive mode, we have request to output 
> both governor and CPPC info. And if continuing expanding get-cpufreq-para to 
> include CPPC info, it will make the parent stuct xen_sysctl.u exceed exceed 
> 128 bytes.

How is the xenpm command "get-cpufreq-para" related to the sysctl interface 
struct
size? If you need to invoke two sysctl sub-ops to produce all relevant
"get-cpufreq-para" output, so be it I would say.

> So I'm left to create a new subcmd to specifically deal with CPPC info
> I could leave above output for get-cpufreq-para unchanged. Then we will have 
> duplicate CPPC info in two commands. Or is it fine to do that?

Duplicate information (in distinct commands) isn't a problem either, imo.

Jason, you did the HWP work here - any thoughts?

Jan

Re: [PATCH v5 14/18] xen/cpufreq: introduce GET_CPUFREQ_CPPC sub-cmd

2025-07-04 Thread Jan Beulich

On 04.07.2025 10:32, Jan Beulich wrote:
> On 04.07.2025 10:13, Penny, Zheng wrote:
>>> -Original Message-
>>> From: Jan Beulich 
>>> Sent: Tuesday, June 17, 2025 6:08 PM
>>>
>>> On 27.05.2025 10:48, Penny Zheng wrote:
 --- a/tools/misc/xenpm.c
 +++ b/tools/misc/xenpm.c
 @@ -69,6 +69,7 @@ void show_help(void)
  " set-max-cstate|'unlimited' 
 [|'unlimited']\n"
  " set the C-State 
 limitation ( >= 0) and\n"
  " optionally the 
 C-sub-state limitation
>>> ( >= 0)\n"
 +" get-cpufreq-cppc  [cpuid]   list cpu cppc parameter 
 of CPU
>>>  or all\n"
  " set-cpufreq-cppc  [cpuid] 
 [balance|performance|powersave]
>>> *\n"
  " set Hardware P-State 
 (HWP) parameters\n"
  " on CPU  or all 
 if omitted.\n"
 @@ -812,33 +813,7 @@ static void print_cpufreq_para(int cpuid, struct
 xc_get_cpufreq_para *p_cpufreq)

  printf("scaling_driver   : %s\n", p_cpufreq->scaling_driver);

 -if ( hwp )
 -{
 -const xc_cppc_para_t *cppc = &p_cpufreq->u.cppc_para;
 -
 -printf("cppc variables   :\n");
 -printf("  hardware limits: lowest [%"PRIu32"] lowest nonlinear
>>> [%"PRIu32"]\n",
 -   cppc->lowest, cppc->lowest_nonlinear);
 -printf(" : nominal [%"PRIu32"] highest 
 [%"PRIu32"]\n",
 -   cppc->nominal, cppc->highest);
 -printf("  configured limits  : min [%"PRIu32"] max [%"PRIu32"] 
 energy perf
>>> [%"PRIu32"]\n",
 -   cppc->minimum, cppc->maximum, cppc->energy_perf);
 -
 -if ( cppc->features & XEN_SYSCTL_CPPC_FEAT_ACT_WINDOW )
 -{
 -unsigned int activity_window;
 -const char *units;
 -
 -activity_window = calculate_activity_window(cppc, &units);
 -printf(" : activity_window [%"PRIu32" 
 %s]\n",
 -   activity_window, units);
 -}
 -
 -printf(" : desired [%"PRIu32"%s]\n",
 -   cppc->desired,
 -   cppc->desired ? "" : " hw autonomous");
 -}
 -else
 +if ( !hwp )
  {
  if ( p_cpufreq->gov_num )
  printf("scaling_avail_gov: %s\n",
>>>
>>> I'm not sure it is a good idea to alter what is being output for 
>>> get-cpufreq-para.
>>> People may simply miss that output then, without having any indication 
>>> where it
>>> went.
>>
>> Hwp is more like amd-cppc in active mode. It also does not rely on Xen 
>> governor to do performance tuning, so in previous design, we could borrow 
>> governor filed to output cppc info
>> However after introducing amd-cppc passive mode, we have request to output 
>> both governor and CPPC info. And if continuing expanding get-cpufreq-para to 
>> include CPPC info, it will make the parent stuct xen_sysctl.u exceed exceed 
>> 128 bytes.
> 
> How is the xenpm command "get-cpufreq-para" related to the sysctl interface 
> struct
> size? If you need to invoke two sysctl sub-ops to produce all relevant
> "get-cpufreq-para" output, so be it I would say.
> 
>> So I'm left to create a new subcmd to specifically deal with CPPC info
>> I could leave above output for get-cpufreq-para unchanged. Then we will have 
>> duplicate CPPC info in two commands. Or is it fine to do that?
> 
> Duplicate information (in distinct commands) isn't a problem either, imo.

But, ftaod, duplicate code producing the same information is. Such code would
want breaking out into a helper function then. (And yet further ftaod, if the
code only produces identical information, but from different input structures,
such breaking out of course wouldn't normally be an option.)

Jan

Re: [PATCH v6 38/39] accel: Extract AccelClass definition to 'accel/accel-ops.h'

2025-07-04 Thread Zhao Liu

On Thu, Jul 03, 2025 at 07:32:44PM +0200, Philippe Mathieu-Daudé wrote:
> Date: Thu,  3 Jul 2025 19:32:44 +0200
> From: Philippe Mathieu-Daudé 
> Subject: [PATCH v6 38/39] accel: Extract AccelClass definition to
>  'accel/accel-ops.h'
> X-Mailer: git-send-email 2.49.0
> 
> Only accelerator implementations (and the common accelator
> code) need to know about AccelClass internals. Move the
> definition out but forward declare AccelState and AccelClass.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  MAINTAINERS |  2 +-
>  include/accel/accel-ops.h   | 50 +
>  include/qemu/accel.h| 40 ++---
>  include/system/hvf_int.h|  3 ++-
>  include/system/kvm_int.h|  1 +
>  accel/accel-common.c|  1 +
>  accel/accel-system.c|  1 +
>  accel/hvf/hvf-all.c |  1 +
>  accel/kvm/kvm-all.c |  1 +
>  accel/qtest/qtest.c |  1 +
>  accel/tcg/tcg-accel-ops.c   |  1 +
>  accel/tcg/tcg-all.c |  1 +
>  accel/xen/xen-all.c |  1 +
>  bsd-user/main.c |  1 +
>  gdbstub/system.c|  1 +
>  linux-user/main.c   |  1 +
>  system/memory.c |  1 +
>  target/i386/nvmm/nvmm-all.c |  1 +
>  target/i386/whpx/whpx-all.c |  1 +
>  19 files changed, 70 insertions(+), 40 deletions(-)
>  create mode 100644 include/accel/accel-ops.h

Reviewed-by: Zhao Liu

Re: [PATCH v2 5/6] x86/idle: Implement a new MWAIT IPI-elision algorithm

2025-07-04 Thread Roger Pau Monné

On Fri, Jul 04, 2025 at 05:34:09PM +0100, Andrew Cooper wrote:
> In order elide IPIs, we must be able to identify whether a target CPU is in
> MWAIT at the point it is woken up.  i.e. the store to wake it up must also
> identify the state.
> 
> Create a new in_mwait variable beside __softirq_pending, so we can use a
> CMPXCHG to set the softirq while also observing the status safely.  Implement
> an x86 version of arch_pend_softirq() which does this.
> 
> In mwait_idle_with_hints(), advertise in_mwait, with an explanation of
> precisely what it means.  X86_BUG_MONITOR can be accounted for simply by not
> advertising in_mwait.
> 
> Signed-off-by: Andrew Cooper 

Acked-by: Roger Pau Monné 

Thanks, Roger.

[XEN PATCH 5/5] xen/bitops: address violation of MISRA C Rule 5.5

2025-07-04 Thread Dmytro Prokopchuk1

Address a violation of MISRA C:2012 Rule 5.5:
"Identifiers shall be distinct from macro names".

Reports for service MC3A2.R5.5:
xen/include/xen/bitops.h: non-compliant function '__test_and_set_bit(int, 
volatile void*)'
xen/include/xen/bitops.h: non-compliant macro '__test_and_set_bit'
xen/include/xen/bitops.h: non-compliant function '__test_and_clear_bit(int, 
volatile void*)'
xen/include/xen/bitops.h: non-compliant macro '__test_and_clear_bit'
xen/include/xen/bitops.h: non-compliant function '__test_and_change_bit(int, 
volatile void*)'
xen/include/xen/bitops.h: non-compliant macro '__test_and_change_bit'
xen/include/xen/bitops.h: non-compliant function 'test_bit(int, const volatile 
void*)'
xen/include/xen/bitops.h: non-compliant macro 'test_bit'

The primary issue stems from the macro and function
having identical names, which is confusing and
non-compliant with common coding standards.
Change the functions names by adding two underscores at the end.
No functional changes.

Signed-off-by: Dmytro Prokopchuk 
---
 xen/include/xen/bitops.h | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h
index a4d31ec02a..b292470ad7 100644
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -120,7 +120,7 @@ static always_inline bool generic_test_bit(int nr, const 
volatile void *addr)
 }
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * __test_and_set_bit__ - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -129,7 +129,7 @@ static always_inline bool generic_test_bit(int nr, const 
volatile void *addr)
  * but actually fail.  You must protect multiple accesses with a lock.
  */
 static always_inline bool
-__test_and_set_bit(int nr, volatile void *addr)
+__test_and_set_bit__(int nr, volatile void *addr)
 {
 #ifndef arch__test_and_set_bit
 #define arch__test_and_set_bit generic__test_and_set_bit
@@ -139,11 +139,11 @@ __test_and_set_bit(int nr, volatile void *addr)
 }
 #define __test_and_set_bit(nr, addr) ({ \
 if ( bitop_bad_size(addr) ) __bitop_bad_size(); \
-__test_and_set_bit(nr, addr);   \
+__test_and_set_bit__(nr, addr); \
 })
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * __test_and_clear_bit__ - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -152,7 +152,7 @@ __test_and_set_bit(int nr, volatile void *addr)
  * but actually fail.  You must protect multiple accesses with a lock.
  */
 static always_inline bool
-__test_and_clear_bit(int nr, volatile void *addr)
+__test_and_clear_bit__(int nr, volatile void *addr)
 {
 #ifndef arch__test_and_clear_bit
 #define arch__test_and_clear_bit generic__test_and_clear_bit
@@ -162,11 +162,11 @@ __test_and_clear_bit(int nr, volatile void *addr)
 }
 #define __test_and_clear_bit(nr, addr) ({   \
 if ( bitop_bad_size(addr) ) __bitop_bad_size(); \
-__test_and_clear_bit(nr, addr); \
+__test_and_clear_bit__(nr, addr);   \
 })
 
 /**
- * __test_and_change_bit - Change a bit and return its old value
+ * __test_and_change_bit__ - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
@@ -175,7 +175,7 @@ __test_and_clear_bit(int nr, volatile void *addr)
  * but actually fail.  You must protect multiple accesses with a lock.
  */
 static always_inline bool
-__test_and_change_bit(int nr, volatile void *addr)
+__test_and_change_bit__(int nr, volatile void *addr)
 {
 #ifndef arch__test_and_change_bit
 #define arch__test_and_change_bit generic__test_and_change_bit
@@ -185,11 +185,11 @@ __test_and_change_bit(int nr, volatile void *addr)
 }
 #define __test_and_change_bit(nr, addr) ({  \
 if ( bitop_bad_size(addr) ) __bitop_bad_size(); \
-__test_and_change_bit(nr, addr);\
+__test_and_change_bit__(nr, addr);  \
 })
 
 /**
- * test_bit - Determine whether a bit is set
+ * test_bit__ - Determine whether a bit is set
  * @nr: bit number to test
  * @addr: Address to start counting from
  *
@@ -197,7 +197,7 @@ __test_and_change_bit(int nr, volatile void *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static always_inline bool test_bit(int nr, const volatile void *addr)
+static always_inline bool test_bit__(int nr, const volatile void *addr)
 {
 #ifndef arch_test_bit
 #define arch_test_bit generic_test_bit
@@ -207,7 +207,7 @@ static always_inline bool test_bit(int nr, const volatile 
void *addr)
 }
 #define test_bit(nr, addr) ({   \
 if ( bitop_bad_size(addr) ) __bitop_bad_size(); \
-test_bit(nr, addr); \
+test_bit__(nr, addr);   \
 })
 
 /* ---

[XEN PATCH 4/5] device-tree: address violation of MISRA C Rule 5.5

2025-07-04 Thread Dmytro Prokopchuk1

Address a violation of MISRA C:2012 Rule 5.5:
"Identifiers shall be distinct from macro names".

Reports for service MC3A2.R5.5:
xen/include/xen/fdt-domain-build.h: non-compliant parameter 'copy_to_guest'
xen/include/xen/guest_access.h: non-compliant macro 'copy_to_guest'

Rename 'copy_to_guest' function parameter to 'cb' for compliance.
No functional changes.

Signed-off-by: Dmytro Prokopchuk 
---
 xen/common/device-tree/domain-build.c | 9 -
 xen/include/xen/fdt-domain-build.h| 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/xen/common/device-tree/domain-build.c 
b/xen/common/device-tree/domain-build.c
index cd01a8b4bc..2b009547d0 100644
--- a/xen/common/device-tree/domain-build.c
+++ b/xen/common/device-tree/domain-build.c
@@ -331,7 +331,7 @@ void __init allocate_memory(struct domain *d, struct 
kernel_info *kinfo)
 }
 
 void __init dtb_load(struct kernel_info *kinfo,
- copy_to_guest_phys_cb copy_to_guest)
+ copy_to_guest_phys_cb cb)
 {
 unsigned long left;
 
@@ -339,7 +339,7 @@ void __init dtb_load(struct kernel_info *kinfo,
kinfo->d, kinfo->dtb_paddr,
kinfo->dtb_paddr + fdt_totalsize(kinfo->fdt));
 
-left = copy_to_guest(kinfo->d, kinfo->dtb_paddr,
+left = cb(kinfo->d, kinfo->dtb_paddr,
  kinfo->fdt,
  fdt_totalsize(kinfo->fdt));
 
@@ -350,7 +350,7 @@ void __init dtb_load(struct kernel_info *kinfo,
 }
 
 void __init initrd_load(struct kernel_info *kinfo,
-copy_to_guest_phys_cb copy_to_guest)
+copy_to_guest_phys_cb cb)
 {
 const struct boot_module *mod = kinfo->initrd;
 paddr_t load_addr = kinfo->initrd_paddr;
@@ -393,8 +393,7 @@ void __init initrd_load(struct kernel_info *kinfo,
 if ( !initrd )
 panic("Unable to map the %pd initrd\n", kinfo->d);
 
-res = copy_to_guest(kinfo->d, load_addr,
-initrd, len);
+res = cb(kinfo->d, load_addr, initrd, len);
 if ( res != 0 )
 panic("Unable to copy the initrd in the %pd memory\n", kinfo->d);
 
diff --git a/xen/include/xen/fdt-domain-build.h 
b/xen/include/xen/fdt-domain-build.h
index 45981dbec0..3a20623cf5 100644
--- a/xen/include/xen/fdt-domain-build.h
+++ b/xen/include/xen/fdt-domain-build.h
@@ -50,10 +50,10 @@ typedef unsigned long (*copy_to_guest_phys_cb)(struct 
domain *d,
unsigned int len);
 
 void initrd_load(struct kernel_info *kinfo,
- copy_to_guest_phys_cb copy_to_guest);
+ copy_to_guest_phys_cb cb);
 
 void dtb_load(struct kernel_info *kinfo,
-  copy_to_guest_phys_cb copy_to_guest);
+  copy_to_guest_phys_cb cb);
 
 int find_unallocated_memory(const struct kernel_info *kinfo,
 const struct membanks *mem_banks[],
-- 
2.43.0

[XEN PATCH 2/5] iommu: address violation of MISRA C Rule 5.5

2025-07-04 Thread Dmytro Prokopchuk1

Address a violation of MISRA C:2012 Rule 5.5:
"Identifiers shall be distinct from macro names".

Reports for service MC3A2.R5.5:
xen/include/xen/iommu.h: non-compliant struct 'page_list_head'
xen/include/xen/mm.h: non-compliant macro 'page_list_head'
xen/drivers/passthrough/iommu.c: non-compliant macro 'iommu_quarantine'
xen/include/xen/iommu.h: non-compliant variable 'iommu_quarantine'

These external variables ('iommu_pt_cleanup_lock'
and 'iommu_pt_cleanup_list') are no longer used
in the codebase. Removing them eliminates dead
code and ensures compliance with coding standards.
Fixes: b5622eb627 (iommu: remove unused iommu_ops method and tasklet, 
2020-09-22)

The variable 'iommu_quarantine' makes sence to use
only if 'CONFIG_HAS_PCI=y', so place it inside '#ifdef'.

Signed-off-by: Dmytro Prokopchuk 
---
 xen/include/xen/iommu.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
index ebfada1d88..57f338e2a0 100644
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -53,7 +53,9 @@ static inline bool dfn_eq(dfn_t x, dfn_t y)
 extern bool iommu_enable, iommu_enabled;
 extern bool force_iommu, iommu_verbose;
 /* Boolean except for the specific purposes of drivers/passthrough/iommu.c. */
+#ifdef CONFIG_HAS_PCI
 extern uint8_t iommu_quarantine;
+#endif /* CONFIG_HAS_PCI */
 #else
 #define iommu_enabled false
 #endif
@@ -500,9 +502,6 @@ void iommu_dev_iotlb_flush_timeout(struct domain *d, struct 
pci_dev *pdev);
  */
 DECLARE_PER_CPU(bool, iommu_dont_flush_iotlb);
 
-extern struct spinlock iommu_pt_cleanup_lock;
-extern struct page_list_head iommu_pt_cleanup_list;
-
 bool arch_iommu_use_permitted(const struct domain *d);
 
 #ifdef CONFIG_X86
-- 
2.43.0

[XEN PATCH 3/5] x86/irq: address violation of MISRA C Rule 5.5

2025-07-04 Thread Dmytro Prokopchuk1

Address a violation of MISRA C:2012 Rule 5.5:
"Identifiers shall be distinct from macro names".

Reports for service MC3A2.R5.5:
xen/include/xen/irq.h: non-compliant function `pirq_cleanup_check(struct pirq*, 
struct domain*)'
xen/include/xen/irq.h: non-compliant macro `pirq_cleanup_check'

The primary issue stems from the macro and function
having identical names, which is confusing and
non-compliant with common coding standards.

Change the function name by adding two underscores at the end.

Signed-off-by: Dmytro Prokopchuk 
---
 xen/arch/x86/irq.c| 2 +-
 xen/include/xen/irq.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index 556134f85a..d61720aa60 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1383,7 +1383,7 @@ struct pirq *alloc_pirq_struct(struct domain *d)
 return pirq;
 }
 
-void (pirq_cleanup_check)(struct pirq *pirq, struct domain *d)
+void pirq_cleanup_check__(struct pirq *pirq, struct domain *d)
 {
 /*
  * Check whether all fields have their default values, and delete
diff --git a/xen/include/xen/irq.h b/xen/include/xen/irq.h
index 95034c0d6b..02f686a999 100644
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -183,10 +183,10 @@ extern struct pirq *pirq_get_info(struct domain *d, int 
pirq);
 #define pirq_to_evtchn(d, pirq) pirq_field(d, pirq, evtchn, 0)
 #define pirq_masked(d, pirq) pirq_field(d, pirq, masked, 0)
 
-void pirq_cleanup_check(struct pirq *pirq, struct domain *d);
+void pirq_cleanup_check__(struct pirq *pirq, struct domain *d);
 
 #define pirq_cleanup_check(pirq, d) \
-(!(pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0)
+(!(pirq)->evtchn ? pirq_cleanup_check__(pirq, d) : (void)0)
 
 extern void pirq_guest_eoi(struct pirq *pirq);
 extern void desc_guest_eoi(struct irq_desc *desc, struct pirq *pirq);
-- 
2.43.0

[XEN PATCH 1/5] gnttab: address violation of MISRA C Rule 5.5

2025-07-04 Thread Dmytro Prokopchuk1

Address a violation of MISRA C:2012 Rule 5.5:
"Identifiers shall be distinct from macro names".

Reports for service MC3A2.R5.5:
xen/common/grant_table.c: non-compliant macro 'update_gnttab_par'
xen/common/grant_table.c: non-compliant macro 'parse_gnttab_limit'

The macros above are intended to discard function arguments (unused1, unused2)
when compiling with different configurations of CONFIG_HYPFS.
This can lead to confusion and unexpected behavior
because the macro name and the function name are identical.
Split the code and create two distinct function signatures.
This ensures that the code behaves predictably and remains compliant.

Signed-off-by: Dmytro Prokopchuk 
---
 xen/common/grant_table.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index cf131c43a1..f3282a1d7b 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -126,18 +126,12 @@ static void __init cf_check 
max_maptrack_frames_init(struct param_hypfs *par)
 update_gnttab_par(opt_max_maptrack_frames, par,
   opt_max_maptrack_frames_val);
 }
-#else
-#define update_gnttab_par(v, unused1, unused2) update_gnttab_par(v)
-#define parse_gnttab_limit(a, v, unused1, unused2) parse_gnttab_limit(a, v)
-
-static void update_gnttab_par(unsigned int val, struct param_hypfs *par,
-  char *parval)
-{
-}
-#endif
 
 static int parse_gnttab_limit(const char *arg, unsigned int *valp,
   struct param_hypfs *par, char *parval)
+#else
+static int parse_gnttab_limit(const char *arg, unsigned int *valp)
+#endif
 {
 const char *e;
 unsigned long val;
@@ -150,7 +144,9 @@ static int parse_gnttab_limit(const char *arg, unsigned int 
*valp,
 return -ERANGE;
 
 *valp = val;
+#ifdef CONFIG_HYPFS
 update_gnttab_par(val, par, parval);
+#endif
 
 return 0;
 }
@@ -161,9 +157,13 @@ custom_runtime_param("gnttab_max_frames", 
parse_gnttab_max_frames,
 
 static int cf_check parse_gnttab_max_frames(const char *arg)
 {
+#ifdef CONFIG_HYPFS
 return parse_gnttab_limit(arg, &opt_max_grant_frames,
   param_2_parfs(parse_gnttab_max_frames),
   opt_max_grant_frames_val);
+#else
+return parse_gnttab_limit(arg, &opt_max_grant_frames);
+#endif
 }
 
 static int cf_check parse_gnttab_max_maptrack_frames(const char *arg);
@@ -173,9 +173,13 @@ custom_runtime_param("gnttab_max_maptrack_frames",
 
 static int cf_check parse_gnttab_max_maptrack_frames(const char *arg)
 {
+#ifdef CONFIG_HYPFS
 return parse_gnttab_limit(arg, &opt_max_maptrack_frames,
   param_2_parfs(parse_gnttab_max_maptrack_frames),
   opt_max_maptrack_frames_val);
+#else
+return parse_gnttab_limit(arg, &opt_max_maptrack_frames);
+#endif
 }
 
 #ifndef GNTTAB_MAX_VERSION
-- 
2.43.0

[XEN PATCH 0/5] address violation of MISRA C Rule 5.5

2025-07-04 Thread Dmytro Prokopchuk1

This patch series fully eliminates MISRA C Rule 5.5
violations for ARM64.

The previous thread is here:
https://lore.kernel.org/xen-devel/48c7830931a98b2bf70ef1509f309b262b9e5792.1745427770.git.victorm.l...@amd.com/
where that violation was proposed to be deviated.

Dmytro Prokopchuk (5):
  gnttab: address violation of MISRA C Rule 5.5
  iommu: address violation of MISRA C Rule 5.5
  x86/irq: address violation of MISRA C Rule 5.5
  device-tree: address violation of MISRA C Rule 5.5
  xen/bitops: address violation of MISRA C Rule 5.5

 xen/arch/x86/irq.c|  2 +-
 xen/common/device-tree/domain-build.c |  9 -
 xen/common/grant_table.c  | 22 +-
 xen/include/xen/bitops.h  | 24 
 xen/include/xen/fdt-domain-build.h|  4 ++--
 xen/include/xen/iommu.h   |  5 ++---
 xen/include/xen/irq.h |  4 ++--
 7 files changed, 36 insertions(+), 34 deletions(-)

-- 
2.43.0

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

2025-07-04 Thread Oleksii Kurochko



On 7/1/25 3:49 PM, Jan Beulich wrote:

On 10.06.2025 15:05, Oleksii Kurochko wrote:

This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
modifications.

Key differences include:
- TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
   break-before-make (BBM). As a result, the flushing logic is simplified.
   TLB invalidation can be deferred until p2m_write_unlock() is called.
   Consequently, the p2m->need_flush flag is always considered true and is
   removed.
- Page Table Traversal: The order of walking the page tables differs from Arm,
   and this implementation reflects that reversed traversal.
- Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
   P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.

The main functionality is in __p2m_set_entry(), which handles mappings aligned
to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).

p2m_set_entry() breaks a region down into block-aligned mappings and calls
__p2m_set_entry() accordingly.

Stub implementations (to be completed later) include:
- p2m_free_entry()

What would a function of this name do?


Recursively visiting all leaf PTE's for sub-tree behind an entry, then calls
put_page() (which will free if there is no any reference to this page),
freeing intermediate page table (after all entries were freed) by removing
it from d->arch.paging.freelist, and removes correspondent page of intermediate 
page
table from p2m->pages list.


You can clear entries, but you can't
free them, can you?


Is is a question regarding terminology? I can't free entry itself, but a page 
table or
a page (if it is a leaf entry) on which it points could free.


--- a/xen/arch/riscv/include/asm/p2m.h
+++ b/xen/arch/riscv/include/asm/p2m.h
@@ -9,8 +9,13 @@
  #include 
  #include 
  
+#include 

  #include 
  
+#define P2M_ROOT_LEVEL  HYP_PT_ROOT_LEVEL

+#define P2M_ROOT_ORDER  XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL)

This is confusing, as in patch 6 we see that p2m root table order is 2.
Something needs doing about the naming, so the two sets of things can't
be confused.


Agree, confusing enough.

I will define|P2M_ROOT_ORDER| 
as|get_order_from_bytes(GUEST_ROOT_PAGE_TABLE_SIZE)|
(or declare a new variable to store this value).

Actually, the way it's currently defined was only needed 
for|p2m_get_root_pointer() |to find the root page table by GFN, 
but|XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL) |is used explicitly there, so I just 
missed doing a proper cleanup.



--- a/xen/arch/riscv/p2m.c
+++ b/xen/arch/riscv/p2m.c
@@ -231,6 +231,8 @@ int p2m_init(struct domain *d)
  INIT_PAGE_LIST_HEAD(&p2m->pages);
  
  p2m->vmid = INVALID_VMID;

+p2m->max_mapped_gfn = _gfn(0);
+p2m->lowest_mapped_gfn = _gfn(ULONG_MAX);
  
  p2m->default_access = p2m_access_rwx;
  
@@ -325,6 +327,214 @@ int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)

  return 0;
  }
  
+/*

+ * Find and map the root page table. The caller is responsible for
+ * unmapping the table.
+ *
+ * The function will return NULL if the offset of the root table is
+ * invalid.

Don't you mean "offset into ..."?


If you won't suggested that, I will think that the meaning of "of" and "into" 
is pretty close.
But it seems like semantically "into" is more accurate and better conveys the 
intent of the code.


+ */
+static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
+{
+unsigned long root_table_indx;
+
+root_table_indx = gfn_x(gfn) >> XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL);
+if ( root_table_indx >= P2M_ROOT_PAGES )
+return NULL;
+
+return __map_domain_page(p2m->root + root_table_indx);
+}
+
+static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)

The rule of thumb is to have inline functions only in header files, leaving
decisions to the compiler elsewhere.


I am not sure what you mean in the second part (after coma) of your sentence.


+{
+panic("%s: isn't implemented for now\n", __func__);
+
+return false;
+}

For this function in particular, though: Besides the "p2me" in the name
being somewhat odd (supposedly page table entries here are simply pte_t),
how is this going to be different from pte_is_valid()?


pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
what is a type stored in the radix tree (p2m->p2m_types):
  /*
   * In the case of the P2M, the valid bit is used for other purpose. Use
   * the type to check whether an entry is valid.
   */
  static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
  {
  return p2m_type_radix_get(p2m, pte) != p2m_invalid;
  }

It is done to track which page was modified by a guest.



+static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_pte)
+{
+write_pte(p, pte);
+if ( clean_pte )
+clean_dcache_va_range(p, sizeof(*p));
+}
+
+static inl

86 matches

Mail list logo