Re: [PATCH 08/15] tools: Add domain_id and expert mode for overlay operations
Hi Anthony, On 5/1/2024 10:46 PM, Anthony PERARD wrote: On Wed, Apr 24, 2024 at 11:34:42AM +0800, Henry Wang wrote: From: Vikram Garhwal Add domain_id and expert mode for overlay assignment. This enables dynamic programming of nodes during runtime. Take the opportunity to fix the name mismatch in the xl command, the command name should be "dt-overlay" instead of "dt_overlay". I don't like much these unrelated / opportunistic changes in a patch, I'd rather have a separate patch. And in this case, if it was on a separate patch, that separated patch could gain: Fixes: 61765a07e3d8 ("tools/xl: Add new xl command overlay for device tree overlay support") and potentially backported. Ok. I can split this part to a separated commit. Signed-off-by: Vikram Garhwal Signed-off-by: Stefano Stabellini Signed-off-by: Henry Wang --- tools/include/libxl.h | 8 +-- tools/include/xenctrl.h | 5 +++-- tools/libs/ctrl/xc_dt_overlay.c | 7 -- tools/libs/light/libxl_dt_overlay.c | 17 +++ tools/xl/xl_vmcontrol.c | 34 ++--- 5 files changed, 58 insertions(+), 13 deletions(-) diff --git a/tools/include/libxl.h b/tools/include/libxl.h index 62cb07dea6..59a3e1b37c 100644 --- a/tools/include/libxl.h +++ b/tools/include/libxl.h @@ -2549,8 +2549,12 @@ libxl_device_pci *libxl_device_pci_list(libxl_ctx *ctx, uint32_t domid, void libxl_device_pci_list_free(libxl_device_pci* list, int num); #if defined(__arm__) || defined(__aarch64__) -int libxl_dt_overlay(libxl_ctx *ctx, void *overlay, - uint32_t overlay_size, uint8_t overlay_op); +#define LIBXL_DT_OVERLAY_ADD 1 +#define LIBXL_DT_OVERLAY_REMOVE2 + +int libxl_dt_overlay(libxl_ctx *ctx, uint32_t domain_id, void *overlay, + uint32_t overlay_size, uint8_t overlay_op, bool auto_mode, + bool domain_mapping); Sorry, you cannot change the API of an existing libxl function without providing something backward compatible. We have already a few example of this changes in libxl.h, e.g.: fded24ea8315 ("libxl: Make libxl_set_vcpuonline async") So, providing a wrapper called libxl_dt_overlay_0x041800() which call the new function. Ok, I will add an wrapper. #endif /* diff --git a/tools/libs/light/libxl_dt_overlay.c b/tools/libs/light/libxl_dt_overlay.c index a6c709a6dc..cdb62b28cf 100644 --- a/tools/libs/light/libxl_dt_overlay.c +++ b/tools/libs/light/libxl_dt_overlay.c @@ -57,10 +58,18 @@ int libxl_dt_overlay(libxl_ctx *ctx, void *overlay_dt, uint32_t overlay_dt_size, rc = 0; } -r = xc_dt_overlay(ctx->xch, overlay_dt, overlay_dt_size, overlay_op); +/* Check if user entered a valid domain id. */ +rc = libxl_domain_info(CTX, NULL, domid); +if (rc == ERROR_DOMAIN_NOTFOUND) { Why do you check specifically for "domain not found", what about other error? I agree this is indeed very confusing...I will rewrite this part properly in the next version. +LOGD(ERROR, domid, "Non-existant domain."); +return ERROR_FAIL; Use `goto out`, and you can let the function return ERROR_DOMAIN_NOTFOUND if that the error, we can just propagate the `rc` from libxl_domain_info(). Sure, will do the suggested way. +} + +r = xc_dt_overlay(ctx->xch, domid, overlay_dt, overlay_dt_size, overlay_op, + domain_mapping); if (r) { -LOG(ERROR, "%s: Adding/Removing overlay dtb failed.", __func__); +LOG(ERROR, "domain%d: Adding/Removing overlay dtb failed.", domid); You could replace the macro by LOGD, instead of handwriting "domain%d". Great suggestion. I will use LOGD. rc = ERROR_FAIL; } diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c index 98f6bd2e76..9674383ec3 100644 --- a/tools/xl/xl_vmcontrol.c +++ b/tools/xl/xl_vmcontrol.c @@ -1270,21 +1270,48 @@ int main_dt_overlay(int argc, char **argv) { const char *overlay_ops = NULL; const char *overlay_config_file = NULL; +uint32_t domain_id = 0; void *overlay_dtb = NULL; int rc; +bool auto_mode = true; +bool domain_mapping = false; uint8_t op; int overlay_dtb_size = 0; const int overlay_add_op = 1; const int overlay_remove_op = 2; -if (argc < 2) { -help("dt_overlay"); +if (argc < 3) { +help("dt-overlay"); return EXIT_FAILURE; } +if (argc > 5) { +fprintf(stderr, "Too many arguments\n"); +return ERROR_FAIL; +} + overlay_ops = argv[1]; overlay_config_file = argv[2]; +if (!strcmp(argv[argc - 1], "-e")) +auto_mode = false; + +if (argc == 4 || !auto_mode) { +domain_id = find_domain(argv[argc-1]); +domain_mapping = true; +} + +if (argc == 5 || !auto_mode) { +domain_id = find_domain(argv[argc-2]); +domain_mapping =
Re: [PATCH 09/15] tools/libs/light: Modify dtbo to domU linux dtbo format
Hi Anthony, On 5/1/2024 11:09 PM, Anthony PERARD wrote: On Wed, Apr 24, 2024 at 11:34:43AM +0800, Henry Wang wrote: diff --git a/tools/libs/light/libxl_dt_overlay.c b/tools/libs/light/libxl_dt_overlay.c index cdb62b28cf..eaf11a0f9c 100644 --- a/tools/libs/light/libxl_dt_overlay.c +++ b/tools/libs/light/libxl_dt_overlay.c @@ -41,6 +42,69 @@ static int check_overlay_fdt(libxl__gc *gc, void *fdt, size_t size) return 0; } +static int modify_overlay_for_domU(libxl__gc *gc, void *overlay_dt_domU, + size_t size) +{ +int rc = 0; +int virtual_interrupt_parent = GUEST_PHANDLE_GIC; +const struct fdt_property *fdt_prop_node = NULL; +int overlay; +int prop_len = 0; +int subnode = 0; +int fragment; +const char *prop_name; +const char *target_path = "/"; + +fdt_for_each_subnode(fragment, overlay_dt_domU, 0) { +prop_name = fdt_getprop(overlay_dt_domU, fragment, "target-path", +_len); +if (prop_name == NULL) { +LOG(ERROR, "target-path property not found\n"); LOG* macros already takes care of adding \n, no need to add an extra one. Sure, I will remove the "\n". +rc = ERROR_FAIL; +goto err; +} + +/* Change target path for domU dtb. */ +rc = fdt_setprop_string(overlay_dt_domU, fragment, "target-path", fdt_setprop_string() isn't a libxl function, store the return value in a variable named `r` instead.' Thanks for spotting this. Will change it to `r`. +target_path); +if (rc) { +LOG(ERROR, "Setting interrupt parent property failed for %s\n", +prop_name); +goto err; +} + +overlay = fdt_subnode_offset(overlay_dt_domU, fragment, "__overlay__"); + +fdt_for_each_subnode(subnode, overlay_dt_domU, overlay) +{ +const char *node_name = fdt_get_name(overlay_dt_domU, subnode, + NULL); + +fdt_prop_node = fdt_getprop(overlay_dt_domU, subnode, +"interrupt-parent", _len); +if (fdt_prop_node == NULL) { +LOG(DETAIL, "%s property not found for %s. Skip to next node\n", +"interrupt-parent", node_name); Why do you have "interrupt-parent" in a separate argument? Do you meant to do something like const char *some_name = "interrupt-parent"; and use that in the 4 different places that this string is used? (Using a variable mean that we (or the compiler) can make sure that they are all spelled correctly. Great suggestion! I will do this way. +continue; +} + +rc = fdt_setprop_inplace_u32(overlay_dt_domU, subnode, + "interrupt-parent", + virtual_interrupt_parent); +if (rc) { +LOG(ERROR, "Setting interrupt parent property failed for %s\n", +"interrupt-parent"); +goto err; +} +} +} + +return 0; Missed indentation. Will correct it. + +err: +return rc; A few things, looks like `rc` is always going to be ERROR_FAIL here, unless you find an libxl_error code that better describe the error, so you could forgo the `rc` variable. Also, if you don't need to clean up anything in the function or have a generic error message, you could simply "return " instead of using the "goto" style. Sure, I will simply use return because I don't really think there is anything to be cleaned up. +} + int libxl_dt_overlay(libxl_ctx *ctx, uint32_t domid, void *overlay_dt, uint32_t overlay_dt_size, uint8_t overlay_op, bool auto_mode, bool domain_mapping) @@ -73,6 +137,15 @@ int libxl_dt_overlay(libxl_ctx *ctx, uint32_t domid, void *overlay_dt, rc = ERROR_FAIL; } +/* + * auto_mode doesn't apply to dom0 as dom0 can get the physical + * description of the hardware. + */ +if (domid && auto_mode) { +if (overlay_op == LIBXL_DT_OVERLAY_ADD) Shouldn't libxl complain if the operation is different? I will add corresponding error handling code here. Thanks! Kind regards, Henry +rc = modify_overlay_for_domU(gc, overlay_dt, overlay_dt_size); +} + out: GC_FREE; return rc; Thanks,
Re: [PATCH 07/15] xen/overlay: Enable device tree overlay assignment to running domains
Hi Julien, On 4/30/2024 5:47 PM, Julien Grall wrote: On 30/04/2024 05:00, Henry Wang wrote: Hi Julien, Hi Henry, On 4/30/2024 1:34 AM, Julien Grall wrote: On 29/04/2024 04:36, Henry Wang wrote: Hi Jan, Julien, Stefano, Hi Henry, On 4/24/2024 2:05 PM, Jan Beulich wrote: On 24.04.2024 05:34, Henry Wang wrote: --- a/xen/include/public/sysctl.h +++ b/xen/include/public/sysctl.h @@ -1197,7 +1197,9 @@ struct xen_sysctl_dt_overlay { #define XEN_SYSCTL_DT_OVERLAY_ADD 1 #define XEN_SYSCTL_DT_OVERLAY_REMOVE 2 uint8_t overlay_op; /* IN: Add or remove. */ - uint8_t pad[3]; /* IN: Must be zero. */ + bool domain_mapping; /* IN: True of False. */ + uint8_t pad[2]; /* IN: Must be zero. */ + uint32_t domain_id; }; If you merely re-purposed padding fields, all would be fine without bumping the interface version. Yet you don't, albeit for an unclear reason: Why uint32_t rather than domid_t? And on top of that - why a separate boolean when you could use e.g. DOMID_INVALID to indicate "no domain mapping"? I think both of your suggestion make great sense. I will follow the suggestion in v2. That said - anything taking a domain ID is certainly suspicious in a sysctl. Judging from the description you really mean this to be a domctl. Anything else will require extra justification. I also think a domctl is better. I had a look at the history of the already merged series, it looks like in the first version of merged part 1 [1], the hypercall was implemented as the domctl in the beginning but later in v2 changed to sysctl. I think this makes sense as the scope of that time is just to make Xen aware of the device tree node via Xen device tree. However this is now a problem for the current part where the scope (and the end goal) is extended to assign the added device to Linux Dom0/DomU via device tree overlays. I am not sure which way is better, should we repurposing the sysctl to domctl or maybe add another domctl (I am worrying about the duplication because basically we need the same sysctl functionality but now with a domid in it)? What do you think? I am not entirely sure this is a good idea to try to add the device in Xen and attach it to the guests at the same time. Imagine the following situation: 1) Add and attach devices 2) The domain is rebooted 3) Detach and remove devices After step 2, you technically have a new domain. You could have also a case where this is a completely different guest. So the flow would look a little bit weird (you create the DT overlay with domain A but remove with domain B). So, at the moment, it feels like the add/attach (resp detech/remove) operations should happen separately. Thinking a bit more about it, there is another problem with the single hypercall appproach. The MMIOs will be mapped 1:1 to the guest. These region may clash with other part of the layout for domain created by the toolstack and dom0less (if the 1:1 option has not been enabled). I guess for that add, it would be possible to specify the mapping in the Device-Tree. But that would not work for the removal (this may be a different domain). On a somewhat similar topic, the number of IRQs supported by the vGIC is fixed at boot. How would that work with this patch? Seeing your comment here I now realized patch #5 is to address this issue. But I think we need to have a complete rework of the original patch to make the feature portable. We can continue the discussion in patch 5. Can you clarify why you want to add devices to Xen and attach to a guest within a single hypercall? Sorry I don't know if there is any specific thoughts on the design of using a single hypercall to do both add devices to Xen device tree and assign the device to the guest. In fact seeing your above comments, I think separating these two functionality to two xl commands using separated hypercalls would indeed be a better idea. Thank you for the suggestion! To make sure I understand correctly, would you mind confirming if below actions for v2 make sense to you? Thanks! - Only use the XEN_SYSCTL_DT_OVERLAY_{ADD, REMOVE} sysctls to add/remove overlay to Xen device tree Note that this would attach the devices to dom0 first. Maybe this is why it was decided to merge the two operations? An option would be to allow the devices to be attached to no-one. - Introduce the xl dt-overlay attach command and respective domctls to do the device assignment for the overlay to domain. We already have domctls to route IRQs and map MMIOs. So do we actually need new domctls? No I don't think so, like you and Stefano said in the other thread, I think I need to split the command to different hypercalls instead of only one hypercall and reuse the existing domctl. Kind regards, Henry Cheers,
Re: [PATCH 05/15] tools/libs/light: Increase nr_spi to 160
Hi Anthony, (+Arm maintainers) On 5/1/2024 9:58 PM, Anthony PERARD wrote: On Wed, Apr 24, 2024 at 11:34:39AM +0800, Henry Wang wrote: Increase number of spi to 160 i.e. gic_number_lines() for Xilinx ZynqMP - 32. This was done to allocate and assign IRQs to a running domain. Signed-off-by: Vikram Garhwal Signed-off-by: Stefano Stabellini Signed-off-by: Henry Wang --- tools/libs/light/libxl_arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c index dd5c9f4917..50dbd0f2a9 100644 --- a/tools/libs/light/libxl_arm.c +++ b/tools/libs/light/libxl_arm.c @@ -181,7 +181,8 @@ int libxl__arch_domain_prepare_config(libxl__gc *gc, LOG(DEBUG, "Configure the domain"); -config->arch.nr_spis = nr_spis; +/* gic_number_lines() is 192 for Xilinx ZynqMP. min nr_spis = 192 - 32. */ +config->arch.nr_spis = MAX(nr_spis, 160); Is there a way that that Xen or libxl could find out what the minimum number of SPI needs to be? I am afraid currently there is none. Are we going to have to increase that minimum number every time a new platform comes along? It doesn't appear that libxl is using that `nr_spis` value and it is probably just given to Xen. So my guess is that Xen could simply take care of the minimum value, gic_number_lines() seems to be a Xen function. Xen will take care of the value of nr_spis for dom0 in create_dom0() dom0_cfg.arch.nr_spis = min(gic_number_lines(), (unsigned int) 992) - 32; and also for dom0less domUs in create_domUs(). However, it looks like Xen will not take care of the mininum value for libxl guests, the value from config->arch.nr_spis in guest config file will be directly passed to the domain_vgic_init() function from arch_domain_create(). I agree with you that we shouldn't just bump the number everytime when we have a new platform. Therefore, would it be a good idea to move the logic in this patch to arch_sanitise_domain_config()? Kind regards, Henry Thanks,
Re: [PATCH 04/15] tools/libs/light: Always enable IOMMU
Hi Anthony, On 5/1/2024 9:47 PM, Anthony PERARD wrote: On Wed, Apr 24, 2024 at 11:34:38AM +0800, Henry Wang wrote: For overlay with iommu functionality to work with running VMs, we need to enable IOMMU when iomem presents for the domains. Signed-off-by: Vikram Garhwal Signed-off-by: Henry Wang --- tools/libs/light/libxl_arm.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c index 1cb89fa584..dd5c9f4917 100644 --- a/tools/libs/light/libxl_arm.c +++ b/tools/libs/light/libxl_arm.c @@ -222,6 +222,12 @@ int libxl__arch_domain_prepare_config(libxl__gc *gc, config->arch.sve_vl = d_config->b_info.arch_arm.sve_vl / 128U; } +#ifdef LIBXL_HAVE_DT_OVERLAY libxl_arm.c is only build on Arm, so this should be defined, so no need to check. Ah sure, I was just thought in the future RISC-V/PPC may have the same, but you are correct. I will remove the check. +if (d_config->b_info.num_iomem) { +config->flags |= XEN_DOMCTL_CDF_iommu; Is this doing the same thing as the previous patch? I think so, yes, we need the IOMMU flag to be set if we want to assign a device from a DT node protected by IOMMU. Kind regards, Henry Thanks,
Re: [PATCH 07/15] xen/overlay: Enable device tree overlay assignment to running domains
Hi Stefano, Julien, On 5/3/2024 2:02 AM, Stefano Stabellini wrote: On Tue, 30 Apr 2024, Henry Wang wrote: Hi Julien, On 4/30/2024 1:34 AM, Julien Grall wrote: On 29/04/2024 04:36, Henry Wang wrote: Hi Jan, Julien, Stefano, Hi Henry, On 4/24/2024 2:05 PM, Jan Beulich wrote: On 24.04.2024 05:34, Henry Wang wrote: --- a/xen/include/public/sysctl.h +++ b/xen/include/public/sysctl.h @@ -1197,7 +1197,9 @@ struct xen_sysctl_dt_overlay { #define XEN_SYSCTL_DT_OVERLAY_ADD 1 #define XEN_SYSCTL_DT_OVERLAY_REMOVE 2 uint8_t overlay_op; /* IN: Add or remove. */ - uint8_t pad[3]; /* IN: Must be zero. */ + bool domain_mapping; /* IN: True of False. */ + uint8_t pad[2]; /* IN: Must be zero. */ + uint32_t domain_id; }; If you merely re-purposed padding fields, all would be fine without bumping the interface version. Yet you don't, albeit for an unclear reason: Why uint32_t rather than domid_t? And on top of that - why a separate boolean when you could use e.g. DOMID_INVALID to indicate "no domain mapping"? I think both of your suggestion make great sense. I will follow the suggestion in v2. That said - anything taking a domain ID is certainly suspicious in a sysctl. Judging from the description you really mean this to be a domctl. Anything else will require extra justification. I also think a domctl is better. I had a look at the history of the already merged series, it looks like in the first version of merged part 1 [1], the hypercall was implemented as the domctl in the beginning but later in v2 changed to sysctl. I think this makes sense as the scope of that time is just to make Xen aware of the device tree node via Xen device tree. However this is now a problem for the current part where the scope (and the end goal) is extended to assign the added device to Linux Dom0/DomU via device tree overlays. I am not sure which way is better, should we repurposing the sysctl to domctl or maybe add another domctl (I am worrying about the duplication because basically we need the same sysctl functionality but now with a domid in it)? What do you think? I am not entirely sure this is a good idea to try to add the device in Xen and attach it to the guests at the same time. Imagine the following situation: 1) Add and attach devices 2) The domain is rebooted 3) Detach and remove devices After step 2, you technically have a new domain. You could have also a case where this is a completely different guest. So the flow would look a little bit weird (you create the DT overlay with domain A but remove with domain B). So, at the moment, it feels like the add/attach (resp detech/remove) operations should happen separately. Can you clarify why you want to add devices to Xen and attach to a guest within a single hypercall? Sorry I don't know if there is any specific thoughts on the design of using a single hypercall to do both add devices to Xen device tree and assign the device to the guest. In fact seeing your above comments, I think separating these two functionality to two xl commands using separated hypercalls would indeed be a better idea. Thank you for the suggestion! To make sure I understand correctly, would you mind confirming if below actions for v2 make sense to you? Thanks! - Only use the XEN_SYSCTL_DT_OVERLAY_{ADD, REMOVE} sysctls to add/remove overlay to Xen device tree - Introduce the xl dt-overlay attach command and respective domctls to do the device assignment for the overlay to domain. I think two hypercalls is OK. The original idea was to have a single xl command to do the operation for user convenience (even that is not a hard requirement) but that can result easily in two hypercalls. Ok, sounds good. I will break the command to two hypercalls and try to reuse the existing domctls for assign/remove IRQ/MMIO ranges. Kind regards, Henry
Re: [PATCH 1/3] xen/arm/dom0less-build: Alloc magic pages for Dom0less DomUs from hypervisor
Hi Daniel, On 4/30/2024 6:22 PM, Daniel P. Smith wrote: On 4/29/24 22:55, Henry Wang wrote: Hi Daniel, On 4/30/2024 8:27 AM, Daniel P. Smith wrote: On 4/25/24 23:14, Henry Wang wrote: There are use cases (for example using the PV driver) in Dom0less setup that require Dom0less DomUs start immediately with Dom0, but initialize XenStore later after Dom0's successful boot and call to the init-dom0less application. An error message can seen from the init-dom0less application on 1:1 direct-mapped domains: ``` Allocating magic pages memory.c:238:d0v0 mfn 0x39000 doesn't belong to d1 Error on alloc magic pages ``` This is because currently the magic pages for Dom0less DomUs are populated by the init-dom0less app through populate_physmap(), and populate_physmap() automatically assumes gfn == mfn for 1:1 direct mapped domains. This cannot be true for the magic pages that are allocated later from the init-dom0less application executed in Dom0. For domain using statically allocated memory but not 1:1 direct-mapped, similar error "failed to retrieve a reserved page" can be seen as the reserved memory list is empty at that time. To solve above issue, this commit allocates the magic pages for Dom0less DomUs at the domain construction time. The base address/PFN of the magic page region will be noted and communicated to the init-dom0less application in Dom0. Might I suggest we not refer to these as magic pages? I would consider them as hypervisor reserved pages for the VM to have access to virtual platform capabilities. We may see this expand in the future for some unforeseen, new capability. I think magic page is a specific terminology to refer to these pages, see alloc_magic_pages() for both x86 and Arm. I will reword the last paragraph of the commit message to refer them as "hypervisor reserved pages (currently used as magic pages on Arm)" if this sounds good to you. I would highlight that is a term used in the toolstack, while is probably not the best, there is no reason to change in there, but the hypervisor does not carry that terminology. IMHO we should not introduce it there and be explicit about why the pages are getting reserved. Thanks for the suggestion. I will rework the commit message. Kind regards, Henry v/r, dps
Re: [PATCH v1.1] xen/commom/dt-overlay: Fix missing lock when remove the device
Hi Julien, On 5/3/2024 9:04 PM, Julien Grall wrote: Hi Henry, On 26/04/2024 02:55, Henry Wang wrote: If CONFIG_DEBUG=y, below assertion will be triggered: (XEN) Assertion 'rw_is_locked(_host_lock)' failed at drivers/passthrough/device_tree.c:146 (XEN) [ Xen-4.19-unstable arm64 debug=y Not tainted ] (XEN) CPU: 0 (XEN) PC: 0a257418 iommu_remove_dt_device+0x8c/0xd4 (XEN) LR: 0a2573a0 (XEN) SP: 8000fff7fb30 (XEN) CPSR: 0249 MODE:64-bit EL2h (Hypervisor, handler) [...] (XEN) Xen call trace: (XEN) [<0a257418>] iommu_remove_dt_device+0x8c/0xd4 (PC) (XEN) [<0a2573a0>] iommu_remove_dt_device+0x14/0xd4 (LR) (XEN) [<0a20797c>] dt-overlay.c#remove_node_resources+0x8c/0x90 (XEN) [<0a207f14>] dt-overlay.c#remove_nodes+0x524/0x648 (XEN) [<0a208460>] dt_overlay_sysctl+0x428/0xc68 (XEN) [<0a2707f8>] arch_do_sysctl+0x1c/0x2c (XEN) [<0a230b40>] do_sysctl+0x96c/0x9ec (XEN) [<0a271e08>] traps.c#do_trap_hypercall+0x1e8/0x288 (XEN) [<0a273490>] do_trap_guest_sync+0x448/0x63c (XEN) [<0a25c480>] entry.o#guest_sync_slowpath+0xa8/0xd8 (XEN) (XEN) (XEN) (XEN) Panic on CPU 0: (XEN) Assertion 'rw_is_locked(_host_lock)' failed at drivers/passthrough/device_tree.c:146 (XEN) This is because iommu_remove_dt_device() is called without taking the dt_host_lock. Fix the issue by taking and releasing the lock properly. Fixes: 7e5c4a8b86f1 ("xen/arm: Implement device tree node removal functionalities") Signed-off-by: Henry Wang --- v1.1: - Move the unlock position before the check of rc. --- xen/common/dt-overlay.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xen/common/dt-overlay.c b/xen/common/dt-overlay.c index 1b197381f6..ab8f43aea2 100644 --- a/xen/common/dt-overlay.c +++ b/xen/common/dt-overlay.c @@ -381,7 +381,9 @@ static int remove_node_resources(struct dt_device_node *device_node) { if ( dt_device_is_protected(device_node) ) { + write_lock(_host_lock); Looking at the code, we are not modifying the device_node, so shouldn't this be a read_lock()? Hmm yes, however after seeing your comment... That said, even though either fix your issue, I am not entirely convinced this is the correct position for the lock. From my understanding, dt_host_lock is meant to ensure that the DT node will not disappear behind your back. So in theory, shouldn't the lock be taken as soon as you get hold of device_node? ...here. I believe you made a point here so I think I will just move the write_lock(_host_lock) as soon as getting overlay_node, i.e. on top of the call to remove_descendant_nodes_resources(). Therefore we can solve the assertion issue of this patch together. Kind regards, Henry Cheers,
Re: [PATCH 2/3] xen/arm, tools: Add a new HVM_PARAM_MAGIC_BASE_PFN key in HVMOP
Hi Stefano, On 5/3/2024 2:08 AM, Stefano Stabellini wrote: On Fri, 26 Apr 2024, Henry Wang wrote: For use cases such as Dom0less PV drivers, a mechanism to communicate Dom0less DomU's static data with the runtime control plane (Dom0) is needed. Since on Arm HVMOP is already the existing approach to address such use cases (for example the allocation of HVM_PARAM_CALLBACK_IRQ), add a new HVMOP key HVM_PARAM_MAGIC_BASE_PFN for storing the magic page region base PFN. The value will be set at Dom0less DomU construction time after Dom0less DomU's magic page region has been allocated. To keep consistent, also set the value for HVM_PARAM_MAGIC_BASE_PFN for libxl guests in alloc_magic_pages(). Reported-by: Alec Kwapis Signed-off-by: Henry Wang --- tools/libs/guest/xg_dom_arm.c | 2 ++ xen/arch/arm/dom0less-build.c | 2 ++ xen/arch/arm/hvm.c | 1 + xen/include/public/hvm/params.h | 1 + 4 files changed, 6 insertions(+) diff --git a/tools/libs/guest/xg_dom_arm.c b/tools/libs/guest/xg_dom_arm.c index 8cc7f27dbb..3c08782d1d 100644 --- a/tools/libs/guest/xg_dom_arm.c +++ b/tools/libs/guest/xg_dom_arm.c @@ -74,6 +74,8 @@ static int alloc_magic_pages(struct xc_dom_image *dom) xc_clear_domain_page(dom->xch, dom->guest_domid, base + MEMACCESS_PFN_OFFSET); xc_clear_domain_page(dom->xch, dom->guest_domid, dom->vuart_gfn); +xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_MAGIC_BASE_PFN, +base); xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_PFN, dom->console_pfn); xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_PFN, diff --git a/xen/arch/arm/dom0less-build.c b/xen/arch/arm/dom0less-build.c index 40dc85c759..72187c167d 100644 --- a/xen/arch/arm/dom0less-build.c +++ b/xen/arch/arm/dom0less-build.c @@ -861,6 +861,8 @@ static int __init construct_domU(struct domain *d, free_domheap_pages(magic_pg, get_order_from_pages(NR_MAGIC_PAGES)); return rc; } + +d->arch.hvm.params[HVM_PARAM_MAGIC_BASE_PFN] = gfn_x(gfn); I apologize as I have not read the whole email thread in reply to this patch. Why do we need to introduce a new hvm param instead of just setting HVM_PARAM_CONSOLE_PFN and HVM_PARAM_STORE_PFN directly here? Yeah this is a good question, I aIso thought about this but in the end didn't do that directly because I don't really want to break the current protocol between Linux, Xen and toolstack. In docs/features/dom0less.pandoc, section "PV Drivers", there is a communication protocol saying that Xen should keep the HVM_PARAM_STORE_PFN to ~0ULL until the toolstack sets the HVM_PARAM_STORE_PFN. I am open to change the protocol (changes might be needed in the Linux side too), if it is ok to do that, I can set the HVM params here directly and change the doc accordingly. Kind regards, Henry
[linux-linus test] 185920: tolerable FAIL - PUSHED
flight 185920 linux-linus real [real] flight 185921 linux-linus real-retest [real] http://logs.test-lab.xenproject.org/osstest/logs/185920/ http://logs.test-lab.xenproject.org/osstest/logs/185921/ Failures :-/ but no regressions. Tests which are failing intermittently (not blocking): test-armhf-armhf-xl-qcow2 8 xen-bootfail pass in 185921-retest Tests which did not succeed, but are not blocking: test-armhf-armhf-libvirt 16 saverestore-support-check fail blocked in 185915 test-armhf-armhf-xl-qcow2 14 migrate-support-check fail in 185921 never pass test-armhf-armhf-xl-qcow2 15 saverestore-support-check fail in 185921 never pass test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 185915 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 185915 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 185915 test-armhf-armhf-examine 8 reboot fail like 185915 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 185915 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 185915 test-amd64-amd64-libvirt 15 migrate-support-checkfail never pass test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail never pass test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check fail never pass test-arm64-arm64-xl-credit1 15 migrate-support-checkfail never pass test-arm64-arm64-xl-xsm 15 migrate-support-checkfail never pass test-arm64-arm64-xl-credit1 16 saverestore-support-checkfail never pass test-arm64-arm64-xl-xsm 16 saverestore-support-checkfail never pass test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail never pass test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail never pass test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail never pass test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail never pass test-arm64-arm64-xl-credit2 15 migrate-support-checkfail never pass test-arm64-arm64-xl-credit2 16 saverestore-support-checkfail never pass test-arm64-arm64-xl 15 migrate-support-checkfail never pass test-arm64-arm64-xl 16 saverestore-support-checkfail never pass test-armhf-armhf-xl 15 migrate-support-checkfail never pass test-armhf-armhf-xl 16 saverestore-support-checkfail never pass test-armhf-armhf-xl-rtds 15 migrate-support-checkfail never pass test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail never pass test-amd64-amd64-libvirt-qcow2 14 migrate-support-checkfail never pass test-amd64-amd64-libvirt-raw 14 migrate-support-checkfail never pass test-armhf-armhf-xl-arndale 15 migrate-support-checkfail never pass test-armhf-armhf-xl-arndale 16 saverestore-support-checkfail never pass test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail never pass test-armhf-armhf-xl-credit1 15 migrate-support-checkfail never pass test-armhf-armhf-xl-credit1 16 saverestore-support-checkfail never pass test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail never pass test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail never pass test-arm64-arm64-xl-vhd 14 migrate-support-checkfail never pass test-arm64-arm64-xl-vhd 15 saverestore-support-checkfail never pass test-armhf-armhf-xl-credit2 15 migrate-support-checkfail never pass test-armhf-armhf-xl-credit2 16 saverestore-support-checkfail never pass test-armhf-armhf-libvirt 15 migrate-support-checkfail never pass test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail never pass test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail never pass test-armhf-armhf-libvirt-vhd 14 migrate-support-checkfail never pass test-armhf-armhf-libvirt-vhd 15 saverestore-support-checkfail never pass test-armhf-armhf-xl-raw 14 migrate-support-checkfail never pass test-armhf-armhf-xl-raw 15 saverestore-support-checkfail never pass version targeted for testing: linuxb9158815de525572333d0499a681459f6b075f28 baseline version: linux7367539ad4b0f8f9b396baf02110962333719a48 Last test of basis 185915 2024-05-04 01:14:15 Z2 days Testing same since 185920 2024-05-05 17:44:03 Z0 days1 attempts People who touched revisions under test: Adam Skladowski Alan Stern Alexander Usyskin Amit Sunil Dhamne Badhri Jagan Sridharan Bjorn Andersson Chad Wagner Chris Wulff Christoph Hellwig Conor Dooley Daniele Ceraolo Spurio Diego Roversi Dmitry Torokhov Frank Oltmanns Greg Kroah-Hartman Guenter Roeck
[PATCH v3 2/3] Update Xen's features.h header
Update it to get XENFEAT_dm_msix_all_writes for the next patch. Signed-off-by: Marek Marczykowski-Górecki --- include/hw/xen/interface/features.h | 17 + 1 file changed, 17 insertions(+) diff --git a/include/hw/xen/interface/features.h b/include/hw/xen/interface/features.h index d2a9175..8801930 100644 --- a/include/hw/xen/interface/features.h +++ b/include/hw/xen/interface/features.h @@ -111,6 +111,23 @@ #define XENFEAT_not_direct_mapped 16 #define XENFEAT_direct_mapped 17 +/* + * Signal whether the domain is able to use the following hypercalls: + * + * VCPUOP_register_runstate_phys_area + * VCPUOP_register_vcpu_time_phys_area + */ +#define XENFEAT_runstate_phys_area18 +#define XENFEAT_vcpu_time_phys_area 19 + +/* + * If set, Xen will passthrough all MSI-X vector ctrl writes to device model, + * not only those unmasking an entry. This allows device model to properly keep + * track of the MSI-X table without having to read it from the device behind + * Xen's backs. This information is relevant only for device models. + */ +#define XENFEAT_dm_msix_all_writes20 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ -- git-series 0.9.1
[PATCH v3 1/3] hw/xen/xen_pt: Save back data only for declared registers
Call pci_default_write_config() only after resolving any handlers from XenPTRegInfo structures, and only with a value updated with those handlers. This is important for two reasons: 1. XenPTRegInfo has ro_mask which needs to be enforced - Xen-specific hooks do that on their own (especially xen_pt_*_reg_write()). 2. Not setting value early allows hooks to see the old value too. If it would be only about the first point, setting PCIDevice.wmask would probably be sufficient, but given the second point, change those writes. Relevant handlers already save data back to the emulated registers space, call the pci_default_write_config() only for its side effects. Signed-off-by: Marek Marczykowski-Górecki --- v3: - use emulated register value for pci_default_write_config() call, not the one for writting back to the hardware - greatly simplify the patch by calling pci_default_write_config() on the whole value v2: - rewrite commit message, previous one was very misleading - fix loop saving register values - fix int overflow when calculating write mask --- hw/xen/xen_pt.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c index 3635d1b..5f12d3c 100644 --- a/hw/xen/xen_pt.c +++ b/hw/xen/xen_pt.c @@ -311,7 +311,6 @@ static void xen_pt_pci_write_config(PCIDevice *d, uint32_t addr, } memory_region_transaction_begin(); -pci_default_write_config(d, addr, val, len); /* adjust the read and write value to appropriate CFC-CFF window */ read_val <<= (addr & 3) << 3; @@ -397,6 +396,12 @@ static void xen_pt_pci_write_config(PCIDevice *d, uint32_t addr, /* need to shift back before passing them to xen_host_pci_set_block. */ val >>= (addr & 3) << 3; +/* Call default handler for its side effects only, with value already + * written by specific handlers. */ +pci_default_write_config(d, addr, + pci_default_read_config(d, addr, len), + len); + memory_region_transaction_commit(); out: -- git-series 0.9.1
[PATCH v3 3/3] Do not access /dev/mem in MSI-X PCI passthrough on Xen
The /dev/mem is used for two purposes: - reading PCI_MSIX_ENTRY_CTRL_MASKBIT - reading Pending Bit Array (PBA) The first one was originally done because when Xen did not send all vector ctrl writes to the device model, so QEMU might have outdated old register value. If Xen is new enough, this has been changed, so QEMU can now use its cached value of the register instead. Detect the "new enough" based on XENFEAT_dm_msix_all_writes bit in XENVER_get_features. The Pending Bit Array (PBA) handling is for the case where it lives on the same page as the MSI-X table itself. Xen has been extended to handle this case too (as well as other registers that may live on those pages), so QEMU handling is not necessary anymore. Additionally, reading from /dev/mem is trapped and emulated by Xen, so QEMU doesn't see real values anyway. And if it did, this method is prone to race conditions. Removing /dev/mem access is useful to work within stubdomain (avoids emulated reads and potential races), and necessary when dom0 kernel runs in lockdown mode (where /dev/mem is unavailable at all). Signed-off-by: Marek Marczykowski-Górecki --- Changes in v2: - Make change conditional on new Xen version (tested via XENFEAT_dm_msix_all_writes) - add few comments --- hw/xen/xen_pt_msi.c | 94 -- 1 file changed, 59 insertions(+), 35 deletions(-) diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c index 09cca4e..836cc9c 100644 --- a/hw/xen/xen_pt_msi.c +++ b/hw/xen/xen_pt_msi.c @@ -460,15 +460,23 @@ static void pci_msix_write(void *opaque, hwaddr addr, entry->updated = true; } else if (msix->enabled && entry->updated && !(val & PCI_MSIX_ENTRY_CTRL_MASKBIT)) { -const volatile uint32_t *vec_ctrl; - /* - * If Xen intercepts the mask bit access, entry->vec_ctrl may not be - * up-to-date. Read from hardware directly. + * Reading mask bit from hardware directly is needed on older Xen only. */ -vec_ctrl = s->msix->phys_iomem_base + entry_nr * PCI_MSIX_ENTRY_SIZE -+ PCI_MSIX_ENTRY_VECTOR_CTRL; -xen_pt_msix_update_one(s, entry_nr, *vec_ctrl); +if (s->msix->phys_iomem_base) { +/* Memory mapped registers */ +const volatile uint32_t *vec_ctrl; + +/* + * If Xen intercepts the mask bit access, entry->vec_ctrl may not be + * up-to-date. Read from hardware directly. + */ +vec_ctrl = s->msix->phys_iomem_base + entry_nr * PCI_MSIX_ENTRY_SIZE ++ PCI_MSIX_ENTRY_VECTOR_CTRL; +xen_pt_msix_update_one(s, entry_nr, *vec_ctrl); +} else { +xen_pt_msix_update_one(s, entry_nr, entry->latch(VECTOR_CTRL)); +} } set_entry_value(entry, offset, val); @@ -493,7 +501,12 @@ static uint64_t pci_msix_read(void *opaque, hwaddr addr, return get_entry_value(>msix_entry[entry_nr], offset); } else { /* Pending Bit Array (PBA) */ -return *(uint32_t *)(msix->phys_iomem_base + addr); +if (s->msix->phys_iomem_base) { +return *(uint32_t *)(msix->phys_iomem_base + addr); +} +XEN_PT_LOG(>dev, "reading PBA, addr 0x%lx, offset 0x%lx\n", + addr, addr - msix->total_entries * PCI_MSIX_ENTRY_SIZE); +return 0x; } } @@ -528,8 +541,8 @@ int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t base) uint32_t table_off = 0; int i, total_entries, bar_index; XenHostPCIDevice *hd = >real_device; +xen_feature_info_t xc_version_info = { 0 }; PCIDevice *d = >dev; -int fd = -1; XenPTMSIX *msix = NULL; int rc = 0; @@ -543,6 +556,10 @@ int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t base) return -1; } +if (xc_version(xen_xc, XENVER_get_features, _version_info) < 0) { +return -1; +} + rc = xen_host_pci_get_word(hd, base + PCI_MSIX_FLAGS, ); if (rc) { XEN_PT_ERR(d, "Failed to read PCI_MSIX_FLAGS field\n"); @@ -576,33 +593,40 @@ int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t base) msix->table_base = s->real_device.io_regions[bar_index].base_addr; XEN_PT_LOG(d, "get MSI-X table BAR base 0x%"PRIx64"\n", msix->table_base); -fd = open("/dev/mem", O_RDWR); -if (fd == -1) { -rc = -errno; -XEN_PT_ERR(d, "Can't open /dev/mem: %s\n", strerror(errno)); -goto error_out; -} -XEN_PT_LOG(d, "table_off = 0x%x, total_entries = %d\n", - table_off, total_entries); -msix->table_offset_adjust = table_off & 0x0fff; -msix->phys_iomem_base = -mmap(NULL, - total_entries * PCI_MSIX_ENTRY_SIZE + msix->table_offset_adjust, - PROT_READ, - MAP_SHARED | MAP_LOCKED, - fd, - msix->table_base + table_off - msix->table_offset_adjust); -close(fd); -if
Re: [PATCH for-4.19] ppc/riscv: fix arch_acquire_resource_check()
Hi Roger, On 4/30/24 10:34 AM, Roger Pau Monne wrote: > None of the implementations support set_foreign_p2m_entry() yet, neither they > have a p2m walk in domain_relinquish_resources() in order to remove the > foreign > mappings from the p2m and thus drop the extra refcounts. > > Adjust the arch helpers to return false and introduce a comment that clearly > states it is not only taking extra refcounts that's needed, but also dropping > them on domain teardown. > > Fixes: 4988704e00d8 ('xen/riscv: introduce p2m.h') > Fixes: 4a2f68f90930 ('xen/ppc: Define minimal stub headers required for full > build') > Signed-off-by: Roger Pau Monné > --- This makes sense to me. This stub implementation was definitely an oversight on my part. Acked-by: Shawn Anastasio Thanks, Shawn
[xen-unstable test] 185919: tolerable FAIL
flight 185919 xen-unstable real [real] http://logs.test-lab.xenproject.org/osstest/logs/185919/ Failures :-/ but no regressions. Tests which are failing intermittently (not blocking): test-armhf-armhf-xl-arndale 8 xen-boot fail pass in 185917 test-amd64-amd64-xl-qemut-debianhvm-i386-xsm 12 debian-hvm-install fail pass in 185917 Tests which did not succeed, but are not blocking: test-armhf-armhf-xl-arndale 15 migrate-support-check fail in 185917 never pass test-armhf-armhf-xl-arndale 16 saverestore-support-check fail in 185917 never pass test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 185917 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 185917 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 185917 test-armhf-armhf-libvirt 16 saverestore-support-checkfail like 185917 test-armhf-armhf-xl-credit2 8 xen-boot fail like 185917 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 185917 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 185917 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail never pass test-amd64-amd64-libvirt 15 migrate-support-checkfail never pass test-arm64-arm64-xl-xsm 15 migrate-support-checkfail never pass test-arm64-arm64-xl-xsm 16 saverestore-support-checkfail never pass test-arm64-arm64-xl 15 migrate-support-checkfail never pass test-arm64-arm64-xl 16 saverestore-support-checkfail never pass test-arm64-arm64-xl-credit1 15 migrate-support-checkfail never pass test-arm64-arm64-xl-credit1 16 saverestore-support-checkfail never pass test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check fail never pass test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail never pass test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail never pass test-arm64-arm64-xl-credit2 15 migrate-support-checkfail never pass test-arm64-arm64-xl-credit2 16 saverestore-support-checkfail never pass test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail never pass test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail never pass test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail never pass test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail never pass test-amd64-amd64-libvirt-qcow2 14 migrate-support-checkfail never pass test-amd64-amd64-libvirt-raw 14 migrate-support-checkfail never pass test-armhf-armhf-xl 15 migrate-support-checkfail never pass test-armhf-armhf-xl 16 saverestore-support-checkfail never pass test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail never pass test-armhf-armhf-xl-credit1 15 migrate-support-checkfail never pass test-armhf-armhf-xl-credit1 16 saverestore-support-checkfail never pass test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail never pass test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail never pass test-arm64-arm64-xl-vhd 14 migrate-support-checkfail never pass test-arm64-arm64-xl-vhd 15 saverestore-support-checkfail never pass test-armhf-armhf-xl-qcow214 migrate-support-checkfail never pass test-armhf-armhf-xl-qcow215 saverestore-support-checkfail never pass test-armhf-armhf-libvirt 15 migrate-support-checkfail never pass test-armhf-armhf-xl-rtds 15 migrate-support-checkfail never pass test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail never pass test-armhf-armhf-xl-raw 14 migrate-support-checkfail never pass test-armhf-armhf-xl-raw 15 saverestore-support-checkfail never pass test-armhf-armhf-libvirt-vhd 14 migrate-support-checkfail never pass test-armhf-armhf-libvirt-vhd 15 saverestore-support-checkfail never pass version targeted for testing: xen f95cd010cbf0914154a0c2775c979d9158b1a3cb baseline version: xen f95cd010cbf0914154a0c2775c979d9158b1a3cb Last test of basis 185919 2024-05-05 01:57:20 Z0 days Testing same since (not found) 0 attempts jobs: build-amd64-xsm pass build-arm64-xsm pass build-i386-xsm pass build-amd64-xtf pass build-amd64 pass build-arm64 pass build-armhf pass build-i386 pass