Re: [PATCH 07/12] s390x/pci: enable for load/store intepretation
On 12/15/21 2:44 AM, Pierre Morel wrote: On 12/7/21 22:04, Matthew Rosato wrote: Use the associated vfio feature ioctl to enable interpretation for devices when requested. As part of this process, we must use the host function handle rather than a QEMU-generated one -- this is provided as part of the ioctl payload. Signed-off-by: Matthew Rosato --- hw/s390x/s390-pci-bus.c | 69 +++- hw/s390x/s390-pci-inst.c | 63 - hw/s390x/s390-pci-vfio.c | 55 + include/hw/s390x/s390-pci-bus.h | 1 + include/hw/s390x/s390-pci-vfio.h | 15 +++ 5 files changed, 201 insertions(+), 2 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 01b58ebc70..451bd32d92 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -971,12 +971,57 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) } } +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev) +{ + uint32_t idx; + int rc; + + rc = s390_pci_probe_interp(pbdev); + if (rc) { + return rc; + } + + rc = s390_pci_update_passthrough_fh(pbdev); + if (rc) { + return rc; + } + + /* + * The host device is in an enabled state, but the device must + * begin as disabled for the guest so mask off the enable bit + * from the passthrough handle. + */ I think you should explain why the device must be seen disabled for the guest. AFAIU it is because we need the guest to issue a CLP_ENABLE for us to activate interpretation. This is due to the choice of activate/deactivate interpretation during device enable/disable. Not completely. While it is true that we need the guest to issue the CLP_ENABLE to activate interpretation with the way this is currently implemented, existing code also starts all plugged devices with a QEMU internal state of pbdev->state = ZPCI_FS_DISABLED; and a disabled pbdev->fh, thus expecting a subsequent CLP ENABLE from the guest when/if it decides to use the device. If we were to set the handle to an enabled state here, we must also udpate the pbdev state, introducing a difference in how we present intercept/emulated devices vs interpreted devices during plug. I don't think we want to do that -- but I can improve this comment here to try and better explain that all devices begin plugged as disabled to the guest Just curious: why not activate/deactivate interpretation on plug/unplug? I think it would be possible to do so (while still showing a disabled handle initially), but my thinking is that tying these actions to guest CLP disable/enable will also be useful later when looking at trying to better-handle error events from the guest e.g. hot reset.
Re: [PATCH 07/12] s390x/pci: enable for load/store intepretation
On 12/7/21 22:04, Matthew Rosato wrote: Use the associated vfio feature ioctl to enable interpretation for devices when requested. As part of this process, we must use the host function handle rather than a QEMU-generated one -- this is provided as part of the ioctl payload. Signed-off-by: Matthew Rosato --- hw/s390x/s390-pci-bus.c | 69 +++- hw/s390x/s390-pci-inst.c | 63 - hw/s390x/s390-pci-vfio.c | 55 + include/hw/s390x/s390-pci-bus.h | 1 + include/hw/s390x/s390-pci-vfio.h | 15 +++ 5 files changed, 201 insertions(+), 2 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 01b58ebc70..451bd32d92 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -971,12 +971,57 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) } } +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev) +{ +uint32_t idx; +int rc; + +rc = s390_pci_probe_interp(pbdev); +if (rc) { +return rc; +} + +rc = s390_pci_update_passthrough_fh(pbdev); +if (rc) { +return rc; +} + +/* + * The host device is in an enabled state, but the device must + * begin as disabled for the guest so mask off the enable bit + * from the passthrough handle. + */ I think you should explain why the device must be seen disabled for the guest. AFAIU it is because we need the guest to issue a CLP_ENABLE for us to activate interpretation. This is due to the choice of activate/deactivate interpretation during device enable/disable. Just curious: why not activate/deactivate interpretation on plug/unplug? +pbdev->fh &= ~FH_MASK_ENABLE; + +/* Next, see if the idx is already in-use */ +idx = pbdev->fh & FH_MASK_INDEX; +if (pbdev->idx != idx) { +if (s390_pci_find_dev_by_idx(s, idx)) { +return -EINVAL; +} +/* + * Update the idx entry with the passed through idx + * If the relinquised idx is lower than next_idx, use it + * to replace next_idx + */ +g_hash_table_remove(s->zpci_table, >idx); +if (idx < s->next_idx) { +s->next_idx = idx; +} +pbdev->idx = idx; +g_hash_table_insert(s->zpci_table, >idx, pbdev); +} + +return 0; +} + static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev); PCIDevice *pdev = NULL; S390PCIBusDevice *pbdev = NULL; +int rc; if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { PCIBridge *pb = PCI_BRIDGE(dev); @@ -1022,12 +1067,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, set_pbdev_info(pbdev); if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { -pbdev->fh |= FH_SHM_VFIO; +/* + * By default, interpretation is always requested; if the available + * facilities indicate it is not available, fallback to the + * intercept model. + */ +if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) { +DPRINTF("zPCI interpretation facilities missing.\n"); +pbdev->interp = false; +} +if (pbdev->interp) { +rc = s390_pci_interp_plug(s, pbdev); +if (rc) { +error_setg(errp, "zpci interp plug failed: %d", rc); +return; +} +} pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); /* Fill in CLP information passed via the vfio region */ s390_pci_get_clp_info(pbdev); +if (!pbdev->interp) { +/* Do vfio passthrough but intercept for I/O */ +pbdev->fh |= FH_SHM_VFIO; +} } else { pbdev->fh |= FH_SHM_EMUL; +/* Always intercept emulated devices */ +pbdev->interp = false; } if (s390_pci_msix_init(pbdev)) { @@ -1360,6 +1426,7 @@ static Property s390_pci_device_properties[] = { DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED), DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid), DEFINE_PROP_STRING("target", S390PCIBusDevice, target), +DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 0cef7fbace..ba4017474e 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -18,6 +18,7 @@ #include "sysemu/hw_accel.h" #include "hw/s390x/s390-pci-inst.h" #include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-vfio.h"
Re: [PATCH 07/12] s390x/pci: enable for load/store intepretation
On 07/12/2021 22.04, Matthew Rosato wrote: Use the associated vfio feature ioctl to enable interpretation for devices when requested. As part of this process, we must use the host function handle rather than a QEMU-generated one -- this is provided as part of the ioctl payload. Signed-off-by: Matthew Rosato --- hw/s390x/s390-pci-bus.c | 69 +++- hw/s390x/s390-pci-inst.c | 63 - hw/s390x/s390-pci-vfio.c | 55 + include/hw/s390x/s390-pci-bus.h | 1 + include/hw/s390x/s390-pci-vfio.h | 15 +++ 5 files changed, 201 insertions(+), 2 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 01b58ebc70..451bd32d92 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -971,12 +971,57 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) } } +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev) +{ +uint32_t idx; +int rc; + +rc = s390_pci_probe_interp(pbdev); +if (rc) { +return rc; +} + +rc = s390_pci_update_passthrough_fh(pbdev); +if (rc) { +return rc; +} + +/* + * The host device is in an enabled state, but the device must + * begin as disabled for the guest so mask off the enable bit + * from the passthrough handle. + */ +pbdev->fh &= ~FH_MASK_ENABLE; + +/* Next, see if the idx is already in-use */ +idx = pbdev->fh & FH_MASK_INDEX; +if (pbdev->idx != idx) { +if (s390_pci_find_dev_by_idx(s, idx)) { +return -EINVAL; +} +/* + * Update the idx entry with the passed through idx + * If the relinquised idx is lower than next_idx, use it s/relinquised/relinquished/ + * to replace next_idx + */ +g_hash_table_remove(s->zpci_table, >idx); +if (idx < s->next_idx) { +s->next_idx = idx; +} +pbdev->idx = idx; +g_hash_table_insert(s->zpci_table, >idx, pbdev); +} + +return 0; +} [...] diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index 6f80a47e29..78093aaac7 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -97,6 +97,61 @@ void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) } } +int s390_pci_probe_interp(S390PCIBusDevice *pbdev) +{ +VFIOPCIDevice *vdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); +struct vfio_device_feature feat = { +.argsz = sizeof(struct vfio_device_feature), +.flags = VFIO_DEVICE_FEATURE_PROBE + VFIO_DEVICE_FEATURE_ZPCI_INTERP +}; + +return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, ); +} + +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable) +{ +VFIOPCIDevice *vdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); +g_autofree struct vfio_device_feature *feat; IIRC there have been compiler versions that complain if a g_autofree variable is initialized at the point of declaration, so you might need to add the "= g_malloc0(size)" here already. +struct vfio_device_zpci_interp *data; +int size; + +size = sizeof(*feat) + sizeof(*data); +feat = g_malloc0(size); +feat->argsz = size; +feat->flags = VFIO_DEVICE_FEATURE_SET + VFIO_DEVICE_FEATURE_ZPCI_INTERP; + +data = (struct vfio_device_zpci_interp *)>data; +if (enable) { +data->flags = VFIO_DEVICE_ZPCI_FLAG_INTERP; +} else { +data->flags = 0; +} + +return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat); +} + +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev) +{ +VFIOPCIDevice *vdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); +g_autofree struct vfio_device_feature *feat; dito +struct vfio_device_zpci_interp *data; +int size, rc; + +size = sizeof(*feat) + sizeof(*data); +feat = g_malloc0(size); +feat->argsz = size; +feat->flags = VFIO_DEVICE_FEATURE_GET + VFIO_DEVICE_FEATURE_ZPCI_INTERP; + +rc = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat); +if (rc) { +return rc; +} + +data = (struct vfio_device_zpci_interp *)>data; +pbdev->fh = data->fh; +return 0; +} Thomas
[PATCH 07/12] s390x/pci: enable for load/store intepretation
Use the associated vfio feature ioctl to enable interpretation for devices when requested. As part of this process, we must use the host function handle rather than a QEMU-generated one -- this is provided as part of the ioctl payload. Signed-off-by: Matthew Rosato --- hw/s390x/s390-pci-bus.c | 69 +++- hw/s390x/s390-pci-inst.c | 63 - hw/s390x/s390-pci-vfio.c | 55 + include/hw/s390x/s390-pci-bus.h | 1 + include/hw/s390x/s390-pci-vfio.h | 15 +++ 5 files changed, 201 insertions(+), 2 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 01b58ebc70..451bd32d92 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -971,12 +971,57 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr) } } +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev) +{ +uint32_t idx; +int rc; + +rc = s390_pci_probe_interp(pbdev); +if (rc) { +return rc; +} + +rc = s390_pci_update_passthrough_fh(pbdev); +if (rc) { +return rc; +} + +/* + * The host device is in an enabled state, but the device must + * begin as disabled for the guest so mask off the enable bit + * from the passthrough handle. + */ +pbdev->fh &= ~FH_MASK_ENABLE; + +/* Next, see if the idx is already in-use */ +idx = pbdev->fh & FH_MASK_INDEX; +if (pbdev->idx != idx) { +if (s390_pci_find_dev_by_idx(s, idx)) { +return -EINVAL; +} +/* + * Update the idx entry with the passed through idx + * If the relinquised idx is lower than next_idx, use it + * to replace next_idx + */ +g_hash_table_remove(s->zpci_table, >idx); +if (idx < s->next_idx) { +s->next_idx = idx; +} +pbdev->idx = idx; +g_hash_table_insert(s->zpci_table, >idx, pbdev); +} + +return 0; +} + static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev); PCIDevice *pdev = NULL; S390PCIBusDevice *pbdev = NULL; +int rc; if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { PCIBridge *pb = PCI_BRIDGE(dev); @@ -1022,12 +1067,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, set_pbdev_info(pbdev); if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { -pbdev->fh |= FH_SHM_VFIO; +/* + * By default, interpretation is always requested; if the available + * facilities indicate it is not available, fallback to the + * intercept model. + */ +if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) { +DPRINTF("zPCI interpretation facilities missing.\n"); +pbdev->interp = false; +} +if (pbdev->interp) { +rc = s390_pci_interp_plug(s, pbdev); +if (rc) { +error_setg(errp, "zpci interp plug failed: %d", rc); +return; +} +} pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); /* Fill in CLP information passed via the vfio region */ s390_pci_get_clp_info(pbdev); +if (!pbdev->interp) { +/* Do vfio passthrough but intercept for I/O */ +pbdev->fh |= FH_SHM_VFIO; +} } else { pbdev->fh |= FH_SHM_EMUL; +/* Always intercept emulated devices */ +pbdev->interp = false; } if (s390_pci_msix_init(pbdev)) { @@ -1360,6 +1426,7 @@ static Property s390_pci_device_properties[] = { DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED), DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid), DEFINE_PROP_STRING("target", S390PCIBusDevice, target), +DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 0cef7fbace..ba4017474e 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -18,6 +18,7 @@ #include "sysemu/hw_accel.h" #include "hw/s390x/s390-pci-inst.h" #include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-vfio.h" #include "hw/s390x/tod.h" #ifndef DEBUG_S390PCI_INST @@ -156,6 +157,47 @@ out: return rc; } +static int clp_enable_interp(S390PCIBusDevice *pbdev) +{ +int rc; + +rc = s390_pci_set_interp(pbdev, true); +if (rc) { +DPRINTF("Failed to enable interpretation\n"); +return rc; +} +rc = s390_pci_update_passthrough_fh(pbdev); +if (rc) { +DPRINTF("Failed to update passthrough fh\n"); +