Hi Eric/ Yi, [Cc: Alex]
> -----Original Message----- > From: Yi Liu <[email protected]> > Sent: 09 December 2025 11:17 > To: [email protected]; Shameer Kolothum > <[email protected]>; [email protected]; qemu- > [email protected] > Cc: [email protected]; Jason Gunthorpe <[email protected]>; Nicolin > Chen <[email protected]>; [email protected]; [email protected]; > Nathan Chen <[email protected]>; Matt Ochs <[email protected]>; > [email protected]; [email protected]; > [email protected]; [email protected]; > [email protected]; [email protected]; Krishnakant Jaju > <[email protected]> > Subject: Re: [PATCH v6 32/33] vfio: Synthesize vPASID capability to VM > > External email: Use caution opening links or attachments > > > On 2025/12/9 17:51, Eric Auger wrote: > > Hi Shameer, > > On 11/20/25 2:22 PM, Shameer Kolothum wrote: > >> From: Yi Liu <[email protected]> > >> > >> If user wants to expose PASID capability in vIOMMU, then VFIO would also > >> need to report the PASID cap for this device if the underlying hardware > >> supports it as well. > >> > >> As a start, this chooses to put the vPASID cap in the last 8 bytes of the > >> vconfig space. This is a choice in the good hope of no conflict with any > >> existing cap or hidden registers. For the devices that has hidden > >> registers, > >> user should figure out a proper offset for the vPASID cap. This may require > >> an option for user to config it. Here we leave it as a future extension. > >> There are more discussions on the mechanism of finding the proper offset. > >> > >> > https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8C > [email protected]/ > >> > >> Since we add a check to ensure the vIOMMU supports PASID, only devices > >> under those vIOMMUs can synthesize the vPASID capability. This gives > >> users control over which devices expose vPASID. > >> > >> Signed-off-by: Yi Liu <[email protected]> > >> Tested-by: Zhangfei Gao <[email protected]> > >> Reviewed-by: Jonathan Cameron <[email protected]> > >> Signed-off-by: Shameer Kolothum <[email protected]> > >> --- > >> hw/vfio/pci.c | 38 ++++++++++++++++++++++++++++++++++++++ > >> include/hw/iommu.h | 1 + > >> 2 files changed, 39 insertions(+) > >> > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > >> index 8b8bc5a421..e11e39d667 100644 > >> --- a/hw/vfio/pci.c > >> +++ b/hw/vfio/pci.c > >> @@ -24,6 +24,7 @@ > >> #include <sys/ioctl.h> > >> > >> #include "hw/hw.h" > >> +#include "hw/iommu.h" > >> #include "hw/pci/msi.h" > >> #include "hw/pci/msix.h" > >> #include "hw/pci/pci_bridge.h" > >> @@ -2500,7 +2501,12 @@ static int > vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos) > >> > >> static void vfio_add_ext_cap(VFIOPCIDevice *vdev) > >> { > >> + HostIOMMUDevice *hiod = vdev->vbasedev.hiod; > >> + HostIOMMUDeviceClass *hiodc = > HOST_IOMMU_DEVICE_GET_CLASS(hiod); > >> PCIDevice *pdev = PCI_DEVICE(vdev); > >> + uint64_t max_pasid_log2 = 0; > >> + bool pasid_cap_added = false; > >> + uint64_t hw_caps; > >> uint32_t header; > >> uint16_t cap_id, next, size; > >> uint8_t cap_ver; > >> @@ -2578,12 +2584,44 @@ static void vfio_add_ext_cap(VFIOPCIDevice > *vdev) > >> pcie_add_capability(pdev, cap_id, cap_ver, next, size); > >> } > >> break; > >> + /* > >> + * VFIO kernel does not expose the PASID CAP today. We may > synthesize > >> + * one later through IOMMUFD APIs. If VFIO ever starts exposing > >> it, > >> + * record its presence here so we do not create a duplicate CAP. > >> + */ > >> + case PCI_EXT_CAP_ID_PASID: > >> + pasid_cap_added = true; > >> + /* fallthrough */ > >> default: > >> pcie_add_capability(pdev, cap_id, cap_ver, next, size); > >> } > >> > >> } > >> > >> +#ifdef CONFIG_IOMMUFD > >> + /* Try to retrieve PASID CAP through IOMMUFD APIs */ > >> + if (!pasid_cap_added && hiodc && hiodc->get_cap) { > >> + hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_GENERIC_HW, > &hw_caps, NULL); > >> + hiodc->get_cap(hiod, > HOST_IOMMU_DEVICE_CAP_MAX_PASID_LOG2, > >> + &max_pasid_log2, NULL); > >> + } > >> + > >> + /* > >> + * If supported, adds the PASID capability in the end of the PCIe > >> config > >> + * space. TODO: Add option for enabling pasid at a safe offset. > >> + */ > >> + if (max_pasid_log2 && (pci_device_get_viommu_flags(pdev) & > >> + VIOMMU_FLAG_PASID_SUPPORTED)) { > >> + bool exec_perm = (hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC); > >> + bool priv_mod = (hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV); > >> + > >> + pcie_pasid_init(pdev, PCIE_CONFIG_SPACE_SIZE - > PCI_EXT_CAP_PASID_SIZEOF, > >> + max_pasid_log2, exec_perm, priv_mod); > >> + /* PASID capability is fully emulated by QEMU */ > >> + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8); > >> + } > >> +#endif > >> + > >> /* Cleanup chain head ID if necessary */ > >> if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { > >> pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); > >> diff --git a/include/hw/iommu.h b/include/hw/iommu.h > >> index 9b8bb94fc2..9635770bee 100644 > >> --- a/include/hw/iommu.h > >> +++ b/include/hw/iommu.h > >> @@ -20,6 +20,7 @@ > >> enum viommu_flags { > >> /* vIOMMU needs nesting parent HWPT to create nested HWPT */ > >> VIOMMU_FLAG_WANT_NESTING_PARENT = BIT_ULL(0), > >> + VIOMMU_FLAG_PASID_SUPPORTED = BIT_ULL(1), > >> }; > >> > >> #endif /* HW_IOMMU_H */ > > Besides the fact the offset is arbitrarily chosen so that this is the > > last cap of the vconfig space, the code looks good to me. > > So > > Reviewed-by: Eric Auger <[email protected]> > > > > Just wondering whether we couldn't add some generic pcie code that > > parses the extended cap linked list to check the offset range is not > > used by another cap before allowing the insertion at a given offset? > > This wouldn't prevent a subsequent addition from failing but at least we > > would know if there is some collision.this could be added later on though. > > > > You're absolutely right. My approach of using the last 8 bytes was a > shortcut to avoid implementing proper capability parsing logic > (importing pci_regs.h and maintaining a cap_id-to-cap_size mapping > table), and it simplified PASID capability detection by only examining > the last 8bytes by a simple dump :(. However, this approach is not > good as we cannot guarantee that the last 8bytes are unused by any > device. > > Let's just implement the logic to walk the linked list of ext_caps to > find an appropriate offset for our use case. I had a go at this. Based on my understanding, even if we walk the PCIe extended capability linked list, we still can't easily determine the size occupied by the last capability as the extended capability header does not encode a length, it only provides the "next" pointer, and for the last entry next == 0. Given that, I tried the following approach, -locate the last extended capability (using the existing helper), -reserve a fixed window (512 bytes) for that final capability, -and synthesize PASID at the end of PCIe config space (0xff8) only if it looks like there is enough room. Maybe I am missing something here.. Please let me know if there is a better way to address this. Thanks, Shameer diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index b302de6419..89178f2b7e 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1005,8 +1005,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev) */ /* Passing a cap_id value > 0xffff will return 0 and put end of list in prev */ -static uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id, - uint16_t *prev_p) +uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id, + uint16_t *prev_p) { uint16_t prev = 0; uint16_t next; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 8b8bc5a421..e8354e8b8d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -24,6 +24,7 @@ #include <sys/ioctl.h> #include "hw/hw.h" +#include "hw/iommu.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/pci/pci_bridge.h" @@ -2498,9 +2499,72 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos) return 0; } +/* + * Try to retrieve PASID capability information via IOMMUFD APIs and, + * if supported, synthesize a PASID PCIe extended capability for the + * VFIO device. + * + * The PASID capability is placed at the end of the PCIe extended + * configuration space. Determining the exact size of the last + * existing extended capability is non trivial, as PCIe extended + * capabilities do not generically encode their total size and + * vendor defined extensions are permitted. + * + * For now, reserve a fixed window (512 bytes) for the last extended + * capability and only insert PASID if sufficient space remains. + */ +static void vfio_pci_synthesize_pasid_cap(VFIOPCIDevice *vdev) +{ + HostIOMMUDevice *hiod = vdev->vbasedev.hiod; + HostIOMMUDeviceClass *hiodc; + HostIOMMUDevicePasidInfo pasid_info; + PCIDevice *pdev = PCI_DEVICE(vdev); + uint16_t last = 0; + uint16_t pasid_offset; + + if (vdev->vbasedev.mdev) { + return; + } + + hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod); + if (!hiodc || !hiodc->get_pasid_info || + !hiodc->get_pasid_info(hiod, &pasid_info) || + !(pci_device_get_viommu_flags(pdev) & VIOMMU_FLAG_PASID_SUPPORTED)) { + return; + } + + /* + * Locate the last PCIe extended capability present in the device + * configuration space. + */ + pcie_find_capability_list(pdev, 0x1ffff, &last); + + /* + * Reserve space at the end of PCIe configuration space for PASID. + * If the last extended capability appears too close to the end, + * refuse to insert PASID. + */ + if (last + 512 > PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF) { + warn_report("vfio: no space to synthesize PASID extended capability"); + return; + } + + pasid_offset = PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF; + + pcie_pasid_init(pdev, pasid_offset, + pasid_info.max_pasid_log2, + pasid_info.exec_perm, + pasid_info.priv_mod); + + /* PASID capability is fully emulated by QEMU */ + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, + PCI_EXT_CAP_PASID_SIZEOF); +} + static void vfio_add_ext_cap(VFIOPCIDevice *vdev) { PCIDevice *pdev = PCI_DEVICE(vdev); + bool pasid_cap_added = false; uint32_t header; uint16_t cap_id, next, size; uint8_t cap_ver; @@ -2562,6 +2626,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) * accesses, exact size doesn't seem worthwhile. */ size = vfio_ext_cap_max_size(config, next); + printf("%s: Shameer: cap_id 0x%x size 0x%x\n",__func__, cap_id, size); /* Use emulated next pointer to allow dropping extended caps */ pci_long_test_and_set_mask(vdev->emulated_config_bits + next, @@ -2578,12 +2643,24 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) pcie_add_capability(pdev, cap_id, cap_ver, next, size); } break; + /* + * VFIO kernel does not expose the PASID CAP today. We may synthesize + * one later through IOMMUFD APIs. If VFIO ever starts exposing it, + * record its presence here so we do not create a duplicate CAP. + */ + case PCI_EXT_CAP_ID_PASID: + pasid_cap_added = true; + /* fallthrough */ default: pcie_add_capability(pdev, cap_id, cap_ver, next, size); } } + if (!pasid_cap_added) { + vfio_pci_synthesize_pasid_cap(vdev); + } + /* Cleanup chain head ID if necessary */ if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index 42cebcd033..1a477b05b2 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -130,6 +130,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev); /* PCI express extended capability helper functions */ uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id); +uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id, + uint16_t *prev_p); void pcie_add_capability(PCIDevice *dev, uint16_t cap_id, uint8_t cap_ver, uint16_t offset, uint16_t size); @@ -138,6 +140,7 @@ void pcie_sync_bridge_lnk(PCIDevice *dev); void pcie_acs_init(PCIDevice *dev, uint16_t offset); void pcie_acs_reset(PCIDevice *dev); +uint16_t pcie_get_ext_cap_next_offset(PCIDevice *pdev, uint16_t cap_size); void pcie_ari_init(PCIDevice *dev, uint16_t offset); void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t ser_num); void pcie_ats_init(PCIDevice *dev, uint16_t offset, bool aligned);
