Hi Shameer,
On 1/5/26 5:33 PM, Shameer Kolothum wrote:
> Hi Eric/ Yi,
>
> [Cc: Alex]
>
>> -----Original Message-----
>> From: Yi Liu <[email protected]>
>> Sent: 09 December 2025 11:17
>> To: [email protected]; Shameer Kolothum
>> <[email protected]>; [email protected]; qemu-
>> [email protected]
>> Cc: [email protected]; Jason Gunthorpe <[email protected]>; Nicolin
>> Chen <[email protected]>; [email protected]; [email protected];
>> Nathan Chen <[email protected]>; Matt Ochs <[email protected]>;
>> [email protected]; [email protected];
>> [email protected]; [email protected];
>> [email protected]; [email protected]; Krishnakant Jaju
>> <[email protected]>
>> Subject: Re: [PATCH v6 32/33] vfio: Synthesize vPASID capability to VM
>>
>> External email: Use caution opening links or attachments
>>
>>
>> On 2025/12/9 17:51, Eric Auger wrote:
>>> Hi Shameer,
>>> On 11/20/25 2:22 PM, Shameer Kolothum wrote:
>>>> From: Yi Liu <[email protected]>
>>>>
>>>> If user wants to expose PASID capability in vIOMMU, then VFIO would also
>>>> need to report the PASID cap for this device if the underlying hardware
>>>> supports it as well.
>>>>
>>>> As a start, this chooses to put the vPASID cap in the last 8 bytes of the
>>>> vconfig space. This is a choice in the good hope of no conflict with any
>>>> existing cap or hidden registers. For the devices that has hidden
>>>> registers,
>>>> user should figure out a proper offset for the vPASID cap. This may require
>>>> an option for user to config it. Here we leave it as a future extension.
>>>> There are more discussions on the mechanism of finding the proper offset.
>>>>
>>>>
>> https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8C
>> [email protected]/
>>>> Since we add a check to ensure the vIOMMU supports PASID, only devices
>>>> under those vIOMMUs can synthesize the vPASID capability. This gives
>>>> users control over which devices expose vPASID.
>>>>
>>>> Signed-off-by: Yi Liu <[email protected]>
>>>> Tested-by: Zhangfei Gao <[email protected]>
>>>> Reviewed-by: Jonathan Cameron <[email protected]>
>>>> Signed-off-by: Shameer Kolothum <[email protected]>
>>>> ---
>>>> hw/vfio/pci.c | 38 ++++++++++++++++++++++++++++++++++++++
>>>> include/hw/iommu.h | 1 +
>>>> 2 files changed, 39 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 8b8bc5a421..e11e39d667 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -24,6 +24,7 @@
>>>> #include <sys/ioctl.h>
>>>>
>>>> #include "hw/hw.h"
>>>> +#include "hw/iommu.h"
>>>> #include "hw/pci/msi.h"
>>>> #include "hw/pci/msix.h"
>>>> #include "hw/pci/pci_bridge.h"
>>>> @@ -2500,7 +2501,12 @@ static int
>> vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
>>>> static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
>>>> {
>>>> + HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
>>>> + HostIOMMUDeviceClass *hiodc =
>> HOST_IOMMU_DEVICE_GET_CLASS(hiod);
>>>> PCIDevice *pdev = PCI_DEVICE(vdev);
>>>> + uint64_t max_pasid_log2 = 0;
>>>> + bool pasid_cap_added = false;
>>>> + uint64_t hw_caps;
>>>> uint32_t header;
>>>> uint16_t cap_id, next, size;
>>>> uint8_t cap_ver;
>>>> @@ -2578,12 +2584,44 @@ static void vfio_add_ext_cap(VFIOPCIDevice
>> *vdev)
>>>> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>>>> }
>>>> break;
>>>> + /*
>>>> + * VFIO kernel does not expose the PASID CAP today. We may
>> synthesize
>>>> + * one later through IOMMUFD APIs. If VFIO ever starts exposing
>>>> it,
>>>> + * record its presence here so we do not create a duplicate CAP.
>>>> + */
>>>> + case PCI_EXT_CAP_ID_PASID:
>>>> + pasid_cap_added = true;
>>>> + /* fallthrough */
>>>> default:
>>>> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>>>> }
>>>>
>>>> }
>>>>
>>>> +#ifdef CONFIG_IOMMUFD
>>>> + /* Try to retrieve PASID CAP through IOMMUFD APIs */
>>>> + if (!pasid_cap_added && hiodc && hiodc->get_cap) {
>>>> + hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_GENERIC_HW,
>> &hw_caps, NULL);
>>>> + hiodc->get_cap(hiod,
>> HOST_IOMMU_DEVICE_CAP_MAX_PASID_LOG2,
>>>> + &max_pasid_log2, NULL);
>>>> + }
>>>> +
>>>> + /*
>>>> + * If supported, adds the PASID capability in the end of the PCIe
>>>> config
>>>> + * space. TODO: Add option for enabling pasid at a safe offset.
>>>> + */
>>>> + if (max_pasid_log2 && (pci_device_get_viommu_flags(pdev) &
>>>> + VIOMMU_FLAG_PASID_SUPPORTED)) {
>>>> + bool exec_perm = (hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC);
>>>> + bool priv_mod = (hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV);
>>>> +
>>>> + pcie_pasid_init(pdev, PCIE_CONFIG_SPACE_SIZE -
>> PCI_EXT_CAP_PASID_SIZEOF,
>>>> + max_pasid_log2, exec_perm, priv_mod);
>>>> + /* PASID capability is fully emulated by QEMU */
>>>> + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8);
>>>> + }
>>>> +#endif
>>>> +
>>>> /* Cleanup chain head ID if necessary */
>>>> if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
>>>> pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
>>>> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
>>>> index 9b8bb94fc2..9635770bee 100644
>>>> --- a/include/hw/iommu.h
>>>> +++ b/include/hw/iommu.h
>>>> @@ -20,6 +20,7 @@
>>>> enum viommu_flags {
>>>> /* vIOMMU needs nesting parent HWPT to create nested HWPT */
>>>> VIOMMU_FLAG_WANT_NESTING_PARENT = BIT_ULL(0),
>>>> + VIOMMU_FLAG_PASID_SUPPORTED = BIT_ULL(1),
>>>> };
>>>>
>>>> #endif /* HW_IOMMU_H */
>>> Besides the fact the offset is arbitrarily chosen so that this is the
>>> last cap of the vconfig space, the code looks good to me.
>>> So
>>> Reviewed-by: Eric Auger <[email protected]>
>>>
>>> Just wondering whether we couldn't add some generic pcie code that
>>> parses the extended cap linked list to check the offset range is not
>>> used by another cap before allowing the insertion at a given offset?
>>> This wouldn't prevent a subsequent addition from failing but at least we
>>> would know if there is some collision.this could be added later on though.
>>>
>> You're absolutely right. My approach of using the last 8 bytes was a
>> shortcut to avoid implementing proper capability parsing logic
>> (importing pci_regs.h and maintaining a cap_id-to-cap_size mapping
>> table), and it simplified PASID capability detection by only examining
>> the last 8bytes by a simple dump :(. However, this approach is not
>> good as we cannot guarantee that the last 8bytes are unused by any
>> device.
>>
>> Let's just implement the logic to walk the linked list of ext_caps to
>> find an appropriate offset for our use case.
> I had a go at this. Based on my understanding, even if we walk the PCIe
> extended capability linked list, we still can't easily determine the size
> occupied by the last capability as the extended capability header does not
> encode a length, it only provides the "next" pointer, and for the last entry
> next == 0.
If my understanding is correct when walking the linked list, you can
enumerate the start index and the PCIe extended Capability variable size
which is made of fix header size + register block variable size which
depends on the capability ID). After that we shall be able to allocate a
slot within holes or at least check that adding the new prop at the end
of the 4kB is safe, no?. What do I miss?
Thanks
Eric
>
> Given that, I tried the following approach,
>
> -locate the last extended capability (using the existing helper),
> -reserve a fixed window (512 bytes) for that final capability,
> -and synthesize PASID at the end of PCIe config space (0xff8) only if it
> looks like there is enough room.
>
> Maybe I am missing something here.. Please let me know if there is a
> better way to address this.
>
> Thanks,
> Shameer
>
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index b302de6419..89178f2b7e 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -1005,8 +1005,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev)
> */
>
> /* Passing a cap_id value > 0xffff will return 0 and put end of list in prev
> */
> -static uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id,
> - uint16_t *prev_p)
> +uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id,
> + uint16_t *prev_p)
> {
> uint16_t prev = 0;
> uint16_t next;
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 8b8bc5a421..e8354e8b8d 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -24,6 +24,7 @@
> #include <sys/ioctl.h>
>
> #include "hw/hw.h"
> +#include "hw/iommu.h"
> #include "hw/pci/msi.h"
> #include "hw/pci/msix.h"
> #include "hw/pci/pci_bridge.h"
> @@ -2498,9 +2499,72 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev,
> uint16_t pos)
> return 0;
> }
>
> +/*
> + * Try to retrieve PASID capability information via IOMMUFD APIs and,
> + * if supported, synthesize a PASID PCIe extended capability for the
> + * VFIO device.
> + *
> + * The PASID capability is placed at the end of the PCIe extended
> + * configuration space. Determining the exact size of the last
> + * existing extended capability is non trivial, as PCIe extended
> + * capabilities do not generically encode their total size and
> + * vendor defined extensions are permitted.
> + *
> + * For now, reserve a fixed window (512 bytes) for the last extended
> + * capability and only insert PASID if sufficient space remains.
> + */
> +static void vfio_pci_synthesize_pasid_cap(VFIOPCIDevice *vdev)
> +{
> + HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
> + HostIOMMUDeviceClass *hiodc;
> + HostIOMMUDevicePasidInfo pasid_info;
> + PCIDevice *pdev = PCI_DEVICE(vdev);
> + uint16_t last = 0;
> + uint16_t pasid_offset;
> +
> + if (vdev->vbasedev.mdev) {
> + return;
> + }
> +
> + hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
> + if (!hiodc || !hiodc->get_pasid_info ||
> + !hiodc->get_pasid_info(hiod, &pasid_info) ||
> + !(pci_device_get_viommu_flags(pdev) & VIOMMU_FLAG_PASID_SUPPORTED)) {
> + return;
> + }
> +
> + /*
> + * Locate the last PCIe extended capability present in the device
> + * configuration space.
> + */
> + pcie_find_capability_list(pdev, 0x1ffff, &last);
> +
> + /*
> + * Reserve space at the end of PCIe configuration space for PASID.
> + * If the last extended capability appears too close to the end,
> + * refuse to insert PASID.
> + */
> + if (last + 512 > PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF) {
> + warn_report("vfio: no space to synthesize PASID extended
> capability");
> + return;
> + }
> +
> + pasid_offset = PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF;
> +
> + pcie_pasid_init(pdev, pasid_offset,
> + pasid_info.max_pasid_log2,
> + pasid_info.exec_perm,
> + pasid_info.priv_mod);
> +
> + /* PASID capability is fully emulated by QEMU */
> + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff,
> + PCI_EXT_CAP_PASID_SIZEOF);
> +}
> +
> static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> {
> PCIDevice *pdev = PCI_DEVICE(vdev);
> + bool pasid_cap_added = false;
> uint32_t header;
> uint16_t cap_id, next, size;
> uint8_t cap_ver;
> @@ -2562,6 +2626,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> * accesses, exact size doesn't seem worthwhile.
> */
> size = vfio_ext_cap_max_size(config, next);
> + printf("%s: Shameer: cap_id 0x%x size 0x%x\n",__func__, cap_id,
> size);
>
> /* Use emulated next pointer to allow dropping extended caps */
> pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
> @@ -2578,12 +2643,24 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> }
> break;
> + /*
> + * VFIO kernel does not expose the PASID CAP today. We may synthesize
> + * one later through IOMMUFD APIs. If VFIO ever starts exposing it,
> + * record its presence here so we do not create a duplicate CAP.
> + */
> + case PCI_EXT_CAP_ID_PASID:
> + pasid_cap_added = true;
> + /* fallthrough */
> default:
> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> }
>
> }
>
> + if (!pasid_cap_added) {
> + vfio_pci_synthesize_pasid_cap(vdev);
> + }
> +
> /* Cleanup chain head ID if necessary */
> if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> index 42cebcd033..1a477b05b2 100644
> --- a/include/hw/pci/pcie.h
> +++ b/include/hw/pci/pcie.h
> @@ -130,6 +130,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev);
>
> /* PCI express extended capability helper functions */
> uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id);
> +uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id,
> + uint16_t *prev_p);
> void pcie_add_capability(PCIDevice *dev,
> uint16_t cap_id, uint8_t cap_ver,
> uint16_t offset, uint16_t size);
> @@ -138,6 +140,7 @@ void pcie_sync_bridge_lnk(PCIDevice *dev);
> void pcie_acs_init(PCIDevice *dev, uint16_t offset);
> void pcie_acs_reset(PCIDevice *dev);
>
> +uint16_t pcie_get_ext_cap_next_offset(PCIDevice *pdev, uint16_t cap_size);
> void pcie_ari_init(PCIDevice *dev, uint16_t offset);
> void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t
> ser_num);
> void pcie_ats_init(PCIDevice *dev, uint16_t offset, bool aligned);
>
>