Hi Shameer,

On 1/5/26 5:33 PM, Shameer Kolothum wrote:
> Hi Eric/ Yi,
>
> [Cc: Alex]
>
>> -----Original Message-----
>> From: Yi Liu <[email protected]>
>> Sent: 09 December 2025 11:17
>> To: [email protected]; Shameer Kolothum
>> <[email protected]>; [email protected]; qemu-
>> [email protected]
>> Cc: [email protected]; Jason Gunthorpe <[email protected]>; Nicolin
>> Chen <[email protected]>; [email protected]; [email protected];
>> Nathan Chen <[email protected]>; Matt Ochs <[email protected]>;
>> [email protected]; [email protected];
>> [email protected]; [email protected];
>> [email protected]; [email protected]; Krishnakant Jaju
>> <[email protected]>
>> Subject: Re: [PATCH v6 32/33] vfio: Synthesize vPASID capability to VM
>>
>> External email: Use caution opening links or attachments
>>
>>
>> On 2025/12/9 17:51, Eric Auger wrote:
>>> Hi Shameer,
>>> On 11/20/25 2:22 PM, Shameer Kolothum wrote:
>>>> From: Yi Liu <[email protected]>
>>>>
>>>> If user wants to expose PASID capability in vIOMMU, then VFIO would also
>>>> need to report the PASID cap for this device if the underlying hardware
>>>> supports it as well.
>>>>
>>>> As a start, this chooses to put the vPASID cap in the last 8 bytes of the
>>>> vconfig space. This is a choice in the good hope of no conflict with any
>>>> existing cap or hidden registers. For the devices that has hidden 
>>>> registers,
>>>> user should figure out a proper offset for the vPASID cap. This may require
>>>> an option for user to config it. Here we leave it as a future extension.
>>>> There are more discussions on the mechanism of finding the proper offset.
>>>>
>>>>
>> https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8C
>> [email protected]/
>>>> Since we add a check to ensure the vIOMMU supports PASID, only devices
>>>> under those vIOMMUs can synthesize the vPASID capability. This gives
>>>> users control over which devices expose vPASID.
>>>>
>>>> Signed-off-by: Yi Liu <[email protected]>
>>>> Tested-by: Zhangfei Gao <[email protected]>
>>>> Reviewed-by: Jonathan Cameron <[email protected]>
>>>> Signed-off-by: Shameer Kolothum <[email protected]>
>>>> ---
>>>>   hw/vfio/pci.c      | 38 ++++++++++++++++++++++++++++++++++++++
>>>>   include/hw/iommu.h |  1 +
>>>>   2 files changed, 39 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 8b8bc5a421..e11e39d667 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -24,6 +24,7 @@
>>>>   #include <sys/ioctl.h>
>>>>
>>>>   #include "hw/hw.h"
>>>> +#include "hw/iommu.h"
>>>>   #include "hw/pci/msi.h"
>>>>   #include "hw/pci/msix.h"
>>>>   #include "hw/pci/pci_bridge.h"
>>>> @@ -2500,7 +2501,12 @@ static int
>> vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
>>>>   static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
>>>>   {
>>>> +    HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
>>>> +    HostIOMMUDeviceClass *hiodc =
>> HOST_IOMMU_DEVICE_GET_CLASS(hiod);
>>>>       PCIDevice *pdev = PCI_DEVICE(vdev);
>>>> +    uint64_t max_pasid_log2 = 0;
>>>> +    bool pasid_cap_added = false;
>>>> +    uint64_t hw_caps;
>>>>       uint32_t header;
>>>>       uint16_t cap_id, next, size;
>>>>       uint8_t cap_ver;
>>>> @@ -2578,12 +2584,44 @@ static void vfio_add_ext_cap(VFIOPCIDevice
>> *vdev)
>>>>                   pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>>>>               }
>>>>               break;
>>>> +        /*
>>>> +         * VFIO kernel does not expose the PASID CAP today. We may
>> synthesize
>>>> +         * one later through IOMMUFD APIs. If VFIO ever starts exposing 
>>>> it,
>>>> +         * record its presence here so we do not create a duplicate CAP.
>>>> +         */
>>>> +        case PCI_EXT_CAP_ID_PASID:
>>>> +             pasid_cap_added = true;
>>>> +             /* fallthrough */
>>>>           default:
>>>>               pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>>>>           }
>>>>
>>>>       }
>>>>
>>>> +#ifdef CONFIG_IOMMUFD
>>>> +    /* Try to retrieve PASID CAP through IOMMUFD APIs */
>>>> +    if (!pasid_cap_added && hiodc && hiodc->get_cap) {
>>>> +        hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_GENERIC_HW,
>> &hw_caps, NULL);
>>>> +        hiodc->get_cap(hiod,
>> HOST_IOMMU_DEVICE_CAP_MAX_PASID_LOG2,
>>>> +                       &max_pasid_log2, NULL);
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * If supported, adds the PASID capability in the end of the PCIe 
>>>> config
>>>> +     * space. TODO: Add option for enabling pasid at a safe offset.
>>>> +     */
>>>> +    if (max_pasid_log2 && (pci_device_get_viommu_flags(pdev) &
>>>> +                           VIOMMU_FLAG_PASID_SUPPORTED)) {
>>>> +        bool exec_perm = (hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC);
>>>> +        bool priv_mod = (hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV);
>>>> +
>>>> +        pcie_pasid_init(pdev, PCIE_CONFIG_SPACE_SIZE -
>> PCI_EXT_CAP_PASID_SIZEOF,
>>>> +                        max_pasid_log2, exec_perm, priv_mod);
>>>> +        /* PASID capability is fully emulated by QEMU */
>>>> +        memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8);
>>>> +    }
>>>> +#endif
>>>> +
>>>>       /* Cleanup chain head ID if necessary */
>>>>       if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
>>>>           pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
>>>> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
>>>> index 9b8bb94fc2..9635770bee 100644
>>>> --- a/include/hw/iommu.h
>>>> +++ b/include/hw/iommu.h
>>>> @@ -20,6 +20,7 @@
>>>>   enum viommu_flags {
>>>>       /* vIOMMU needs nesting parent HWPT to create nested HWPT */
>>>>       VIOMMU_FLAG_WANT_NESTING_PARENT = BIT_ULL(0),
>>>> +    VIOMMU_FLAG_PASID_SUPPORTED = BIT_ULL(1),
>>>>   };
>>>>
>>>>   #endif /* HW_IOMMU_H */
>>> Besides the fact the offset is arbitrarily chosen so that this is the
>>> last cap of the vconfig space, the code looks good to me.
>>> So
>>> Reviewed-by: Eric Auger <[email protected]>
>>>
>>> Just wondering whether we couldn't add some generic pcie code that
>>> parses the extended cap linked list to check the offset range is not
>>> used by another cap before allowing the insertion at a given offset?
>>> This wouldn't prevent a subsequent addition from failing but at least we
>>> would know if there is some collision.this could be added later on though.
>>>
>> You're absolutely right. My approach of using the last 8 bytes was a
>> shortcut to avoid implementing proper capability parsing logic
>> (importing pci_regs.h and maintaining a cap_id-to-cap_size mapping
>> table), and it simplified PASID capability detection by only examining
>> the last 8bytes by a simple dump :(. However, this approach is not
>> good as we cannot guarantee that the last 8bytes are unused by any
>> device.
>>
>> Let's just implement the logic to walk the linked list of ext_caps to
>> find an appropriate offset for our use case.
> I had a go at this. Based on my understanding, even if we walk the PCIe
> extended capability linked list, we still can't easily determine the size
> occupied by the last capability as the extended capability header does not
> encode a length, it only provides the "next" pointer, and for the last entry
> next == 0.
If my understanding is correct when walking the linked list, you can
enumerate the start index and the PCIe extended Capability variable size
which is made of fix header size + register block variable size which
depends on the capability ID). After that we shall be able to allocate a
slot within holes or at least check that adding the new prop at the end
of the 4kB is safe, no?. What do I miss?

Thanks

Eric
>
> Given that, I tried the following approach,
>
> -locate the last extended capability (using the existing helper),
> -reserve a fixed window (512 bytes) for that final capability,
> -and synthesize PASID at the end of PCIe config space (0xff8) only if it
> looks like there is enough room.
>
> Maybe I am missing something here.. Please let me know if there is a
> better way to address this.
>
> Thanks,
> Shameer
>
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index b302de6419..89178f2b7e 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -1005,8 +1005,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev)
>   */
>
>  /* Passing a cap_id value > 0xffff will return 0 and put end of list in prev 
> */
> -static uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id,
> -                                          uint16_t *prev_p)
> +uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id,
> +                                   uint16_t *prev_p)
>  {
>      uint16_t prev = 0;
>      uint16_t next;
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 8b8bc5a421..e8354e8b8d 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -24,6 +24,7 @@
>  #include <sys/ioctl.h>
>
>  #include "hw/hw.h"
> +#include "hw/iommu.h"
>  #include "hw/pci/msi.h"
>  #include "hw/pci/msix.h"
>  #include "hw/pci/pci_bridge.h"
> @@ -2498,9 +2499,72 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, 
> uint16_t pos)
>      return 0;
>  }
>
> +/*
> + * Try to retrieve PASID capability information via IOMMUFD APIs and,
> + * if supported, synthesize a PASID PCIe extended capability for the
> + * VFIO device.
> + *
> + * The PASID capability is placed at the end of the PCIe extended
> + * configuration space. Determining the exact size of the last
> + * existing extended capability is non trivial, as PCIe extended
> + * capabilities do not generically encode their total size and
> + * vendor defined extensions are permitted.
> + *
> + * For now, reserve a fixed window (512 bytes) for the last extended
> + * capability and only insert  PASID if sufficient space remains.
> + */
> +static void vfio_pci_synthesize_pasid_cap(VFIOPCIDevice *vdev)
> +{
> +    HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
> +    HostIOMMUDeviceClass *hiodc;
> +    HostIOMMUDevicePasidInfo pasid_info;
> +    PCIDevice *pdev = PCI_DEVICE(vdev);
> +    uint16_t last = 0;
> +    uint16_t pasid_offset;
> +
> +    if (vdev->vbasedev.mdev) {
> +        return;
> +    }
> +
> +    hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
> +    if (!hiodc || !hiodc->get_pasid_info ||
> +        !hiodc->get_pasid_info(hiod, &pasid_info) ||
> +        !(pci_device_get_viommu_flags(pdev) & VIOMMU_FLAG_PASID_SUPPORTED)) {
> +        return;
> +    }
> +
> +    /*
> +     * Locate the last PCIe extended capability present in the device
> +     * configuration space.
> +     */
> +    pcie_find_capability_list(pdev, 0x1ffff, &last);
> +
> +    /*
> +     * Reserve space at the end of PCIe configuration space for PASID.
> +     * If the last extended capability appears too close to the end,
> +     * refuse to insert PASID.
> +     */
> +    if (last + 512 > PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF) {
> +        warn_report("vfio: no space to synthesize PASID extended 
> capability");
> +        return;
> +    }
> +
> +    pasid_offset = PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF;
> +
> +    pcie_pasid_init(pdev, pasid_offset,
> +                    pasid_info.max_pasid_log2,
> +                    pasid_info.exec_perm,
> +                    pasid_info.priv_mod);
> +
> +    /* PASID capability is fully emulated by QEMU */
> +    memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff,
> +           PCI_EXT_CAP_PASID_SIZEOF);
> +}
> +
>  static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
>  {
>      PCIDevice *pdev = PCI_DEVICE(vdev);
> +    bool pasid_cap_added = false;
>      uint32_t header;
>      uint16_t cap_id, next, size;
>      uint8_t cap_ver;
> @@ -2562,6 +2626,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
>           * accesses, exact size doesn't seem worthwhile.
>           */
>          size = vfio_ext_cap_max_size(config, next);
> +        printf("%s: Shameer: cap_id 0x%x size 0x%x\n",__func__, cap_id, 
> size);
>
>          /* Use emulated next pointer to allow dropping extended caps */
>          pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
> @@ -2578,12 +2643,24 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
>                  pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>              }
>              break;
> +        /*
> +         * VFIO kernel does not expose the PASID CAP today. We may synthesize
> +         * one later through IOMMUFD APIs. If VFIO ever starts exposing it,
> +         * record its presence here so we do not create a duplicate CAP.
> +         */
> +        case PCI_EXT_CAP_ID_PASID:
> +             pasid_cap_added = true;
> +             /* fallthrough */
>          default:
>              pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>          }
>
>      }
>
> +    if (!pasid_cap_added) {
> +        vfio_pci_synthesize_pasid_cap(vdev);
> +    }
> +
>      /* Cleanup chain head ID if necessary */
>      if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
>          pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> index 42cebcd033..1a477b05b2 100644
> --- a/include/hw/pci/pcie.h
> +++ b/include/hw/pci/pcie.h
> @@ -130,6 +130,8 @@ bool pcie_cap_is_arifwd_enabled(const PCIDevice *dev);
>
>  /* PCI express extended capability helper functions */
>  uint16_t pcie_find_capability(PCIDevice *dev, uint16_t cap_id);
> +uint16_t pcie_find_capability_list(PCIDevice *dev, uint32_t cap_id,
> +                                   uint16_t *prev_p);
>  void pcie_add_capability(PCIDevice *dev,
>                           uint16_t cap_id, uint8_t cap_ver,
>                           uint16_t offset, uint16_t size);
> @@ -138,6 +140,7 @@ void pcie_sync_bridge_lnk(PCIDevice *dev);
>  void pcie_acs_init(PCIDevice *dev, uint16_t offset);
>  void pcie_acs_reset(PCIDevice *dev);
>
> +uint16_t pcie_get_ext_cap_next_offset(PCIDevice *pdev, uint16_t cap_size);
>  void pcie_ari_init(PCIDevice *dev, uint16_t offset);
>  void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t 
> ser_num);
>  void pcie_ats_init(PCIDevice *dev, uint16_t offset, bool aligned);
>
>


Reply via email to