Hi Zhenzhong,
On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
I would suggest: Handle PASID entry removal and update instead of verbing.
> This adds an new entry VTDPASIDCacheEntry in VTDAddressSpace to cache the
> pasid entry and track PASID usage and future PASID tagged DMA address
> translation support in vIOMMU.
>
> VTDAddressSpace of PCI_NO_PASID is allocated when device is plugged and
> never freed. For other pasid, VTDAddressSpace instance is created/destroyed
> per the guest pasid entry set up/destroy for passthrough devices. While for
> emulated devices, VTDAddressSpace instance is created in the PASID tagged DMA
> translation and be destroyed per guest PASID cache invalidation. This focuses
s/be detroyed/destroyed
> on the PASID cache management for passthrough devices as there is no PASID
> capable emulated devices yet.
if you don't handle the emulated device case, may be omit talking about
them here.
>
> When guest modifies a PASID entry, QEMU will capture the guest pasid selective
> pasid cache invalidation, allocate or remove a VTDAddressSpace instance per
> the
> invalidation reasons:
>
> a) a present pasid entry moved to non-present
> b) a present pasid entry to be a present entry
> c) a non-present pasid entry moved to present
>
> This handles a) and b), following patch will handle c).
This -> This patch
>
> vIOMMU emulator could figure out the reason by fetching latest guest pasid
> entry
> and compare it with the PASID cache.
>
> Signed-off-by: Yi Liu <[email protected]>
> Signed-off-by: Yi Sun <[email protected]>
> Signed-off-by: Zhenzhong Duan <[email protected]>
> ---
> hw/i386/intel_iommu_internal.h | 27 ++++
> include/hw/i386/intel_iommu.h | 6 +
> hw/i386/intel_iommu.c | 265 +++++++++++++++++++++++++++++++--
> hw/i386/trace-events | 3 +
> 4 files changed, 291 insertions(+), 10 deletions(-)
>
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 18bc22fc72..01c881ed4d 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -315,6 +315,7 @@ typedef enum VTDFaultReason {
> * request while disabled */
> VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */
>
> + VTD_FR_RTADDR_INV_TTM = 0x31, /* Invalid TTM in RTADDR */
> /* PASID directory entry access failure */
> VTD_FR_PASID_DIR_ACCESS_ERR = 0x50,
> /* The Present(P) field of pasid directory entry is 0 */
> @@ -492,6 +493,15 @@ typedef union VTDInvDesc VTDInvDesc;
> #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL
> #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
>
Adding some basic comments for the uncultured reader like me would help
doing the link with the vtd spec:
PASID-cache Invalidate Descriptor (pc_inv_dsc) fields
> +#define VTD_INV_DESC_PASIDC_G (3ULL << 4)
> +#define VTD_INV_DESC_PASIDC_PASID(val) (((val) >> 32) & 0xfffffULL)
> +#define VTD_INV_DESC_PASIDC_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK)
> +#define VTD_INV_DESC_PASIDC_RSVD_VAL0 0xfff000000000f1c0ULL
> +
> +#define VTD_INV_DESC_PASIDC_DSI (0ULL << 4)
> +#define VTD_INV_DESC_PASIDC_PASID_SI (1ULL << 4)
> +#define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
as those are values for the granularity field using
VTD_INV_DESC_PASIDC_G_* look relevant to me.
I think you would gain in readability if you adopt extract32/64 syntax
like in hw/arm/smmuv3-internal.h
Looks more readable to me.
> +
> /* Information about page-selective IOTLB invalidate */
> struct VTDIOTLBPageInvInfo {
> uint16_t domain_id;
> @@ -552,6 +562,22 @@ typedef struct VTDRootEntry VTDRootEntry;
> #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw))
> #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL
>
> +typedef enum VTDPCInvType {
> + /* pasid cache invalidation rely on guest PASID entry */
> + VTD_PASID_CACHE_GLOBAL_INV, /* pasid cache global invalidation */
> + VTD_PASID_CACHE_DOMSI, /* pasid cache domain selective invalidation
> */
> + VTD_PASID_CACHE_PASIDSI, /* pasid cache pasid selective invalidation
> */
> +} VTDPCInvType;
> +
> +typedef struct VTDPASIDCacheInfo {
> + VTDPCInvType type;
> + uint16_t domain_id;
did?
> + uint32_t pasid;
> + PCIBus *bus;
> + uint16_t devfn;
> + bool error_happened;
wouldn't it make more sense to store an Error * directly so that you can
fill it whenever there is an error occuring.
Something like VFIOContainerBase::error
> +} VTDPASIDCacheInfo;
> +
> /* PASID Table Related Definitions */
> #define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL)
> #define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL)
> @@ -563,6 +589,7 @@ typedef struct VTDRootEntry VTDRootEntry;
> #define VTD_PASID_TABLE_BITS_MASK (0x3fULL)
> #define VTD_PASID_TABLE_INDEX(pasid) ((pasid) & VTD_PASID_TABLE_BITS_MASK)
> #define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing
> Disable */
> +#define VTD_PASID_TBL_ENTRY_NUM (1ULL << 6)
>
> /* PASID Granular Translation Type Mask */
> #define VTD_PASID_ENTRY_P 1ULL
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 50f9b27a45..fbc9da903a 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -95,6 +95,11 @@ struct VTDPASIDEntry {
> uint64_t val[8];
> };
>
> +typedef struct VTDPASIDCacheEntry {
> + struct VTDPASIDEntry pasid_entry;
> + bool cache_filled;
> +} VTDPASIDCacheEntry;
> +
> struct VTDAddressSpace {
> PCIBus *bus;
> uint8_t devfn;
> @@ -107,6 +112,7 @@ struct VTDAddressSpace {
> MemoryRegion iommu_ir_fault; /* Interrupt region for catching fault */
> IntelIOMMUState *iommu_state;
> VTDContextCacheEntry context_cache_entry;
> + VTDPASIDCacheEntry pasid_cache_entry;
> QLIST_ENTRY(VTDAddressSpace) next;
> /* Superset of notifier flags that this address space has */
> IOMMUNotifierFlag notifier_flags;
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 8948b8370f..1db581d14a 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -826,11 +826,24 @@ static inline bool vtd_pe_type_check(IntelIOMMUState
> *s, VTDPASIDEntry *pe)
> }
> }
>
> +static inline uint16_t vtd_pe_get_did(VTDPASIDEntry *pe)
> +{
> + return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
see my previous comment about extract*
> +}
> +
> static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
> {
> return pdire->val & 1;
> }
>
> +static inline void pasid_cache_info_set_error(VTDPASIDCacheInfo *pc_info)
not sure whether this helper is really needed
> +{
> + if (pc_info->error_happened) {
Besides why not simply overwriting unconditionnally?
> + return;
> + }
> + pc_info->error_happened = true;
> +}
> +
> /**
> * Caller of this function should check present bit if wants
> * to use pdir entry for further usage except for fpd bit check.
> @@ -3103,6 +3116,241 @@ static bool vtd_process_piotlb_desc(IntelIOMMUState
> *s,
> return true;
> }
>
> +static inline int vtd_dev_get_pe_from_pasid(VTDAddressSpace *vtd_as,
> + uint32_t pasid, VTDPASIDEntry
> *pe)
> +{
> + IntelIOMMUState *s = vtd_as->iommu_state;
> + VTDContextEntry ce;
> + int ret;
> +
> + if (!s->root_scalable) {
> + return -VTD_FR_RTADDR_INV_TTM;
> + }
> +
> + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn,
> + &ce);
> + if (ret) {
> + return ret;
> + }
> +
> + return vtd_ce_get_pasid_entry(s, &ce, pe, pasid);
> +}
> +
> +static bool vtd_pasid_entry_compare(VTDPASIDEntry *p1, VTDPASIDEntry *p2)
> +{
> + return !memcmp(p1, p2, sizeof(*p1));
> +}
> +
> +/*
> + * This function fills in the pasid entry in &vtd_as. Caller
> + * of this function should hold iommu_lock.
> + */
> +static int vtd_fill_pe_in_cache(IntelIOMMUState *s, VTDAddressSpace *vtd_as,
> + VTDPASIDEntry *pe)
> +{
> + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
> +
> + if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) {
> + /* No need to go further as cached pasid entry is latest */
> + return 0;
> + }
> +
> + pc_entry->pasid_entry = *pe;
> + pc_entry->cache_filled = true;
> +
> + /*
> + * TODO: send pasid bind to host for passthru devices
> + */
> +
> + return 0;
> +}
> +
> +/*
> + * This function is used to update or clear cached pasid entry in vtd_as
> + * instances. Caller of this function should hold iommu_lock.
> + */
> +static gboolean vtd_flush_pasid(gpointer key, gpointer value,
> + gpointer user_data)
> +{
> + VTDPASIDCacheInfo *pc_info = user_data;
> + VTDAddressSpace *vtd_as = value;
> + IntelIOMMUState *s = vtd_as->iommu_state;
> + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
> + VTDPASIDEntry pe;
> + uint16_t did;
> + uint32_t pasid;
> + int ret;
> +
> + if (!pc_entry->cache_filled) {
> + return false;
> + }
> + did = vtd_pe_get_did(&pc_entry->pasid_entry);
> +
> + if (vtd_as_to_iommu_pasid_locked(vtd_as, &pasid)) {
> + goto remove;
> + }
> +
> + switch (pc_info->type) {
> + case VTD_PASID_CACHE_PASIDSI:
> + if (pc_info->pasid != pasid) {
> + return false;
> + }
> + /* Fall through */
> + case VTD_PASID_CACHE_DOMSI:
> + if (pc_info->domain_id != did) {
> + return false;
> + }
> + /* Fall through */
> + case VTD_PASID_CACHE_GLOBAL_INV:
> + break;
> + default:
> + error_report("invalid pc_info->type");
> + abort();
> + }
> +
> + /*
> + * pasid cache invalidation may indicate a present pasid
> + * entry to present pasid entry modification. To cover such
> + * case, vIOMMU emulator needs to fetch latest guest pasid
> + * entry and check cached pasid entry, then update pasid
> + * cache and send pasid bind/unbind to host properly.
if you don't do that in that patch I would put that in a subsequent
patch. Focus on the PASID cache in this patch. See my subsequent comment
> + */
> + ret = vtd_dev_get_pe_from_pasid(vtd_as, pasid, &pe);
> + if (ret) {
> + /*
> + * No valid pasid entry in guest memory. e.g. pasid entry
> + * was modified to be either all-zero or non-present. Either
> + * case means existing pasid cache should be removed.
> + */
> + goto remove;
> + }
> +
> + if (vtd_fill_pe_in_cache(s, vtd_as, &pe)) {
> + pasid_cache_info_set_error(pc_info);
> + }
> + return false;
> +
> +remove:
> + /*
> + * TODO: send pasid unbind to host for passthru devices
> + */
> + pc_entry->cache_filled = false;
> +
> + /*
> + * Don't remove address space of PCI_NO_PASID which is created by PCI
> + * sub-system.
> + */
> + if (vtd_as->pasid == PCI_NO_PASID) {
> + return false;
> + }
> + return true;
> +}
> +
> +/*
> + * This function syncs the pasid bindings between guest and host.
> + * It includes updating the pasid cache in vIOMMU and updating the
> + * pasid bindings per guest's latest pasid entry presence.
> + */
> +static void vtd_pasid_cache_sync(IntelIOMMUState *s,
> + VTDPASIDCacheInfo *pc_info)
> +{
> + if (!s->flts || !s->root_scalable || !s->dmar_enabled) {
you don't update error_happened. Is that OK?
> + return;
> + }
> +
> + /*
> + * Regards to a pasid cache invalidation, e.g. a PSI.
> + * it could be either cases of below:
> + * a) a present pasid entry moved to non-present
> + * b) a present pasid entry to be a present entry
> + * c) a non-present pasid entry moved to present
> + *
> + * Different invalidation granularity may affect different device
> + * scope and pasid scope. But for each invalidation granularity,
> + * it needs to do two steps to sync host and guest pasid binding.
> + *
> + * Here is the handling of a PSI:
> + * 1) loop all the existing vtd_as instances to update them
> + * according to the latest guest pasid entry in pasid table.
> + * this will make sure affected existing vtd_as instances
> + * cached the latest pasid entries. Also, during the loop, the
> + * host should be notified if needed. e.g. pasid unbind or pasid
> + * update. Should be able to cover case a) and case b).
> + *
> + * 2) loop all devices to cover case c)
> + * - For devices which are backed by HostIOMMUDeviceIOMMUFD instances,
> + * we loop them and check if guest pasid entry exists. If yes,
> + * it is case c), we update the pasid cache and also notify
> + * host.
> + * - For devices which are not backed by HostIOMMUDeviceIOMMUFD,
> + * it is not necessary to create pasid cache at this phase since
> + * it could be created when vIOMMU does DMA address translation.
> + * This is not yet implemented since there is no emulated
> + * pasid-capable devices today. If we have such devices in
> + * future, the pasid cache shall be created there.
> + * Other granularity follow the same steps, just with different scope
I would put all the stuff related to interactions with host in a
subsequent patch. This patch could concentrate on the IOMMU PASID cache
only. And then you would add the extra complexity of syncs with the
host. I think it would simplify the review.
> + *
> + */
> +
> + vtd_iommu_lock(s);
> + /*
> + * Step 1: loop all the existing vtd_as instances for pasid unbind and
> + * update.
> + */
> + g_hash_table_foreach_remove(s->vtd_address_spaces, vtd_flush_pasid,
> + pc_info);
> + vtd_iommu_unlock(s);
> +
> + /* TODO: Step 2: loop all the existing vtd_hiod instances for pasid
> bind. */
> +}
> +
> +static bool vtd_process_pasid_desc(IntelIOMMUState *s,
> + VTDInvDesc *inv_desc)
> +{
> + uint16_t domain_id;
> + uint32_t pasid;
> + VTDPASIDCacheInfo pc_info = {};
> + uint64_t mask[4] = {VTD_INV_DESC_PASIDC_RSVD_VAL0, VTD_INV_DESC_ALL_ONE,
> + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE};
> +
> + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, true,
> + __func__, "pasid cache inv")) {
> + return false;
> + }
> +
> + domain_id = VTD_INV_DESC_PASIDC_DID(inv_desc->val[0]);
> + pasid = VTD_INV_DESC_PASIDC_PASID(inv_desc->val[0]);
> +
> + switch (inv_desc->val[0] & VTD_INV_DESC_PASIDC_G) {
> + case VTD_INV_DESC_PASIDC_DSI:
> + trace_vtd_pasid_cache_dsi(domain_id);
> + pc_info.type = VTD_PASID_CACHE_DOMSI;
> + pc_info.domain_id = domain_id;
> + break;
> +
> + case VTD_INV_DESC_PASIDC_PASID_SI:
> + /* PASID selective implies a DID selective */
> + trace_vtd_pasid_cache_psi(domain_id, pasid);
> + pc_info.type = VTD_PASID_CACHE_PASIDSI;
> + pc_info.domain_id = domain_id;
> + pc_info.pasid = pasid;
> + break;
> +
> + case VTD_INV_DESC_PASIDC_GLOBAL:
> + trace_vtd_pasid_cache_gsi();
> + pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
> + break;
> +
> + default:
> + error_report_once("invalid-inv-granu-in-pc_inv_desc hi: 0x%" PRIx64
Make it end-user understandable? invalid granulrity field in PASID-cache
invalidate descriptor.
> + " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
> + return false;
> + }
> +
> + vtd_pasid_cache_sync(s, &pc_info);
> + return !pc_info.error_happened ? true : false;
> +}
> +
> static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
> VTDInvDesc *inv_desc)
> {
> @@ -3264,6 +3512,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
> }
> break;
>
> + case VTD_INV_DESC_PC:
> + trace_vtd_inv_desc("pasid-cache", inv_desc.val[1], inv_desc.val[0]);
> + if (!vtd_process_pasid_desc(s, &inv_desc)) {
> + return false;
> + }
> + break;
> +
> case VTD_INV_DESC_PIOTLB:
> trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
> if (!vtd_process_piotlb_desc(s, &inv_desc)) {
> @@ -3299,16 +3554,6 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
> }
> break;
>
> - /*
> - * TODO: the entity of below two cases will be implemented in future
> series.
> - * To make guest (which integrates scalable mode support patch set in
> - * iommu driver) work, just return true is enough so far.
> - */
> - case VTD_INV_DESC_PC:
> - if (s->scalable_mode) {
> - break;
> - }
> - /* fallthrough */
> default:
> error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
> " (unknown type)", __func__, inv_desc.hi,
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index ac9e1a10aa..ae5bbfcdc0 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -24,6 +24,9 @@ vtd_inv_qi_head(uint16_t head) "read head %d"
> vtd_inv_qi_tail(uint16_t head) "write tail %d"
> vtd_inv_qi_fetch(void) ""
> vtd_context_cache_reset(void) ""
> +vtd_pasid_cache_gsi(void) ""
> +vtd_pasid_cache_dsi(uint16_t domain) "Domain selective PC invalidation
> domain 0x%"PRIx16
Would use pasid cache instead of PC
> +vtd_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID selective PC
> invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32
> vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
> vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8"
> devfn %"PRIu8" not present"
> vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t
> domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64"
> domain 0x%"PRIx16
Thanks
Eric