On Wed, 2026-05-27 at 01:46 -0400, Zhenzhong Duan wrote: > Caution: External email. Do not open attachments or click links, unless this > email comes from a known sender and you know the content is safe. > > > Structure VTDAddressSpace includes some elements suitable for emulated > device and passthrough device without PASID, e.g., address space, > different memory regions, etc, it is also protected by vtd iommu lock, > all these are useless and become a burden for passthrough device with > PASID. > > When there are lots of PASIDs used in one device, the AS and MRs are > all registered to memory core and impact the whole system performance. > > So instead of using VTDAddressSpace to cache pasid entry for each pasid > of a passthrough device, we define a light weight structure > VTDAccelPASIDCacheEntry with only necessary elements for each pasid. We > will use this struct as a parameter to conduct binding/unbinding to > nested hwpt and to record the current bound nested hwpt. It's also > designed to support IOMMU_NO_PASID. > > VTDAccelPASIDCacheEntry is designed to only be used in intel_iommu_accel.c,
For consistency with next line: s@intel_iommu_accel.c@hw/i386/intel_iommu_accel.c > similarly VTDPASIDCacheEntry should only be used in hw/i386/intel_iommu.c > > When guest creates new PASID entries, QEMU will capture the pc_inv_dsc > (pasid cache invalidation) request, walk through each pasid in each > passthrough device for valid pasid entries, create a new > VTDAccelPASIDCacheEntry if not existing yet. > > IOMMU_NO_PASID of passthrough device still need to register MRs in case > guest does not operate in scalable mode. So for IOMMU_NO_PASID, we have > both VTDPASIDCacheEntry and VTDAccelPASIDCacheEntry. > > Co-developed-by: Yi Liu <[[email protected]](mailto:[email protected])> > Signed-off-by: Yi Liu <[[email protected]](mailto:[email protected])> > Signed-off-by: Zhenzhong Duan > <[[email protected]](mailto:[email protected])> > Tested-by: Xudong Hao <[[email protected]](mailto:[email protected])> > --- > hw/i386/intel_iommu_accel.h | 13 +++ > hw/i386/intel_iommu_internal.h | 8 ++ > hw/i386/intel_iommu.c | 3 + > hw/i386/intel_iommu_accel.c | 156 +++++++++++++++++++++++++++++++++ > 4 files changed, 180 insertions(+) > > diff --git a/hw/i386/intel_iommu_accel.h b/hw/i386/intel_iommu_accel.h > index e5f0b077b4..c9b1823745 100644 > --- a/hw/i386/intel_iommu_accel.h > +++ b/hw/i386/intel_iommu_accel.h > @@ -12,6 +12,13 @@ > #define HW_I386_INTEL_IOMMU_ACCEL_H > #include CONFIG_DEVICES > > +typedef struct VTDAccelPASIDCacheEntry { > + VTDHostIOMMUDevice *vtd_hiod; > + VTDPASIDEntry pasid_entry; > + uint32_t pasid; > + QLIST_ENTRY(VTDAccelPASIDCacheEntry) next; > +} VTDAccelPASIDCacheEntry; > + > #ifdef CONFIG_VTD_ACCEL > bool vtd_check_hiod_accel(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hiod, > Error **errp); > @@ -20,6 +27,7 @@ bool vtd_propagate_guest_pasid(VTDAddressSpace *vtd_as, > Error **errp); > void vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, uint16_t > domain_id, > uint32_t pasid, hwaddr addr, > uint64_t npages, bool ih); > +void vtd_accel_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo > *pc_info); > void vtd_iommu_ops_update_accel(PCIIOMMUOps *ops); > #else > static inline bool vtd_check_hiod_accel(IntelIOMMUState *s, > @@ -49,6 +57,11 @@ static inline void > vtd_flush_host_piotlb_all_locked(IntelIOMMUState *s, > { > } > > +static inline void vtd_accel_pasid_cache_sync(IntelIOMMUState *s, > + VTDPASIDCacheInfo *pc_info) > +{ > +} > + > static inline void vtd_iommu_ops_update_accel(PCIIOMMUOps *ops) > { > } > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > index ee59df09f8..9307825017 100644 > --- a/hw/i386/intel_iommu_internal.h > +++ b/hw/i386/intel_iommu_internal.h > @@ -615,6 +615,7 @@ typedef struct VTDRootEntry VTDRootEntry; > #define VTD_CTX_ENTRY_LEGACY_SIZE 16 > #define VTD_CTX_ENTRY_SCALABLE_SIZE 32 > > +#define VTD_SM_CONTEXT_ENTRY_PDTS(x) extract64((x)->val[0], 9, 3) > #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw)) > #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL > #define VTD_SM_CONTEXT_ENTRY_PRE 0x10ULL > @@ -645,6 +646,7 @@ typedef struct VTDPIOTLBInvInfo { > #define VTD_PASID_DIR_BITS_MASK (0x3fffULL) > #define VTD_PASID_DIR_INDEX(pasid) (((pasid) >> 6) & > VTD_PASID_DIR_BITS_MASK) > #define VTD_PASID_DIR_FPD (1ULL << 1) /* Fault Processing > Disable */ > +#define VTD_PASID_TABLE_ENTRY_NUM (1ULL << 6) > #define VTD_PASID_TABLE_BITS_MASK (0x3fULL) > #define VTD_PASID_TABLE_INDEX(pasid) ((pasid) & VTD_PASID_TABLE_BITS_MASK) > #define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing > Disable */ > @@ -710,6 +712,7 @@ typedef struct VTDHostIOMMUDevice { > PCIBus *bus; > uint8_t devfn; > HostIOMMUDevice *hiod; > + QLIST_HEAD(, VTDAccelPASIDCacheEntry) pasid_cache_list; > } VTDHostIOMMUDevice; > > /* > @@ -767,6 +770,11 @@ static inline int vtd_pasid_entry_compare(VTDPASIDEntry > *p1, VTDPASIDEntry *p2) > return memcmp(p1, p2, sizeof(*p1)); > } > > +static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce) > +{ > + return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7); > +} > + > int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, uint32_t pasid, > > VTDPASIDDirEntry *pdire); > int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, uint32_t pasid, > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 2deea8fbe2..94a6a740f1 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -3181,6 +3181,8 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s, > VTDPASIDCacheInfo *pc_info) > g_hash_table_foreach(s->vtd_address_spaces, vtd_pasid_cache_sync_locked, > > pc_info); > vtd_iommu_unlock(s); > + > + vtd_accel_pasid_cache_sync(s, pc_info); > } > > static void vtd_replay_pasid_bindings_all(IntelIOMMUState *s) > @@ -4751,6 +4753,7 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void > *opaque, int devfn, > vtd_hiod->devfn = (uint8_t)devfn; > vtd_hiod->iommu_state = s; > vtd_hiod->hiod = hiod; > + QLIST_INIT(&vtd_hiod->pasid_cache_list); > > if (!vtd_check_hiod(s, vtd_hiod, errp)) { > g_free(vtd_hiod); > diff --git a/hw/i386/intel_iommu_accel.c b/hw/i386/intel_iommu_accel.c > index 10bdbba632..a66d63b4c8 100644 > --- a/hw/i386/intel_iommu_accel.c > +++ b/hw/i386/intel_iommu_accel.c > @@ -259,6 +259,162 @@ void vtd_flush_host_piotlb_all_locked(IntelIOMMUState > *s, uint16_t domain_id, > vtd_flush_host_piotlb_locked, &piotlb_info); > } > > +static void vtd_accel_fill_pc(VTDHostIOMMUDevice *vtd_hiod, uint32_t pasid, > + VTDPASIDEntry *pe) > +{ > + VTDAccelPASIDCacheEntry *vtd_pce; > + > + QLIST_FOREACH(vtd_pce, &vtd_hiod->pasid_cache_list, next) { > + if (vtd_pce->pasid == pasid) { > + if (vtd_pasid_entry_compare(pe, &vtd_pce->pasid_entry)) { > + vtd_pce->pasid_entry = *pe; > + } > + return; > + } > + } > + > + vtd_pce = g_malloc0(sizeof(VTDAccelPASIDCacheEntry)); > + vtd_pce->vtd_hiod = vtd_hiod; > + vtd_pce->pasid = pasid; > + vtd_pce->pasid_entry = *pe; > + QLIST_INSERT_HEAD(&vtd_hiod->pasid_cache_list, vtd_pce, next); > +} > + > +/* > + * This function walks over PASID range within [start, end) in a single > + * PASID table for entries matching @info type/did, then create > + * VTDAccelPASIDCacheEntry if not exist yet. > + */ > +static void vtd_sm_pasid_table_walk_one(VTDHostIOMMUDevice *vtd_hiod, > + dma_addr_t pt_base, int start, int > end, > + VTDPASIDCacheInfo *info) > +{ > + IntelIOMMUState *s = vtd_hiod->iommu_state; > + VTDPASIDEntry pe; > + int pasid; > + > + for (pasid = start; pasid < end; pasid++) { > + if (vtd_get_pe_in_pasid_leaf_table(s, pasid, pt_base, &pe) || > + !vtd_pe_present(&pe)) { > + continue; > + } > + > + if ((info->type == VTD_INV_DESC_PASIDC_G_DSI || > + info->type == VTD_INV_DESC_PASIDC_G_PASID_SI) && > + (info->did != VTD_SM_PASID_ENTRY_DID(&pe))) { > + /* > + * VTD_PASID_CACHE_DOMSI and VTD_PASID_CACHE_PASIDSI > + * requires domain id check. If domain id check fail, > + * go to next pasid. > + */ > + continue; > + } > + > + vtd_accel_fill_pc(vtd_hiod, pasid, &pe); > + } > +} > + > +/* > + * In VT-d scalable mode translation, PASID dir + PASID table is used. > + * This function aims at looping over a range of PASIDs in the given > + * two level table to identify the pasid config in guest. > + */ > +static void vtd_sm_pasid_table_walk(VTDHostIOMMUDevice *vtd_hiod, > + dma_addr_t pdt_base, int start, int end, > > + VTDPASIDCacheInfo *info) > +{ > + VTDPASIDDirEntry pdire; > + int pasid = start; > + int pasid_next; > + dma_addr_t pt_base; > + > + while (pasid < end) { > + pasid_next = (pasid + VTD_PASID_TABLE_ENTRY_NUM) & > + ~(VTD_PASID_TABLE_ENTRY_NUM - 1); > + pasid_next = pasid_next < end ? pasid_next : end; > + > + if (!vtd_get_pdire_from_pdir_table(pdt_base, pasid, &pdire) > + && vtd_pdire_present(&pdire)) { > + pt_base = pdire.val & VTD_PASID_TABLE_BASE_ADDR_MASK; > + vtd_sm_pasid_table_walk_one(vtd_hiod, pt_base, pasid, > pasid_next, > + info); > + } > + pasid = pasid_next; > + } > +} > + > +static void vtd_accel_replay_pasid_bind_for_dev(VTDHostIOMMUDevice > *vtd_hiod, > + int start, int end, > + VTDPASIDCacheInfo *pc_info) > +{ > + IntelIOMMUState *s = vtd_hiod->iommu_state; > + VTDContextEntry ce; > + int dev_max_pasid = 1 << vtd_hiod->hiod->caps.max_pasid_log2; > + > + if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_hiod->bus), > + vtd_hiod->devfn, &ce)) { > + VTDPASIDCacheInfo walk_info = *pc_info; > + uint32_t ce_max_pasid = vtd_sm_ce_get_pdt_entry_num(&ce) * > + VTD_PASID_TABLE_ENTRY_NUM; > + > + end = MIN(end, MIN(dev_max_pasid, ce_max_pasid)); > + > + vtd_sm_pasid_table_walk(vtd_hiod, VTD_CE_GET_PASID_DIR_TABLE(&ce), > + start, end, &walk_info); > + } > +} > + > +/* > + * This function replays the guest pasid bindings by walking the two level > + * guest PASID table. For each valid pasid entry, it creates an entry > + * VTDAccelPASIDCacheEntry dynamically if not exist yet. This entry holds > + * info specific to a pasid > + */ > +void vtd_accel_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo > *pc_info) > +{ > + int start = IOMMU_NO_PASID, end = 1 << s->pasid; > + VTDHostIOMMUDevice *vtd_hiod; > + GHashTableIter hiod_it; > + > + if (!s->fsts) { > + return; > + } > + > + switch (pc_info->type) { > + case VTD_INV_DESC_PASIDC_G_PASID_SI: > + start = pc_info->pasid; > + end = pc_info->pasid + 1; > + /* fall through */ > + case VTD_INV_DESC_PASIDC_G_DSI: > + /* > + * loop all assigned devices, do domain id check in > + * vtd_sm_pasid_table_walk_one() after get pasid entry. > + */ > + break; > + case VTD_INV_DESC_PASIDC_G_GLOBAL: > + /* loop all assigned devices */ > + break; > + default: > + g_assert_not_reached(); > + } > + > + /* > + * Loop all the vtd_hiod instances to sync the "pasid cache" per the > + * guest pasid configuration. > + * > + * VTD translation callback never accesses vtd_hiod and its > corresponding > + * cached pasid entry, so no iommu lock needed here. > + */ > + g_hash_table_iter_init(&hiod_it, s->vtd_host_iommu_dev); > + while (g_hash_table_iter_next(&hiod_it, NULL, (void **)&vtd_hiod)) { > + if (!object_dynamic_cast(OBJECT(vtd_hiod->hiod), > + TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) { > + continue; > + } > + vtd_accel_replay_pasid_bind_for_dev(vtd_hiod, start, end, pc_info); > + } > +} > + > static uint64_t vtd_get_host_iommu_quirks(uint32_t type, > void *caps, uint32_t size) > { > -- > 2.47.3 >
