Hi Kunkun, On 4/14/21 3:45 AM, Kunkun Jiang wrote: > On 2021/4/13 20:57, Auger Eric wrote: >> Hi Kunkun, >> >> On 4/13/21 2:10 PM, Kunkun Jiang wrote: >>> Hi Eric, >>> >>> On 2021/4/11 20:08, Eric Auger wrote: >>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as >>>> there is no "caching" mode and we do not trap on map. >>>> >>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM >>>> through the host single stage. >>>> >>>> With nested mode, we need to setup the stage 2 and the stage 1 >>>> separately. This patch introduces a prereg_listener to setup >>>> the stage 2 mapping. >>>> >>>> The stage 1 mapping, owned by the guest, is passed to the host >>>> when the guest invalidates the stage 1 configuration, through >>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations >>>> are cascaded downto the host through another IOMMU MR UNMAP >>>> notifier. >>>> >>>> Signed-off-by: Eric Auger <eric.au...@redhat.com> >>>> >>>> --- >>>> >>>> v7 -> v8: >>>> - properly handle new IOMMUTLBEntry fields and especially >>>> propagate DOMAIN and PASID based invalidations >>>> >>>> v6 -> v7: >>>> - remove PASID based invalidation >>>> >>>> v5 -> v6: >>>> - add error_report_err() >>>> - remove the abort in case of nested stage case >>>> >>>> v4 -> v5: >>>> - use VFIO_IOMMU_SET_PASID_TABLE >>>> - use PCIPASIDOps for config notification >>>> >>>> v3 -> v4: >>>> - use iommu_inv_pasid_info for ASID invalidation >>>> >>>> v2 -> v3: >>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE >>>> - new user API >>>> - handle leaf >>>> >>>> v1 -> v2: >>>> - adapt to uapi changes >>>> - pass the asid >>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier >>>> --- >>>> hw/vfio/common.c | 139 >>>> +++++++++++++++++++++++++++++++++++++++++-- >>>> hw/vfio/pci.c | 21 +++++++ >>>> hw/vfio/trace-events | 2 + >>>> 3 files changed, 157 insertions(+), 5 deletions(-) >>>> >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>>> index 0cd7ef2139..e369d451e7 100644 >>>> --- a/hw/vfio/common.c >>>> +++ b/hw/vfio/common.c >>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry >>>> *iotlb, void **vaddr, >>>> return true; >>>> } >>>> +/* Propagate a guest IOTLB invalidation to the host (nested >>>> mode) */ >>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry >>>> *iotlb) >>>> +{ >>>> + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); >>>> + struct vfio_iommu_type1_cache_invalidate ustruct = {}; >>>> + VFIOContainer *container = giommu->container; >>>> + int ret; >>>> + >>>> + assert(iotlb->perm == IOMMU_NONE); >>>> + >>>> + ustruct.argsz = sizeof(ustruct); >>>> + ustruct.flags = 0; >>>> + ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info); >>>> + ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1; >>>> + ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB; >>>> + >>>> + switch (iotlb->granularity) { >>>> + case IOMMU_INV_GRAN_DOMAIN: >>>> + ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN; >>>> + break; >>>> + case IOMMU_INV_GRAN_PASID: >>>> + { >>>> + struct iommu_inv_pasid_info *pasid_info; >>>> + int archid = -1; >>>> + >>>> + pasid_info = &ustruct.info.granu.pasid_info; >>>> + ustruct.info.granularity = IOMMU_INV_GRANU_PASID; >>>> + if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { >>>> + pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; >>>> + archid = iotlb->arch_id; >>>> + } >>>> + pasid_info->archid = archid; >>>> + trace_vfio_iommu_asid_inv_iotlb(archid); >>>> + break; >>>> + } >>>> + case IOMMU_INV_GRAN_ADDR: >>>> + { >>>> + hwaddr start = iotlb->iova + giommu->iommu_offset; >>>> + struct iommu_inv_addr_info *addr_info; >>>> + size_t size = iotlb->addr_mask + 1; >>>> + int archid = -1; >>>> + >>>> + addr_info = &ustruct.info.granu.addr_info; >>>> + ustruct.info.granularity = IOMMU_INV_GRANU_ADDR; >>>> + if (iotlb->leaf) { >>>> + addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF; >>>> + } >>>> + if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { >>>> + addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; >>>> + archid = iotlb->arch_id; >>>> + } >>>> + addr_info->archid = archid; >>>> + addr_info->addr = start; >>>> + addr_info->granule_size = size; >>>> + addr_info->nb_granules = 1; >>>> + trace_vfio_iommu_addr_inv_iotlb(archid, start, size, >>>> + 1, iotlb->leaf); >>>> + break; >>>> + } >>> Should we pass a size to host kernel here, even if vSMMU doesn't >>> support >>> RIL or guest kernel doesn't use RIL? >>> >>> It will cause TLBI issue in this scenario: Guest kernel issues a >>> TLBI cmd >>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed >>> the iova and size (4K) to host kernel. Finally, host kernel issues a >>> TLBI cmd >>> with "range" (4K) which can not invalidate the TLB entry of 2M huge >>> page. >>> (pSMMU supports RIL) >> In that case the guest will loop over all 4K images belonging to the 2M >> huge page and invalidate each of them. This should turn into qemu >> notifications for each 4kB page, no? This is totally inefficient, hence > The guest will not loop over all 4K images belonging to the 2M huge page. > The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The > gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule": > > iommu_iotlb_gather_add_page > iommu_iotlb_sync > domain->ops->iotlb_sync > arm_smmu_iotlb_sync > arm_smmu_tlb_inv_range_domain > __arm_smmu_tlb_inv_range > > In the above mentioned scenario, guest kernel will issue a TLBI cmd only > with > "iova" (tg = 0).
I am busy fixing this case. Could you share your guest use case. It is DPDK? In the positive I would be interested in getting your DPDK setup/commands. Thank you in advance Eric > > Thanks, > Kunkun Jiang >> the support of RIL on guest side and QEMU device. >> >> What do I miss? >> >> Thanks >> >> Eric >>> Thanks, >>> Kunkun Jiang >>>> + } >>>> + >>>> + ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct); >>>> + if (ret) { >>>> + error_report("%p: failed to invalidate CACHE (%d)", >>>> container, ret); >>>> + } >>>> +} >>>> + >>>> static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry >>>> *iotlb) >>>> { >>>> VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); >>>> @@ -776,6 +843,35 @@ static void >>>> vfio_dma_unmap_ram_section(VFIOContainer *container, >>>> } >>>> } >>>> +static void vfio_prereg_listener_region_add(MemoryListener >>>> *listener, >>>> + MemoryRegionSection >>>> *section) >>>> +{ >>>> + VFIOContainer *container = >>>> + container_of(listener, VFIOContainer, prereg_listener); >>>> + Error *err = NULL; >>>> + >>>> + if (!memory_region_is_ram(section->mr)) { >>>> + return; >>>> + } >>>> + >>>> + vfio_dma_map_ram_section(container, section, &err); >>>> + if (err) { >>>> + error_report_err(err); >>>> + } >>>> +} >>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener, >>>> + MemoryRegionSection *section) >>>> +{ >>>> + VFIOContainer *container = >>>> + container_of(listener, VFIOContainer, prereg_listener); >>>> + >>>> + if (!memory_region_is_ram(section->mr)) { >>>> + return; >>>> + } >>>> + >>>> + vfio_dma_unmap_ram_section(container, section); >>>> +} >>>> + >>>> static void vfio_listener_region_add(MemoryListener *listener, >>>> MemoryRegionSection *section) >>>> { >>>> @@ -879,9 +975,10 @@ static void >>>> vfio_listener_region_add(MemoryListener *listener, >>>> memory_region_ref(section->mr); >>>> if (memory_region_is_iommu(section->mr)) { >>>> + IOMMUNotify notify; >>>> VFIOGuestIOMMU *giommu; >>>> IOMMUMemoryRegion *iommu_mr = >>>> IOMMU_MEMORY_REGION(section->mr); >>>> - int iommu_idx; >>>> + int iommu_idx, flags; >>>> trace_vfio_listener_region_add_iommu(iova, end); >>>> /* >>>> @@ -900,8 +997,18 @@ static void >>>> vfio_listener_region_add(MemoryListener *listener, >>>> llend = int128_sub(llend, int128_one()); >>>> iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, >>>> >>>> MEMTXATTRS_UNSPECIFIED); >>>> - iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, >>>> - IOMMU_NOTIFIER_IOTLB_EVENTS, >>>> + >>>> + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { >>>> + /* IOTLB unmap notifier to propagate guest IOTLB >>>> invalidations */ >>>> + flags = IOMMU_NOTIFIER_UNMAP; >>>> + notify = vfio_iommu_unmap_notify; >>>> + } else { >>>> + /* MAP/UNMAP IOTLB notifier */ >>>> + flags = IOMMU_NOTIFIER_IOTLB_EVENTS; >>>> + notify = vfio_iommu_map_notify; >>>> + } >>>> + >>>> + iommu_notifier_init(&giommu->n, notify, flags, >>>> section->offset_within_region, >>>> int128_get64(llend), >>>> iommu_idx); >>>> @@ -921,7 +1028,9 @@ static void >>>> vfio_listener_region_add(MemoryListener *listener, >>>> goto fail; >>>> } >>>> QLIST_INSERT_HEAD(&container->giommu_list, giommu, >>>> giommu_next); >>>> - memory_region_iommu_replay(giommu->iommu, &giommu->n); >>>> + if (flags & IOMMU_NOTIFIER_MAP) { >>>> + memory_region_iommu_replay(giommu->iommu, &giommu->n); >>>> + } >>>> return; >>>> } >>>> @@ -1205,10 +1314,16 @@ static const MemoryListener >>>> vfio_memory_listener = { >>>> .log_sync = vfio_listener_log_sync, >>>> }; >>>> +static MemoryListener vfio_memory_prereg_listener = { >>>> + .region_add = vfio_prereg_listener_region_add, >>>> + .region_del = vfio_prereg_listener_region_del, >>>> +}; >>>> + >>>> static void vfio_listener_release(VFIOContainer *container) >>>> { >>>> memory_listener_unregister(&container->listener); >>>> - if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { >>>> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || >>>> + container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { >>>> memory_listener_unregister(&container->prereg_listener); >>>> } >>>> } >>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup >>>> *group, AddressSpace *as, >>>> vfio_get_iommu_info_migration(container, info); >>>> } >>>> g_free(info); >>>> + >>>> + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { >>>> + container->prereg_listener = vfio_memory_prereg_listener; >>>> + memory_listener_register(&container->prereg_listener, >>>> + &address_space_memory); >>>> + if (container->error) { >>>> + >>>> memory_listener_unregister(&container->prereg_listener); >>>> + ret = -1; >>>> + error_propagate_prepend(errp, container->error, >>>> + "RAM memory listener >>>> initialization failed " >>>> + "for container"); >>>> + goto free_container_exit; >>>> + } >>>> + } >>>> break; >>>> } >>>> case VFIO_SPAPR_TCE_v2_IOMMU: >>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >>>> index 5c65aa0a98..cad7deec71 100644 >>>> --- a/hw/vfio/pci.c >>>> +++ b/hw/vfio/pci.c >>>> @@ -2773,6 +2773,25 @@ static void >>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev) >>>> vdev->req_enabled = false; >>>> } >>>> +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn, >>>> + IOMMUConfig *config) >>>> +{ >>>> + PCIDevice *pdev = bus->devices[devfn]; >>>> + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); >>>> + VFIOContainer *container = vdev->vbasedev.group->container; >>>> + struct vfio_iommu_type1_set_pasid_table info; >>>> + >>>> + info.argsz = sizeof(info); >>>> + info.flags = VFIO_PASID_TABLE_FLAG_SET; >>>> + memcpy(&info.config, &config->pasid_cfg, >>>> sizeof(config->pasid_cfg)); >>>> + >>>> + return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info); >>>> +} >>>> + >>>> +static PCIPASIDOps vfio_pci_pasid_ops = { >>>> + .set_pasid_table = vfio_iommu_set_pasid_table, >>>> +}; >>>> + >>>> static void vfio_realize(PCIDevice *pdev, Error **errp) >>>> { >>>> VFIOPCIDevice *vdev = VFIO_PCI(pdev); >>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error >>>> **errp) >>>> vfio_register_req_notifier(vdev); >>>> vfio_setup_resetfn_quirk(vdev); >>>> + pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops); >>>> + >>>> return; >>>> out_deregister: >>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >>>> index 936d29d150..43696afc15 100644 >>>> --- a/hw/vfio/trace-events >>>> +++ b/hw/vfio/trace-events >>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name, >>>> int index, int nr_areas) "Devic >>>> vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned >>>> long end) "sparse entry %d [0x%lx - 0x%lx]" >>>> vfio_get_dev_region(const char *name, int index, uint32_t type, >>>> uint32_t subtype) "%s index %d, %08x/%0x8" >>>> vfio_dma_unmap_overflow_workaround(void) "" >>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, >>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, >>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" >>>> leaf=%d" >>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" >>>> # platform.c >>>> vfio_platform_base_device_init(char *name, int groupid) "%s belongs >>>> to group #%d" >>> >>> >> . > >