Hi Kunkun, On 4/15/21 4:03 AM, Kunkun Jiang wrote: > Hi Eric, > > On 2021/4/14 16:05, Auger Eric wrote: >> Hi Kunkun, >> >> On 4/14/21 3:45 AM, Kunkun Jiang wrote: >>> On 2021/4/13 20:57, Auger Eric wrote: >>>> Hi Kunkun, >>>> >>>> On 4/13/21 2:10 PM, Kunkun Jiang wrote: >>>>> Hi Eric, >>>>> >>>>> On 2021/4/11 20:08, Eric Auger wrote: >>>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as >>>>>> there is no "caching" mode and we do not trap on map. >>>>>> >>>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM >>>>>> through the host single stage. >>>>>> >>>>>> With nested mode, we need to setup the stage 2 and the stage 1 >>>>>> separately. This patch introduces a prereg_listener to setup >>>>>> the stage 2 mapping. >>>>>> >>>>>> The stage 1 mapping, owned by the guest, is passed to the host >>>>>> when the guest invalidates the stage 1 configuration, through >>>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations >>>>>> are cascaded downto the host through another IOMMU MR UNMAP >>>>>> notifier. >>>>>> >>>>>> Signed-off-by: Eric Auger <eric.au...@redhat.com> >>>>>> >>>>>> --- >>>>>> >>>>>> v7 -> v8: >>>>>> - properly handle new IOMMUTLBEntry fields and especially >>>>>> propagate DOMAIN and PASID based invalidations >>>>>> >>>>>> v6 -> v7: >>>>>> - remove PASID based invalidation >>>>>> >>>>>> v5 -> v6: >>>>>> - add error_report_err() >>>>>> - remove the abort in case of nested stage case >>>>>> >>>>>> v4 -> v5: >>>>>> - use VFIO_IOMMU_SET_PASID_TABLE >>>>>> - use PCIPASIDOps for config notification >>>>>> >>>>>> v3 -> v4: >>>>>> - use iommu_inv_pasid_info for ASID invalidation >>>>>> >>>>>> v2 -> v3: >>>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE >>>>>> - new user API >>>>>> - handle leaf >>>>>> >>>>>> v1 -> v2: >>>>>> - adapt to uapi changes >>>>>> - pass the asid >>>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier >>>>>> --- >>>>>> hw/vfio/common.c | 139 >>>>>> +++++++++++++++++++++++++++++++++++++++++-- >>>>>> hw/vfio/pci.c | 21 +++++++ >>>>>> hw/vfio/trace-events | 2 + >>>>>> 3 files changed, 157 insertions(+), 5 deletions(-) >>>>>> >>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>>>>> index 0cd7ef2139..e369d451e7 100644 >>>>>> --- a/hw/vfio/common.c >>>>>> +++ b/hw/vfio/common.c >>>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry >>>>>> *iotlb, void **vaddr, >>>>>> return true; >>>>>> } >>>>>> +/* Propagate a guest IOTLB invalidation to the host (nested >>>>>> mode) */ >>>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry >>>>>> *iotlb) >>>>>> +{ >>>>>> + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); >>>>>> + struct vfio_iommu_type1_cache_invalidate ustruct = {}; >>>>>> + VFIOContainer *container = giommu->container; >>>>>> + int ret; >>>>>> + >>>>>> + assert(iotlb->perm == IOMMU_NONE); >>>>>> + >>>>>> + ustruct.argsz = sizeof(ustruct); >>>>>> + ustruct.flags = 0; >>>>>> + ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info); >>>>>> + ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1; >>>>>> + ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB; >>>>>> + >>>>>> + switch (iotlb->granularity) { >>>>>> + case IOMMU_INV_GRAN_DOMAIN: >>>>>> + ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN; >>>>>> + break; >>>>>> + case IOMMU_INV_GRAN_PASID: >>>>>> + { >>>>>> + struct iommu_inv_pasid_info *pasid_info; >>>>>> + int archid = -1; >>>>>> + >>>>>> + pasid_info = &ustruct.info.granu.pasid_info; >>>>>> + ustruct.info.granularity = IOMMU_INV_GRANU_PASID; >>>>>> + if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { >>>>>> + pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; >>>>>> + archid = iotlb->arch_id; >>>>>> + } >>>>>> + pasid_info->archid = archid; >>>>>> + trace_vfio_iommu_asid_inv_iotlb(archid); >>>>>> + break; >>>>>> + } >>>>>> + case IOMMU_INV_GRAN_ADDR: >>>>>> + { >>>>>> + hwaddr start = iotlb->iova + giommu->iommu_offset; >>>>>> + struct iommu_inv_addr_info *addr_info; >>>>>> + size_t size = iotlb->addr_mask + 1; >>>>>> + int archid = -1; >>>>>> + >>>>>> + addr_info = &ustruct.info.granu.addr_info; >>>>>> + ustruct.info.granularity = IOMMU_INV_GRANU_ADDR; >>>>>> + if (iotlb->leaf) { >>>>>> + addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF; >>>>>> + } >>>>>> + if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { >>>>>> + addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; >>>>>> + archid = iotlb->arch_id; >>>>>> + } >>>>>> + addr_info->archid = archid; >>>>>> + addr_info->addr = start; >>>>>> + addr_info->granule_size = size; >>>>>> + addr_info->nb_granules = 1; >>>>>> + trace_vfio_iommu_addr_inv_iotlb(archid, start, size, >>>>>> + 1, iotlb->leaf); >>>>>> + break; >>>>>> + } >>>>> Should we pass a size to host kernel here, even if vSMMU doesn't >>>>> support >>>>> RIL or guest kernel doesn't use RIL? >>>>> >>>>> It will cause TLBI issue in this scenario: Guest kernel issues a >>>>> TLBI cmd >>>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu >>>>> passed >>>>> the iova and size (4K) to host kernel. Finally, host kernel issues a >>>>> TLBI cmd >>>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge >>>>> page. >>>>> (pSMMU supports RIL) >>>> In that case the guest will loop over all 4K images belonging to the 2M >>>> huge page and invalidate each of them. This should turn into qemu >>>> notifications for each 4kB page, no? This is totally inefficient, hence >>> The guest will not loop over all 4K images belonging to the 2M huge >>> page. >>> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. >>> The >>> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule": >>> >>> iommu_iotlb_gather_add_page >>> iommu_iotlb_sync >>> domain->ops->iotlb_sync >>> arm_smmu_iotlb_sync >>> arm_smmu_tlb_inv_range_domain >>> __arm_smmu_tlb_inv_range >>> >>> In the above mentioned scenario, guest kernel will issue a TLBI cmd only >>> with >>> "iova" (tg = 0). >> OK I get it now. In that case I think I should do a TG=0 invalidation >> on host even if the host does support RIL. Does that sound correct? > Yeah, that's what I thought. >> The trouble is the uapi struct does not convey such info explicitly. >> Maybe I should use nb_granules = 0 to detect that case. > It is troublesome to correct this issue. Using nb_granules = 0 may be > a good choice. >> I think for this use case RIL should be supported by the guest. Thoughts? > Yes. If guest supports RIL, the scenario mentioned above does not exist.
After further study I really wonder if it is worth supporting the case where the guest does not use range inval. Passing a nb_granules = 0 (~ size) would be OK to perform the TG=0 range invalidation on the host. however host arm_smmu_tlb_inv_range_domain() then calls arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size) which needs a size. We would need to trap guest CMD_ATC_INV and use different code paths on host smmu driver to cascade the various cache invalidations. At the moment, without maintainer giodance, I am a bit reluctant to add this extra complexity. So I would be inclined to fail in QEMU if we detect TG=0 is being used by the guest. Recent guest kernels would be a prerequisite for this use case. What do you think? Thanks Eric > > Thanks, > Kunkun Jiang >> >> Thanks >> >> Eric >>> Thanks, >>> Kunkun Jiang >>>> the support of RIL on guest side and QEMU device. >>>> >>>> What do I miss? >>>> >>>> Thanks >>>> >>>> Eric >>>>> Thanks, >>>>> Kunkun Jiang >>>>>> + } >>>>>> + >>>>>> + ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, >>>>>> &ustruct); >>>>>> + if (ret) { >>>>>> + error_report("%p: failed to invalidate CACHE (%d)", >>>>>> container, ret); >>>>>> + } >>>>>> +} >>>>>> + >>>>>> static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry >>>>>> *iotlb) >>>>>> { >>>>>> VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); >>>>>> @@ -776,6 +843,35 @@ static void >>>>>> vfio_dma_unmap_ram_section(VFIOContainer *container, >>>>>> } >>>>>> } >>>>>> +static void vfio_prereg_listener_region_add(MemoryListener >>>>>> *listener, >>>>>> + MemoryRegionSection >>>>>> *section) >>>>>> +{ >>>>>> + VFIOContainer *container = >>>>>> + container_of(listener, VFIOContainer, prereg_listener); >>>>>> + Error *err = NULL; >>>>>> + >>>>>> + if (!memory_region_is_ram(section->mr)) { >>>>>> + return; >>>>>> + } >>>>>> + >>>>>> + vfio_dma_map_ram_section(container, section, &err); >>>>>> + if (err) { >>>>>> + error_report_err(err); >>>>>> + } >>>>>> +} >>>>>> +static void vfio_prereg_listener_region_del(MemoryListener >>>>>> *listener, >>>>>> + MemoryRegionSection *section) >>>>>> +{ >>>>>> + VFIOContainer *container = >>>>>> + container_of(listener, VFIOContainer, prereg_listener); >>>>>> + >>>>>> + if (!memory_region_is_ram(section->mr)) { >>>>>> + return; >>>>>> + } >>>>>> + >>>>>> + vfio_dma_unmap_ram_section(container, section); >>>>>> +} >>>>>> + >>>>>> static void vfio_listener_region_add(MemoryListener *listener, >>>>>> MemoryRegionSection >>>>>> *section) >>>>>> { >>>>>> @@ -879,9 +975,10 @@ static void >>>>>> vfio_listener_region_add(MemoryListener *listener, >>>>>> memory_region_ref(section->mr); >>>>>> if (memory_region_is_iommu(section->mr)) { >>>>>> + IOMMUNotify notify; >>>>>> VFIOGuestIOMMU *giommu; >>>>>> IOMMUMemoryRegion *iommu_mr = >>>>>> IOMMU_MEMORY_REGION(section->mr); >>>>>> - int iommu_idx; >>>>>> + int iommu_idx, flags; >>>>>> trace_vfio_listener_region_add_iommu(iova, end); >>>>>> /* >>>>>> @@ -900,8 +997,18 @@ static void >>>>>> vfio_listener_region_add(MemoryListener *listener, >>>>>> llend = int128_sub(llend, int128_one()); >>>>>> iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, >>>>>> >>>>>> MEMTXATTRS_UNSPECIFIED); >>>>>> - iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, >>>>>> - IOMMU_NOTIFIER_IOTLB_EVENTS, >>>>>> + >>>>>> + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { >>>>>> + /* IOTLB unmap notifier to propagate guest IOTLB >>>>>> invalidations */ >>>>>> + flags = IOMMU_NOTIFIER_UNMAP; >>>>>> + notify = vfio_iommu_unmap_notify; >>>>>> + } else { >>>>>> + /* MAP/UNMAP IOTLB notifier */ >>>>>> + flags = IOMMU_NOTIFIER_IOTLB_EVENTS; >>>>>> + notify = vfio_iommu_map_notify; >>>>>> + } >>>>>> + >>>>>> + iommu_notifier_init(&giommu->n, notify, flags, >>>>>> section->offset_within_region, >>>>>> int128_get64(llend), >>>>>> iommu_idx); >>>>>> @@ -921,7 +1028,9 @@ static void >>>>>> vfio_listener_region_add(MemoryListener *listener, >>>>>> goto fail; >>>>>> } >>>>>> QLIST_INSERT_HEAD(&container->giommu_list, giommu, >>>>>> giommu_next); >>>>>> - memory_region_iommu_replay(giommu->iommu, &giommu->n); >>>>>> + if (flags & IOMMU_NOTIFIER_MAP) { >>>>>> + memory_region_iommu_replay(giommu->iommu, &giommu->n); >>>>>> + } >>>>>> return; >>>>>> } >>>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener >>>>>> vfio_memory_listener = { >>>>>> .log_sync = vfio_listener_log_sync, >>>>>> }; >>>>>> +static MemoryListener vfio_memory_prereg_listener = { >>>>>> + .region_add = vfio_prereg_listener_region_add, >>>>>> + .region_del = vfio_prereg_listener_region_del, >>>>>> +}; >>>>>> + >>>>>> static void vfio_listener_release(VFIOContainer *container) >>>>>> { >>>>>> memory_listener_unregister(&container->listener); >>>>>> - if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { >>>>>> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || >>>>>> + container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { >>>>>> memory_listener_unregister(&container->prereg_listener); >>>>>> } >>>>>> } >>>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup >>>>>> *group, AddressSpace *as, >>>>>> vfio_get_iommu_info_migration(container, info); >>>>>> } >>>>>> g_free(info); >>>>>> + >>>>>> + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { >>>>>> + container->prereg_listener = >>>>>> vfio_memory_prereg_listener; >>>>>> + memory_listener_register(&container->prereg_listener, >>>>>> + &address_space_memory); >>>>>> + if (container->error) { >>>>>> + >>>>>> memory_listener_unregister(&container->prereg_listener); >>>>>> + ret = -1; >>>>>> + error_propagate_prepend(errp, container->error, >>>>>> + "RAM memory listener >>>>>> initialization failed " >>>>>> + "for container"); >>>>>> + goto free_container_exit; >>>>>> + } >>>>>> + } >>>>>> break; >>>>>> } >>>>>> case VFIO_SPAPR_TCE_v2_IOMMU: >>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >>>>>> index 5c65aa0a98..cad7deec71 100644 >>>>>> --- a/hw/vfio/pci.c >>>>>> +++ b/hw/vfio/pci.c >>>>>> @@ -2773,6 +2773,25 @@ static void >>>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev) >>>>>> vdev->req_enabled = false; >>>>>> } >>>>>> +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t >>>>>> devfn, >>>>>> + IOMMUConfig *config) >>>>>> +{ >>>>>> + PCIDevice *pdev = bus->devices[devfn]; >>>>>> + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); >>>>>> + VFIOContainer *container = vdev->vbasedev.group->container; >>>>>> + struct vfio_iommu_type1_set_pasid_table info; >>>>>> + >>>>>> + info.argsz = sizeof(info); >>>>>> + info.flags = VFIO_PASID_TABLE_FLAG_SET; >>>>>> + memcpy(&info.config, &config->pasid_cfg, >>>>>> sizeof(config->pasid_cfg)); >>>>>> + >>>>>> + return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info); >>>>>> +} >>>>>> + >>>>>> +static PCIPASIDOps vfio_pci_pasid_ops = { >>>>>> + .set_pasid_table = vfio_iommu_set_pasid_table, >>>>>> +}; >>>>>> + >>>>>> static void vfio_realize(PCIDevice *pdev, Error **errp) >>>>>> { >>>>>> VFIOPCIDevice *vdev = VFIO_PCI(pdev); >>>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error >>>>>> **errp) >>>>>> vfio_register_req_notifier(vdev); >>>>>> vfio_setup_resetfn_quirk(vdev); >>>>>> + pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops); >>>>>> + >>>>>> return; >>>>>> out_deregister: >>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >>>>>> index 936d29d150..43696afc15 100644 >>>>>> --- a/hw/vfio/trace-events >>>>>> +++ b/hw/vfio/trace-events >>>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name, >>>>>> int index, int nr_areas) "Devic >>>>>> vfio_region_sparse_mmap_entry(int i, unsigned long start, >>>>>> unsigned >>>>>> long end) "sparse entry %d [0x%lx - 0x%lx]" >>>>>> vfio_get_dev_region(const char *name, int index, uint32_t type, >>>>>> uint32_t subtype) "%s index %d, %08x/%0x8" >>>>>> vfio_dma_unmap_overflow_workaround(void) "" >>>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, >>>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, >>>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" >>>>>> leaf=%d" >>>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate >>>>>> asid=%d" >>>>>> # platform.c >>>>>> vfio_platform_base_device_init(char *name, int groupid) "%s >>>>>> belongs >>>>>> to group #%d" >>>>> >>>> . >>> >> . > >