This captures the guest PASID table entry modifications and
propagates the changes to host to attach a hwpt with type determined
per guest IOMMU mdoe and PGTT configuration.

When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
vIOMMU reuse hwpt(GPA->HPA) provided by VFIO as nested parent to
construct nested page table.

When guest decides to use legacy mode then vIOMMU switches the MRs of
the device's AS, hence the IOAS created by VFIO container would be
switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
switched to IOMMU MR. So it is able to support shadowing the guest IO
page table.

Co-Authored-by: Yi Liu <yi.l....@intel.com>
Signed-off-by: Yi Liu <yi.l....@intel.com>
Signed-off-by: Yi Sun <yi.y....@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
---
 hw/i386/intel_iommu.c          | 221 ++++++++++++++++++++++++++++++++-
 hw/i386/intel_iommu_internal.h |  14 ++-
 hw/i386/trace-events           |   3 +
 include/hw/i386/intel_iommu.h  |   1 +
 4 files changed, 233 insertions(+), 6 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 030862fb2f..c35673eb58 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -20,6 +20,7 @@
  */
 
 #include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "qapi/error.h"
@@ -41,6 +42,9 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 #include "system/iommufd.h"
+#ifdef CONFIG_IOMMUFD
+#include <linux/iommufd.h>
+#endif
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -50,10 +54,9 @@
 
 /* pe operations */
 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
-#define VTD_PE_GET_FL_LEVEL(pe) \
-    (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
 #define VTD_PE_GET_SL_LEVEL(pe) \
     (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_FL_LEVEL(pe) (VTD_SM_PASID_ENTRY_FSPM(pe) + 4)
 
 /*
  * PCI bus number (or SID) is not reliable since the device is usaully
@@ -834,6 +837,31 @@ static inline uint32_t 
vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
     return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7);
 }
 
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+    return pe->val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;
+}
+
+/*
+ * Stage-1 IOVA address width: 48 bits for 4-level paging(FSPM=00)
+ *                             57 bits for 5-level paging(FSPM=01)
+ */
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+    return 48 + VTD_SM_PASID_ENTRY_FSPM(pe) * 9;
+}
+
+static inline bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
+}
+
+/* check if pgtt is first stage translation */
+static inline bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
     return pdire->val & 1;
@@ -1131,7 +1159,7 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState 
*s,
     if (s->root_scalable) {
         vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
         if (s->flts) {
-            return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+            return pe.val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;
         } else {
             return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
         }
@@ -1766,7 +1794,7 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, 
VTDContextEntry *ce,
              */
             return false;
         }
-        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
+        return vtd_pe_pgtt_is_pt(&pe);
     }
 
     return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH);
@@ -2428,6 +2456,178 @@ static void 
vtd_context_global_invalidate(IntelIOMMUState *s)
     vtd_iommu_replay_all(s);
 }
 
+#ifdef CONFIG_IOMMUFD
+static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
+                                  VTDPASIDEntry *pe)
+{
+    memset(vtd, 0, sizeof(*vtd));
+
+    vtd->flags =  (VTD_SM_PASID_ENTRY_SRE_BIT(pe) ? IOMMU_VTD_S1_SRE : 0) |
+                  (VTD_SM_PASID_ENTRY_WPE_BIT(pe) ? IOMMU_VTD_S1_WPE : 0) |
+                  (VTD_SM_PASID_ENTRY_EAFE_BIT(pe) ? IOMMU_VTD_S1_EAFE : 0);
+    vtd->addr_width = vtd_pe_get_fl_aw(pe);
+    vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
+}
+
+static int vtd_create_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                              VTDPASIDEntry *pe, uint32_t *s1_hwpt,
+                              Error **errp)
+{
+    struct iommu_hwpt_vtd_s1 vtd;
+
+    vtd_init_s1_hwpt_data(&vtd, pe);
+
+    return !iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                       idev->hwpt_id, 0, 
IOMMU_HWPT_DATA_VTD_S1,
+                                       sizeof(vtd), &vtd, s1_hwpt, errp);
+}
+
+static void vtd_destroy_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                VTDAddressSpace *vtd_as)
+{
+    if (!vtd_as->s1_hwpt) {
+        return;
+    }
+    iommufd_backend_free_id(idev->iommufd, vtd_as->s1_hwpt);
+    vtd_as->s1_hwpt = 0;
+}
+
+static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+                                     VTDAddressSpace *vtd_as, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    VTDPASIDEntry *pe = &vtd_as->pasid_cache_entry.pasid_entry;
+    uint32_t hwpt_id;
+    int ret;
+
+    /*
+     * We can get here only if flts=on, the supported PGTT is FLT and PT.
+     * Catch invalid PGTT when processing invalidation request to avoid
+     * attaching to wrong hwpt.
+     */
+    if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
+        error_setg(errp, "Invalid PGTT type");
+        return -EINVAL;
+    }
+
+    if (vtd_pe_pgtt_is_flt(pe)) {
+        /* Should fail if the FLPT base is 0 */
+        if (!vtd_pe_get_flpt_base(pe)) {
+            error_setg(errp, "FLPT base is 0");
+            return -EINVAL;
+        }
+
+        if (vtd_create_s1_hwpt(idev, pe, &hwpt_id, errp)) {
+            return -EINVAL;
+        }
+    } else {
+        hwpt_id = idev->hwpt_id;
+    }
+
+    ret = !host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp);
+    trace_vtd_device_attach_hwpt(idev->devid, vtd_as->pasid, hwpt_id, ret);
+    if (!ret) {
+        vtd_destroy_s1_hwpt(idev, vtd_as);
+        if (vtd_pe_pgtt_is_flt(pe)) {
+            vtd_as->s1_hwpt = hwpt_id;
+        }
+    } else if (vtd_pe_pgtt_is_flt(pe)) {
+        iommufd_backend_free_id(idev->iommufd, hwpt_id);
+    }
+
+    return ret;
+}
+
+static int vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+                                     VTDAddressSpace *vtd_as, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    uint32_t pasid = vtd_as->pasid;
+    int ret;
+
+    if (vtd_hiod->iommu_state->dmar_enabled) {
+        ret = !host_iommu_device_iommufd_detach_hwpt(idev, errp);
+        trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
+    } else {
+        ret = !host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, 
errp);
+        trace_vtd_device_reattach_def_hwpt(idev->devid, pasid, idev->hwpt_id,
+                                           ret);
+    }
+
+    if (!ret) {
+        vtd_destroy_s1_hwpt(idev, vtd_as);
+    }
+
+    return ret;
+}
+
+static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, VTDPASIDOp op,
+                                Error **errp)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    VTDHostIOMMUDevice *vtd_hiod = vtd_find_hiod_iommufd(s, vtd_as);
+    int ret;
+
+    if (!vtd_hiod) {
+        /* No need to go further, e.g. for emulated device */
+        return 0;
+    }
+
+    if (vtd_as->pasid != PCI_NO_PASID) {
+        error_setg(errp, "Non-rid_pasid %d not supported yet", vtd_as->pasid);
+        return -EINVAL;
+    }
+
+    switch (op) {
+    case VTD_PASID_UPDATE:
+    case VTD_PASID_BIND:
+    {
+        ret = vtd_device_attach_iommufd(vtd_hiod, vtd_as, errp);
+        break;
+    }
+    case VTD_PASID_UNBIND:
+    {
+        ret = vtd_device_detach_iommufd(vtd_hiod, vtd_as, errp);
+        break;
+    }
+    default:
+        error_setg(errp, "Unknown VTDPASIDOp!!!");
+        break;
+    }
+
+    return ret;
+}
+#else
+static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, VTDPASIDOp op,
+                                Error **errp)
+{
+    return 0;
+}
+#endif
+
+static int vtd_bind_guest_pasid_report_err(VTDAddressSpace *vtd_as,
+                                           VTDPASIDOp op)
+{
+    Error *local_err = NULL;
+    int ret;
+
+    /*
+     * vIOMMU calls into kernel to do BIND/UNBIND, the failure reason
+     * can be kernel, QEMU bug or invalid guest config. None of them
+     * should be reported to guest in PASID cache invalidation
+     * processing path. But at least, we can report it to QEMU console.
+     *
+     * TODO: for invalid guest config, DMA translation fault will be
+     * caught by host and passed to QEMU to inject to guest in future.
+     */
+    ret = vtd_bind_guest_pasid(vtd_as, op, &local_err);
+    if (ret) {
+        error_report_err(local_err);
+    }
+
+    return ret;
+}
+
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field after shifting
  */
@@ -3234,10 +3434,20 @@ static gboolean vtd_flush_pasid_locked(gpointer key, 
gpointer value,
     /* No need to update if cached pasid entry is latest */
     if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
         pc_entry->pasid_entry = pe;
+        if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_UPDATE)) {
+            /*
+             * In case update binding fails, tear down existing binding to
+             * catch invalid pasid entry config during DMA translation.
+             */
+            goto remove;
+        }
     }
     return false;
 
 remove:
+    if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_UNBIND)) {
+        return false;
+    }
     pc_entry->valid = false;
 
     /*
@@ -3317,6 +3527,9 @@ static void vtd_sm_pasid_table_walk_one(IntelIOMMUState 
*s,
             if (!pc_entry->valid) {
                 pc_entry->pasid_entry = pe;
                 pc_entry->valid = true;
+                if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_BIND)) {
+                    pc_entry->valid = false;
+                }
             }
         }
         pasid++;
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 887f93bac9..c87e59554d 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -563,6 +563,12 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
 
+typedef enum VTDPASIDOp {
+    VTD_PASID_BIND,
+    VTD_PASID_UPDATE,
+    VTD_PASID_UNBIND,
+} VTDPASIDOp;
+
 typedef enum VTDPCInvType {
     /* VTD spec defined PASID cache invalidation type */
     VTD_PASID_CACHE_DOMSI = VTD_INV_DESC_PASIDC_G_DSI,
@@ -611,8 +617,12 @@ typedef struct VTDPASIDCacheInfo {
 #define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width */
 #define VTD_SM_PASID_ENTRY_DID(x)      extract64((x)->val[1], 0, 16)
 
-#define VTD_SM_PASID_ENTRY_FLPM          3ULL
-#define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
+#define VTD_SM_PASID_ENTRY_FSPTPTR       (~0xfffULL)
+#define VTD_SM_PASID_ENTRY_SRE_BIT(x)    extract64((x)->val[2], 0, 1)
+/* 00: 4-level paging, 01: 5-level paging, 10-11: Reserved */
+#define VTD_SM_PASID_ENTRY_FSPM(x)       extract64((x)->val[2], 2, 2)
+#define VTD_SM_PASID_ENTRY_WPE_BIT(x)    extract64((x)->val[2], 4, 1)
+#define VTD_SM_PASID_ENTRY_EAFE_BIT(x)   extract64((x)->val[2], 7, 1)
 
 /* First Level Paging Structure */
 /* Masks for First Level Paging Entry */
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index c8a936eb46..1c31b9a873 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -73,6 +73,9 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
 vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 
0x%"PRIx16" index %d vec %d (should be: %d)"
 vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 
0x%"PRIx16" index %d trigger %d (should be: %d)"
 vtd_reset_exit(void) ""
+vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int 
ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
+vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d 
pasid %d ret: %d"
+vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t 
hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
 
 # amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 
0x%"PRIx64" +  offset 0x%"PRIx32
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 0e3826f6f0..2affab36b2 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -104,6 +104,7 @@ struct VTDAddressSpace {
     PCIBus *bus;
     uint8_t devfn;
     uint32_t pasid;
+    uint32_t s1_hwpt;
     AddressSpace as;
     IOMMUMemoryRegion iommu;
     MemoryRegion root;          /* The root container of the device */
-- 
2.47.1


Reply via email to