[PATCH 2/2] vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev

2024-07-22 Thread Zhenzhong Duan
mdevs aren't "physical" devices and when asking for backing IOMMU info,
it fails the entire provisioning of the guest. Fix that by setting
vbasedev->mdev true so skipping HostIOMMUDevice initialization in the
presence of mdevs.

Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() 
handler")
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/ccw.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 1f8e1272c7..70934b01d5 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -675,6 +675,9 @@ static void vfio_ccw_instance_init(Object *obj)
 VFIOCCWDevice *vcdev = VFIO_CCW(obj);
 VFIODevice *vbasedev = >vdev;
 
+/* CCW device is mdev type device */
+vbasedev->mdev = true;
+
 /*
  * All vfio-ccw devices are believed to operate in a way compatible with
  * discarding of memory in RAM blocks, ie. pages pinned in the host are
-- 
2.34.1




[PATCH 1/2] vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev

2024-07-22 Thread Zhenzhong Duan
mdevs aren't "physical" devices and when asking for backing IOMMU info,
it fails the entire provisioning of the guest. Fix that by setting
vbasedev->mdev true so skipping HostIOMMUDevice initialization in the
presence of mdevs.

Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() 
handler")
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/ap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 0c4354e3e7..391bfb72ca 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -230,6 +230,9 @@ static void vfio_ap_instance_init(Object *obj)
  */
 vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, _ap_ops,
  DEVICE(vapdev), true);
+
+/* AP device is mdev type device */
+vbasedev->mdev = true;
 }
 
 #ifdef CONFIG_IOMMUFD
-- 
2.34.1




[PATCH 0/2] Don't initialize HOST_IOMMU_DEVICE with mdev

2024-07-22 Thread Zhenzhong Duan
This fixes a potential issue with mdev that fails to initialize 
HOST_IOMMU_DEVICE.
Reason is mdev isn't physical device and doesn't support IOMMU_GET_HW_INFO.

I thought ap/ccw are all mdev type and need a fix.

This series depends on a patch from Joao which fixes the same for vfio-pci.
See https://lists.gnu.org/archive/html/qemu-devel/2024-07/msg04612.html

Not tested due to no ap/ccw environment. But build test pass.

Thanks
Zhenzhong

Zhenzhong Duan (2):
  vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev
  vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev

 hw/vfio/ap.c  | 3 +++
 hw/vfio/ccw.c | 3 +++
 2 files changed, 6 insertions(+)

-- 
2.34.1




[PATCH v1 16/17] intel_iommu: Modify x-scalable-mode to be string option

2024-07-18 Thread Zhenzhong Duan
From: Yi Liu 

Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
related to scalable mode translation, thus there are multiple combinations.
While this vIOMMU implementation wants to simplify it for user by providing
typical combinations. User could config it by "x-scalable-mode" option. The
usage is as below:

"-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"

 - "legacy": gives support for stage-2 page table
 - "modern": gives support for stage-1 page table
 - "off": no scalable mode support
 -  if not configured, means no scalable mode support, if not proper
configured, will throw error

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 24 +++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 48134bda11..650641544c 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -263,6 +263,7 @@ struct IntelIOMMUState {
 
 bool caching_mode;  /* RO - is cap CM enabled? */
 bool scalable_mode; /* RO - is Scalable Mode supported? */
+char *scalable_mode_str;/* RO - admin's Scalable Mode config */
 bool scalable_modern;   /* RO - is modern SM supported? */
 bool snoop_control; /* RO - is SNP filed supported? */
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2804c3628a..14d05fce1d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3770,7 +3770,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
   VTD_HOST_AW_AUTO),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
-DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, scalable_mode_str),
 DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
 DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false),
 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
@@ -4686,6 +4686,28 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 }
 }
 
+if (s->scalable_mode_str &&
+(strcmp(s->scalable_mode_str, "off") &&
+ strcmp(s->scalable_mode_str, "modern") &&
+ strcmp(s->scalable_mode_str, "legacy"))) {
+error_setg(errp, "Invalid x-scalable-mode config,"
+ "Please use \"modern\", \"legacy\" or \"off\"");
+return false;
+}
+
+if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "legacy")) {
+s->scalable_mode = true;
+s->scalable_modern = false;
+} else if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "modern")) {
+s->scalable_mode = true;
+s->scalable_modern = true;
+} else {
+s->scalable_mode = false;
+s->scalable_modern = false;
+}
+
 if (s->aw_bits == VTD_HOST_AW_AUTO) {
 if (s->scalable_modern) {
 s->aw_bits = VTD_HOST_AW_48BIT;
-- 
2.34.1




[PATCH v1 17/17] tests/qtest: Add intel-iommu test

2024-07-18 Thread Zhenzhong Duan
Add the framework to test the intel-iommu device.

Currently only tested cap/ecap bits correctness in scalable
modern mode. Also tested cap/ecap bits consistency before
and after system reset.

Signed-off-by: Zhenzhong Duan 
---
 MAINTAINERS|  1 +
 include/hw/i386/intel_iommu.h  |  1 +
 tests/qtest/intel-iommu-test.c | 71 ++
 tests/qtest/meson.build|  1 +
 4 files changed, 74 insertions(+)
 create mode 100644 tests/qtest/intel-iommu-test.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 7d9811458c..ec765bf3d3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3662,6 +3662,7 @@ S: Supported
 F: hw/i386/intel_iommu.c
 F: hw/i386/intel_iommu_internal.h
 F: include/hw/i386/intel_iommu.h
+F: tests/qtest/intel-iommu-test.c
 
 AMD-Vi Emulation
 S: Orphan
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 650641544c..b1848dbec6 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -47,6 +47,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define VTD_HOST_AW_48BIT   48
 #define VTD_HOST_AW_AUTO0xff
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap)  ((cap >> 16) & 0x3fULL)
 
 #define DMAR_REPORT_F_INTR  (1)
 
diff --git a/tests/qtest/intel-iommu-test.c b/tests/qtest/intel-iommu-test.c
new file mode 100644
index 00..8e07034f6f
--- /dev/null
+++ b/tests/qtest/intel-iommu-test.c
@@ -0,0 +1,71 @@
+/*
+ * QTest testcase for intel-iommu
+ *
+ * Copyright (c) 2024 Intel, Inc.
+ *
+ * Author: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+#include "hw/i386/intel_iommu_internal.h"
+
+#define CAP_MODERN_FIXED1(VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | \
+  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS)
+#define ECAP_MODERN_FIXED1   (VTD_ECAP_QI |  VTD_ECAP_IRO | VTD_ECAP_MHMV | \
+  VTD_ECAP_SMTS | VTD_ECAP_FLTS)
+
+static inline uint32_t vtd_reg_readl(QTestState *s, uint64_t offset)
+{
+return qtest_readl(s, Q35_HOST_BRIDGE_IOMMU_ADDR + offset);
+}
+
+static inline uint64_t vtd_reg_readq(QTestState *s, uint64_t offset)
+{
+return qtest_readq(s, Q35_HOST_BRIDGE_IOMMU_ADDR + offset);
+}
+
+static void test_intel_iommu_modern(void)
+{
+uint8_t init_csr[DMAR_REG_SIZE]; /* register values */
+uint8_t post_reset_csr[DMAR_REG_SIZE]; /* register values */
+uint64_t cap, ecap, tmp;
+QTestState *s;
+
+s = qtest_init("-M q35 -device intel-iommu,x-scalable-mode=modern");
+
+cap = vtd_reg_readq(s, DMAR_CAP_REG);
+g_assert((cap & CAP_MODERN_FIXED1) == CAP_MODERN_FIXED1);
+
+tmp = cap & VTD_CAP_SAGAW_MASK;
+g_assert(tmp == (VTD_CAP_SAGAW_39bit | VTD_CAP_SAGAW_48bit));
+
+tmp = VTD_MGAW_FROM_CAP(cap);
+g_assert(tmp == VTD_HOST_AW_48BIT - 1);
+
+ecap = vtd_reg_readq(s, DMAR_ECAP_REG);
+g_assert((ecap & ECAP_MODERN_FIXED1) == ECAP_MODERN_FIXED1);
+g_assert(ecap & VTD_ECAP_IR);
+
+qtest_memread(s, Q35_HOST_BRIDGE_IOMMU_ADDR, init_csr, DMAR_REG_SIZE);
+
+qobject_unref(qtest_qmp(s, "{ 'execute': 'system_reset' }"));
+qtest_qmp_eventwait(s, "RESET");
+
+qtest_memread(s, Q35_HOST_BRIDGE_IOMMU_ADDR, post_reset_csr, 
DMAR_REG_SIZE);
+/* Ensure registers are consistent after hard reset */
+g_assert(!memcmp(init_csr, post_reset_csr, DMAR_REG_SIZE));
+
+qtest_quit(s);
+}
+
+int main(int argc, char **argv)
+{
+g_test_init(, , NULL);
+qtest_add_func("/q35/intel-iommu/modern", test_intel_iommu_modern);
+
+return g_test_run();
+}
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 6508bfb1a2..20d05d471b 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -79,6 +79,7 @@ qtests_i386 = \
   (config_all_devices.has_key('CONFIG_SB16') ? ['fuzz-sb16-test'] : []) +  
 \
   (config_all_devices.has_key('CONFIG_SDHCI_PCI') ? ['fuzz-sdcard-test'] : []) 
+\
   (config_all_devices.has_key('CONFIG_ESP_PCI') ? ['am53c974-test'] : []) +
 \
+  (config_all_devices.has_key('CONFIG_VTD') ? ['intel-iommu-test'] : []) + 
\
   (host_os != 'windows' and
\
config_all_devices.has_key('CONFIG_ACPI_ERST') ? ['erst-test'] : []) +  
 \
   (config_all_devices.has_key('CONFIG_PCIE_PORT') and  
 \
-- 
2.34.1




[PATCH v1 07/17] intel_iommu: Check if the input address is canonical

2024-07-18 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

First stage translation must fail if the address to translate is
not canonical.

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  2 ++
 hw/i386/intel_iommu.c  | 21 +
 2 files changed, 23 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 36fcc6bb5e..168185b850 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -322,6 +322,8 @@ typedef enum VTDFaultReason {
 VTD_FR_PASID_ENTRY_P = 0x59,
 VTD_FR_PASID_TABLE_ENTRY_INV = 0x5b,  /*Invalid PASID table entry */
 
+VTD_FR_FS_NON_CANONICAL = 0x80, /* SNG.1 : Address for FS not canonical.*/
+
 /* Output address in the interrupt address range for scalable mode */
 VTD_FR_SM_INTERRUPT_ADDR = 0x87,
 VTD_FR_MAX, /* Guard */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 287741b687..495a41cf80 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1824,6 +1824,7 @@ static const bool vtd_qualified_faults[] = {
 [VTD_FR_PASID_ENTRY_P] = true,
 [VTD_FR_PASID_TABLE_ENTRY_INV] = true,
 [VTD_FR_SM_INTERRUPT_ADDR] = true,
+[VTD_FR_FS_NON_CANONICAL] = true,
 [VTD_FR_MAX] = false,
 };
 
@@ -1927,6 +1928,20 @@ static inline bool vtd_flpte_present(uint64_t flpte)
 return !!(flpte & VTD_FL_P);
 }
 
+/* Return true if IOVA is canonical, otherwise false. */
+static bool vtd_iova_fl_check_canonical(IntelIOMMUState *s, uint64_t iova,
+VTDContextEntry *ce, uint32_t pasid)
+{
+uint64_t iova_limit = vtd_iova_limit(s, ce, s->aw_bits, pasid);
+uint64_t upper_bits_mask = ~(iova_limit - 1);
+uint64_t upper_bits = iova & upper_bits_mask;
+bool msb = ((iova & (iova_limit >> 1)) != 0);
+return !(
+ (!msb && (upper_bits != 0)) ||
+ (msb && (upper_bits != upper_bits_mask))
+);
+}
+
 /*
  * Given the @iova, get relevant @flptep. @flpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
@@ -1942,6 +1957,12 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, 
VTDContextEntry *ce,
 uint32_t offset;
 uint64_t flpte;
 
+if (!vtd_iova_fl_check_canonical(s, iova, ce, pasid)) {
+error_report_once("%s: detected non canonical IOVA (iova=0x%" PRIx64 
","
+  "pasid=0x%" PRIx32 ")", __func__, iova, pasid);
+return -VTD_FR_FS_NON_CANONICAL;
+}
+
 while (true) {
 offset = vtd_iova_level_offset(iova, level);
 flpte = vtd_get_pte(addr, offset);
-- 
2.34.1




[PATCH v1 06/17] intel_iommu: Implement stage-1 translation

2024-07-18 Thread Zhenzhong Duan
From: Yi Liu 

This adds stage-1 page table walking to support stage-1 only
transltion in scalable modern mode.

Signed-off-by: Yi Liu 
Co-developed-by: Clément Mathieu--Drif 
Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  26 ++
 hw/i386/intel_iommu.c  | 146 -
 2 files changed, 168 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 1875c2ddd6..36fcc6bb5e 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -440,6 +440,24 @@ typedef union VTDInvDesc VTDInvDesc;
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
 
+/* Rsvd field masks for fpte */
+#define VTD_FS_UPPER_IGNORED 0xfff0ULL
+#define VTD_FPTE_PAGE_L1_RSVD_MASK(aw) (~(VTD_HAW_MASK(aw)) & \
+   (~VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L2_RSVD_MASK(aw) (~(VTD_HAW_MASK(aw)) & \
+   (~VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L3_RSVD_MASK(aw) (~(VTD_HAW_MASK(aw)) & \
+   (~VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L3_FS1GP_RSVD_MASK(aw) ((0x3fffe000ULL | \
+~(VTD_HAW_MASK(aw))) \
+  & (~VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L2_FS2MP_RSVD_MASK(aw) ((0x1fe000ULL | \
+~(VTD_HAW_MASK(aw))) \
+  & (~VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L4_RSVD_MASK(aw) ((0x80ULL | \
+~(VTD_HAW_MASK(aw))) \
+& (~VTD_FS_UPPER_IGNORED))
+
 #define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
 #define VTD_INV_DESC_PIOTLB_PSI_IN_PASID  (3ULL << 4)
 
@@ -533,6 +551,14 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SM_PASID_ENTRY_AW  7ULL /* Adjusted guest-address-width */
 #define VTD_SM_PASID_ENTRY_DID(val)((val) & VTD_DOMAIN_ID_MASK)
 
+#define VTD_SM_PASID_ENTRY_FLPM  3ULL
+#define VTD_SM_PASID_ENTRY_FLPTPTR   (~0xfffULL)
+
+/* First Level Paging Structure */
+/* Masks for First Level Paging Entry */
+#define VTD_FL_P1ULL
+#define VTD_FL_RW_MASK  (1ULL << 1)
+
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 94f6532935..287741b687 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -48,6 +48,8 @@
 
 /* pe operations */
 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
+#define VTD_PE_GET_FL_LEVEL(pe) \
+(4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
 #define VTD_PE_GET_SL_LEVEL(pe) \
 (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
 
@@ -755,6 +757,11 @@ static inline bool 
vtd_is_sl_level_supported(IntelIOMMUState *s, uint32_t level)
(1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
 }
 
+static inline bool vtd_is_fl_level_supported(IntelIOMMUState *s, uint32_t 
level)
+{
+return level == VTD_PML4_LEVEL;
+}
+
 /* Return true if check passed, otherwise false */
 static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
 {
@@ -841,6 +848,11 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
+if (pgtt == VTD_SM_PASID_ENTRY_FLT &&
+!vtd_is_fl_level_supported(s, VTD_PE_GET_FL_LEVEL(pe))) {
+return -VTD_FR_PASID_TABLE_ENTRY_INV;
+}
+
 return 0;
 }
 
@@ -976,7 +988,11 @@ static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
 
 if (s->root_scalable) {
 vtd_ce_get_rid2pasid_entry(s, ce, , pasid);
-return VTD_PE_GET_SL_LEVEL();
+if (s->scalable_modern) {
+return VTD_PE_GET_FL_LEVEL();
+} else {
+return VTD_PE_GET_SL_LEVEL();
+}
 }
 
 return vtd_ce_get_level(ce);
@@ -1063,7 +1079,11 @@ static dma_addr_t 
vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
 
 if (s->root_scalable) {
 vtd_ce_get_rid2pasid_entry(s, ce, , pasid);
-return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+if (s->scalable_modern) {
+return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+} else {
+return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+}
 }
 
 return vtd_ce_get_slpt_base(ce);
@@ -1865,6 +1885,104 @@ out:
 trace_vtd_pt_enable_fast_path(source_id, success);
 }
 
+/*
+ * Rsvd field masks for fpte:
+ * vtd_fpte_rsvd 4k pages
+ * vtd_fpte_rsvd_large larg

[PATCH v1 15/17] intel_iommu: Set default aw_bits to 48 in scalable modren mode

2024-07-18 Thread Zhenzhong Duan
According to VTD spec, stage-1 page table could support 4-level and
5-level paging.

However, 5-level paging translation emulation is unsupported yet.
That means the only supported value for aw_bits is 48.

So default aw_bits to 48 in scalable modern mode. In other cases,
it is still default to 39 for compatibility.

Add a check to ensure user specified value is 48 in modern mode
for now.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  2 +-
 hw/i386/intel_iommu.c | 16 +++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index b843d069cc..48134bda11 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -45,7 +45,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define DMAR_REG_SIZE   0x230
 #define VTD_HOST_AW_39BIT   39
 #define VTD_HOST_AW_48BIT   48
-#define VTD_HOST_ADDRESS_WIDTH  VTD_HOST_AW_39BIT
+#define VTD_HOST_AW_AUTO0xff
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
 
 #define DMAR_REPORT_F_INTR  (1)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index c0116497b1..2804c3628a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3768,7 +3768,7 @@ static Property vtd_properties[] = {
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
-  VTD_HOST_ADDRESS_WIDTH),
+  VTD_HOST_AW_AUTO),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
 DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
@@ -4686,6 +4686,14 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 }
 }
 
+if (s->aw_bits == VTD_HOST_AW_AUTO) {
+if (s->scalable_modern) {
+s->aw_bits = VTD_HOST_AW_48BIT;
+} else {
+s->aw_bits = VTD_HOST_AW_39BIT;
+}
+}
+
 if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
 (s->aw_bits != VTD_HOST_AW_48BIT) &&
 !s->scalable_modern) {
@@ -4694,6 +4702,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return false;
 }
 
+if ((s->aw_bits != VTD_HOST_AW_48BIT) && s->scalable_modern) {
+error_setg(errp, "Supported values for aw-bits are: %d",
+   VTD_HOST_AW_48BIT);
+return false;
+}
+
 if (s->scalable_mode && !s->dma_drain) {
 error_setg(errp, "Need to set dma_drain for scalable mode");
 return false;
-- 
2.34.1




[PATCH v1 13/17] intel_iommu: Add support for PASID-based device IOTLB invalidation

2024-07-18 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h | 11 
 hw/i386/intel_iommu.c  | 50 ++
 2 files changed, 61 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 7dd8176e86..ed358aa763 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -377,6 +377,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_WAIT   0x5 /* Invalidation Wait Descriptor */
 #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
 #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
 #define VTD_INV_DESC_NONE   0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
@@ -420,6 +421,16 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffe0fff8
 
+/* Mask for PASID Device IOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(val) ((val) & \
+   0xf000ULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(val) ((val >> 11) & 0x1)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(val) ((val) & 0x1)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(val) (((val) >> 16) & 0xULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(val) ((val >> 32) & 0xfULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_HI 0x7feULL
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_LO 0xfff0f000ULL
+
 /* Rsvd field masks for spte */
 #define VTD_SPTE_SNP 0x800ULL
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a17ce2b1f1..8b66d6cfa5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3016,6 +3016,49 @@ static void do_invalidate_device_tlb(VTDAddressSpace 
*vtd_dev_as,
 memory_region_notify_iommu(_dev_as->iommu, 0, event);
 }
 
+static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+uint16_t sid;
+VTDAddressSpace *vtd_dev_as;
+bool size;
+bool global;
+hwaddr addr;
+uint32_t pasid;
+
+if ((inv_desc->hi & VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_HI) ||
+ (inv_desc->lo & VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_LO)) {
+error_report_once("%s: invalid pasid-based dev iotlb inv desc:"
+  "hi=%"PRIx64 "(reserved nonzero)",
+  __func__, inv_desc->hi);
+return false;
+}
+
+global = VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(inv_desc->hi);
+size = VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(inv_desc->hi);
+addr = VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(inv_desc->hi);
+sid = VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(inv_desc->lo);
+if (global) {
+QLIST_FOREACH(vtd_dev_as, >vtd_as_with_notifiers, next) {
+if ((vtd_dev_as->pasid != PCI_NO_PASID) &&
+(PCI_BUILD_BDF(pci_bus_num(vtd_dev_as->bus),
+   vtd_dev_as->devfn) == sid)) {
+do_invalidate_device_tlb(vtd_dev_as, size, addr);
+}
+}
+} else {
+pasid = VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(inv_desc->lo);
+vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, sid, pasid);
+if (!vtd_dev_as) {
+return true;
+}
+
+do_invalidate_device_tlb(vtd_dev_as, size, addr);
+}
+
+return true;
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
@@ -3110,6 +3153,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
+case VTD_INV_DESC_DEV_PIOTLB:
+trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo);
+if (!vtd_process_device_piotlb_desc(s, _desc)) {
+return false;
+}
+break;
+
 case VTD_INV_DESC_DEVICE:
 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
 if (!vtd_process_device_iotlb_desc(s, _desc)) {
-- 
2.34.1




[PATCH v1 02/17] intel_iommu: Make pasid entry type check accurate

2024-07-18 Thread Zhenzhong Duan
When guest configures Nested Translation(011b) or First-stage Translation only
(001b), type check passed unaccurately.

Fails the type check in those cases as their simulation isn't supported yet.

Fixes: fb43cf739e1 ("intel_iommu: scalable mode emulation")
Suggested-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e65f5b29a5..1cff8b00ae 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -759,20 +759,16 @@ static inline bool vtd_pe_type_check(X86IOMMUState 
*x86_iommu,
  VTDPASIDEntry *pe)
 {
 switch (VTD_PE_GET_TYPE(pe)) {
-case VTD_SM_PASID_ENTRY_FLT:
 case VTD_SM_PASID_ENTRY_SLT:
-case VTD_SM_PASID_ENTRY_NESTED:
-break;
+return true;
 case VTD_SM_PASID_ENTRY_PT:
-if (!x86_iommu->pt_supported) {
-return false;
-}
-break;
+return x86_iommu->pt_supported;
+case VTD_SM_PASID_ENTRY_FLT:
+case VTD_SM_PASID_ENTRY_NESTED:
 default:
 /* Unknown type */
 return false;
 }
-return true;
 }
 
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
-- 
2.34.1




[PATCH v1 09/17] intel_iommu: Flush stage-1 cache in iotlb invalidation

2024-07-18 Thread Zhenzhong Duan
According to spec, Page-Selective-within-Domain Invalidation (11b):

1. IOTLB entries caching second-stage mappings (PGTT=010b) or pass-through
(PGTT=100b) mappings associated with the specified domain-id and the
input-address range are invalidated.
2. IOTLB entries caching first-stage (PGTT=001b) or nested (PGTT=011b)
mapping associated with specified domain-id are invalidated.

So per spec definition the Page-Selective-within-Domain Invalidation
needs to flush first stage and nested cached IOTLB enties as well.

We don't support nested yet and pass-through mapping is never cached,
so what in iotlb cache are only first-stage and second-stage mappings.

Add a tag pgtt in VTDIOTLBEntry to mark PGTT type of the mapping and
invalidate entries based on PGTT type.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 27 +--
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index fe9057c50d..b843d069cc 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -155,6 +155,7 @@ struct VTDIOTLBEntry {
 uint64_t pte;
 uint64_t mask;
 uint8_t access_flags;
+uint8_t pgtt;
 };
 
 /* VT-d Source-ID Qualifier types */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 210df32f01..8d47e5ba78 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -305,9 +305,21 @@ static gboolean vtd_hash_remove_by_page(gpointer key, 
gpointer value,
 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
-return (entry->domain_id == info->domain_id) &&
-(((entry->gfn & info->mask) == gfn) ||
- (entry->gfn == gfn_tlb));
+
+if (entry->domain_id != info->domain_id) {
+return false;
+}
+
+/*
+ * According to spec, IOTLB entries caching first-stage (PGTT=001b) or
+ * nested (PGTT=011b) mapping associated with specified domain-id are
+ * invalidated. Nested isn't supported yet, so only need to check 001b.
+ */
+if (entry->pgtt == VTD_SM_PASID_ENTRY_FLT) {
+return true;
+}
+
+return (entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb;
 }
 
 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
@@ -382,7 +394,7 @@ out:
 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
  uint16_t domain_id, hwaddr addr, uint64_t pte,
  uint8_t access_flags, uint32_t level,
- uint32_t pasid)
+ uint32_t pasid, uint8_t pgtt)
 {
 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
 struct vtd_iotlb_key *key = g_malloc(sizeof(*key));
@@ -400,6 +412,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t 
source_id,
 entry->access_flags = access_flags;
 entry->mask = vtd_pt_level_page_mask(level);
 entry->pasid = pasid;
+entry->pgtt = pgtt;
 
 key->gfn = gfn;
 key->sid = source_id;
@@ -2071,7 +2084,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 bool is_fpd_set = false;
 bool reads = true;
 bool writes = true;
-uint8_t access_flags;
+uint8_t access_flags, pgtt;
 bool rid2pasid = (pasid == PCI_NO_PASID) && s->root_scalable;
 VTDIOTLBEntry *iotlb_entry;
 
@@ -2179,9 +2192,11 @@ static bool vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 if (s->scalable_modern && s->root_scalable) {
 ret_fr = vtd_iova_to_flpte(s, , addr, is_write, , ,
, , s->aw_bits, pasid);
+pgtt = VTD_SM_PASID_ENTRY_FLT;
 } else {
 ret_fr = vtd_iova_to_slpte(s, , addr, is_write, , ,
, , s->aw_bits, pasid);
+pgtt = VTD_SM_PASID_ENTRY_SLT;
 }
 if (ret_fr) {
 vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
@@ -2192,7 +2207,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 page_mask = vtd_pt_level_page_mask(level);
 access_flags = IOMMU_ACCESS_FLAG(reads, writes);
 vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, , pasid),
- addr, pte, access_flags, level, pasid);
+ addr, pte, access_flags, level, pasid, pgtt);
 out:
 vtd_iommu_unlock(s);
 entry->iova = addr & page_mask;
-- 
2.34.1




[PATCH v1 10/17] intel_iommu: Process PASID-based iotlb invalidation

2024-07-18 Thread Zhenzhong Duan
PASID-based iotlb (piotlb) is used during walking Intel
VT-d stage-1 page table.

This emulates the stage-1 page table iotlb invalidation requested
by a PASID-based IOTLB Invalidate Descriptor (P_IOTLB).

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  3 +++
 hw/i386/intel_iommu.c  | 45 ++
 2 files changed, 48 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index cf0f176e06..7dd8176e86 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -470,6 +470,9 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
 #define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
  VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_PIOTLB_AM(val)   ((val) & 0x3fULL)
+#define VTD_INV_DESC_PIOTLB_IH(val)   (((val) >> 6) & 0x1)
 
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8d47e5ba78..8ebb6dbd7d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -322,6 +322,28 @@ static gboolean vtd_hash_remove_by_page(gpointer key, 
gpointer value,
 return (entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb;
 }
 
+static gboolean vtd_hash_remove_by_page_piotlb(gpointer key, gpointer value,
+   gpointer user_data)
+{
+VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
+uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
+
+/*
+ * According to spec, PASID-based-IOTLB Invalidation in page granularity
+ * doesn't invalidate IOTLB entries caching second-stage (PGTT=010b)
+ * or pass-through (PGTT=100b) mappings. Nested isn't supported yet,
+ * so only need to check first-stage (PGTT=001b) mappings.
+ */
+if (entry->pgtt != VTD_SM_PASID_ENTRY_FLT) {
+return false;
+}
+
+return entry->domain_id == info->domain_id && entry->pasid == info->pasid 
&&
+   ((entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb);
+}
+
 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
  * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
  */
@@ -2886,11 +2908,30 @@ static void vtd_piotlb_pasid_invalidate(IntelIOMMUState 
*s,
 }
 }
 
+static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+   uint32_t pasid, hwaddr addr, uint8_t am,
+   bool ih)
+{
+VTDIOTLBPageInvInfo info;
+
+info.domain_id = domain_id;
+info.pasid = pasid;
+info.addr = addr;
+info.mask = ~((1 << am) - 1);
+
+vtd_iommu_lock(s);
+g_hash_table_foreach_remove(s->iotlb,
+vtd_hash_remove_by_page_piotlb, );
+vtd_iommu_unlock(s);
+}
+
 static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
 VTDInvDesc *inv_desc)
 {
 uint16_t domain_id;
 uint32_t pasid;
+uint8_t am;
+hwaddr addr;
 
 if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
 (inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
@@ -2907,6 +2948,10 @@ static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
 break;
 
 case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]);
+addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]);
+vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am,
+   VTD_INV_DESC_PIOTLB_IH(inv_desc->val[1]));
 break;
 
 default:
-- 
2.34.1




[PATCH v1 05/17] intel_iommu: Rename slpte to pte

2024-07-18 Thread Zhenzhong Duan
From: Yi Liu 

Because we will support both FST(a.k.a, FLT) and SST(a.k.a, SLT) translation,
rename variable and functions from slpte to pte whenever possible.

But some are SST only, they are renamed with sl_ prefix.

Signed-off-by: Yi Liu 
Co-developed-by: Clément Mathieu--Drif 
Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  24 +++---
 include/hw/i386/intel_iommu.h  |   2 +-
 hw/i386/intel_iommu.c  | 129 +
 3 files changed, 78 insertions(+), 77 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f71fc91234..1875c2ddd6 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -536,24 +536,24 @@ typedef struct VTDRootEntry VTDRootEntry;
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
 
-/* Paging Structure common */
-#define VTD_SL_PT_PAGE_SIZE_MASK(1ULL << 7)
-/* Bits to decide the offset for each level */
-#define VTD_SL_LEVEL_BITS   9
-
 /* Second Level Paging Structure */
-#define VTD_SL_PML4_LEVEL   4
-#define VTD_SL_PDP_LEVEL3
-#define VTD_SL_PD_LEVEL 2
-#define VTD_SL_PT_LEVEL 1
-#define VTD_SL_PT_ENTRY_NR  512
-
 /* Masks for Second Level Paging Entry */
 #define VTD_SL_RW_MASK  3ULL
 #define VTD_SL_R1ULL
 #define VTD_SL_W(1ULL << 1)
-#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+/* Common for both First Level and Second Level */
+#define VTD_PML4_LEVEL   4
+#define VTD_PDP_LEVEL3
+#define VTD_PD_LEVEL 2
+#define VTD_PT_LEVEL 1
+#define VTD_PT_ENTRY_NR  512
+#define VTD_PT_PAGE_SIZE_MASK(1ULL << 7)
+#define VTD_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+
+/* Bits to decide the offset for each level */
+#define VTD_LEVEL_BITS   9
+
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 788ed42477..fe9057c50d 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -152,7 +152,7 @@ struct VTDIOTLBEntry {
 uint64_t gfn;
 uint16_t domain_id;
 uint32_t pasid;
-uint64_t slpte;
+uint64_t pte;
 uint64_t mask;
 uint8_t access_flags;
 };
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 075a27adac..94f6532935 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -48,7 +48,8 @@
 
 /* pe operations */
 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
-#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & 
VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_SL_LEVEL(pe) \
+(2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
 
 /*
  * PCI bus number (or SID) is not reliable since the device is usaully
@@ -284,15 +285,15 @@ static gboolean vtd_hash_remove_by_domain(gpointer key, 
gpointer value,
 }
 
 /* The shift of an addr for a certain level of paging structure */
-static inline uint32_t vtd_slpt_level_shift(uint32_t level)
+static inline uint32_t vtd_pt_level_shift(uint32_t level)
 {
 assert(level != 0);
-return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
+return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_LEVEL_BITS;
 }
 
-static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
+static inline uint64_t vtd_pt_level_page_mask(uint32_t level)
 {
-return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
+return ~((1ULL << vtd_pt_level_shift(level)) - 1);
 }
 
 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
@@ -349,7 +350,7 @@ static void vtd_reset_caches(IntelIOMMUState *s)
 
 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
 {
-return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
+return (addr & vtd_pt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
 }
 
 /* Must be called with IOMMU lock held */
@@ -360,7 +361,7 @@ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, 
uint16_t source_id,
 VTDIOTLBEntry *entry;
 int level;
 
-for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
+for (level = VTD_PT_LEVEL; level < VTD_PML4_LEVEL; level++) {
 key.gfn = vtd_get_iotlb_gfn(addr, level);
 key.level = level;
 key.sid = source_id;
@@ -377,7 +378,7 @@ out:
 
 /* Must be with IOMMU lock held */
 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
- uint16_t domain_id, hwaddr addr, uint64_t slpte,
+ uint16_t domain_id, hwaddr addr, uint64_t pte,
  uint8_t access_

[PATCH v1 03/17] intel_iommu: Add a placeholder variable for scalable modern mode

2024-07-18 Thread Zhenzhong Duan
Add an new element scalable_mode in IntelIOMMUState to mark scalable
modern mode, this element will be exposed as an intel_iommu property
finally.

For now, it's only a placehholder and used for cap/ecap initialization,
compatibility check and block host device passthrough until nesting
is supported.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  2 ++
 include/hw/i386/intel_iommu.h  |  1 +
 hw/i386/intel_iommu.c  | 34 +++---
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index c0ca7b372f..4e0331caba 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -195,6 +195,7 @@
 #define VTD_ECAP_PASID  (1ULL << 40)
 #define VTD_ECAP_SMTS   (1ULL << 43)
 #define VTD_ECAP_SLTS   (1ULL << 46)
+#define VTD_ECAP_FLTS   (1ULL << 47)
 
 /* CAP_REG */
 /* (offset >> 4) << 24 */
@@ -211,6 +212,7 @@
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
 #define VTD_CAP_DRAIN_WRITE (1ULL << 54)
 #define VTD_CAP_DRAIN_READ  (1ULL << 55)
+#define VTD_CAP_FS1GP   (1ULL << 56)
 #define VTD_CAP_DRAIN   (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
 #define VTD_CAP_CM  (1ULL << 7)
 #define VTD_PASID_ID_SHIFT  20
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 1eb05c29fc..788ed42477 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -262,6 +262,7 @@ struct IntelIOMMUState {
 
 bool caching_mode;  /* RO - is cap CM enabled? */
 bool scalable_mode; /* RO - is Scalable Mode supported? */
+bool scalable_modern;   /* RO - is modern SM supported? */
 bool snoop_control; /* RO - is SNP filed supported? */
 
 dma_addr_t root;/* Current root table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1cff8b00ae..40cbd4a0f4 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -755,16 +755,20 @@ static inline bool vtd_is_level_supported(IntelIOMMUState 
*s, uint32_t level)
 }
 
 /* Return true if check passed, otherwise false */
-static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
- VTDPASIDEntry *pe)
+static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
 {
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
 switch (VTD_PE_GET_TYPE(pe)) {
+case VTD_SM_PASID_ENTRY_FLT:
+return s->scalable_modern;
 case VTD_SM_PASID_ENTRY_SLT:
-return true;
+return !s->scalable_modern;
+case VTD_SM_PASID_ENTRY_NESTED:
+/* Not support NESTED page table type yet */
+return false;
 case VTD_SM_PASID_ENTRY_PT:
 return x86_iommu->pt_supported;
-case VTD_SM_PASID_ENTRY_FLT:
-case VTD_SM_PASID_ENTRY_NESTED:
 default:
 /* Unknown type */
 return false;
@@ -813,7 +817,6 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 uint8_t pgtt;
 uint32_t index;
 dma_addr_t entry_size;
-X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
 index = VTD_PASID_TABLE_INDEX(pasid);
 entry_size = VTD_PASID_ENTRY_SIZE;
@@ -827,7 +830,7 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 }
 
 /* Do translation type check */
-if (!vtd_pe_type_check(x86_iommu, pe)) {
+if (!vtd_pe_type_check(s, pe)) {
 return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
@@ -3861,7 +3864,13 @@ static bool vtd_check_hiod(IntelIOMMUState *s, 
HostIOMMUDevice *hiod,
 return false;
 }
 
-return true;
+if (!s->scalable_modern) {
+/* All checks requested by VTD non-modern mode pass */
+return true;
+}
+
+error_setg(errp, "host device is unsupported in scalable modern mode yet");
+return false;
 }
 
 static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
@@ -4084,7 +4093,10 @@ static void vtd_cap_init(IntelIOMMUState *s)
 }
 
 /* TODO: read cap/ecap from host to decide which cap to be exposed. */
-if (s->scalable_mode) {
+if (s->scalable_modern) {
+s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_FLTS;
+s->cap |= VTD_CAP_FS1GP;
+} else if (s->scalable_mode) {
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
 }
 
@@ -4251,9 +4263,9 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 }
 }
 
-/* Currently only address widths supported are 39 and 48 bits */
 if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
-(s->aw_bits != VTD_HOST_AW_48BIT)) {
+(s->aw_bits != VTD_HOST_AW_48BIT) &&
+!s->scala

[PATCH v1 11/17] intel_iommu: Extract device IOTLB invalidation logic

2024-07-18 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

This piece of code can be shared by both IOTLB invalidation and
PASID-based IOTLB invalidation

No functional changes intended.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 57 +--
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8ebb6dbd7d..4d5a457f92 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2975,13 +2975,43 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
 return true;
 }
 
+static void do_invalidate_device_tlb(VTDAddressSpace *vtd_dev_as,
+ bool size, hwaddr addr)
+{
+/*
+ * According to ATS spec table 2.4:
+ * S = 0, bits 15:12 =  range size: 4K
+ * S = 1, bits 15:12 = xxx0 range size: 8K
+ * S = 1, bits 15:12 = xx01 range size: 16K
+ * S = 1, bits 15:12 = x011 range size: 32K
+ * S = 1, bits 15:12 = 0111 range size: 64K
+ * ...
+ */
+
+IOMMUTLBEvent event;
+uint64_t sz;
+
+if (size) {
+sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
+addr &= ~(sz - 1);
+} else {
+sz = VTD_PAGE_SIZE;
+}
+
+event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
+event.entry.target_as = _dev_as->as;
+event.entry.addr_mask = sz - 1;
+event.entry.iova = addr;
+event.entry.perm = IOMMU_NONE;
+event.entry.translated_addr = 0;
+memory_region_notify_iommu(_dev_as->iommu, 0, event);
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
 VTDAddressSpace *vtd_dev_as;
-IOMMUTLBEvent event;
 hwaddr addr;
-uint64_t sz;
 uint16_t sid;
 bool size;
 
@@ -3006,28 +3036,7 @@ static bool 
vtd_process_device_iotlb_desc(IntelIOMMUState *s,
 goto done;
 }
 
-/* According to ATS spec table 2.4:
- * S = 0, bits 15:12 =  range size: 4K
- * S = 1, bits 15:12 = xxx0 range size: 8K
- * S = 1, bits 15:12 = xx01 range size: 16K
- * S = 1, bits 15:12 = x011 range size: 32K
- * S = 1, bits 15:12 = 0111 range size: 64K
- * ...
- */
-if (size) {
-sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
-addr &= ~(sz - 1);
-} else {
-sz = VTD_PAGE_SIZE;
-}
-
-event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
-event.entry.target_as = _dev_as->as;
-event.entry.addr_mask = sz - 1;
-event.entry.iova = addr;
-event.entry.perm = IOMMU_NONE;
-event.entry.translated_addr = 0;
-memory_region_notify_iommu(_dev_as->iommu, 0, event);
+do_invalidate_device_tlb(vtd_dev_as, size, addr);
 
 done:
 return true;
-- 
2.34.1




[PATCH v1 12/17] intel_iommu: Add an internal API to find an address space with PASID

2024-07-18 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

This will be used to implement the device IOTLB invalidation

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 39 ---
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4d5a457f92..a17ce2b1f1 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -70,6 +70,11 @@ struct vtd_hiod_key {
 uint8_t devfn;
 };
 
+struct vtd_as_raw_key {
+uint16_t sid;
+uint32_t pasid;
+};
+
 struct vtd_iotlb_key {
 uint64_t gfn;
 uint32_t pasid;
@@ -1878,29 +1883,33 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
 }
 
-static gboolean vtd_find_as_by_sid(gpointer key, gpointer value,
-   gpointer user_data)
+static gboolean vtd_find_as_by_sid_and_pasid(gpointer key, gpointer value,
+ gpointer user_data)
 {
 struct vtd_as_key *as_key = (struct vtd_as_key *)key;
-uint16_t target_sid = *(uint16_t *)user_data;
+struct vtd_as_raw_key target = *(struct vtd_as_raw_key *)user_data;
 uint16_t sid = PCI_BUILD_BDF(pci_bus_num(as_key->bus), as_key->devfn);
-return sid == target_sid;
+
+return (as_key->pasid == target.pasid) &&
+   (sid == target.sid);
 }
 
-static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid)
+static VTDAddressSpace *vtd_get_as_by_sid_and_pasid(IntelIOMMUState *s,
+uint16_t sid,
+uint32_t pasid)
 {
-uint8_t bus_num = PCI_BUS_NUM(sid);
-VTDAddressSpace *vtd_as = s->vtd_as_cache[bus_num];
-
-if (vtd_as &&
-(sid == PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn))) {
-return vtd_as;
-}
+struct vtd_as_raw_key key = {
+.sid = sid,
+.pasid = pasid
+};
 
-vtd_as = g_hash_table_find(s->vtd_address_spaces, vtd_find_as_by_sid, 
);
-s->vtd_as_cache[bus_num] = vtd_as;
+return g_hash_table_find(s->vtd_address_spaces,
+ vtd_find_as_by_sid_and_pasid, );
+}
 
-return vtd_as;
+static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid)
+{
+return vtd_get_as_by_sid_and_pasid(s, sid, PCI_NO_PASID);
 }
 
 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
-- 
2.34.1




[PATCH v1 14/17] intel_iommu: piotlb invalidation should notify unmap

2024-07-18 Thread Zhenzhong Duan
This is used by some emulated devices which caches address
translation result. When piotlb invalidation issued in guest,
those caches should be refreshed.

Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8b66d6cfa5..c0116497b1 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2910,7 +2910,7 @@ static void vtd_piotlb_pasid_invalidate(IntelIOMMUState 
*s,
 continue;
 }
 
-if (!s->scalable_modern) {
+if (!s->scalable_modern || !vtd_as_has_map_notifier(vtd_as)) {
 vtd_address_space_sync(vtd_as);
 }
 }
@@ -2922,6 +2922,9 @@ static void vtd_piotlb_page_invalidate(IntelIOMMUState 
*s, uint16_t domain_id,
bool ih)
 {
 VTDIOTLBPageInvInfo info;
+VTDAddressSpace *vtd_as;
+VTDContextEntry ce;
+hwaddr size = (1 << am) * VTD_PAGE_SIZE;
 
 info.domain_id = domain_id;
 info.pasid = pasid;
@@ -2932,6 +2935,36 @@ static void vtd_piotlb_page_invalidate(IntelIOMMUState 
*s, uint16_t domain_id,
 g_hash_table_foreach_remove(s->iotlb,
 vtd_hash_remove_by_page_piotlb, );
 vtd_iommu_unlock(s);
+
+QLIST_FOREACH(vtd_as, >vtd_as_with_notifiers, next) {
+if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+  vtd_as->devfn, ) &&
+domain_id == vtd_get_domain_id(s, , vtd_as->pasid)) {
+uint32_t rid2pasid = VTD_CE_GET_RID2PASID();
+IOMMUTLBEvent event;
+
+if ((vtd_as->pasid != PCI_NO_PASID || pasid != rid2pasid) &&
+vtd_as->pasid != pasid) {
+continue;
+}
+
+/*
+ * Page-Selective-within-PASID PASID-based-IOTLB Invalidation
+ * does not flush stage-2 entries. See spec section 6.5.2.4
+ */
+if (!s->scalable_modern) {
+continue;
+}
+
+event.type = IOMMU_NOTIFIER_UNMAP;
+event.entry.target_as = _space_memory;
+event.entry.iova = addr;
+event.entry.perm = IOMMU_NONE;
+event.entry.addr_mask = size - 1;
+event.entry.translated_addr = 0;
+memory_region_notify_iommu(_as->iommu, 0, event);
+}
+}
 }
 
 static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
-- 
2.34.1




[PATCH v1 08/17] intel_iommu: Set accessed and dirty bits during first stage translation

2024-07-18 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  3 +++
 hw/i386/intel_iommu.c  | 24 
 2 files changed, 27 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 168185b850..cf0f176e06 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -326,6 +326,7 @@ typedef enum VTDFaultReason {
 
 /* Output address in the interrupt address range for scalable mode */
 VTD_FR_SM_INTERRUPT_ADDR = 0x87,
+VTD_FR_FS_BIT_UPDATE_FAILED = 0x91, /* SFS.10 */
 VTD_FR_MAX, /* Guard */
 } VTDFaultReason;
 
@@ -560,6 +561,8 @@ typedef struct VTDRootEntry VTDRootEntry;
 /* Masks for First Level Paging Entry */
 #define VTD_FL_P1ULL
 #define VTD_FL_RW_MASK  (1ULL << 1)
+#define VTD_FL_A0x20
+#define VTD_FL_D0x40
 
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 495a41cf80..210df32f01 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1825,6 +1825,7 @@ static const bool vtd_qualified_faults[] = {
 [VTD_FR_PASID_TABLE_ENTRY_INV] = true,
 [VTD_FR_SM_INTERRUPT_ADDR] = true,
 [VTD_FR_FS_NON_CANONICAL] = true,
+[VTD_FR_FS_BIT_UPDATE_FAILED] = true,
 [VTD_FR_MAX] = false,
 };
 
@@ -1942,6 +1943,20 @@ static bool vtd_iova_fl_check_canonical(IntelIOMMUState 
*s, uint64_t iova,
 );
 }
 
+static MemTxResult vtd_set_flag_in_pte(dma_addr_t base_addr, uint32_t index,
+   uint64_t pte, uint64_t flag)
+{
+if (pte & flag) {
+return MEMTX_OK;
+}
+pte |= flag;
+pte = cpu_to_le64(pte);
+return dma_memory_write(_space_memory,
+base_addr + index * sizeof(pte),
+, sizeof(pte),
+MEMTXATTRS_UNSPECIFIED);
+}
+
 /*
  * Given the @iova, get relevant @flptep. @flpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
@@ -1993,7 +2008,16 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, 
VTDContextEntry *ce,
 return -VTD_FR_PAGING_ENTRY_RSVD;
 }
 
+if (vtd_set_flag_in_pte(addr, offset, flpte, VTD_FL_A) != MEMTX_OK) {
+return -VTD_FR_FS_BIT_UPDATE_FAILED;
+}
+
 if (vtd_is_last_pte(flpte, level)) {
+if (is_write &&
+(vtd_set_flag_in_pte(addr, offset, flpte, VTD_FL_D) !=
+MEMTX_OK)) 
{
+return -VTD_FR_FS_BIT_UPDATE_FAILED;
+}
 *flptep = flpte;
 *flpte_level = level;
 return 0;
-- 
2.34.1




[PATCH v1 01/17] intel_iommu: Use the latest fault reasons defined by spec

2024-07-18 Thread Zhenzhong Duan
From: Yu Zhang 

Spec revision 3.0 or above defines more detailed fault reasons for
scalable mode. So introduce them into emulation code, see spec
section 7.1.2 for details.

Note spec revision has no relation with VERSION register, Guest
kernel should not use that register to judge what features are
supported. Instead cap/ecap bits should be checked.

Signed-off-by: Yu Zhang 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  9 -
 hw/i386/intel_iommu.c  | 25 -
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99bddf..c0ca7b372f 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -311,7 +311,14 @@ typedef enum VTDFaultReason {
   * request while disabled */
 VTD_FR_IR_SID_ERR = 0x26,   /* Invalid Source-ID */
 
-VTD_FR_PASID_TABLE_INV = 0x58,  /*Invalid PASID table entry */
+/* PASID directory entry access failure */
+VTD_FR_PASID_DIR_ACCESS_ERR = 0x50,
+/* The Present(P) field of pasid directory entry is 0 */
+VTD_FR_PASID_DIR_ENTRY_P = 0x51,
+VTD_FR_PASID_TABLE_ACCESS_ERR = 0x58, /* PASID table entry access failure 
*/
+/* The Present(P) field of pasid table entry is 0 */
+VTD_FR_PASID_ENTRY_P = 0x59,
+VTD_FR_PASID_TABLE_ENTRY_INV = 0x5b,  /*Invalid PASID table entry */
 
 /* Output address in the interrupt address range for scalable mode */
 VTD_FR_SM_INTERRUPT_ADDR = 0x87,
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 37c21a0aec..e65f5b29a5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -796,7 +796,7 @@ static int vtd_get_pdire_from_pdir_table(dma_addr_t 
pasid_dir_base,
 addr = pasid_dir_base + index * entry_size;
 if (dma_memory_read(_space_memory, addr,
 pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_DIR_ACCESS_ERR;
 }
 
 pdire->val = le64_to_cpu(pdire->val);
@@ -814,6 +814,7 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
   dma_addr_t addr,
   VTDPASIDEntry *pe)
 {
+uint8_t pgtt;
 uint32_t index;
 dma_addr_t entry_size;
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
@@ -823,7 +824,7 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 addr = addr + index * entry_size;
 if (dma_memory_read(_space_memory, addr,
 pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_TABLE_ACCESS_ERR;
 }
 for (size_t i = 0; i < ARRAY_SIZE(pe->val); i++) {
 pe->val[i] = le64_to_cpu(pe->val[i]);
@@ -831,11 +832,13 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 
 /* Do translation type check */
 if (!vtd_pe_type_check(x86_iommu, pe)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
-if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
-return -VTD_FR_PASID_TABLE_INV;
+pgtt = VTD_PE_GET_TYPE(pe);
+if (pgtt == VTD_SM_PASID_ENTRY_SLT &&
+!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
+return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
 return 0;
@@ -876,7 +879,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
 }
 
 if (!vtd_pdire_present()) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_DIR_ENTRY_P;
 }
 
 ret = vtd_get_pe_from_pdire(s, pasid, , pe);
@@ -885,7 +888,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
 }
 
 if (!vtd_pe_present(pe)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_ENTRY_P;
 }
 
 return 0;
@@ -938,7 +941,7 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
 }
 
 if (!vtd_pdire_present()) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_DIR_ENTRY_P;
 }
 
 /*
@@ -1795,7 +1798,11 @@ static const bool vtd_qualified_faults[] = {
 [VTD_FR_ROOT_ENTRY_RSVD] = false,
 [VTD_FR_PAGING_ENTRY_RSVD] = true,
 [VTD_FR_CONTEXT_ENTRY_TT] = true,
-[VTD_FR_PASID_TABLE_INV] = false,
+[VTD_FR_PASID_DIR_ACCESS_ERR] = false,
+[VTD_FR_PASID_DIR_ENTRY_P] = true,
+[VTD_FR_PASID_TABLE_ACCESS_ERR] = false,
+[VTD_FR_PASID_ENTRY_P] = true,
+[VTD_FR_PASID_TABLE_ENTRY_INV] = true,
 [VTD_FR_SM_INTERRUPT_ADDR] = true,
 [VTD_FR_MAX] = false,
 };
-- 
2.34.1




[PATCH v1 00/17] intel_iommu: Enable stage-1 translation for emulated device

2024-07-18 Thread Zhenzhong Duan
Hi,

Per Jason Wang's suggestion, iommufd nesting series[1] is split into
"Enable stage-1 translation for emulated device" series and
"Enable stage-1 translation for passthrough device" series.

This series enables stage-1 translation support for emulated device
in intel iommu which we called "modern" mode.

PATCH1-5:  Some preparing work before support stage-1 translation
PATCH6-8:  Implement stage-1 translation for emulated device
PATCH9-14: Emulate iotlb invalidation of stage-1 mapping
PATCH15:   Set default aw_bits to 48 in scalable modern mode
PATCH16:   Introduce "modern" mode to distinguish with legacy mode
PATCH17:   Add qtest

Note in spec revision 3.4, it renames "First-level" to "First-stage",
"Second-level" to "Second-stage". But the scalable mode was added
before that change. So we keep old favor using First-level/fl/Second-level/sl
in code but change to use stage-1/stage-2 in commit log.
But keep in mind First-level/fl/stage-1 all have same meaning,
same for Second-level/sl/stage-2.

Qemu code can be found at [2]

[1] https://lists.gnu.org/archive/html/qemu-devel/2024-01/msg02740.html
[2] https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_stage1_emu_v1

Thanks
Zhenzhong

Changelog:
v1:
- define VTD_HOST_AW_AUTO (Clement)
- passing pgtt as a parameter to vtd_update_iotlb (Clement)
- prefix sl_/fl_ to second/first level specific functions (Clement)
- pick reserved bit check from Clement, add his Co-developed-by
- Update test without using libqtest-single.h (Thomas)

rfcv2:
- split from nesting series (Jason)
- merged some commits from Clement
- add qtest (jason)

Clément Mathieu--Drif (5):
  intel_iommu: Check if the input address is canonical
  intel_iommu: Set accessed and dirty bits during first stage
translation
  intel_iommu: Extract device IOTLB invalidation logic
  intel_iommu: Add an internal API to find an address space with PASID
  intel_iommu: Add support for PASID-based device IOTLB invalidation

Yi Liu (3):
  intel_iommu: Rename slpte to pte
  intel_iommu: Implement stage-1 translation
  intel_iommu: Modify x-scalable-mode to be string option

Yu Zhang (1):
  intel_iommu: Use the latest fault reasons defined by spec

Zhenzhong Duan (8):
  intel_iommu: Make pasid entry type check accurate
  intel_iommu: Add a placeholder variable for scalable modern mode
  intel_iommu: Flush stage-2 cache in PADID-selective PASID-based iotlb
invalidation
  intel_iommu: Flush stage-1 cache in iotlb invalidation
  intel_iommu: Process PASID-based iotlb invalidation
  intel_iommu: piotlb invalidation should notify unmap
  intel_iommu: Set default aw_bits to 48 in scalable modren mode
  tests/qtest: Add intel-iommu test

 MAINTAINERS|   1 +
 hw/i386/intel_iommu_internal.h |  90 +++-
 include/hw/i386/intel_iommu.h  |   8 +-
 hw/i386/intel_iommu.c  | 742 +++--
 tests/qtest/intel-iommu-test.c |  71 
 tests/qtest/meson.build|   1 +
 6 files changed, 764 insertions(+), 149 deletions(-)
 create mode 100644 tests/qtest/intel-iommu-test.c

-- 
2.34.1




[PATCH v1 04/17] intel_iommu: Flush stage-2 cache in PADID-selective PASID-based iotlb invalidation

2024-07-18 Thread Zhenzhong Duan
Per spec 6.5.2.4, PADID-selective PASID-based iotlb invalidation will
flush stage-2 iotlb entries with matching domain id and pasid.

With scalable modern mode introduced, guest could send PASID-selective
PASID-based iotlb invalidation to flush both stage-1 and stage-2 entries.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h | 10 +
 hw/i386/intel_iommu.c  | 78 ++
 2 files changed, 88 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 4e0331caba..f71fc91234 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -440,6 +440,16 @@ typedef union VTDInvDesc VTDInvDesc;
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
 
+#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
+#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID  (3ULL << 4)
+
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff0ffc0ULL
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+
+#define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
+ VTD_DOMAIN_ID_MASK)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 40cbd4a0f4..075a27adac 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2659,6 +2659,80 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static gboolean vtd_hash_remove_by_pasid(gpointer key, gpointer value,
+ gpointer user_data)
+{
+VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+
+return ((entry->domain_id == info->domain_id) &&
+(entry->pasid == info->pasid));
+}
+
+static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
+uint16_t domain_id, uint32_t pasid)
+{
+VTDIOTLBPageInvInfo info;
+VTDAddressSpace *vtd_as;
+VTDContextEntry ce;
+
+info.domain_id = domain_id;
+info.pasid = pasid;
+
+vtd_iommu_lock(s);
+g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pasid,
+);
+vtd_iommu_unlock(s);
+
+QLIST_FOREACH(vtd_as, >vtd_as_with_notifiers, next) {
+if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+  vtd_as->devfn, ) &&
+domain_id == vtd_get_domain_id(s, , vtd_as->pasid)) {
+uint32_t rid2pasid = VTD_CE_GET_RID2PASID();
+
+if ((vtd_as->pasid != PCI_NO_PASID || pasid != rid2pasid) &&
+vtd_as->pasid != pasid) {
+continue;
+}
+
+if (!s->scalable_modern) {
+vtd_address_space_sync(vtd_as);
+}
+}
+}
+}
+
+static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
+VTDInvDesc *inv_desc)
+{
+uint16_t domain_id;
+uint32_t pasid;
+
+if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
+error_report_once("non-zero-field-in-piotlb_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]);
+pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]);
+switch (inv_desc->val[0] & VTD_INV_DESC_IOTLB_G) {
+case VTD_INV_DESC_PIOTLB_ALL_IN_PASID:
+vtd_piotlb_pasid_invalidate(s, domain_id, pasid);
+break;
+
+case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+break;
+
+default:
+error_report_once("Invalid granularity in P-IOTLB desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -2769,6 +2843,10 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 break;
 
 case VTD_INV_DESC_PIOTLB:
+trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_piotlb_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_WAIT:
-- 
2.34.1




[PATCH] RAMBlock: use return value of ram_block_discard_require() as errno

2024-07-16 Thread Zhenzhong Duan
When ram_block_discard_require() fails, errno is passed to error_setg_errno().
It's a stale value or 0 which is unrelated to ram_block_discard_require().

As ram_block_discard_require() already returns -EBUSY in failure case,
use it as errno for error_setg_errno().

Fixes: 852f0048f3ea ("make guest_memfd require uncoordinated discard")
Signed-off-by: Zhenzhong Duan 
---
 system/physmem.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/system/physmem.c b/system/physmem.c
index 2154432cb6..9a3b3a7636 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1845,11 +1845,14 @@ static void ram_block_add(RAMBlock *new_block, Error 
**errp)
 }
 
 if (new_block->flags & RAM_GUEST_MEMFD) {
+int ret;
+
 assert(kvm_enabled());
 assert(new_block->guest_memfd < 0);
 
-if (ram_block_discard_require(true) < 0) {
-error_setg_errno(errp, errno,
+ret = ram_block_discard_require(true);
+if (ret < 0) {
+error_setg_errno(errp, -ret,
  "cannot set up private guest memory: discard 
currently blocked");
 error_append_hint(errp, "Are you using assigned devices?\n");
 goto out_free;
-- 
2.34.1




[PATCH v2 1/2] vfio/display: Fix potential memleak of edid info

2024-06-30 Thread Zhenzhong Duan
EDID related device region info is leaked in vfio_display_edid_init()
error path and VFIODisplay destroying path.

Fixes: 08479114b0de ("vfio/display: add edid support.")
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/display.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index 661e921616..9c57fd3888 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -171,7 +171,9 @@ static void vfio_display_edid_init(VFIOPCIDevice *vdev)
 
 err:
 trace_vfio_display_edid_write_error();
+g_free(dpy->edid_info);
 g_free(dpy->edid_regs);
+dpy->edid_info = NULL;
 dpy->edid_regs = NULL;
 return;
 }
@@ -182,6 +184,7 @@ static void vfio_display_edid_exit(VFIODisplay *dpy)
 return;
 }
 
+g_free(dpy->edid_info);
 g_free(dpy->edid_regs);
 g_free(dpy->edid_blob);
 timer_free(dpy->edid_link_timer);
-- 
2.34.1




[PATCH v2 0/2] Misc fixes on vfio display

2024-06-30 Thread Zhenzhong Duan
Hi,

This is trying to address an issue Cédric found.
See https://www.mail-archive.com/qemu-devel@nongnu.org/msg1043142.html
While looking into it, also found a potential memory leak.

I'm sorry that I didn't find how to test this fix, because it looks
a GFX card is needed. Any idea on how to test or help test are quite
appreciated.

Thanks
Zhenzhong

v2:
- set dpy->edid_info to NULL in vfio_display_edid_init() err path (Marc-André)
- remove a wrongly added g_free(*info) in vfio_get_dev_region_info() 
(Marc-André)
- add R-B on patch2


Zhenzhong Duan (2):
  vfio/display: Fix potential memleak of edid info
  vfio/display: Fix vfio_display_edid_init() error path

 hw/vfio/display.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

-- 
2.34.1




[PATCH v2 2/2] vfio/display: Fix vfio_display_edid_init() error path

2024-06-30 Thread Zhenzhong Duan
vfio_display_edid_init() can fail for many reasons and return silently.
It would be good to report the error.

Old mdev driver may not support vfio edid region and we allow to go
through in this case.

vfio_display_edid_update() isn't changed because it can be called at
runtime when UI changes (i.e. window resize).

Fixes: 08479114b0de ("vfio/display: add edid support.")
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Marc-André Lureau 
---
 hw/vfio/display.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index 9c57fd3888..ea87830fe0 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -124,7 +124,7 @@ static void vfio_display_edid_ui_info(void *opaque, 
uint32_t idx,
 }
 }
 
-static void vfio_display_edid_init(VFIOPCIDevice *vdev)
+static bool vfio_display_edid_init(VFIOPCIDevice *vdev, Error **errp)
 {
 VFIODisplay *dpy = vdev->dpy;
 int fd = vdev->vbasedev.fd;
@@ -135,7 +135,8 @@ static void vfio_display_edid_init(VFIOPCIDevice *vdev)
VFIO_REGION_SUBTYPE_GFX_EDID,
>edid_info);
 if (ret) {
-return;
+/* Failed to get GFX edid info, allow to go through without edid. */
+return true;
 }
 
 trace_vfio_display_edid_available();
@@ -167,15 +168,16 @@ static void vfio_display_edid_init(VFIOPCIDevice *vdev)
 vfio_display_edid_link_up, vdev);
 
 vfio_display_edid_update(vdev, true, 0, 0);
-return;
+return true;
 
 err:
+error_setg(errp, "vfio: failed to read GFX edid field");
 trace_vfio_display_edid_write_error();
 g_free(dpy->edid_info);
 g_free(dpy->edid_regs);
 dpy->edid_info = NULL;
 dpy->edid_regs = NULL;
-return;
+return false;
 }
 
 static void vfio_display_edid_exit(VFIODisplay *dpy)
@@ -368,8 +370,7 @@ static bool vfio_display_dmabuf_init(VFIOPCIDevice *vdev, 
Error **errp)
 return false;
 }
 }
-vfio_display_edid_init(vdev);
-return true;
+return vfio_display_edid_init(vdev, errp);
 }
 
 static void vfio_display_dmabuf_exit(VFIODisplay *dpy)
-- 
2.34.1




[PATCH 0/2] Misc fixes on vfio display

2024-06-28 Thread Zhenzhong Duan
Hi,

This is trying to address an issue Cédric found.
See https://www.mail-archive.com/qemu-devel@nongnu.org/msg1043142.html
While looking into it, also found a potential memory leak.

I'm sorry that I didn't find how to test this fix, because it looks
a GFX card is needed. Any idea on how to test or help test are quite
appreciated.

Thanks
Zhenzhong

Zhenzhong Duan (2):
  vfio/display: Fix potential memleak of edid info
  vfio/display: Fix vfio_display_edid_init() error path

 hw/vfio/display.c | 15 +--
 hw/vfio/helpers.c |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

-- 
2.34.1




[PATCH 1/2] vfio/display: Fix potential memleak of edid info

2024-06-28 Thread Zhenzhong Duan
EDID related device region info is leaked in three paths:
1. In vfio_get_dev_region_info(), when edid info isn't find, the last
device region info is leaked.
2. In vfio_display_edid_init() error path, edid info is leaked.
3. In VFIODisplay destroying path, edid info is leaked.

Fixes: 08479114b0de ("vfio/display: add edid support.")
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/display.c | 2 ++
 hw/vfio/helpers.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index 661e921616..5926bd6628 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -171,6 +171,7 @@ static void vfio_display_edid_init(VFIOPCIDevice *vdev)
 
 err:
 trace_vfio_display_edid_write_error();
+g_free(dpy->edid_info);
 g_free(dpy->edid_regs);
 dpy->edid_regs = NULL;
 return;
@@ -182,6 +183,7 @@ static void vfio_display_edid_exit(VFIODisplay *dpy)
 return;
 }
 
+g_free(dpy->edid_info);
 g_free(dpy->edid_regs);
 g_free(dpy->edid_blob);
 timer_free(dpy->edid_link_timer);
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index b14edd46ed..3dd32b26a4 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -586,6 +586,7 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t 
type,
 g_free(*info);
 }
 
+g_free(*info);
 *info = NULL;
 return -ENODEV;
 }
-- 
2.34.1




[PATCH 2/2] vfio/display: Fix vfio_display_edid_init() error path

2024-06-28 Thread Zhenzhong Duan
vfio_display_edid_init() can fail for many reasons and return silently.
It would be good to report the error.

Old mdev driver may not support vfio edid region and we allow to go
through in this case.

vfio_display_edid_update() isn't changed because it can be called at
runtime when UI changes (i.e. window resize).

Fixes: 08479114b0de ("vfio/display: add edid support.")
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/display.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index 5926bd6628..462845ce69 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -124,7 +124,7 @@ static void vfio_display_edid_ui_info(void *opaque, 
uint32_t idx,
 }
 }
 
-static void vfio_display_edid_init(VFIOPCIDevice *vdev)
+static bool vfio_display_edid_init(VFIOPCIDevice *vdev, Error **errp)
 {
 VFIODisplay *dpy = vdev->dpy;
 int fd = vdev->vbasedev.fd;
@@ -135,7 +135,8 @@ static void vfio_display_edid_init(VFIOPCIDevice *vdev)
VFIO_REGION_SUBTYPE_GFX_EDID,
>edid_info);
 if (ret) {
-return;
+/* Failed to get GFX edid info, allow to go through without edid. */
+return true;
 }
 
 trace_vfio_display_edid_available();
@@ -167,14 +168,15 @@ static void vfio_display_edid_init(VFIOPCIDevice *vdev)
 vfio_display_edid_link_up, vdev);
 
 vfio_display_edid_update(vdev, true, 0, 0);
-return;
+return true;
 
 err:
+error_setg(errp, "vfio: failed to read GFX edid field");
 trace_vfio_display_edid_write_error();
 g_free(dpy->edid_info);
 g_free(dpy->edid_regs);
 dpy->edid_regs = NULL;
-return;
+return false;
 }
 
 static void vfio_display_edid_exit(VFIODisplay *dpy)
@@ -367,8 +369,7 @@ static bool vfio_display_dmabuf_init(VFIOPCIDevice *vdev, 
Error **errp)
 return false;
 }
 }
-vfio_display_edid_init(vdev);
-return true;
+return vfio_display_edid_init(vdev, errp);
 }
 
 static void vfio_display_dmabuf_exit(VFIODisplay *dpy)
-- 
2.34.1




[PATCH v7 12/17] hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn()

2024-06-05 Thread Zhenzhong Duan
Extract out pci_device_get_iommu_bus_devfn() from
pci_device_iommu_address_space() to facilitate
implementation of pci_device_[set|unset]_iommu_device()
in following patch.

No functional change intended.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Eric Auger 
---
 hw/pci/pci.c | 48 +---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 324c1302d2..02a4bb2af6 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2648,11 +2648,27 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+/*
+ * Get IOMMU root bus, aliased bus and devfn of a PCI device
+ *
+ * IOMMU root bus is needed by all call sites to call into iommu_ops.
+ * For call sites which don't need aliased BDF, passing NULL to
+ * aliased_[bus|devfn] is allowed.
+ *
+ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device.
+ *
+ * @aliased_bus: return aliased #PCIBus of the PCI device, optional.
+ *
+ * @aliased_devfn: return aliased devfn of the PCI device, optional.
+ */
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **piommu_bus,
+   PCIBus **aliased_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2693,7 +2709,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, _bus, , );
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
-- 
2.34.1




[PATCH v7 10/17] backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() handler

2024-06-05 Thread Zhenzhong Duan
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 backends/iommufd.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/backends/iommufd.c b/backends/iommufd.c
index c7e969d6f7..84fefbc9ee 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -230,6 +230,28 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, 
uint32_t devid,
 return true;
 }
 
+static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
+{
+HostIOMMUDeviceCaps *caps = >caps;
+
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE:
+return caps->type;
+case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+return caps->aw_bits;
+default:
+error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
+return -EINVAL;
+}
+}
+
+static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->get_cap = hiod_iommufd_get_cap;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_IOMMUFD_BACKEND,
@@ -246,6 +268,7 @@ static const TypeInfo types[] = {
 }, {
 .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
 .parent = TYPE_HOST_IOMMU_DEVICE,
+.class_init = hiod_iommufd_class_init,
 .abstract = true,
 }
 };
-- 
2.34.1




[PATCH v7 17/17] intel_iommu: Check compatibility with host IOMMU capabilities

2024-06-05 Thread Zhenzhong Duan
If check fails, host device (either VFIO or VDPA device) is not
compatible with current vIOMMU config and should not be passed to
guest.

Only aw_bits is checked for now, we don't care about other caps
before scalable modern mode is introduced.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 07e897ad7a..f592082444 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3837,6 +3837,30 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod,
+   Error **errp)
+{
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+int ret;
+
+if (!hiodc->get_cap) {
+error_setg(errp, ".get_cap() not implemented");
+return false;
+}
+
+/* Common checks */
+ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS, errp);
+if (ret < 0) {
+return false;
+}
+if (s->aw_bits > ret) {
+error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret);
+return false;
+}
+
+return true;
+}
+
 static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
  HostIOMMUDevice *hiod, Error **errp)
 {
@@ -3857,6 +3881,11 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 return false;
 }
 
+if (!vtd_check_hiod(s, hiod, errp)) {
+vtd_iommu_unlock(s);
+return false;
+}
+
 new_key = g_malloc(sizeof(*new_key));
 new_key->bus = bus;
 new_key->devfn = devfn;
-- 
2.34.1




[PATCH v7 11/17] vfio: Create host IOMMU device instance

2024-06-05 Thread Zhenzhong Duan
Create host IOMMU device instance in vfio_attach_device() and call
.realize() to initialize it further.

Introuduce attribute VFIOIOMMUClass::hiod_typename and initialize
it based on VFIO backend type. It will facilitate HostIOMMUDevice
creation in vfio_attach_device().

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  1 +
 include/hw/vfio/vfio-container-base.h |  3 +++
 hw/vfio/common.c  | 16 +++-
 hw/vfio/container.c   |  2 ++
 hw/vfio/iommufd.c |  2 ++
 5 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 105b8b7e80..776de8064f 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -127,6 +127,7 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
+HostIOMMUDevice *hiod;
 int devid;
 IOMMUFDBackend *iommufd;
 } VFIODevice;
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 2776481fc9..442c0dfc4c 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -109,6 +109,9 @@ DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, 
TYPE_VFIO_IOMMU)
 struct VFIOIOMMUClass {
 InterfaceClass parent_class;
 
+/* Properties */
+const char *hiod_typename;
+
 /* basic feature */
 bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
 int (*dma_map)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f9619a1dfb..f20a7b5bba 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1528,6 +1528,7 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 {
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
+HostIOMMUDevice *hiod;
 
 if (vbasedev->iommufd) {
 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
@@ -1535,7 +1536,19 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 
 assert(ops);
 
-return ops->attach_device(name, vbasedev, as, errp);
+if (!ops->attach_device(name, vbasedev, as, errp)) {
+return false;
+}
+
+hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
+if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) {
+object_unref(hiod);
+ops->detach_device(vbasedev);
+return false;
+}
+vbasedev->hiod = hiod;
+
+return true;
 }
 
 void vfio_detach_device(VFIODevice *vbasedev)
@@ -1543,5 +1556,6 @@ void vfio_detach_device(VFIODevice *vbasedev)
 if (!vbasedev->bcontainer) {
 return;
 }
+object_unref(vbasedev->hiod);
 vbasedev->bcontainer->ops->detach_device(vbasedev);
 }
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 99beeba422..26e6f7fb4f 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1126,6 +1126,8 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
 
+vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
+
 vioc->setup = vfio_legacy_setup;
 vioc->dma_map = vfio_legacy_dma_map;
 vioc->dma_unmap = vfio_legacy_dma_unmap;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 1674c61227..409ed3dcc9 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -612,6 +612,8 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
 
+vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO;
+
 vioc->dma_map = iommufd_cdev_map;
 vioc->dma_unmap = iommufd_cdev_unmap;
 vioc->attach_device = iommufd_cdev_attach;
-- 
2.34.1




[PATCH v7 04/17] backends/iommufd: Introduce TYPE_HOST_IOMMU_DEVICE_IOMMUFD[_VFIO] devices

2024-06-05 Thread Zhenzhong Duan
TYPE_HOST_IOMMU_DEVICE_IOMMUFD represents a host IOMMU device under
iommufd backend. It is abstract, because it is going to be derived
into VFIO or VDPA type'd device.

It will have its own .get_cap() implementation.

TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO is a sub-class of
TYPE_HOST_IOMMU_DEVICE_IOMMUFD, represents a VFIO type'd host IOMMU
device under iommufd backend. It will be created during VFIO device
attaching and passed to vIOMMU.

It will have its own .realize() implementation.

Opportunistically, add missed header to include/sysemu/iommufd.h.

Suggested-by: Cédric Le Goater 
Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  3 +++
 include/sysemu/iommufd.h  | 16 
 backends/iommufd.c| 35 ++-
 hw/vfio/iommufd.c |  5 -
 4 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 75b167979a..56d1717211 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -32,6 +32,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
 #include "sysemu/host_iommu_device.h"
+#include "sysemu/iommufd.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -173,6 +174,8 @@ typedef struct VFIOGroup {
 } VFIOGroup;
 
 #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE 
"-legacy-vfio"
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \
+TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
 
 typedef struct VFIODMABuf {
 QemuDmaBuf *buf;
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 293bfbe967..f6e6d6e1f9 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -1,9 +1,23 @@
+/*
+ * iommufd container backend declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ * Copyright Red Hat, Inc. 2024
+ *
+ * Authors: Yi Liu 
+ *  Eric Auger 
+ *  Zhenzhong Duan 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
 #ifndef SYSEMU_IOMMUFD_H
 #define SYSEMU_IOMMUFD_H
 
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +47,6 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index c506afbdac..012f18d8d8 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -208,23 +208,24 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, 
uint32_t ioas_id,
 return ret;
 }
 
-static const TypeInfo iommufd_backend_info = {
-.name = TYPE_IOMMUFD_BACKEND,
-.parent = TYPE_OBJECT,
-.instance_size = sizeof(IOMMUFDBackend),
-.instance_init = iommufd_backend_init,
-.instance_finalize = iommufd_backend_finalize,
-.class_size = sizeof(IOMMUFDBackendClass),
-.class_init = iommufd_backend_class_init,
-.interfaces = (InterfaceInfo[]) {
-{ TYPE_USER_CREATABLE },
-{ }
+static const TypeInfo types[] = {
+{
+.name = TYPE_IOMMUFD_BACKEND,
+.parent = TYPE_OBJECT,
+.instance_size = sizeof(IOMMUFDBackend),
+.instance_init = iommufd_backend_init,
+.instance_finalize = iommufd_backend_finalize,
+.class_size = sizeof(IOMMUFDBackendClass),
+.class_init = iommufd_backend_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_USER_CREATABLE },
+{ }
+}
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.abstract = true,
 }
 };
 
-static void register_types(void)
-{
-type_register_static(_backend_info);
-}
-
-type_init(register_types);
+DEFINE_TYPES(types)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 554f9a6292..e4a507d55c 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -624,7 +624,10 @@ static const TypeInfo types[] = {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_iommufd_class_init,
-},
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v7 14/17] vfio/pci: Pass HostIOMMUDevice to vIOMMU

2024-06-05 Thread Zhenzhong Duan
With HostIOMMUDevice passed, vIOMMU can check compatibility with host
IOMMU, call into IOMMUFD specific methods, etc.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Eric Auger 
---
 hw/vfio/pci.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 74a79bdf61..d8a76c1ee0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3121,10 +3121,15 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-if (!vfio_add_capabilities(vdev, errp)) {
+if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+if (!vfio_add_capabilities(vdev, errp)) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3141,7 +3146,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3150,11 +3155,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3238,6 +3243,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3266,6 +3273,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = >vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3280,7 +3288,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(>vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1




[PATCH v7 00/17] Add a host IOMMU device abstraction to check with vIOMMU

2024-06-05 Thread Zhenzhong Duan
 device and its sub-classes (Cédric)
- move host IOMMU device creation in attach_device() (Cédric)
- refine pci_device_set/unset_iommu_device doc further (Eric)
- define host IOMMU info format of different backend
- implement get_host_iommu_info() for different backend (Cédric)
- drop cap/ecap update logic (MST)
- check aw-bits from get_host_iommu_info() in legacy mode

v1:
- use HostIOMMUDevice handle instead of union in VFIODevice (Eric)
- change host_iommu_device_init to host_iommu_device_create
- allocate HostIOMMUDevice in host_iommu_device_create callback
  and set the VFIODevice base_hdev handle (Eric)
- refine pci_device_set/unset_iommu_device doc (Eric)
- use HostIOMMUDevice handle instead of union in VTDHostIOMMUDevice (Eric)
- convert HostIOMMUDevice to sub object pointer in vtd_check_hdev

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B

Yi Liu (2):
  hw/pci: Introduce pci_device_[set|unset]_iommu_device()
  intel_iommu: Implement [set|unset]_iommu_device() callbacks

Zhenzhong Duan (15):
  backends: Introduce HostIOMMUDevice abstract
  backends/host_iommu_device: Introduce HostIOMMUDeviceCaps
  vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO device
  backends/iommufd: Introduce TYPE_HOST_IOMMU_DEVICE_IOMMUFD[_VFIO]
devices
  range: Introduce range_get_last_bit()
  vfio/container: Implement HostIOMMUDeviceClass::realize() handler
  backends/iommufd: Introduce helper function
iommufd_backend_get_device_info()
  vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler
  vfio/container: Implement HostIOMMUDeviceClass::get_cap() handler
  backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() handler
  vfio: Create host IOMMU device instance
  hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn()
  vfio/pci: Pass HostIOMMUDevice to vIOMMU
  intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap
  intel_iommu: Check compatibility with host IOMMU capabilities

 MAINTAINERS   |   2 +
 include/hw/i386/intel_iommu.h |   2 +
 include/hw/pci/pci.h  |  38 -
 include/hw/vfio/vfio-common.h |   8 +
 include/hw/vfio/vfio-container-base.h |   3 +
 include/qemu/range.h  |  11 ++
 include/sysemu/host_iommu_device.h|  91 
 include/sysemu/iommufd.h  |  19 +++
 backends/host_iommu_device.c  |  33 +
 backends/iommufd.c|  76 --
 hw/i386/intel_iommu.c | 203 --
 hw/pci/pci.c  |  75 +-
 hw/vfio/common.c  |  16 +-
 hw/vfio/container.c   |  41 +-
 hw/vfio/helpers.c |  17 +++
 hw/vfio/iommufd.c |  37 -
 hw/vfio/pci.c |  19 ++-
 backends/meson.build  |   1 +
 18 files changed, 623 insertions(+), 69 deletions(-)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

-- 
2.34.1




[PATCH v7 03/17] vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO device

2024-06-05 Thread Zhenzhong Duan
TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO represents a host IOMMU device under
VFIO legacy container backend.

It will have its own realize implementation.

Suggested-by: Eric Auger 
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 3 +++
 hw/vfio/container.c   | 5 -
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 4cb1ab8645..75b167979a 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -171,6 +172,8 @@ typedef struct VFIOGroup {
 bool ram_block_discard_allowed;
 } VFIOGroup;
 
+#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE 
"-legacy-vfio"
+
 typedef struct VFIODMABuf {
 QemuDmaBuf *buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 096cc97258..c4fca2dfca 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1141,7 +1141,10 @@ static const TypeInfo types[] = {
 .name = TYPE_VFIO_IOMMU_LEGACY,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_legacy_class_init,
-},
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v7 08/17] vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler

2024-06-05 Thread Zhenzhong Duan
It calls iommufd_backend_get_device_info() to get host IOMMU
related information and translate it into HostIOMMUDeviceCaps
for query with .get_cap().

For aw_bits, use the same way as legacy backend by calling
vfio_device_get_aw_bits() which is common for different vendor
IOMMU.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index e4a507d55c..1674c61227 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -619,6 +619,35 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
 };
 
+static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+  Error **errp)
+{
+VFIODevice *vdev = opaque;
+HostIOMMUDeviceCaps *caps = >caps;
+enum iommu_hw_info_type type;
+union {
+struct iommu_hw_info_vtd vtd;
+} data;
+
+if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid,
+ , , sizeof(data), errp)) {
+return false;
+}
+
+hiod->name = g_strdup(vdev->name);
+caps->type = type;
+caps->aw_bits = vfio_device_get_aw_bits(vdev);
+
+return true;
+}
+
+static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hiodc->realize = hiod_iommufd_vfio_realize;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
@@ -627,6 +656,7 @@ static const TypeInfo types[] = {
 }, {
 .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO,
 .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+.class_init = hiod_iommufd_vfio_class_init,
 }
 };
 
-- 
2.34.1




[PATCH v7 13/17] hw/pci: Introduce pci_device_[set|unset]_iommu_device()

2024-06-05 Thread Zhenzhong Duan
From: Yi Liu 

pci_device_[set|unset]_iommu_device() call pci_device_get_iommu_bus_devfn()
to get iommu_bus->iommu_ops and call [set|unset]_iommu_device callback to
set/unset HostIOMMUDevice for a given PCI device.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Eric Auger 
---
 include/hw/pci/pci.h | 38 +-
 hw/pci/pci.c | 27 +++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eaa3fc99d8..eb26cac810 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -383,10 +384,45 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the #HostIOMMUDevice to attach.
+ *
+ * @errp: pass an Error out only when return false
+ *
+ * Returns: true if HostIOMMUDevice is attached or else false with errp 
set.
+ */
+bool (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+ HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+ Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 02a4bb2af6..c8a8aab306 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2742,6 +2742,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 return _space_memory;
 }
 
+bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+ Error **errp)
+{
+PCIBus *iommu_bus;
+
+/* set_iommu_device requires device's direct BDF instead of aliased BDF */
+pci_device_get_iommu_bus_devfn(dev, _bus, NULL, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) {
+return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev),
+  iommu_bus->iommu_opaque,
+  dev->devfn, hiod, errp);
+}
+return true;
+}
+
+void pci_device_unset_iommu_device(PCIDevice *dev)
+{
+PCIBus *iommu_bus;
+
+pci_device_get_iommu_bus_devfn(dev, _bus, NULL, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) {
+return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev),
+
iommu_bus->iommu_opaque,
+dev->devfn);
+}
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 /*
-- 
2.34.1




[PATCH v7 15/17] intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap

2024-06-05 Thread Zhenzhong Duan
Extract cap/ecap initialization in vtd_cap_init() to make code
cleaner.

No functional change intended.

Reviewed-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 93 ---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index cc8e59674e..519063c8f8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3934,30 +3934,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-memset(s->csr, 0, DMAR_REG_SIZE);
-memset(s->wmask, 0, DMAR_REG_SIZE);
-memset(s->w1cmask, 0, DMAR_REG_SIZE);
-memset(s->womask, 0, DMAR_REG_SIZE);
-
-s->root = 0;
-s->root_scalable = false;
-s->dmar_enabled = false;
-s->intr_enabled = false;
-s->iq_head = 0;
-s->iq_tail = 0;
-s->iq = 0;
-s->iq_size = 0;
-s->qi_enabled = false;
-s->iq_last_desc_type = VTD_INV_DESC_NONE;
-s->iq_dw = false;
-s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
  VTD_CAP_MGAW(s->aw_bits);
@@ -3974,27 +3954,6 @@ static void vtd_init(IntelIOMMUState *s)
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-/*
- * Rsvd field masks for spte
- */
-vtd_spte_rsvd[0] = ~0ULL;
-vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-  x86_iommu->dt_supported);
-vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-
-if (s->scalable_mode || s->snoop_control) {
-vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-}
-
 if (x86_iommu_ir_supported(x86_iommu)) {
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
 if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4027,6 +3986,56 @@ static void vtd_init(IntelIOMMUState *s)
 if (s->pasid) {
 s->ecap |= VTD_ECAP_PASID;
 }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+memset(s->csr, 0, DMAR_REG_SIZE);
+memset(s->wmask, 0, DMAR_REG_SIZE);
+memset(s->w1cmask, 0, DMAR_REG_SIZE);
+memset(s->womask, 0, DMAR_REG_SIZE);
+
+s->root = 0;
+s->root_scalable = false;
+s->dmar_enabled = false;
+s->intr_enabled = false;
+s->iq_head = 0;
+s->iq_tail = 0;
+s->iq = 0;
+s->iq_size = 0;
+s->qi_enabled = false;
+s->iq_last_desc_type = VTD_INV_DESC_NONE;
+s->iq_dw = false;
+s->next_frcd_reg = 0;
+
+vtd_cap_init(s);
+
+/*
+ * Rsvd field masks for spte
+ */
+vtd_spte_rsvd[0] = ~0ULL;
+vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+  x86_iommu->dt_supported);
+vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+
+if (s->scalable_mode || s->snoop_control) {
+vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+}
 
 vtd_reset_caches(s);
 
-- 
2.34.1




[PATCH v7 09/17] vfio/container: Implement HostIOMMUDeviceClass::get_cap() handler

2024-06-05 Thread Zhenzhong Duan
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 2f62c13214..99beeba422 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1147,11 +1147,26 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice 
*hiod, void *opaque,
 return true;
 }
 
+static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
+Error **errp)
+{
+HostIOMMUDeviceCaps *caps = >caps;
+
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+return caps->aw_bits;
+default:
+error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
+return -EINVAL;
+}
+}
+
 static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
 {
 HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
 
 hioc->realize = hiod_legacy_vfio_realize;
+hioc->get_cap = hiod_legacy_vfio_get_cap;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH v7 07/17] backends/iommufd: Introduce helper function iommufd_backend_get_device_info()

2024-06-05 Thread Zhenzhong Duan
Introduce a helper function iommufd_backend_get_device_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  3 +++
 backends/iommufd.c   | 22 ++
 2 files changed, 25 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index f6e6d6e1f9..9edfec6045 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -47,6 +47,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+ uint32_t *type, void *data, uint32_t len,
+ Error **errp);
 
 #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 012f18d8d8..c7e969d6f7 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -208,6 +208,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
 return ret;
 }
 
+bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+ uint32_t *type, void *data, uint32_t len,
+ Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+
+if (ioctl(be->fd, IOMMU_GET_HW_INFO, )) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+return false;
+}
+
+g_assert(type);
+*type = info.out_data_type;
+
+return true;
+}
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_IOMMUFD_BACKEND,
-- 
2.34.1




[PATCH v7 16/17] intel_iommu: Implement [set|unset]_iommu_device() callbacks

2024-06-05 Thread Zhenzhong Duan
From: Yi Liu 

Implement [set|unset]_iommu_device() callbacks in Intel vIOMMU.
In set call, we take a reference of HostIOMMUDevice and store it
in hash table indexed by PCI BDF.

Note this BDF index is device's real BDF not the aliased one which
is different from the index of VTDAddressSpace. There can be multiple
assigned devices under same virtual iommu group and share same
VTDAddressSpace, but each has its own HostIOMMUDevice.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  2 +
 hw/i386/intel_iommu.c | 81 +++
 2 files changed, 83 insertions(+)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c8..1eb05c29fc 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -292,6 +292,8 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+GHashTable *vtd_host_iommu_dev; /* HostIOMMUDevice */
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 519063c8f8..07e897ad7a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -61,6 +61,12 @@ struct vtd_as_key {
 uint32_t pasid;
 };
 
+/* bus/devfn is PCI device's real BDF not the aliased one */
+struct vtd_hiod_key {
+PCIBus *bus;
+uint8_t devfn;
+};
+
 struct vtd_iotlb_key {
 uint64_t gfn;
 uint32_t pasid;
@@ -250,6 +256,25 @@ static guint vtd_as_hash(gconstpointer v)
 return (guint)(value << 8 | key->devfn);
 }
 
+/* Same implementation as vtd_as_hash() */
+static guint vtd_hiod_hash(gconstpointer v)
+{
+return vtd_as_hash(v);
+}
+
+static gboolean vtd_hiod_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct vtd_hiod_key *key1 = v1;
+const struct vtd_hiod_key *key2 = v2;
+
+return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
+
+static void vtd_hiod_destroy(gpointer v)
+{
+object_unref(v);
+}
+
 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
   gpointer user_data)
 {
@@ -3812,6 +3837,58 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+ HostIOMMUDevice *hiod, Error **errp)
+{
+IntelIOMMUState *s = opaque;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+struct vtd_as_key *new_key;
+
+assert(hiod);
+
+vtd_iommu_lock(s);
+
+if (g_hash_table_lookup(s->vtd_host_iommu_dev, )) {
+error_setg(errp, "Host IOMMU device already exist");
+vtd_iommu_unlock(s);
+return false;
+}
+
+new_key = g_malloc(sizeof(*new_key));
+new_key->bus = bus;
+new_key->devfn = devfn;
+
+object_ref(hiod);
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, hiod);
+
+vtd_iommu_unlock(s);
+
+return true;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+
+vtd_iommu_lock(s);
+
+if (!g_hash_table_lookup(s->vtd_host_iommu_dev, )) {
+vtd_iommu_unlock(s);
+return;
+}
+
+g_hash_table_remove(s->vtd_host_iommu_dev, );
+
+vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -4116,6 +4193,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.set_iommu_device = vtd_dev_set_iommu_device,
+.unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4235,6 +4314,8 @@ static void vtd_realize(DeviceState *dev, Error **errp)
  g_free, g_free);
 s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
   g_free, g_free);
+s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_hiod_hash, 
vtd_hiod_equal,
+  g_free, vtd_hiod_destroy);
 vtd_init(s);
 pci_setup_iommu(bus, _iommu_ops, dev);
 /* Pseudo address space under root PCI bus. */
-- 
2.34.1




[PATCH v7 06/17] vfio/container: Implement HostIOMMUDeviceClass::realize() handler

2024-06-05 Thread Zhenzhong Duan
The realize function populates the capabilities. For now only the
aw_bits caps is computed for legacy backend.

Introduce a helper function vfio_device_get_aw_bits() which calls
range_get_last_bit() to get host aw_bits and package it in
HostIOMMUDeviceCaps for query with .get_cap(). This helper will
also be used by iommufd backend.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  1 +
 hw/vfio/container.c   | 19 +++
 hw/vfio/helpers.c | 17 +
 3 files changed, 37 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 56d1717211..105b8b7e80 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -289,4 +289,5 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error 
**errp);
 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
   DeviceState *dev, bool ram_discard);
+int vfio_device_get_aw_bits(VFIODevice *vdev);
 #endif /* HW_VFIO_VFIO_COMMON_H */
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index c4fca2dfca..2f62c13214 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1136,6 +1136,24 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };
 
+static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+ Error **errp)
+{
+VFIODevice *vdev = opaque;
+
+hiod->name = g_strdup(vdev->name);
+hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev);
+
+return true;
+}
+
+static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->realize = hiod_legacy_vfio_realize;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_LEGACY,
@@ -1144,6 +1162,7 @@ static const TypeInfo types[] = {
 }, {
 .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
 .parent = TYPE_HOST_IOMMU_DEVICE,
+.class_init = hiod_legacy_vfio_class_init,
 }
 };
 
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 27ea26aa48..b14edd46ed 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -658,3 +658,20 @@ void vfio_device_init(VFIODevice *vbasedev, int type, 
VFIODeviceOps *ops,
 
 vbasedev->ram_block_discard_allowed = ram_discard;
 }
+
+int vfio_device_get_aw_bits(VFIODevice *vdev)
+{
+/*
+ * iova_ranges is a sorted list. For old kernels that support
+ * VFIO but not support query of iova ranges, iova_ranges is NULL,
+ * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned.
+ */
+GList *l = g_list_last(vdev->bcontainer->iova_ranges);
+
+if (l) {
+Range *range = l->data;
+return range_get_last_bit(range) + 1;
+}
+
+return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX;
+}
-- 
2.34.1




[PATCH v7 05/17] range: Introduce range_get_last_bit()

2024-06-05 Thread Zhenzhong Duan
This helper get the highest 1 bit position of the upper bound.

If the range is empty or upper bound is zero, -1 is returned.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Eric Auger 
---
 include/qemu/range.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/qemu/range.h b/include/qemu/range.h
index 205e1da76d..4ce694a398 100644
--- a/include/qemu/range.h
+++ b/include/qemu/range.h
@@ -20,6 +20,8 @@
 #ifndef QEMU_RANGE_H
 #define QEMU_RANGE_H
 
+#include "qemu/bitops.h"
+
 /*
  * Operations on 64 bit address ranges.
  * Notes:
@@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t 
len1,
 return !(last2 < first1 || last1 < first2);
 }
 
+/* Get highest non-zero bit position of a range */
+static inline int range_get_last_bit(Range *range)
+{
+if (range_is_empty(range)) {
+return -1;
+}
+return 63 - clz64(range->upb);
+}
+
 /*
  * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
  * Both @a and @b must not be empty.
-- 
2.34.1




[PATCH v7 01/17] backends: Introduce HostIOMMUDevice abstract

2024-06-05 Thread Zhenzhong Duan
A HostIOMMUDevice is an abstraction for an assigned device that is protected
by a physical IOMMU (aka host IOMMU). The userspace interaction with this
physical IOMMU can be done either through the VFIO IOMMU type 1 legacy
backend or the new iommufd backend. The assigned device can be a VFIO device
or a VDPA device. The HostIOMMUDevice is needed to interact with the host
IOMMU that protects the assigned device. It is especially useful when the
device is also protected by a virtual IOMMU as this latter use the translation
services of the physical IOMMU and is constrained by it. In that context the
HostIOMMUDevice can be passed to the virtual IOMMU to collect physical IOMMU
capabilities such as the supported address width. In the future, the virtual
IOMMU will use the HostIOMMUDevice to program the guest page tables in the
first translation stage of the physical IOMMU.

Introduce .realize() to initialize HostIOMMUDevice further after instance init.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 MAINTAINERS|  2 ++
 include/sysemu/host_iommu_device.h | 53 ++
 backends/host_iommu_device.c   | 33 +++
 backends/meson.build   |  1 +
 4 files changed, 89 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 448dc951c5..1cf2b25beb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2196,6 +2196,8 @@ M: Zhenzhong Duan 
 S: Supported
 F: backends/iommufd.c
 F: include/sysemu/iommufd.h
+F: backends/host_iommu_device.c
+F: include/sysemu/host_iommu_device.h
 F: include/qemu/chardev_open.h
 F: util/chardev_open.c
 F: docs/devel/vfio-iommufd.rst
diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..db47a16189
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,53 @@
+/*
+ * Host IOMMU device abstract declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+#include "qom/object.h"
+#include "qapi/error.h"
+
+#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
+OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
+
+struct HostIOMMUDevice {
+Object parent_obj;
+
+char *name;
+};
+
+/**
+ * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices.
+ *
+ * Different types of host devices (e.g., VFIO or VDPA device) or devices
+ * with different backend (e.g., VFIO legacy container or IOMMUFD backend)
+ * will have different implementations of the HostIOMMUDeviceClass.
+ */
+struct HostIOMMUDeviceClass {
+ObjectClass parent_class;
+
+/**
+ * @realize: initialize host IOMMU device instance further.
+ *
+ * Mandatory callback.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @opaque: pointer to agent device of this host IOMMU device,
+ *  e.g., VFIO base device or VDPA device.
+ *
+ * @errp: pass an Error out when realize fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+};
+#endif
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
new file mode 100644
index 00..8f2dda1beb
--- /dev/null
+++ b/backends/host_iommu_device.c
@@ -0,0 +1,33 @@
+/*
+ * Host IOMMU device abstract
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/host_iommu_device.h"
+
+OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice,
+host_iommu_device,
+HOST_IOMMU_DEVICE,
+OBJECT)
+
+static void host_iommu_device_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void host_iommu_device_init(Object *obj)
+{
+}
+
+static void host_iommu_device_finalize(Object *obj)
+{
+HostIOMMUDevice *hiod = HOST_IOMMU_DEVICE(obj);
+
+g_free(hiod->name);
+}
diff --git a/backends/meson.build b/backends/meson.build
index 8b2b111497..106312f0c8 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -16,6 +16,7 @@ if host_os != 'windows'
 endif
 if host_os == 'linux'
   system_ss.add(files('hostmem-memfd.c'))
+  system_ss.add(files('host_iommu_device.c'))
 endif
 if keyutils.found()
 system_ss.add(keyutils, files('cryptodev-lkcf.c'))
-- 
2.34.1




[PATCH v7 02/17] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps

2024-06-05 Thread Zhenzhong Duan
HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities.
Different platform IOMMU can support different elements.

Currently only two elements, type and aw_bits, type hints the host
platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints
host IOMMU address width.

Introduce .get_cap() handler to check if HOST_IOMMU_DEVICE_CAP_XXX
is supported.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
index db47a16189..a57873958b 100644
--- a/include/sysemu/host_iommu_device.h
+++ b/include/sysemu/host_iommu_device.h
@@ -15,6 +15,18 @@
 #include "qom/object.h"
 #include "qapi/error.h"
 
+/**
+ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities.
+ *
+ * @type: host platform IOMMU type.
+ *
+ * @aw_bits: host IOMMU address width. 0xff if no limitation.
+ */
+typedef struct HostIOMMUDeviceCaps {
+uint32_t type;
+uint8_t aw_bits;
+} HostIOMMUDeviceCaps;
+
 #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
 OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
 
@@ -22,6 +34,7 @@ struct HostIOMMUDevice {
 Object parent_obj;
 
 char *name;
+HostIOMMUDeviceCaps caps;
 };
 
 /**
@@ -49,5 +62,30 @@ struct HostIOMMUDeviceClass {
  * Returns: true on success, false on failure.
  */
 bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+/**
+ * @get_cap: check if a host IOMMU device capability is supported.
+ *
+ * Optional callback, if not implemented, hint not supporting query
+ * of @cap.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @cap: capability to check.
+ *
+ * @errp: pass an Error out when fails to query capability.
+ *
+ * Returns: <0 on failure, 0 if a @cap is unsupported, or else
+ * 1 or some positive value for some special @cap,
+ * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS.
+ */
+int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp);
 };
+
+/*
+ * Host IOMMU device capability list.
+ */
+#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE0
+#define HOST_IOMMU_DEVICE_CAP_AW_BITS   1
+
+#define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX   64
 #endif
-- 
2.34.1




[PATCH v6 18/19] intel_iommu: Implement [set|unset]_iommu_device() callbacks

2024-06-03 Thread Zhenzhong Duan
From: Yi Liu 

Implement [set|unset]_iommu_device() callbacks in Intel vIOMMU.
In set call, a new structure VTDHostIOMMUDevice which holds
a reference to HostIOMMUDevice is stored in hash table
indexed by PCI BDF.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  9 
 include/hw/i386/intel_iommu.h  |  2 +
 hw/i386/intel_iommu.c  | 76 ++
 3 files changed, 87 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99bddf..b800d62ca0 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -28,6 +28,7 @@
 #ifndef HW_I386_INTEL_IOMMU_INTERNAL_H
 #define HW_I386_INTEL_IOMMU_INTERNAL_H
 #include "hw/i386/intel_iommu.h"
+#include "sysemu/host_iommu_device.h"
 
 /*
  * Intel IOMMU register specification
@@ -537,4 +538,12 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+
+typedef struct VTDHostIOMMUDevice {
+IntelIOMMUState *iommu_state;
+PCIBus *bus;
+uint8_t devfn;
+HostIOMMUDevice *dev;
+QLIST_ENTRY(VTDHostIOMMUDevice) next;
+} VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7d694b0813..2bbde41e45 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -293,6 +293,8 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 519063c8f8..747c988bc4 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -237,6 +237,13 @@ static gboolean vtd_as_equal(gconstpointer v1, 
gconstpointer v2)
(key1->pasid == key2->pasid);
 }
 
+static gboolean vtd_as_idev_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct vtd_as_key *key1 = v1;
+const struct vtd_as_key *key2 = v2;
+
+return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
 /*
  * Note that we use pointer to PCIBus as the key, so hashing/shifting
  * based on the pointer value is intended. Note that we deal with
@@ -3812,6 +3819,70 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+ HostIOMMUDevice *hiod, Error **errp)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+struct vtd_as_key *new_key;
+
+assert(hiod);
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, );
+
+if (vtd_hdev) {
+error_setg(errp, "IOMMUFD device already exist");
+vtd_iommu_unlock(s);
+return false;
+}
+
+vtd_hdev = g_malloc0(sizeof(VTDHostIOMMUDevice));
+vtd_hdev->bus = bus;
+vtd_hdev->devfn = (uint8_t)devfn;
+vtd_hdev->iommu_state = s;
+vtd_hdev->dev = hiod;
+
+new_key = g_malloc(sizeof(*new_key));
+new_key->bus = bus;
+new_key->devfn = devfn;
+
+object_ref(hiod);
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, vtd_hdev);
+
+vtd_iommu_unlock(s);
+
+return true;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, );
+if (!vtd_hdev) {
+vtd_iommu_unlock(s);
+return;
+}
+
+g_hash_table_remove(s->vtd_host_iommu_dev, );
+object_unref(vtd_hdev->dev);
+
+vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -4116,6 +4187,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.set_iommu_device = vtd_dev_set_iommu_device,
+.unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4235,6 +4308,9 @@ static void vtd_realize(DeviceState *dev, Error **errp)
  g_free, g_free);
 s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,

[PATCH v6 05/19] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps

2024-06-03 Thread Zhenzhong Duan
HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities.
Different platform IOMMU can support different elements.

Currently only two elements, type and aw_bits, type hints the host
platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints
host IOMMU address width.

Introduce .get_cap() handler to check if HOST_IOMMU_DEVICE_CAP_XXX
is supported.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 37 ++
 1 file changed, 37 insertions(+)

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
index 2b58a94d62..d47d1034b1 100644
--- a/include/sysemu/host_iommu_device.h
+++ b/include/sysemu/host_iommu_device.h
@@ -15,11 +15,25 @@
 #include "qom/object.h"
 #include "qapi/error.h"
 
+/**
+ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities.
+ *
+ * @type: host platform IOMMU type.
+ *
+ * @aw_bits: host IOMMU address width. 0xff if no limitation.
+ */
+typedef struct HostIOMMUDeviceCaps {
+uint32_t type;
+uint8_t aw_bits;
+} HostIOMMUDeviceCaps;
+
 #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
 OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
 
 struct HostIOMMUDevice {
 Object parent_obj;
+
+HostIOMMUDeviceCaps caps;
 };
 
 /**
@@ -47,5 +61,28 @@ struct HostIOMMUDeviceClass {
  * Returns: true on success, false on failure.
  */
 bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+/**
+ * @get_cap: check if a host IOMMU device capability is supported.
+ *
+ * Optional callback, if not implemented, hint not supporting query
+ * of @cap.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @cap: capability to check.
+ *
+ * @errp: pass an Error out when fails to query capability.
+ *
+ * Returns: <0 on failure, 0 if a @cap is unsupported, or else
+ * 1 or some positive value for some special @cap,
+ * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS.
+ */
+int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp);
 };
+
+/*
+ * Host IOMMU device capability list.
+ */
+#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE0
+#define HOST_IOMMU_DEVICE_CAP_AW_BITS   1
 #endif
-- 
2.34.1




[PATCH v6 01/19] backends: Introduce HostIOMMUDevice abstract

2024-06-03 Thread Zhenzhong Duan
Introduce HostIOMMUDevice as an abstraction of host IOMMU device.

Introduce .realize() to initialize HostIOMMUDevice further after
instance init.

Introduce a macro CONFIG_HOST_IOMMU_DEVICE to define the usage
for VFIO, and VDPA in the future.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 MAINTAINERS|  2 ++
 include/sysemu/host_iommu_device.h | 51 ++
 backends/host_iommu_device.c   | 30 ++
 backends/Kconfig   |  5 +++
 backends/meson.build   |  1 +
 5 files changed, 89 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 448dc951c5..1cf2b25beb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2196,6 +2196,8 @@ M: Zhenzhong Duan 
 S: Supported
 F: backends/iommufd.c
 F: include/sysemu/iommufd.h
+F: backends/host_iommu_device.c
+F: include/sysemu/host_iommu_device.h
 F: include/qemu/chardev_open.h
 F: util/chardev_open.c
 F: docs/devel/vfio-iommufd.rst
diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..2b58a94d62
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,51 @@
+/*
+ * Host IOMMU device abstract declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+#include "qom/object.h"
+#include "qapi/error.h"
+
+#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
+OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
+
+struct HostIOMMUDevice {
+Object parent_obj;
+};
+
+/**
+ * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices.
+ *
+ * Different type of host devices (e.g., VFIO or VDPA device) or devices
+ * with different backend (e.g., VFIO legacy container or IOMMUFD backend)
+ * can have different sub-classes.
+ */
+struct HostIOMMUDeviceClass {
+ObjectClass parent_class;
+
+/**
+ * @realize: initialize host IOMMU device instance further.
+ *
+ * Mandatory callback.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @opaque: pointer to agent device of this host IOMMU device,
+ *  i.e., for VFIO, pointer to VFIODevice
+ *
+ * @errp: pass an Error out when realize fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+};
+#endif
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
new file mode 100644
index 00..41f2fdce20
--- /dev/null
+++ b/backends/host_iommu_device.c
@@ -0,0 +1,30 @@
+/*
+ * Host IOMMU device abstract
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/host_iommu_device.h"
+
+OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice,
+host_iommu_device,
+HOST_IOMMU_DEVICE,
+OBJECT)
+
+static void host_iommu_device_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void host_iommu_device_init(Object *obj)
+{
+}
+
+static void host_iommu_device_finalize(Object *obj)
+{
+}
diff --git a/backends/Kconfig b/backends/Kconfig
index 2cb23f62fa..34ab29e994 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -3,3 +3,8 @@ source tpm/Kconfig
 config IOMMUFD
 bool
 depends on VFIO
+
+config HOST_IOMMU_DEVICE
+bool
+default y
+depends on VFIO
diff --git a/backends/meson.build b/backends/meson.build
index 8b2b111497..2e975d641e 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -25,6 +25,7 @@ if have_vhost_user
 endif
 system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost.c'))
 system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
+system_ss.add(when: 'CONFIG_HOST_IOMMU_DEVICE', if_true: 
files('host_iommu_device.c'))
 if have_vhost_user_crypto
   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost-user.c'))
 endif
-- 
2.34.1




[PATCH v6 07/19] vfio/container: Implement HostIOMMUDeviceClass::realize() handler

2024-06-03 Thread Zhenzhong Duan
Utilize range_get_last_bit() to get host IOMMU address width and
package it in HostIOMMUDeviceCaps for query with .get_cap().

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index c4fca2dfca..48800fe92f 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1136,6 +1136,31 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };
 
+static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+ Error **errp)
+{
+VFIODevice *vdev = opaque;
+/* iova_ranges is a sorted list */
+GList *l = g_list_last(vdev->bcontainer->iova_ranges);
+
+/* Only support query HOST_IOMMU_DEVICE_CAP_AW_BITS with legacy backend */
+if (l) {
+Range *range = l->data;
+hiod->caps.aw_bits = range_get_last_bit(range) + 1;
+} else {
+hiod->caps.aw_bits = 0xff;
+}
+
+return true;
+}
+
+static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->realize = hiod_legacy_vfio_realize;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_LEGACY,
@@ -1144,6 +1169,7 @@ static const TypeInfo types[] = {
 }, {
 .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
 .parent = TYPE_HOST_IOMMU_DEVICE,
+.class_init = hiod_legacy_vfio_class_init,
 }
 };
 
-- 
2.34.1




[PATCH v6 19/19] intel_iommu: Check compatibility with host IOMMU capabilities

2024-06-03 Thread Zhenzhong Duan
If check fails, host device (either VFIO or VDPA device) is not
compatible with current vIOMMU config and should not be passed to
guest.

Only aw_bits is checked for now, we don't care other capabilities
before scalable modern mode is introduced.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 747c988bc4..d8202a77dd 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3819,6 +3819,30 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static bool vtd_check_hdev(IntelIOMMUState *s, HostIOMMUDevice *hiod,
+   Error **errp)
+{
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+int ret;
+
+if (!hiodc->get_cap) {
+error_setg(errp, ".get_cap() not implemented");
+return false;
+}
+
+/* Common checks */
+ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS, errp);
+if (ret < 0) {
+return false;
+}
+if (s->aw_bits > ret) {
+error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret);
+return false;
+}
+
+return true;
+}
+
 static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
  HostIOMMUDevice *hiod, Error **errp)
 {
@@ -3842,6 +3866,11 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 return false;
 }
 
+if (!vtd_check_hdev(s, hiod, errp)) {
+vtd_iommu_unlock(s);
+return false;
+}
+
 vtd_hdev = g_malloc0(sizeof(VTDHostIOMMUDevice));
 vtd_hdev->bus = bus;
 vtd_hdev->devfn = (uint8_t)devfn;
-- 
2.34.1




[PATCH v6 17/19] intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap

2024-06-03 Thread Zhenzhong Duan
Extract cap/ecap initialization in vtd_cap_init() to make code
cleaner.

No functional change intended.

Reviewed-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 93 ---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index cc8e59674e..519063c8f8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3934,30 +3934,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-memset(s->csr, 0, DMAR_REG_SIZE);
-memset(s->wmask, 0, DMAR_REG_SIZE);
-memset(s->w1cmask, 0, DMAR_REG_SIZE);
-memset(s->womask, 0, DMAR_REG_SIZE);
-
-s->root = 0;
-s->root_scalable = false;
-s->dmar_enabled = false;
-s->intr_enabled = false;
-s->iq_head = 0;
-s->iq_tail = 0;
-s->iq = 0;
-s->iq_size = 0;
-s->qi_enabled = false;
-s->iq_last_desc_type = VTD_INV_DESC_NONE;
-s->iq_dw = false;
-s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
  VTD_CAP_MGAW(s->aw_bits);
@@ -3974,27 +3954,6 @@ static void vtd_init(IntelIOMMUState *s)
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-/*
- * Rsvd field masks for spte
- */
-vtd_spte_rsvd[0] = ~0ULL;
-vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-  x86_iommu->dt_supported);
-vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-
-if (s->scalable_mode || s->snoop_control) {
-vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-}
-
 if (x86_iommu_ir_supported(x86_iommu)) {
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
 if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4027,6 +3986,56 @@ static void vtd_init(IntelIOMMUState *s)
 if (s->pasid) {
 s->ecap |= VTD_ECAP_PASID;
 }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+memset(s->csr, 0, DMAR_REG_SIZE);
+memset(s->wmask, 0, DMAR_REG_SIZE);
+memset(s->w1cmask, 0, DMAR_REG_SIZE);
+memset(s->womask, 0, DMAR_REG_SIZE);
+
+s->root = 0;
+s->root_scalable = false;
+s->dmar_enabled = false;
+s->intr_enabled = false;
+s->iq_head = 0;
+s->iq_tail = 0;
+s->iq = 0;
+s->iq_size = 0;
+s->qi_enabled = false;
+s->iq_last_desc_type = VTD_INV_DESC_NONE;
+s->iq_dw = false;
+s->next_frcd_reg = 0;
+
+vtd_cap_init(s);
+
+/*
+ * Rsvd field masks for spte
+ */
+vtd_spte_rsvd[0] = ~0ULL;
+vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+  x86_iommu->dt_supported);
+vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+
+if (s->scalable_mode || s->snoop_control) {
+vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+}
 
 vtd_reset_caches(s);
 
-- 
2.34.1




[PATCH v6 13/19] vfio: Create host IOMMU device instance

2024-06-03 Thread Zhenzhong Duan
Create host IOMMU device instance in vfio_attach_device() and call
.realize() to initialize it further.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  1 +
 hw/vfio/common.c  | 16 +++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 56d1717211..c0851e83bb 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -127,6 +127,7 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
+HostIOMMUDevice *hiod;
 int devid;
 IOMMUFDBackend *iommufd;
 } VFIODevice;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f9619a1dfb..f20a7b5bba 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1528,6 +1528,7 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 {
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
+HostIOMMUDevice *hiod;
 
 if (vbasedev->iommufd) {
 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
@@ -1535,7 +1536,19 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 
 assert(ops);
 
-return ops->attach_device(name, vbasedev, as, errp);
+if (!ops->attach_device(name, vbasedev, as, errp)) {
+return false;
+}
+
+hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
+if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) {
+object_unref(hiod);
+ops->detach_device(vbasedev);
+return false;
+}
+vbasedev->hiod = hiod;
+
+return true;
 }
 
 void vfio_detach_device(VFIODevice *vbasedev)
@@ -1543,5 +1556,6 @@ void vfio_detach_device(VFIODevice *vbasedev)
 if (!vbasedev->bcontainer) {
 return;
 }
+object_unref(vbasedev->hiod);
 vbasedev->bcontainer->ops->detach_device(vbasedev);
 }
-- 
2.34.1




[PATCH v6 14/19] hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn()

2024-06-03 Thread Zhenzhong Duan
Extract out pci_device_get_iommu_bus_devfn() from
pci_device_iommu_address_space() to facilitate
implementation of pci_device_[set|unset]_iommu_device()
in following patch.

No functional change intended.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 hw/pci/pci.c | 48 +---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 324c1302d2..02a4bb2af6 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2648,11 +2648,27 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+/*
+ * Get IOMMU root bus, aliased bus and devfn of a PCI device
+ *
+ * IOMMU root bus is needed by all call sites to call into iommu_ops.
+ * For call sites which don't need aliased BDF, passing NULL to
+ * aliased_[bus|devfn] is allowed.
+ *
+ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device.
+ *
+ * @aliased_bus: return aliased #PCIBus of the PCI device, optional.
+ *
+ * @aliased_devfn: return aliased devfn of the PCI device, optional.
+ */
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **piommu_bus,
+   PCIBus **aliased_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2693,7 +2709,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, _bus, , );
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
-- 
2.34.1




[PATCH v6 10/19] vfio/container: Implement HostIOMMUDeviceClass::get_cap() handler

2024-06-03 Thread Zhenzhong Duan
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 48800fe92f..a46c275a88 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1154,11 +1154,26 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice 
*hiod, void *opaque,
 return true;
 }
 
+static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
+Error **errp)
+{
+HostIOMMUDeviceCaps *caps = >caps;
+
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+return caps->aw_bits;
+default:
+error_setg(errp, "Not support get cap %x", cap);
+return -EINVAL;
+}
+}
+
 static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
 {
 HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
 
 hioc->realize = hiod_legacy_vfio_realize;
+hioc->get_cap = hiod_legacy_vfio_get_cap;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH v6 03/19] backends/iommufd: Introduce abstract TYPE_HOST_IOMMU_DEVICE_IOMMUFD device

2024-06-03 Thread Zhenzhong Duan
TYPE_HOST_IOMMU_DEVICE_IOMMUFD represents a host IOMMU device under
iommufd backend.

It will have its own .get_cap() implementation.

Opportunistically, add missed header to include/sysemu/iommufd.h.

Suggested-by: Cédric Le Goater 
Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h | 16 
 backends/iommufd.c   | 35 ++-
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 293bfbe967..f6e6d6e1f9 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -1,9 +1,23 @@
+/*
+ * iommufd container backend declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ * Copyright Red Hat, Inc. 2024
+ *
+ * Authors: Yi Liu 
+ *  Eric Auger 
+ *  Zhenzhong Duan 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
 #ifndef SYSEMU_IOMMUFD_H
 #define SYSEMU_IOMMUFD_H
 
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +47,6 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index c506afbdac..012f18d8d8 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -208,23 +208,24 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, 
uint32_t ioas_id,
 return ret;
 }
 
-static const TypeInfo iommufd_backend_info = {
-.name = TYPE_IOMMUFD_BACKEND,
-.parent = TYPE_OBJECT,
-.instance_size = sizeof(IOMMUFDBackend),
-.instance_init = iommufd_backend_init,
-.instance_finalize = iommufd_backend_finalize,
-.class_size = sizeof(IOMMUFDBackendClass),
-.class_init = iommufd_backend_class_init,
-.interfaces = (InterfaceInfo[]) {
-{ TYPE_USER_CREATABLE },
-{ }
+static const TypeInfo types[] = {
+{
+.name = TYPE_IOMMUFD_BACKEND,
+.parent = TYPE_OBJECT,
+.instance_size = sizeof(IOMMUFDBackend),
+.instance_init = iommufd_backend_init,
+.instance_finalize = iommufd_backend_finalize,
+.class_size = sizeof(IOMMUFDBackendClass),
+.class_init = iommufd_backend_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_USER_CREATABLE },
+{ }
+}
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.abstract = true,
 }
 };
 
-static void register_types(void)
-{
-type_register_static(_backend_info);
-}
-
-type_init(register_types);
+DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v6 04/19] vfio/iommufd: Introduce TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO device

2024-06-03 Thread Zhenzhong Duan
TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO represents a host IOMMU device under
VFIO iommufd backend. It will be created during VFIO device attaching
and passed to vIOMMU.

It will have its own .realize() implementation.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 3 +++
 hw/vfio/iommufd.c | 5 -
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 75b167979a..56d1717211 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -32,6 +32,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
 #include "sysemu/host_iommu_device.h"
+#include "sysemu/iommufd.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -173,6 +174,8 @@ typedef struct VFIOGroup {
 } VFIOGroup;
 
 #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE 
"-legacy-vfio"
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \
+TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
 
 typedef struct VFIODMABuf {
 QemuDmaBuf *buf;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 554f9a6292..e4a507d55c 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -624,7 +624,10 @@ static const TypeInfo types[] = {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_iommufd_class_init,
-},
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v6 16/19] vfio/pci: Pass HostIOMMUDevice to vIOMMU

2024-06-03 Thread Zhenzhong Duan
With HostIOMMUDevice passed, vIOMMU can check compatibility with host
IOMMU, call into IOMMUFD specific methods, etc.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 74a79bdf61..d8a76c1ee0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3121,10 +3121,15 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-if (!vfio_add_capabilities(vdev, errp)) {
+if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+if (!vfio_add_capabilities(vdev, errp)) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3141,7 +3146,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3150,11 +3155,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3238,6 +3243,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3266,6 +3273,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = >vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3280,7 +3288,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(>vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1




[PATCH v6 15/19] hw/pci: Introduce pci_device_[set|unset]_iommu_device()

2024-06-03 Thread Zhenzhong Duan
From: Yi Liu 

pci_device_[set|unset]_iommu_device() call pci_device_get_iommu_bus_devfn()
to get iommu_bus->iommu_ops and call [set|unset]_iommu_device callback to
set/unset HostIOMMUDevice for a given PCI device.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/pci/pci.h | 38 +-
 hw/pci/pci.c | 27 +++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eaa3fc99d8..c84cc9b99a 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -383,10 +384,45 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host IOMMU device.
+ *
+ * @errp: pass an Error out only when return false
+ *
+ * Returns: true if HostIOMMUDevice is attached or else false with errp 
set.
+ */
+bool (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+ HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+ Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 02a4bb2af6..c8a8aab306 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2742,6 +2742,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 return _space_memory;
 }
 
+bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+ Error **errp)
+{
+PCIBus *iommu_bus;
+
+/* set_iommu_device requires device's direct BDF instead of aliased BDF */
+pci_device_get_iommu_bus_devfn(dev, _bus, NULL, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) {
+return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev),
+  iommu_bus->iommu_opaque,
+  dev->devfn, hiod, errp);
+}
+return true;
+}
+
+void pci_device_unset_iommu_device(PCIDevice *dev)
+{
+PCIBus *iommu_bus;
+
+pci_device_get_iommu_bus_devfn(dev, _bus, NULL, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) {
+return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev),
+
iommu_bus->iommu_opaque,
+dev->devfn);
+}
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 /*
-- 
2.34.1




[PATCH v6 08/19] backends/iommufd: Introduce helper function iommufd_backend_get_device_info()

2024-06-03 Thread Zhenzhong Duan
Introduce a helper function iommufd_backend_get_device_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  3 +++
 backends/iommufd.c   | 22 ++
 2 files changed, 25 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index f6e6d6e1f9..9edfec6045 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -47,6 +47,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+ uint32_t *type, void *data, uint32_t len,
+ Error **errp);
 
 #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 012f18d8d8..c7e969d6f7 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -208,6 +208,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
 return ret;
 }
 
+bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+ uint32_t *type, void *data, uint32_t len,
+ Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+
+if (ioctl(be->fd, IOMMU_GET_HW_INFO, )) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+return false;
+}
+
+g_assert(type);
+*type = info.out_data_type;
+
+return true;
+}
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_IOMMUFD_BACKEND,
-- 
2.34.1




[PATCH v6 06/19] range: Introduce range_get_last_bit()

2024-06-03 Thread Zhenzhong Duan
This helper get the highest 1 bit position of the upper bound.

If the range is empty or upper bound is zero, -1 is returned.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/qemu/range.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/qemu/range.h b/include/qemu/range.h
index 205e1da76d..4ce694a398 100644
--- a/include/qemu/range.h
+++ b/include/qemu/range.h
@@ -20,6 +20,8 @@
 #ifndef QEMU_RANGE_H
 #define QEMU_RANGE_H
 
+#include "qemu/bitops.h"
+
 /*
  * Operations on 64 bit address ranges.
  * Notes:
@@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t 
len1,
 return !(last2 < first1 || last1 < first2);
 }
 
+/* Get highest non-zero bit position of a range */
+static inline int range_get_last_bit(Range *range)
+{
+if (range_is_empty(range)) {
+return -1;
+}
+return 63 - clz64(range->upb);
+}
+
 /*
  * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
  * Both @a and @b must not be empty.
-- 
2.34.1




[PATCH v6 11/19] backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() handler

2024-06-03 Thread Zhenzhong Duan
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 backends/iommufd.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/backends/iommufd.c b/backends/iommufd.c
index c7e969d6f7..f2f7a762a0 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -230,6 +230,28 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, 
uint32_t devid,
 return true;
 }
 
+static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
+{
+HostIOMMUDeviceCaps *caps = >caps;
+
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE:
+return caps->type;
+case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+return caps->aw_bits;
+default:
+error_setg(errp, "Not support get cap %x", cap);
+return -EINVAL;
+}
+}
+
+static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->get_cap = hiod_iommufd_get_cap;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_IOMMUFD_BACKEND,
@@ -246,6 +268,7 @@ static const TypeInfo types[] = {
 }, {
 .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
 .parent = TYPE_HOST_IOMMU_DEVICE,
+.class_init = hiod_iommufd_class_init,
 .abstract = true,
 }
 };
-- 
2.34.1




[PATCH v6 09/19] vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler

2024-06-03 Thread Zhenzhong Duan
It calls iommufd_backend_get_device_info() to get host IOMMU
related information and translate it into HostIOMMUDeviceCaps
for query with .get_cap().

Introduce macro VTD_MGAW_FROM_CAP to get MGAW which equals to
(aw_bits - 1).

Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/vfio/iommufd.c | 37 +++
 2 files changed, 38 insertions(+)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c8..7d694b0813 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -47,6 +47,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define VTD_HOST_AW_48BIT   48
 #define VTD_HOST_ADDRESS_WIDTH  VTD_HOST_AW_39BIT
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap)  ((cap >> 16) & 0x3fULL)
 
 #define DMAR_REPORT_F_INTR  (1)
 
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index e4a507d55c..9d2e95e20e 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -25,6 +25,7 @@
 #include "qemu/cutils.h"
 #include "qemu/chardev_open.h"
 #include "pci.h"
+#include "hw/i386/intel_iommu_internal.h"
 
 static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly)
@@ -619,6 +620,41 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
 };
 
+static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+  Error **errp)
+{
+VFIODevice *vdev = opaque;
+HostIOMMUDeviceCaps *caps = >caps;
+enum iommu_hw_info_type type;
+union {
+struct iommu_hw_info_vtd vtd;
+} data;
+
+if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid,
+ , , sizeof(data), errp)) {
+return false;
+}
+
+caps->type = type;
+
+switch (type) {
+case IOMMU_HW_INFO_TYPE_INTEL_VTD:
+caps->aw_bits = VTD_MGAW_FROM_CAP(data.vtd.cap_reg) + 1;
+break;
+case IOMMU_HW_INFO_TYPE_NONE:
+break;
+}
+
+return true;
+}
+
+static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hiodc->realize = hiod_iommufd_vfio_realize;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
@@ -627,6 +663,7 @@ static const TypeInfo types[] = {
 }, {
 .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO,
 .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+.class_init = hiod_iommufd_vfio_class_init,
 }
 };
 
-- 
2.34.1




[PATCH v6 00/19] Add a host IOMMU device abstraction to check with vIOMMU

2024-06-03 Thread Zhenzhong Duan
 to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B

Yi Liu (2):
  hw/pci: Introduce pci_device_[set|unset]_iommu_device()
  intel_iommu: Implement [set|unset]_iommu_device() callbacks

Zhenzhong Duan (17):
  backends: Introduce HostIOMMUDevice abstract
  vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO device
  backends/iommufd: Introduce abstract TYPE_HOST_IOMMU_DEVICE_IOMMUFD
device
  vfio/iommufd: Introduce TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO device
  backends/host_iommu_device: Introduce HostIOMMUDeviceCaps
  range: Introduce range_get_last_bit()
  vfio/container: Implement HostIOMMUDeviceClass::realize() handler
  backends/iommufd: Introduce helper function
iommufd_backend_get_device_info()
  vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler
  vfio/container: Implement HostIOMMUDeviceClass::get_cap() handler
  backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() handler
  vfio: Introduce VFIOIOMMUClass::hiod_typename attribute
  vfio: Create host IOMMU device instance
  hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn()
  vfio/pci: Pass HostIOMMUDevice to vIOMMU
  intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap
  intel_iommu: Check compatibility with host IOMMU capabilities

 MAINTAINERS   |   2 +
 hw/i386/intel_iommu_internal.h|   9 ++
 include/hw/i386/intel_iommu.h |   3 +
 include/hw/pci/pci.h  |  38 -
 include/hw/vfio/vfio-common.h |   7 +
 include/hw/vfio/vfio-container-base.h |   3 +
 include/qemu/range.h  |  11 ++
 include/sysemu/host_iommu_device.h|  88 
 include/sysemu/iommufd.h  |  19 +++
 backends/host_iommu_device.c  |  30 
 backends/iommufd.c|  76 --
 hw/i386/intel_iommu.c | 198 --
 hw/pci/pci.c  |  75 +-
 hw/vfio/common.c  |  16 ++-
 hw/vfio/container.c   |  48 ++-
 hw/vfio/iommufd.c |  44 +-
 hw/vfio/pci.c |  19 ++-
 backends/Kconfig  |   5 +
 backends/meson.build  |   1 +
 19 files changed, 623 insertions(+), 69 deletions(-)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

-- 
2.34.1




[PATCH v6 12/19] vfio: Introduce VFIOIOMMUClass::hiod_typename attribute

2024-06-03 Thread Zhenzhong Duan
Initialize attribute VFIOIOMMUClass::hiod_typename based on
VFIO backend type.

This attribute will facilitate HostIOMMUDevice creation in
vfio_attach_device().

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-container-base.h | 3 +++
 hw/vfio/container.c   | 2 ++
 hw/vfio/iommufd.c | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 2776481fc9..442c0dfc4c 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -109,6 +109,9 @@ DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, 
TYPE_VFIO_IOMMU)
 struct VFIOIOMMUClass {
 InterfaceClass parent_class;
 
+/* Properties */
+const char *hiod_typename;
+
 /* basic feature */
 bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
 int (*dma_map)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index a46c275a88..a830426647 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1126,6 +1126,8 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
 
+vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
+
 vioc->setup = vfio_legacy_setup;
 vioc->dma_map = vfio_legacy_dma_map;
 vioc->dma_unmap = vfio_legacy_dma_unmap;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 9d2e95e20e..8fd8d52bc2 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -613,6 +613,8 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
 
+vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO;
+
 vioc->dma_map = iommufd_cdev_map;
 vioc->dma_unmap = iommufd_cdev_unmap;
 vioc->attach_device = iommufd_cdev_attach;
-- 
2.34.1




[PATCH v6 02/19] vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO device

2024-06-03 Thread Zhenzhong Duan
TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO represents a host IOMMU device under
VFIO legacy container backend.

It will have its own realize implementation.

Suggested-by: Eric Auger 
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 3 +++
 hw/vfio/container.c   | 5 -
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 4cb1ab8645..75b167979a 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -171,6 +172,8 @@ typedef struct VFIOGroup {
 bool ram_block_discard_allowed;
 } VFIOGroup;
 
+#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE 
"-legacy-vfio"
+
 typedef struct VFIODMABuf {
 QemuDmaBuf *buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 096cc97258..c4fca2dfca 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1141,7 +1141,10 @@ static const TypeInfo types[] = {
 .name = TYPE_VFIO_IOMMU_LEGACY,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_legacy_class_init,
-},
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH rfcv2 13/17] intel_iommu: add support for PASID-based device IOTLB invalidation

2024-05-22 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h | 11 
 hw/i386/intel_iommu.c  | 50 ++
 2 files changed, 61 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 8a375d038a..5831aa4d82 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -378,6 +378,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_WAIT   0x5 /* Invalidation Wait Descriptor */
 #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
 #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
 #define VTD_INV_DESC_NONE   0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
@@ -421,6 +422,16 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffe0fff8
 
+/* Mask for PASID Device IOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(val) ((val) & \
+   0xf000ULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(val) ((val >> 11) & 0x1)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(val) ((val) & 0x1)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(val) (((val) >> 16) & 0xULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(val) ((val >> 32) & 0xfULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_HI 0x7feULL
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_LO 0xfff0f000ULL
+
 /* Rsvd field masks for spte */
 #define VTD_SPTE_SNP 0x800ULL
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7ae8df2f49..de4e8afcf9 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2996,6 +2996,49 @@ static void do_invalidate_device_tlb(VTDAddressSpace 
*vtd_dev_as,
 memory_region_notify_iommu(_dev_as->iommu, 0, event);
 }
 
+static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+uint16_t sid;
+VTDAddressSpace *vtd_dev_as;
+bool size;
+bool global;
+hwaddr addr;
+uint32_t pasid;
+
+if ((inv_desc->hi & VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_HI) ||
+ (inv_desc->lo & VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_LO)) {
+error_report_once("%s: invalid pasid-based dev iotlb inv desc:"
+  "hi=%"PRIx64 "(reserved nonzero)",
+  __func__, inv_desc->hi);
+return false;
+}
+
+global = VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(inv_desc->hi);
+size = VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(inv_desc->hi);
+addr = VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(inv_desc->hi);
+sid = VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(inv_desc->lo);
+if (global) {
+QLIST_FOREACH(vtd_dev_as, >vtd_as_with_notifiers, next) {
+if ((vtd_dev_as->pasid != PCI_NO_PASID) &&
+(PCI_BUILD_BDF(pci_bus_num(vtd_dev_as->bus),
+   vtd_dev_as->devfn) == sid)) {
+do_invalidate_device_tlb(vtd_dev_as, size, addr);
+}
+}
+} else {
+pasid = VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(inv_desc->lo);
+vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, sid, pasid);
+if (!vtd_dev_as) {
+return true;
+}
+
+do_invalidate_device_tlb(vtd_dev_as, size, addr);
+}
+
+return true;
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
@@ -3090,6 +3133,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
+case VTD_INV_DESC_DEV_PIOTLB:
+trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo);
+if (!vtd_process_device_piotlb_desc(s, _desc)) {
+return false;
+}
+break;
+
 case VTD_INV_DESC_DEVICE:
 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
 if (!vtd_process_device_iotlb_desc(s, _desc)) {
-- 
2.34.1




[PATCH rfcv2 10/17] intel_iommu: Process PASID-based iotlb invalidation

2024-05-22 Thread Zhenzhong Duan
PASID-based iotlb (piotlb) is used during walking Intel
VT-d stage-1 page table.

This emulates the stage-1 page table iotlb invalidation requested
by a PASID-based IOTLB Invalidate Descriptor (P_IOTLB).

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  3 +++
 hw/i386/intel_iommu.c  | 45 ++
 2 files changed, 48 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index c0a94af820..8a375d038a 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -453,6 +453,9 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
 #define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
  VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_PIOTLB_AM(val)   ((val) & 0x3fULL)
+#define VTD_INV_DESC_PIOTLB_IH(val)   (((val) >> 6) & 0x1)
 
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 0078bad9d4..f6c429ae4c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -304,6 +304,28 @@ static gboolean vtd_hash_remove_by_page(gpointer key, 
gpointer value,
 return (entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb;
 }
 
+static gboolean vtd_hash_remove_by_page_piotlb(gpointer key, gpointer value,
+   gpointer user_data)
+{
+VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
+uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
+
+/*
+ * According to spec, PASID-based-IOTLB Invalidation in page granularity
+ * doesn't invalidate IOTLB entries caching second-stage (PGTT=010b)
+ * or pass-through (PGTT=100b) mappings. Nested isn't supported yet,
+ * so only need to check first-stage (PGTT=001b) mappings.
+ */
+if (entry->pgtt != VTD_SM_PASID_ENTRY_FLT) {
+return false;
+}
+
+return entry->domain_id == info->domain_id && entry->pasid == info->pasid 
&&
+   ((entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb);
+}
+
 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
  * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
  */
@@ -2866,11 +2888,30 @@ static void vtd_piotlb_pasid_invalidate(IntelIOMMUState 
*s,
 }
 }
 
+static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+   uint32_t pasid, hwaddr addr, uint8_t am,
+   bool ih)
+{
+VTDIOTLBPageInvInfo info;
+
+info.domain_id = domain_id;
+info.pasid = pasid;
+info.addr = addr;
+info.mask = ~((1 << am) - 1);
+
+vtd_iommu_lock(s);
+g_hash_table_foreach_remove(s->iotlb,
+vtd_hash_remove_by_page_piotlb, );
+vtd_iommu_unlock(s);
+}
+
 static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
 VTDInvDesc *inv_desc)
 {
 uint16_t domain_id;
 uint32_t pasid;
+uint8_t am;
+hwaddr addr;
 
 if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
 (inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
@@ -2887,6 +2928,10 @@ static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
 break;
 
 case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]);
+addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]);
+vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am,
+   VTD_INV_DESC_PIOTLB_IH(inv_desc->val[1]));
 break;
 
 default:
-- 
2.34.1




[PATCH rfcv2 11/17] intel_iommu: Extract device IOTLB invalidation logic

2024-05-22 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

This piece of code can be shared by both IOTLB invalidation and
PASID-based IOTLB invalidation

No functional changes intended.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 57 +--
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index f6c429ae4c..3c14fd85cc 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2955,13 +2955,43 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
 return true;
 }
 
+static void do_invalidate_device_tlb(VTDAddressSpace *vtd_dev_as,
+ bool size, hwaddr addr)
+{
+/*
+ * According to ATS spec table 2.4:
+ * S = 0, bits 15:12 =  range size: 4K
+ * S = 1, bits 15:12 = xxx0 range size: 8K
+ * S = 1, bits 15:12 = xx01 range size: 16K
+ * S = 1, bits 15:12 = x011 range size: 32K
+ * S = 1, bits 15:12 = 0111 range size: 64K
+ * ...
+ */
+
+IOMMUTLBEvent event;
+uint64_t sz;
+
+if (size) {
+sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
+addr &= ~(sz - 1);
+} else {
+sz = VTD_PAGE_SIZE;
+}
+
+event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
+event.entry.target_as = _dev_as->as;
+event.entry.addr_mask = sz - 1;
+event.entry.iova = addr;
+event.entry.perm = IOMMU_NONE;
+event.entry.translated_addr = 0;
+memory_region_notify_iommu(_dev_as->iommu, 0, event);
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
 VTDAddressSpace *vtd_dev_as;
-IOMMUTLBEvent event;
 hwaddr addr;
-uint64_t sz;
 uint16_t sid;
 bool size;
 
@@ -2986,28 +3016,7 @@ static bool 
vtd_process_device_iotlb_desc(IntelIOMMUState *s,
 goto done;
 }
 
-/* According to ATS spec table 2.4:
- * S = 0, bits 15:12 =  range size: 4K
- * S = 1, bits 15:12 = xxx0 range size: 8K
- * S = 1, bits 15:12 = xx01 range size: 16K
- * S = 1, bits 15:12 = x011 range size: 32K
- * S = 1, bits 15:12 = 0111 range size: 64K
- * ...
- */
-if (size) {
-sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
-addr &= ~(sz - 1);
-} else {
-sz = VTD_PAGE_SIZE;
-}
-
-event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
-event.entry.target_as = _dev_as->as;
-event.entry.addr_mask = sz - 1;
-event.entry.iova = addr;
-event.entry.perm = IOMMU_NONE;
-event.entry.translated_addr = 0;
-memory_region_notify_iommu(_dev_as->iommu, 0, event);
+do_invalidate_device_tlb(vtd_dev_as, size, addr);
 
 done:
 return true;
-- 
2.34.1




[PATCH rfcv2 05/17] intel_iommu: Rename slpte to pte

2024-05-22 Thread Zhenzhong Duan
From: Yi Liu 

Because we will support both FST(a.k.a, FLT) and SST(a.k.a, SLT) translation,
rename slpte to pte to make it generic.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  3 ++-
 include/hw/i386/intel_iommu.h  |  2 +-
 hw/i386/intel_iommu.c  | 39 +-
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index b0d9b1f986..0e240d6d54 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -553,10 +553,11 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_RW_MASK  3ULL
 #define VTD_SL_R1ULL
 #define VTD_SL_W(1ULL << 1)
-#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+/* Common for both First Level and Second Level */
+#define VTD_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
 
 typedef struct VTDHostIOMMUDevice {
 IntelIOMMUState *iommu_state;
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 9ba9c45015..011f374883 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -153,7 +153,7 @@ struct VTDIOTLBEntry {
 uint64_t gfn;
 uint16_t domain_id;
 uint32_t pasid;
-uint64_t slpte;
+uint64_t pte;
 uint64_t mask;
 uint8_t access_flags;
 };
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ed95b5ba2e..544e8f0e40 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -360,7 +360,7 @@ out:
 
 /* Must be with IOMMU lock held */
 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
- uint16_t domain_id, hwaddr addr, uint64_t slpte,
+ uint16_t domain_id, hwaddr addr, uint64_t pte,
  uint8_t access_flags, uint32_t level,
  uint32_t pasid)
 {
@@ -368,7 +368,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t 
source_id,
 struct vtd_iotlb_key *key = g_malloc(sizeof(*key));
 uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
 
-trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
+trace_vtd_iotlb_page_update(source_id, addr, pte, domain_id);
 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
 trace_vtd_iotlb_reset("iotlb exceeds size limit");
 vtd_reset_iotlb_locked(s);
@@ -376,7 +376,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t 
source_id,
 
 entry->gfn = gfn;
 entry->domain_id = domain_id;
-entry->slpte = slpte;
+entry->pte = pte;
 entry->access_flags = access_flags;
 entry->mask = vtd_slpt_level_page_mask(level);
 entry->pasid = pasid;
@@ -693,9 +693,9 @@ static inline dma_addr_t 
vtd_ce_get_slpt_base(VTDContextEntry *ce)
 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
 }
 
-static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
+static inline uint64_t vtd_get_pte_addr(uint64_t slpte, uint8_t aw)
 {
-return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
+return slpte & VTD_PT_BASE_ADDR_MASK(aw);
 }
 
 /* Whether the pte indicates the address of the page frame */
@@ -1152,11 +1152,11 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, 
VTDContextEntry *ce,
 *slpte_level = level;
 break;
 }
-addr = vtd_get_slpte_addr(slpte, aw_bits);
+addr = vtd_get_pte_addr(slpte, aw_bits);
 level--;
 }
 
-xlat = vtd_get_slpte_addr(*slptep, aw_bits);
+xlat = vtd_get_pte_addr(*slptep, aw_bits);
 size = ~vtd_slpt_level_page_mask(level) + 1;
 
 /*
@@ -1343,7 +1343,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t 
start,
  * This is a valid PDE (or even bigger than PDE).  We need
  * to walk one further level.
  */
-ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+ret = vtd_page_walk_level(vtd_get_pte_addr(slpte, info->aw),
   iova, MIN(iova_next, end), level - 1,
   read_cur, write_cur, info);
 } else {
@@ -1360,7 +1360,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t 
start,
 event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
 event.entry.addr_mask = ~subpage_mask;
 /* NOTE: this is only meaningful if entry_valid == true */
-event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+event.entry.translated_addr = vtd_get_pte_addr(slpte, info->aw);
 event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP :
  

[PATCH rfcv2 15/17] intel_iommu: Set default aw_bits to 48 in scalable modren mode

2024-05-22 Thread Zhenzhong Duan
According to VTD spec, stage-1 page table could support 4-level and
5-level paging.

However, 5-level paging translation emulation is unsupported yet.
That means the only supported value for aw_bits is 48.

So default aw_bits to 48 in scalable modern mode. In other cases,
it is still default to 39 for compatibility.

Add a check to ensure user specified value is 48 in modern mode
for now.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e07daaba99..a4c241ea96 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3748,7 +3748,7 @@ static Property vtd_properties[] = {
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
-  VTD_HOST_ADDRESS_WIDTH),
+  0xff),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
 DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
@@ -4663,6 +4663,14 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 }
 }
 
+if (s->aw_bits == 0xff) {
+if (s->scalable_modern) {
+s->aw_bits = VTD_HOST_AW_48BIT;
+} else {
+s->aw_bits = VTD_HOST_AW_39BIT;
+}
+}
+
 if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
 (s->aw_bits != VTD_HOST_AW_48BIT) &&
 !s->scalable_modern) {
@@ -4671,6 +4679,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return false;
 }
 
+if ((s->aw_bits != VTD_HOST_AW_48BIT) && s->scalable_modern) {
+error_setg(errp, "Supported values for aw-bits are: %d",
+   VTD_HOST_AW_48BIT);
+return false;
+}
+
 if (s->scalable_mode && !s->dma_drain) {
 error_setg(errp, "Need to set dma_drain for scalable mode");
 return false;
-- 
2.34.1




[PATCH rfcv2 16/17] intel_iommu: Modify x-scalable-mode to be string option

2024-05-22 Thread Zhenzhong Duan
From: Yi Liu 

Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
related to scalable mode translation, thus there are multiple combinations.
While this vIOMMU implementation wants to simplify it for user by providing
typical combinations. User could config it by "x-scalable-mode" option. The
usage is as below:

"-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"

 - "legacy": gives support for stage-2 page table
 - "modern": gives support for stage-1 page table
 - "off": no scalable mode support
 -  if not configured, means no scalable mode support, if not proper
configured, will throw error

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 24 +++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index b0d5b5a5be..dd032b1081 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -264,6 +264,7 @@ struct IntelIOMMUState {
 
 bool caching_mode;  /* RO - is cap CM enabled? */
 bool scalable_mode; /* RO - is Scalable Mode supported? */
+char *scalable_mode_str;/* RO - admin's Scalable Mode config */
 bool scalable_modern;   /* RO - is modern SM supported? */
 bool snoop_control; /* RO - is SNP filed supported? */
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a4c241ea96..1bd91fcf4c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3750,7 +3750,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
   0xff),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
-DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, scalable_mode_str),
 DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
 DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false),
 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
@@ -4663,6 +4663,28 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 }
 }
 
+if (s->scalable_mode_str &&
+(strcmp(s->scalable_mode_str, "off") &&
+ strcmp(s->scalable_mode_str, "modern") &&
+ strcmp(s->scalable_mode_str, "legacy"))) {
+error_setg(errp, "Invalid x-scalable-mode config,"
+ "Please use \"modern\", \"legacy\" or \"off\"");
+return false;
+}
+
+if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "legacy")) {
+s->scalable_mode = true;
+s->scalable_modern = false;
+} else if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "modern")) {
+s->scalable_mode = true;
+s->scalable_modern = true;
+} else {
+s->scalable_mode = false;
+s->scalable_modern = false;
+}
+
 if (s->aw_bits == 0xff) {
 if (s->scalable_modern) {
 s->aw_bits = VTD_HOST_AW_48BIT;
-- 
2.34.1




[PATCH rfcv2 01/17] intel_iommu: Update version to 3.0 and add the latest fault reasons

2024-05-22 Thread Zhenzhong Duan
From: Yu Zhang 

The scalable mode was introduced in VTD spec 3.0, now that
the scalable mode is already supported, bump version to 3.0.

In spec 3.0 some more detailed fault reasons are defined
for scalable mode. So introduce them into emulation code,
see spec section 7.1.2 for details.

Guest kernel should use the version for informational purpose
not feature check, cap/ecap bits should be checked instead.
So this change will not impact migration.

Signed-off-by: Yu Zhang 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  9 -
 hw/i386/intel_iommu.c  | 27 +--
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index b800d62ca0..955bc24787 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -312,7 +312,14 @@ typedef enum VTDFaultReason {
   * request while disabled */
 VTD_FR_IR_SID_ERR = 0x26,   /* Invalid Source-ID */
 
-VTD_FR_PASID_TABLE_INV = 0x58,  /*Invalid PASID table entry */
+/* PASID directory entry access failure */
+VTD_FR_PASID_DIR_ACCESS_ERR = 0x50,
+/* The Present(P) field of pasid directory entry is 0 */
+VTD_FR_PASID_DIR_ENTRY_P = 0x51,
+VTD_FR_PASID_TABLE_ACCESS_ERR = 0x58, /* PASID table entry access failure 
*/
+/* The Present(P) field of pasid table entry is 0 */
+VTD_FR_PASID_ENTRY_P = 0x59,
+VTD_FR_PASID_TABLE_ENTRY_INV = 0x5b,  /*Invalid PASID table entry */
 
 /* Output address in the interrupt address range for scalable mode */
 VTD_FR_SM_INTERRUPT_ADDR = 0x87,
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 07bfd4f99e..d85aaf4bb8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -779,7 +779,7 @@ static int vtd_get_pdire_from_pdir_table(dma_addr_t 
pasid_dir_base,
 addr = pasid_dir_base + index * entry_size;
 if (dma_memory_read(_space_memory, addr,
 pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_DIR_ACCESS_ERR;
 }
 
 pdire->val = le64_to_cpu(pdire->val);
@@ -797,6 +797,7 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
   dma_addr_t addr,
   VTDPASIDEntry *pe)
 {
+uint8_t pgtt;
 uint32_t index;
 dma_addr_t entry_size;
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
@@ -806,7 +807,7 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 addr = addr + index * entry_size;
 if (dma_memory_read(_space_memory, addr,
 pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_TABLE_ACCESS_ERR;
 }
 for (size_t i = 0; i < ARRAY_SIZE(pe->val); i++) {
 pe->val[i] = le64_to_cpu(pe->val[i]);
@@ -814,11 +815,13 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 
 /* Do translation type check */
 if (!vtd_pe_type_check(x86_iommu, pe)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
-if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
-return -VTD_FR_PASID_TABLE_INV;
+pgtt = VTD_PE_GET_TYPE(pe);
+if (pgtt == VTD_SM_PASID_ENTRY_SLT &&
+!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
+return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
 return 0;
@@ -859,7 +862,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
 }
 
 if (!vtd_pdire_present()) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_DIR_ENTRY_P;
 }
 
 ret = vtd_get_pe_from_pdire(s, pasid, , pe);
@@ -868,7 +871,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
 }
 
 if (!vtd_pe_present(pe)) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_ENTRY_P;
 }
 
 return 0;
@@ -921,7 +924,7 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
 }
 
 if (!vtd_pdire_present()) {
-return -VTD_FR_PASID_TABLE_INV;
+return -VTD_FR_PASID_DIR_ENTRY_P;
 }
 
 /*
@@ -1778,7 +1781,11 @@ static const bool vtd_qualified_faults[] = {
 [VTD_FR_ROOT_ENTRY_RSVD] = false,
 [VTD_FR_PAGING_ENTRY_RSVD] = true,
 [VTD_FR_CONTEXT_ENTRY_TT] = true,
-[VTD_FR_PASID_TABLE_INV] = false,
+[VTD_FR_PASID_DIR_ACCESS_ERR] = false,
+[VTD_FR_PASID_DIR_ENTRY_P] = true,
+[VTD_FR_PASID_TABLE_ACCESS_ERR] = false,
+[VTD_FR_PASID_ENTRY_P] = true,
+[VTD_FR_PASID_TABLE_ENTRY_INV] = true,
 [VTD_FR_SM_INTERRUPT_ADDR] = true,
 [VTD_FR_MAX] = false,
 };
@@ -4138,7 +4145,7 @@ static void vtd_init(IntelIOMMUState *s)
 vtd_reset_caches(s);
 
 /* Define registers with default values and bit semantics */
-vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 

[PATCH rfcv2 04/17] intel_iommu: Flush stage-2 cache in PADID-selective PASID-based iotlb invalidation

2024-05-22 Thread Zhenzhong Duan
Per spec 6.5.2.4, PADID-selective PASID-based iotlb invalidation will
flush stage-2 iotlb entries with matching domain id and pasid.

With scalable modern mdoe introduced, guest could send PADID-selective
PASID-based iotlb invalidation to flush both stage-1 and stage-2 entries.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h | 10 +
 hw/i386/intel_iommu.c  | 78 ++
 2 files changed, 88 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 75aea80942..b0d9b1f986 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -441,6 +441,16 @@ typedef union VTDInvDesc VTDInvDesc;
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
 
+#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
+#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID  (3ULL << 4)
+
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff0ffc0ULL
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+
+#define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
+ VTD_DOMAIN_ID_MASK)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 6d1d94ada3..ed95b5ba2e 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2642,6 +2642,80 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static gboolean vtd_hash_remove_by_pasid(gpointer key, gpointer value,
+ gpointer user_data)
+{
+VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+
+return ((entry->domain_id == info->domain_id) &&
+(entry->pasid == info->pasid));
+}
+
+static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
+uint16_t domain_id, uint32_t pasid)
+{
+VTDIOTLBPageInvInfo info;
+VTDAddressSpace *vtd_as;
+VTDContextEntry ce;
+
+info.domain_id = domain_id;
+info.pasid = pasid;
+
+vtd_iommu_lock(s);
+g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pasid,
+);
+vtd_iommu_unlock(s);
+
+QLIST_FOREACH(vtd_as, >vtd_as_with_notifiers, next) {
+if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+  vtd_as->devfn, ) &&
+domain_id == vtd_get_domain_id(s, , vtd_as->pasid)) {
+uint32_t rid2pasid = VTD_CE_GET_RID2PASID();
+
+if ((vtd_as->pasid != PCI_NO_PASID || pasid != rid2pasid) &&
+vtd_as->pasid != pasid) {
+continue;
+}
+
+if (!s->scalable_modern) {
+vtd_address_space_sync(vtd_as);
+}
+}
+}
+}
+
+static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
+VTDInvDesc *inv_desc)
+{
+uint16_t domain_id;
+uint32_t pasid;
+
+if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
+error_report_once("non-zero-field-in-piotlb_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]);
+pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]);
+switch (inv_desc->val[0] & VTD_INV_DESC_IOTLB_G) {
+case VTD_INV_DESC_PIOTLB_ALL_IN_PASID:
+vtd_piotlb_pasid_invalidate(s, domain_id, pasid);
+break;
+
+case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+break;
+
+default:
+error_report_once("Invalid granularity in P-IOTLB desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -2752,6 +2826,10 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 break;
 
 case VTD_INV_DESC_PIOTLB:
+trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_piotlb_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_WAIT:
-- 
2.34.1




[PATCH rfcv2 00/17] intel_iommu: Enable stage-1 translation for emulated device

2024-05-22 Thread Zhenzhong Duan
Hi,

Per Jason Wang's suggestion, iommufd nesting series[1] is split into
"Enable stage-1 translation for emulated device" series and
"Enable stage-1 translation for passthrough device" series.

This series enables stage-1 translation support for emulated device
in intel iommu which we called "modern" mode.

PATCH1-5:  Some preparing work before support stage-1 translation
PATCH6-8:  Implement stage-1 translation for emulated device
PATCH9-14: Emulate iotlb invalidation of stage-1 mapping
PATCH15:   Set default aw_bits to 48 in scalable modren mode
PATCH16:   Introduce "modern" mode to distinguish with legacy mode
PATCH17:   Add qtest

Qemu code can be found at [2]

[1] https://lists.gnu.org/archive/html/qemu-devel/2024-01/msg02740.html
[2] https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_stage1_emu_rfcv2

Thanks
Zhenzhong

Changelog:
v2:
- split from nesting series (Jason)
- merged some commits from Clement
- add qtest (jason)

Clément Mathieu--Drif (5):
  intel_iommu: check if the input address is canonical
  intel_iommu: set accessed and dirty bits during first stage
translation
  intel_iommu: Extract device IOTLB invalidation logic
  intel_iommu: add an internal API to find an address space with PASID
  intel_iommu: add support for PASID-based device IOTLB invalidation

Yi Liu (3):
  intel_iommu: Rename slpte to pte
  intel_iommu: Implement stage-1 translation
  intel_iommu: Modify x-scalable-mode to be string option

Yu Zhang (1):
  intel_iommu: Update version to 3.0 and add the latest fault reasons

Zhenzhong Duan (8):
  intel_iommu: Make pasid entry type check accurate
  intel_iommu: Add a placeholder variable for scalable modern mode
  intel_iommu: Flush stage-2 cache in PADID-selective PASID-based iotlb
invalidation
  intel_iommu: Flush stage-1 cache in iotlb invalidation
  intel_iommu: Process PASID-based iotlb invalidation
  intel_iommu: piotlb invalidation should notify unmap
  intel_iommu: Set default aw_bits to 48 in scalable modren mode
  tests/qtest: Add intel-iommu test

 MAINTAINERS|   1 +
 hw/i386/intel_iommu_internal.h |  60 +++-
 include/hw/i386/intel_iommu.h  |   5 +-
 hw/i386/intel_iommu.c  | 639 -
 tests/qtest/intel-iommu-test.c |  63 
 tests/qtest/meson.build|   1 +
 6 files changed, 676 insertions(+), 93 deletions(-)
 create mode 100644 tests/qtest/intel-iommu-test.c

-- 
2.34.1




[PATCH rfcv2 17/17] tests/qtest: Add intel-iommu test

2024-05-22 Thread Zhenzhong Duan
Add the framework to test the intel-iommu device.

Currently only tested cap/ecap bits correctness in scalable
modern mode. Also tested cap/ecap bits consistency before
and after system reset.

Signed-off-by: Zhenzhong Duan 
---
 MAINTAINERS|  1 +
 tests/qtest/intel-iommu-test.c | 63 ++
 tests/qtest/meson.build|  1 +
 3 files changed, 65 insertions(+)
 create mode 100644 tests/qtest/intel-iommu-test.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 5dab60bd04..f1ef6128c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3656,6 +3656,7 @@ S: Supported
 F: hw/i386/intel_iommu.c
 F: hw/i386/intel_iommu_internal.h
 F: include/hw/i386/intel_iommu.h
+F: tests/qtest/intel-iommu-test.c
 
 AMD-Vi Emulation
 S: Orphan
diff --git a/tests/qtest/intel-iommu-test.c b/tests/qtest/intel-iommu-test.c
new file mode 100644
index 00..e1273bce14
--- /dev/null
+++ b/tests/qtest/intel-iommu-test.c
@@ -0,0 +1,63 @@
+/*
+ * QTest testcase for intel-iommu
+ *
+ * Copyright (c) 2024 Intel, Inc.
+ *
+ * Author: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest-single.h"
+#include "hw/i386/intel_iommu_internal.h"
+
+#define vtd_reg_readl(offset)(readq(Q35_HOST_BRIDGE_IOMMU_ADDR + offset))
+#define CAP_MODERN_FIXED1(VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | \
+  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS)
+#define ECAP_MODERN_FIXED1   (VTD_ECAP_QI |  VTD_ECAP_IRO | VTD_ECAP_MHMV | \
+  VTD_ECAP_SMTS | VTD_ECAP_FLTS)
+
+static void test_intel_iommu_modern(void)
+{
+uint8_t init_csr[DMAR_REG_SIZE]; /* register values */
+uint8_t post_reset_csr[DMAR_REG_SIZE]; /* register values */
+uint64_t cap, ecap, tmp;
+
+qtest_start("-M q35 -device intel-iommu,x-scalable-mode=modern");
+
+g_assert(vtd_reg_readl(DMAR_VER_REG) == 0x30);
+
+cap = vtd_reg_readl(DMAR_CAP_REG);
+g_assert((cap & CAP_MODERN_FIXED1) == CAP_MODERN_FIXED1);
+
+tmp = cap & VTD_CAP_SAGAW_MASK;
+g_assert(tmp == (VTD_CAP_SAGAW_39bit | VTD_CAP_SAGAW_48bit));
+
+tmp = VTD_MGAW_FROM_CAP(cap);
+g_assert(tmp == VTD_HOST_AW_48BIT - 1);
+
+ecap = vtd_reg_readl(DMAR_ECAP_REG);
+g_assert((ecap & ECAP_MODERN_FIXED1) == ECAP_MODERN_FIXED1);
+g_assert(ecap & VTD_ECAP_IR);
+
+memread(Q35_HOST_BRIDGE_IOMMU_ADDR, init_csr, DMAR_REG_SIZE);
+
+qobject_unref(qmp("{ 'execute': 'system_reset' }"));
+qmp_eventwait("RESET");
+
+memread(Q35_HOST_BRIDGE_IOMMU_ADDR, post_reset_csr, DMAR_REG_SIZE);
+/* Ensure registers are consistent after hard reset */
+g_assert(!memcmp(init_csr, post_reset_csr, DMAR_REG_SIZE));
+
+qtest_end();
+}
+
+int main(int argc, char **argv)
+{
+g_test_init(, , NULL);
+qtest_add_func("/q35/intel-iommu/modern", test_intel_iommu_modern);
+
+return g_test_run();
+}
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 6f2f594ace..09106739d2 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -79,6 +79,7 @@ qtests_i386 = \
   (config_all_devices.has_key('CONFIG_SB16') ? ['fuzz-sb16-test'] : []) +  
 \
   (config_all_devices.has_key('CONFIG_SDHCI_PCI') ? ['fuzz-sdcard-test'] : []) 
+\
   (config_all_devices.has_key('CONFIG_ESP_PCI') ? ['am53c974-test'] : []) +
 \
+  (config_all_devices.has_key('CONFIG_VTD') ? ['intel-iommu-test'] : []) + 
\
   (host_os != 'windows' and
\
config_all_devices.has_key('CONFIG_ACPI_ERST') ? ['erst-test'] : []) +  
 \
   (config_all_devices.has_key('CONFIG_PCIE_PORT') and  
 \
-- 
2.34.1




[PATCH rfcv2 08/17] intel_iommu: set accessed and dirty bits during first stage translation

2024-05-22 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  3 +++
 hw/i386/intel_iommu.c  | 25 +
 2 files changed, 28 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index b6820dbca3..c0a94af820 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -327,6 +327,7 @@ typedef enum VTDFaultReason {
 
 /* Output address in the interrupt address range for scalable mode */
 VTD_FR_SM_INTERRUPT_ADDR = 0x87,
+VTD_FR_FS_BIT_UPDATE_FAILED = 0x91, /* SFS.10 */
 VTD_FR_MAX, /* Guard */
 } VTDFaultReason;
 
@@ -547,6 +548,8 @@ typedef struct VTDRootEntry VTDRootEntry;
 /* First Level Paging Structure */
 #define VTD_FL_PT_LEVEL 1
 #define VTD_FL_PT_ENTRY_NR  512
+#define VTD_FL_PTE_A0x20
+#define VTD_FL_PTE_D0x40
 
 /* Masks for First Level Paging Entry */
 #define VTD_FL_RW_MASK  (1ULL << 1)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1ea030bfbe..0801112e2e 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1802,6 +1802,7 @@ static const bool vtd_qualified_faults[] = {
 [VTD_FR_PASID_TABLE_ENTRY_INV] = true,
 [VTD_FR_SM_INTERRUPT_ADDR] = true,
 [VTD_FR_FS_NON_CANONICAL] = true,
+[VTD_FR_FS_BIT_UPDATE_FAILED] = true,
 [VTD_FR_MAX] = false,
 };
 
@@ -1927,6 +1928,20 @@ static bool vtd_iova_fl_check_canonical(IntelIOMMUState 
*s, uint64_t iova,
 );
 }
 
+static MemTxResult vtd_set_flag_in_pte(dma_addr_t base_addr, uint32_t index,
+   uint64_t pte, uint64_t flag)
+{
+if (pte & flag) {
+return MEMTX_OK;
+}
+pte |= flag;
+pte = cpu_to_le64(pte);
+return dma_memory_write(_space_memory,
+base_addr + index * sizeof(pte),
+, sizeof(pte),
+MEMTXATTRS_UNSPECIFIED);
+}
+
 /*
  * Given the @iova, get relevant @flptep. @flpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
@@ -1972,7 +1987,17 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, 
VTDContextEntry *ce,
 return -VTD_FR_WRITE;
 }
 
+if (vtd_set_flag_in_pte(addr, offset, flpte, VTD_FL_PTE_A)
+!= MEMTX_OK) {
+return -VTD_FR_FS_BIT_UPDATE_FAILED;
+}
+
 if (vtd_is_last_flpte(flpte, level)) {
+if (is_write &&
+(vtd_set_flag_in_pte(addr, offset, flpte, VTD_FL_PTE_D) !=
+MEMTX_OK)) 
{
+return -VTD_FR_FS_BIT_UPDATE_FAILED;
+}
 *flptep = flpte;
 *flpte_level = level;
 return 0;
-- 
2.34.1




[PATCH rfcv2 14/17] intel_iommu: piotlb invalidation should notify unmap

2024-05-22 Thread Zhenzhong Duan
This is used by some emulated devices which caches address
translation result. When piotlb invalidation issued in guest,
those caches should be refreshed.

Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index de4e8afcf9..e07daaba99 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2890,7 +2890,7 @@ static void vtd_piotlb_pasid_invalidate(IntelIOMMUState 
*s,
 continue;
 }
 
-if (!s->scalable_modern) {
+if (!s->scalable_modern || !vtd_as_has_map_notifier(vtd_as)) {
 vtd_address_space_sync(vtd_as);
 }
 }
@@ -2902,6 +2902,9 @@ static void vtd_piotlb_page_invalidate(IntelIOMMUState 
*s, uint16_t domain_id,
bool ih)
 {
 VTDIOTLBPageInvInfo info;
+VTDAddressSpace *vtd_as;
+VTDContextEntry ce;
+hwaddr size = (1 << am) * VTD_PAGE_SIZE;
 
 info.domain_id = domain_id;
 info.pasid = pasid;
@@ -2912,6 +2915,36 @@ static void vtd_piotlb_page_invalidate(IntelIOMMUState 
*s, uint16_t domain_id,
 g_hash_table_foreach_remove(s->iotlb,
 vtd_hash_remove_by_page_piotlb, );
 vtd_iommu_unlock(s);
+
+QLIST_FOREACH(vtd_as, >vtd_as_with_notifiers, next) {
+if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+  vtd_as->devfn, ) &&
+domain_id == vtd_get_domain_id(s, , vtd_as->pasid)) {
+uint32_t rid2pasid = VTD_CE_GET_RID2PASID();
+IOMMUTLBEvent event;
+
+if ((vtd_as->pasid != PCI_NO_PASID || pasid != rid2pasid) &&
+vtd_as->pasid != pasid) {
+continue;
+}
+
+/*
+ * Page-Selective-within-PASID PASID-based-IOTLB Invalidation
+ * does not flush stage-2 entries. See spec section 6.5.2.4
+ */
+if (!s->scalable_modern) {
+continue;
+}
+
+event.type = IOMMU_NOTIFIER_UNMAP;
+event.entry.target_as = _space_memory;
+event.entry.iova = addr;
+event.entry.perm = IOMMU_NONE;
+event.entry.addr_mask = size - 1;
+event.entry.translated_addr = 0;
+memory_region_notify_iommu(_as->iommu, 0, event);
+}
+}
 }
 
 static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
-- 
2.34.1




[PATCH rfcv2 06/17] intel_iommu: Implement stage-1 translation

2024-05-22 Thread Zhenzhong Duan
From: Yi Liu 

This adds stage-1 page table walking to support stage-1 only
transltion in scalable modern mode.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  17 +
 hw/i386/intel_iommu.c  | 128 +++--
 2 files changed, 141 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0e240d6d54..abfdbd5f65 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -534,6 +534,23 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SM_PASID_ENTRY_AW  7ULL /* Adjusted guest-address-width */
 #define VTD_SM_PASID_ENTRY_DID(val)((val) & VTD_DOMAIN_ID_MASK)
 
+#define VTD_SM_PASID_ENTRY_FLPM  3ULL
+#define VTD_SM_PASID_ENTRY_FLPTPTR   (~0xfffULL)
+
+/* Paging Structure common */
+#define VTD_FL_PT_PAGE_SIZE_MASK(1ULL << 7)
+/* Bits to decide the offset for each level */
+#define VTD_FL_LEVEL_BITS   9
+
+/* First Level Paging Structure */
+#define VTD_FL_PT_LEVEL 1
+#define VTD_FL_PT_ENTRY_NR  512
+
+/* Masks for First Level Paging Entry */
+#define VTD_FL_RW_MASK  (1ULL << 1)
+#define VTD_FL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+#define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */
+
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 544e8f0e40..cf29809bc1 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -50,6 +50,8 @@
 /* pe operations */
 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & 
VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_FLPT_LEVEL(pe) \
+(4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
 
 /*
  * PCI bus number (or SID) is not reliable since the device is usaully
@@ -823,6 +825,11 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
+if (pgtt == VTD_SM_PASID_ENTRY_FLT &&
+VTD_PE_GET_FLPT_LEVEL(pe) != 4) {
+return -VTD_FR_PASID_TABLE_ENTRY_INV;
+}
+
 return 0;
 }
 
@@ -958,7 +965,11 @@ static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
 
 if (s->root_scalable) {
 vtd_ce_get_rid2pasid_entry(s, ce, , pasid);
-return VTD_PE_GET_LEVEL();
+if (s->scalable_modern) {
+return VTD_PE_GET_FLPT_LEVEL();
+} else {
+return VTD_PE_GET_LEVEL();
+}
 }
 
 return vtd_ce_get_level(ce);
@@ -1045,7 +1056,11 @@ static dma_addr_t 
vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
 
 if (s->root_scalable) {
 vtd_ce_get_rid2pasid_entry(s, ce, , pasid);
-return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+if (s->scalable_modern) {
+return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+} else {
+return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+}
 }
 
 return vtd_ce_get_slpt_base(ce);
@@ -1847,6 +1862,106 @@ out:
 trace_vtd_pt_enable_fast_path(source_id, success);
 }
 
+/* The shift of an addr for a certain level of paging structure */
+static inline uint32_t vtd_flpt_level_shift(uint32_t level)
+{
+assert(level != 0);
+return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_FL_LEVEL_BITS;
+}
+
+/*
+ * Given an iova and the level of paging structure, return the offset
+ * of current level.
+ */
+static inline uint32_t vtd_iova_fl_level_offset(uint64_t iova, uint32_t level)
+{
+return (iova >> vtd_flpt_level_shift(level)) &
+((1ULL << VTD_FL_LEVEL_BITS) - 1);
+}
+
+/* Get the content of a flpte located in @base_addr[@index] */
+static uint64_t vtd_get_flpte(dma_addr_t base_addr, uint32_t index)
+{
+uint64_t flpte;
+
+assert(index < VTD_FL_PT_ENTRY_NR);
+
+if (dma_memory_read(_space_memory,
+base_addr + index * sizeof(flpte), ,
+sizeof(flpte), MEMTXATTRS_UNSPECIFIED)) {
+flpte = (uint64_t)-1;
+return flpte;
+}
+flpte = le64_to_cpu(flpte);
+return flpte;
+}
+
+static inline bool vtd_flpte_present(uint64_t flpte)
+{
+return !!(flpte & 0x1);
+}
+
+/* Whether the pte indicates the address of the page frame */
+static inline bool vtd_is_last_flpte(uint64_t flpte, uint32_t level)
+{
+return level == VTD_FL_PT_LEVEL || (flpte & VTD_FL_PT_PAGE_SIZE_MASK);
+}
+
+static inline uint64_t vtd_get_flpte_addr(uint64_t flpte, uint8_t aw)
+{
+return flpte & VTD_FL_PT_BASE_ADDR_MASK(aw);
+}
+
+/*
+ * Given the @iova, get relevant @flptep. @flpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ *

[PATCH rfcv2 12/17] intel_iommu: add an internal API to find an address space with PASID

2024-05-22 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

This will be used to implement the device IOTLB invalidation

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 39 ---
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3c14fd85cc..7ae8df2f49 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -64,6 +64,11 @@ struct vtd_as_key {
 uint32_t pasid;
 };
 
+struct vtd_as_raw_key {
+uint16_t sid;
+uint32_t pasid;
+};
+
 struct vtd_iotlb_key {
 uint64_t gfn;
 uint32_t pasid;
@@ -1856,29 +1861,33 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
 }
 
-static gboolean vtd_find_as_by_sid(gpointer key, gpointer value,
-   gpointer user_data)
+static gboolean vtd_find_as_by_sid_and_pasid(gpointer key, gpointer value,
+ gpointer user_data)
 {
 struct vtd_as_key *as_key = (struct vtd_as_key *)key;
-uint16_t target_sid = *(uint16_t *)user_data;
+struct vtd_as_raw_key target = *(struct vtd_as_raw_key *)user_data;
 uint16_t sid = PCI_BUILD_BDF(pci_bus_num(as_key->bus), as_key->devfn);
-return sid == target_sid;
+
+return (as_key->pasid == target.pasid) &&
+   (sid == target.sid);
 }
 
-static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid)
+static VTDAddressSpace *vtd_get_as_by_sid_and_pasid(IntelIOMMUState *s,
+uint16_t sid,
+uint32_t pasid)
 {
-uint8_t bus_num = PCI_BUS_NUM(sid);
-VTDAddressSpace *vtd_as = s->vtd_as_cache[bus_num];
-
-if (vtd_as &&
-(sid == PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn))) {
-return vtd_as;
-}
+struct vtd_as_raw_key key = {
+.sid = sid,
+.pasid = pasid
+};
 
-vtd_as = g_hash_table_find(s->vtd_address_spaces, vtd_find_as_by_sid, 
);
-s->vtd_as_cache[bus_num] = vtd_as;
+return g_hash_table_find(s->vtd_address_spaces,
+ vtd_find_as_by_sid_and_pasid, );
+}
 
-return vtd_as;
+static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid)
+{
+return vtd_get_as_by_sid_and_pasid(s, sid, PCI_NO_PASID);
 }
 
 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
-- 
2.34.1




[PATCH rfcv2 09/17] intel_iommu: Flush stage-1 cache in iotlb invalidation

2024-05-22 Thread Zhenzhong Duan
According to spec, Page-Selective-within-Domain Invalidation (11b):

1. IOTLB entries caching second-stage mappings (PGTT=010b) or pass-through
(PGTT=100b) mappings associated with the specified domain-id and the
input-address range are invalidated.
2. IOTLB entries caching first-stage (PGTT=001b) or nested (PGTT=011b)
mapping associated with specified domain-id are invalidated.

So per spec definition the Page-Selective-within-Domain Invalidation
needs to flush first stage and nested cached IOTLB enties as well.

We don't support nested yet and pass-through mapping is never cached,
so what in iotlb cache are only first-stage and second-stage mappings.

Add a tag pgtt in VTDIOTLBEntry to mark PGTT type of the mapping and
invalidate entries based on PGTT type.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 20 +---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 011f374883..b0d5b5a5be 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -156,6 +156,7 @@ struct VTDIOTLBEntry {
 uint64_t pte;
 uint64_t mask;
 uint8_t access_flags;
+uint8_t pgtt;
 };
 
 /* VT-d Source-ID Qualifier types */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 0801112e2e..0078bad9d4 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -287,9 +287,21 @@ static gboolean vtd_hash_remove_by_page(gpointer key, 
gpointer value,
 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
-return (entry->domain_id == info->domain_id) &&
-(((entry->gfn & info->mask) == gfn) ||
- (entry->gfn == gfn_tlb));
+
+if (entry->domain_id != info->domain_id) {
+return false;
+}
+
+/*
+ * According to spec, IOTLB entries caching first-stage (PGTT=001b) or
+ * nested (PGTT=011b) mapping associated with specified domain-id are
+ * invalidated. Nested isn't supported yet, so only need to check 001b.
+ */
+if (entry->pgtt == VTD_SM_PASID_ENTRY_FLT) {
+return true;
+}
+
+return (entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb;
 }
 
 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
@@ -382,6 +394,8 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t 
source_id,
 entry->access_flags = access_flags;
 entry->mask = vtd_slpt_level_page_mask(level);
 entry->pasid = pasid;
+entry->pgtt = s->scalable_modern ? VTD_SM_PASID_ENTRY_FLT
+ : VTD_SM_PASID_ENTRY_SLT;
 
 key->gfn = gfn;
 key->sid = source_id;
-- 
2.34.1




[PATCH rfcv2 03/17] intel_iommu: Add a placeholder variable for scalable modern mode

2024-05-22 Thread Zhenzhong Duan
Add an new element scalable_mode in IntelIOMMUState to mark scalable
modern mode, this element will be exposed as an intel_iommu property
finally.

For now, it's only a placehholder and used for cap/ecap initialization,
compatibility check and block host device passthrough until nesting
is supported.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  2 ++
 include/hw/i386/intel_iommu.h  |  1 +
 hw/i386/intel_iommu.c  | 37 --
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 955bc24787..75aea80942 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -196,6 +196,7 @@
 #define VTD_ECAP_PASID  (1ULL << 40)
 #define VTD_ECAP_SMTS   (1ULL << 43)
 #define VTD_ECAP_SLTS   (1ULL << 46)
+#define VTD_ECAP_FLTS   (1ULL << 47)
 
 /* CAP_REG */
 /* (offset >> 4) << 24 */
@@ -212,6 +213,7 @@
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
 #define VTD_CAP_DRAIN_WRITE (1ULL << 54)
 #define VTD_CAP_DRAIN_READ  (1ULL << 55)
+#define VTD_CAP_FS1GP   (1ULL << 56)
 #define VTD_CAP_DRAIN   (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
 #define VTD_CAP_CM  (1ULL << 7)
 #define VTD_PASID_ID_SHIFT  20
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 2bbde41e45..9ba9c45015 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -263,6 +263,7 @@ struct IntelIOMMUState {
 
 bool caching_mode;  /* RO - is cap CM enabled? */
 bool scalable_mode; /* RO - is Scalable Mode supported? */
+bool scalable_modern;   /* RO - is modern SM supported? */
 bool snoop_control; /* RO - is SNP filed supported? */
 
 dma_addr_t root;/* Current root table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 348e3a441e..6d1d94ada3 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -738,16 +738,20 @@ static inline bool vtd_is_level_supported(IntelIOMMUState 
*s, uint32_t level)
 }
 
 /* Return true if check passed, otherwise false */
-static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
- VTDPASIDEntry *pe)
+static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
 {
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
 switch (VTD_PE_GET_TYPE(pe)) {
+case VTD_SM_PASID_ENTRY_FLT:
+return s->scalable_modern;
 case VTD_SM_PASID_ENTRY_SLT:
-return true;
+return !s->scalable_modern;
+case VTD_SM_PASID_ENTRY_NESTED:
+/* Not support NESTED page table type yet */
+return false;
 case VTD_SM_PASID_ENTRY_PT:
 return x86_iommu->pt_supported;
-case VTD_SM_PASID_ENTRY_FLT:
-case VTD_SM_PASID_ENTRY_NESTED:
 default:
 /* Unknown type */
 return false;
@@ -796,7 +800,6 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 uint8_t pgtt;
 uint32_t index;
 dma_addr_t entry_size;
-X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
 index = VTD_PASID_TABLE_INDEX(pasid);
 entry_size = VTD_PASID_ENTRY_SIZE;
@@ -810,7 +813,7 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState 
*s,
 }
 
 /* Do translation type check */
-if (!vtd_pe_type_check(x86_iommu, pe)) {
+if (!vtd_pe_type_check(s, pe)) {
 return -VTD_FR_PASID_TABLE_ENTRY_INV;
 }
 
@@ -3839,8 +3842,17 @@ static bool vtd_check_hdev(IntelIOMMUState *s, 
VTDHostIOMMUDevice *vtd_hdev,
 error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret);
 return false;
 }
-#endif
+
+if (!s->scalable_modern) {
+/* All checks requested by VTD non-modern mode pass */
+return true;
+}
+
+error_setg(errp, "host device is unsupported in scalable modern mode yet");
+return false;
+#else
 return true;
+#endif
 }
 
 static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
@@ -4076,7 +4088,10 @@ static void vtd_cap_init(IntelIOMMUState *s)
 }
 
 /* TODO: read cap/ecap from host to decide which cap to be exposed. */
-if (s->scalable_mode) {
+if (s->scalable_modern) {
+s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_FLTS;
+s->cap |= VTD_CAP_FS1GP;
+} else if (s->scalable_mode) {
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
 }
 
@@ -4243,9 +4258,9 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 }
 }
 
-/* Currently only address widths supported are 39 and 48 bits */
 if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
-

[PATCH rfcv2 07/17] intel_iommu: check if the input address is canonical

2024-05-22 Thread Zhenzhong Duan
From: Clément Mathieu--Drif 

First stage translation must fail if the address to translate is
not canonical.

Signed-off-by: Clément Mathieu--Drif 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  2 ++
 hw/i386/intel_iommu.c  | 21 +
 2 files changed, 23 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index abfdbd5f65..b6820dbca3 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -323,6 +323,8 @@ typedef enum VTDFaultReason {
 VTD_FR_PASID_ENTRY_P = 0x59,
 VTD_FR_PASID_TABLE_ENTRY_INV = 0x5b,  /*Invalid PASID table entry */
 
+VTD_FR_FS_NON_CANONICAL = 0x80, /* SNG.1 : Address for FS not canonical.*/
+
 /* Output address in the interrupt address range for scalable mode */
 VTD_FR_SM_INTERRUPT_ADDR = 0x87,
 VTD_FR_MAX, /* Guard */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index cf29809bc1..1ea030bfbe 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1801,6 +1801,7 @@ static const bool vtd_qualified_faults[] = {
 [VTD_FR_PASID_ENTRY_P] = true,
 [VTD_FR_PASID_TABLE_ENTRY_INV] = true,
 [VTD_FR_SM_INTERRUPT_ADDR] = true,
+[VTD_FR_FS_NON_CANONICAL] = true,
 [VTD_FR_MAX] = false,
 };
 
@@ -1912,6 +1913,20 @@ static inline uint64_t vtd_get_flpte_addr(uint64_t 
flpte, uint8_t aw)
 return flpte & VTD_FL_PT_BASE_ADDR_MASK(aw);
 }
 
+/* Return true if IOVA is canonical, otherwise false. */
+static bool vtd_iova_fl_check_canonical(IntelIOMMUState *s, uint64_t iova,
+VTDContextEntry *ce, uint32_t pasid)
+{
+uint64_t iova_limit = vtd_iova_limit(s, ce, s->aw_bits, pasid);
+uint64_t upper_bits_mask = ~(iova_limit - 1);
+uint64_t upper_bits = iova & upper_bits_mask;
+bool msb = ((iova & (iova_limit >> 1)) != 0);
+return !(
+ (!msb && (upper_bits != 0)) ||
+ (msb && (upper_bits != upper_bits_mask))
+);
+}
+
 /*
  * Given the @iova, get relevant @flptep. @flpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
@@ -1927,6 +1942,12 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, 
VTDContextEntry *ce,
 uint32_t offset;
 uint64_t flpte;
 
+if (!vtd_iova_fl_check_canonical(s, iova, ce, pasid)) {
+error_report_once("%s: detected non canonical IOVA (iova=0x%" PRIx64 
","
+  "pasid=0x%" PRIx32 ")", __func__, iova, pasid);
+return -VTD_FR_FS_NON_CANONICAL;
+}
+
 while (true) {
 offset = vtd_iova_fl_level_offset(iova, level);
 flpte = vtd_get_flpte(addr, offset);
-- 
2.34.1




[PATCH rfcv2 02/17] intel_iommu: Make pasid entry type check accurate

2024-05-22 Thread Zhenzhong Duan
When guest configures Nested Translation(011b) or First-stage Translation only
(001b), type check passed unaccurately.

Fails the type check in those cases as their simulation isn't supported yet.

Fixes: fb43cf739e1 ("intel_iommu: scalable mode emulation")
Suggested-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d85aaf4bb8..348e3a441e 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -742,20 +742,16 @@ static inline bool vtd_pe_type_check(X86IOMMUState 
*x86_iommu,
  VTDPASIDEntry *pe)
 {
 switch (VTD_PE_GET_TYPE(pe)) {
-case VTD_SM_PASID_ENTRY_FLT:
 case VTD_SM_PASID_ENTRY_SLT:
-case VTD_SM_PASID_ENTRY_NESTED:
-break;
+return true;
 case VTD_SM_PASID_ENTRY_PT:
-if (!x86_iommu->pt_supported) {
-return false;
-}
-break;
+return x86_iommu->pt_supported;
+case VTD_SM_PASID_ENTRY_FLT:
+case VTD_SM_PASID_ENTRY_NESTED:
 default:
 /* Unknown type */
 return false;
 }
-return true;
 }
 
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
-- 
2.34.1




[PATCH v2 18/20] vfio/igd: Use g_autofree in vfio_probe_igd_bar4_quirk()

2024-05-21 Thread Zhenzhong Duan
Pointer opregion, host and lpc are allocated and freed in
vfio_probe_igd_bar4_quirk(). Use g_autofree to automatically
free them.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/igd.c | 27 ---
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c
index 1e79202f2b..d320d032a7 100644
--- a/hw/vfio/igd.c
+++ b/hw/vfio/igd.c
@@ -368,7 +368,9 @@ static const MemoryRegionOps vfio_igd_index_quirk = {
 void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 {
 g_autofree struct vfio_region_info *rom = NULL;
-struct vfio_region_info *opregion = NULL, *host = NULL, *lpc = NULL;
+g_autofree struct vfio_region_info *opregion = NULL;
+g_autofree struct vfio_region_info *host = NULL;
+g_autofree struct vfio_region_info *lpc = NULL;
 VFIOQuirk *quirk;
 VFIOIGDQuirk *igd;
 PCIDevice *lpc_bridge;
@@ -426,7 +428,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 if ((ret || !rom->size) && !vdev->pdev.romfile) {
 error_report("IGD device %s has no ROM, legacy mode disabled",
  vdev->vbasedev.name);
-goto out;
+return;
 }
 
 /*
@@ -437,7 +439,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 error_report("IGD device %s hotplugged, ROM disabled, "
  "legacy mode disabled", vdev->vbasedev.name);
 vdev->rom_read_failed = true;
-goto out;
+return;
 }
 
 /*
@@ -450,7 +452,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 if (ret) {
 error_report("IGD device %s does not support OpRegion access,"
  "legacy mode disabled", vdev->vbasedev.name);
-goto out;
+return;
 }
 
 ret = vfio_get_dev_region_info(>vbasedev,
@@ -459,7 +461,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 if (ret) {
 error_report("IGD device %s does not support host bridge access,"
  "legacy mode disabled", vdev->vbasedev.name);
-goto out;
+return;
 }
 
 ret = vfio_get_dev_region_info(>vbasedev,
@@ -468,7 +470,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 if (ret) {
 error_report("IGD device %s does not support LPC bridge access,"
  "legacy mode disabled", vdev->vbasedev.name);
-goto out;
+return;
 }
 
 gmch = vfio_pci_read_config(>pdev, IGD_GMCH, 4);
@@ -482,7 +484,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 error_report("IGD device %s failed to enable VGA access, "
  "legacy mode disabled", vdev->vbasedev.name);
-goto out;
+return;
 }
 
 /* Create our LPC/ISA bridge */
@@ -490,7 +492,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 if (ret) {
 error_report("IGD device %s failed to create LPC bridge, "
  "legacy mode disabled", vdev->vbasedev.name);
-goto out;
+return;
 }
 
 /* Stuff some host values into the VM PCI host bridge */
@@ -498,14 +500,14 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int 
nr)
 if (ret) {
 error_report("IGD device %s failed to modify host bridge, "
  "legacy mode disabled", vdev->vbasedev.name);
-goto out;
+return;
 }
 
 /* Setup OpRegion access */
 if (!vfio_pci_igd_opregion_init(vdev, opregion, )) {
 error_append_hint(, "IGD legacy mode disabled\n");
 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
-goto out;
+return;
 }
 
 /* Setup our quirk to munge GTT addresses to the VM allocated buffer */
@@ -607,9 +609,4 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 }
 
 trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb);
-
-out:
-g_free(opregion);
-g_free(host);
-g_free(lpc);
 }
-- 
2.34.1




[PATCH v2 16/20] vfio/pci-quirks: Make vfio_add_*_cap() return bool

2024-05-21 Thread Zhenzhong Duan
This is to follow the coding standand in qapi/error.h to return bool
for bool-valued functions.

Include below functions:
vfio_add_virt_caps()
vfio_add_nv_gpudirect_cap()
vfio_add_vmd_shadow_cap()

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/pci.h|  2 +-
 hw/vfio/pci-quirks.c | 42 +++---
 hw/vfio/pci.c|  3 +--
 3 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index f158681072..bf67df2fbc 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -212,7 +212,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr);
 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr);
 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr);
 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev);
-int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp);
+bool vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp);
 void vfio_quirk_reset(VFIOPCIDevice *vdev);
 VFIOQuirk *vfio_quirk_alloc(int nr_mem);
 void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr);
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index ca27917159..39dae72497 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -1536,7 +1536,7 @@ static bool is_valid_std_cap_offset(uint8_t pos)
 pos <= (PCI_CFG_SPACE_SIZE - PCI_CAP_SIZEOF));
 }
 
-static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
+static bool vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
 {
 ERRP_GUARD();
 PCIDevice *pdev = >pdev;
@@ -1545,18 +1545,18 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice 
*vdev, Error **errp)
 uint8_t tmp;
 
 if (vdev->nv_gpudirect_clique == 0xFF) {
-return 0;
+return true;
 }
 
 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
-return -EINVAL;
+return false;
 }
 
 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
 PCI_BASE_CLASS_DISPLAY) {
 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
-return -EINVAL;
+return false;
 }
 
 /*
@@ -1572,7 +1572,7 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, 
Error **errp)
 vdev->config_offset + PCI_CAPABILITY_LIST);
 if (ret != 1 || !is_valid_std_cap_offset(tmp)) {
 error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list");
-return -EINVAL;
+return false;
 }
 
 do {
@@ -1590,13 +1590,13 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice 
*vdev, Error **errp)
 pos = 0xD4;
 } else {
 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space");
-return -EINVAL;
+return false;
 }
 
 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
 if (ret < 0) {
 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
-return ret;
+return false;
 }
 
 memset(vdev->emulated_config_bits + pos, 0xFF, 8);
@@ -1608,7 +1608,7 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, 
Error **errp)
 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
 pci_set_byte(pdev->config + pos, 0);
 
-return 0;
+return true;
 }
 
 /*
@@ -1629,7 +1629,7 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, 
Error **errp)
  */
 #define VMD_SHADOW_CAP_VER 1
 #define VMD_SHADOW_CAP_LEN 24
-static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp)
+static bool vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp)
 {
 ERRP_GUARD();
 uint8_t membar_phys[16];
@@ -1639,7 +1639,7 @@ static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, 
Error **errp)
   vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) ||
   vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) ||
   vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) {
-return 0;
+return true;
 }
 
 ret = pread(vdev->vbasedev.fd, membar_phys, 16,
@@ -1647,14 +1647,14 @@ static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, 
Error **errp)
 if (ret != 16) {
 error_report("VMD %s cannot read MEMBARs (%d)",
  vdev->vbasedev.name, ret);
-return -EFAULT;
+return false;
 }
 
 ret = pci_add_capability(>pdev, PCI_CAP_ID_VNDR, pos,
  VMD_SHADOW_CAP_LEN, errp);
 if (ret < 0) {
 error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: ");
-return ret;
+return false;
 }
 
 memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN);
@@ -1664,22 +1664,18 @@ static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, 
Error **errp)
 pci_set_long(vdev->pdev.c

[PATCH v2 14/20] vfio/pci: Use g_autofree for vfio_region_info pointer

2024-05-21 Thread Zhenzhong Duan
Pointer opregion is freed after vfio_pci_igd_opregion_init().
Use 'g_autofree' to avoid the g_free() calls.

Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/pci.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c3323912dd..8379d2284a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3143,7 +3143,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 if (!vdev->igd_opregion &&
 vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
-struct vfio_region_info *opregion;
+g_autofree struct vfio_region_info *opregion = NULL;
 
 if (vdev->pdev.qdev.hotplugged) {
 error_setg(errp,
@@ -3162,7 +3162,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 }
 
 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
-g_free(opregion);
 if (ret) {
 goto out_teardown;
 }
-- 
2.34.1




[PATCH v2 09/20] vfio/pci: Make vfio_pci_relocate_msix() and vfio_msix_early_setup() return a bool

2024-05-21 Thread Zhenzhong Duan
Since vfio_pci_relocate_msix() and vfio_msix_early_setup() takes
an 'Error **' argument, best practices suggest to return a bool.
See the qapi/error.h Rules section.

By this chance, pass errp directly to vfio_msix_early_setup() to avoid
calling error_propagate().

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 33 -
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 12fb534d79..4fb5fd0c9f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1450,13 +1450,13 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice 
*vdev)
 }
 }
 
-static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
+static bool vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
 {
 int target_bar = -1;
 size_t msix_sz;
 
 if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
-return;
+return true;
 }
 
 /* The actual minimum size of MSI-X structures */
@@ -1479,7 +1479,7 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, 
Error **errp)
 if (target_bar < 0) {
 error_setg(errp, "No automatic MSI-X relocation available for "
"device %04x:%04x", vdev->vendor_id, vdev->device_id);
-return;
+return false;
 }
 } else {
 target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
@@ -1489,7 +1489,7 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, 
Error **errp)
 if (vdev->bars[target_bar].ioport) {
 error_setg(errp, "Invalid MSI-X relocation BAR %d, "
"I/O port BAR", target_bar);
-return;
+return false;
 }
 
 /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
@@ -1497,7 +1497,7 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, 
Error **errp)
  target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
 error_setg(errp, "Invalid MSI-X relocation BAR %d, "
"consumed by 64-bit BAR %d", target_bar, target_bar - 1);
-return;
+return false;
 }
 
 /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
@@ -1505,7 +1505,7 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, 
Error **errp)
 !vdev->bars[target_bar].mem64) {
 error_setg(errp, "Invalid MSI-X relocation BAR %d, "
"no space to extend 32-bit BAR", target_bar);
-return;
+return false;
 }
 
 /*
@@ -1540,6 +1540,7 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, 
Error **errp)
 
 trace_vfio_msix_relo(vdev->vbasedev.name,
  vdev->msix->table_bar, vdev->msix->table_offset);
+return true;
 }
 
 /*
@@ -1550,7 +1551,7 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, 
Error **errp)
  * need to first look for where the MSI-X table lives.  So we
  * unfortunately split MSI-X setup across two functions.
  */
-static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
+static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
 {
 uint8_t pos;
 uint16_t ctrl;
@@ -1562,25 +1563,25 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 
 pos = pci_find_capability(>pdev, PCI_CAP_ID_MSIX);
 if (!pos) {
-return;
+return true;
 }
 
 if (pread(fd, , sizeof(ctrl),
   vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
 error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
-return;
+return false;
 }
 
 if (pread(fd, , sizeof(table),
   vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
 error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
-return;
+return false;
 }
 
 if (pread(fd, , sizeof(pba),
   vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
 error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
-return;
+return false;
 }
 
 ctrl = le16_to_cpu(ctrl);
@@ -1598,7 +1599,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 if (ret < 0) {
 error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
 g_free(msix);
-return;
+return false;
 }
 
 msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
@@ -1630,7 +1631,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 error_setg(errp, "hardware reports invalid configuration, "
"MSIX PBA outside of specified BAR");
 g_free(msix);
-return;
+return false;
 }
 }
 
@@ -1641,7 +1642,7 @@ static

[PATCH v2 07/20] vfio/ccw: Make vfio_ccw_get_region() return a bool

2024-05-21 Thread Zhenzhong Duan
Since vfio_populate_device() takes an 'Error **' argument,
best practices suggest to return a bool. See the qapi/error.h
Rules section.

Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/ccw.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 8850ca17c8..2600e62e37 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -474,7 +474,7 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice 
*vcdev,
 event_notifier_cleanup(notifier);
 }
 
-static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
+static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
 {
 VFIODevice *vdev = >vdev;
 struct vfio_region_info *info;
@@ -483,7 +483,7 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error 
**errp)
 /* Sanity check device */
 if (!(vdev->flags & VFIO_DEVICE_FLAGS_CCW)) {
 error_setg(errp, "vfio: Um, this isn't a vfio-ccw device");
-return;
+return false;
 }
 
 /*
@@ -493,13 +493,13 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, 
Error **errp)
 if (vdev->num_regions < VFIO_CCW_CONFIG_REGION_INDEX + 1) {
 error_setg(errp, "vfio: too few regions (%u), expected at least %u",
vdev->num_regions, VFIO_CCW_CONFIG_REGION_INDEX + 1);
-return;
+return false;
 }
 
 ret = vfio_get_region_info(vdev, VFIO_CCW_CONFIG_REGION_INDEX, );
 if (ret) {
 error_setg_errno(errp, -ret, "vfio: Error getting config info");
-return;
+return false;
 }
 
 vcdev->io_region_size = info->size;
@@ -553,7 +553,7 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error 
**errp)
 g_free(info);
 }
 
-return;
+return true;
 
 out_err:
 g_free(vcdev->crw_region);
@@ -561,7 +561,7 @@ out_err:
 g_free(vcdev->async_cmd_region);
 g_free(vcdev->io_region);
 g_free(info);
-return;
+return false;
 }
 
 static void vfio_ccw_put_region(VFIOCCWDevice *vcdev)
@@ -597,8 +597,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 goto out_attach_dev_err;
 }
 
-vfio_ccw_get_region(vcdev, );
-if (err) {
+if (!vfio_ccw_get_region(vcdev, )) {
 goto out_region_err;
 }
 
-- 
2.34.1




[PATCH v2 00/20] VFIO: misc cleanups part2

2024-05-21 Thread Zhenzhong Duan
Hi

This is the last round of cleanup series to change functions in hw/vfio/
to return bool when the error is passed through errp parameter.

The first round is at 
https://lists.gnu.org/archive/html/qemu-devel/2024-05/msg01147.html

I see Cédric is also working on some migration stuff cleanup,
so didn't touch migration.c, but all other files in hw/vfio/ are cleanup now.

Patch1 and patch20 are fix patch, all others are cleanup patches.

Test done on x86 platform:
vfio device hotplug/unplug with different backend
reboot

This series is rebased to https://github.com/legoater/qemu/tree/vfio-next

Thanks
Zhenzhong

Changelog:
v2:
- add patch17 to use g_autofree in all callsite of vfio_get_region_info() 
(Cédric)
- add patch18 to use g_autofree in vfio_probe_igd_bar4_quirk()
- add patch19 to drop local err in vfio_ccw_realize() (Cédric)
- add patch20 to fix a bug I just found
- add R-B

Zhenzhong Duan (20):
  vfio/display: Fix error path in call site of ramfb_setup()
  vfio/display: Make vfio_display_*() return bool
  vfio/helpers: Use g_autofree in vfio_set_irq_signaling()
  vfio/helpers: Make vfio_set_irq_signaling() return bool
  vfio/helpers: Make vfio_device_get_name() return bool
  vfio/platform: Make vfio_populate_device() and vfio_base_device_init()
return bool
  vfio/ccw: Make vfio_ccw_get_region() return a bool
  vfio/pci: Make vfio_intx_enable_kvm() return a bool
  vfio/pci: Make vfio_pci_relocate_msix() and vfio_msix_early_setup()
return a bool
  vfio/pci: Make vfio_populate_device() return a bool
  vfio/pci: Make vfio_intx_enable() return bool
  vfio/pci: Make vfio_populate_vga() return bool
  vfio/pci: Make capability related functions return bool
  vfio/pci: Use g_autofree for vfio_region_info pointer
  vfio/pci-quirks: Make vfio_pci_igd_opregion_init() return bool
  vfio/pci-quirks: Make vfio_add_*_cap() return bool
  vfio: Use g_autofree in all call site of vfio_get_region_info()
  vfio/igd: Use g_autofree in vfio_probe_igd_bar4_quirk()
  vfio/ccw: Drop local @err in vfio_ccw_realize()
  vfio/ccw: Fix the missed unrealize() call in error path

 hw/vfio/pci.h |  12 +-
 include/hw/vfio/vfio-common.h |   6 +-
 hw/vfio/ap.c  |  10 +-
 hw/vfio/ccw.c |  47 ---
 hw/vfio/display.c |  22 +--
 hw/vfio/helpers.c |  36 ++---
 hw/vfio/igd.c |  35 +++--
 hw/vfio/pci-quirks.c  |  50 ---
 hw/vfio/pci.c | 243 --
 hw/vfio/platform.c|  61 -
 10 files changed, 241 insertions(+), 281 deletions(-)

-- 
2.34.1




[PATCH v2 13/20] vfio/pci: Make capability related functions return bool

2024-05-21 Thread Zhenzhong Duan
The functions operating on capability don't have a consistent return style.

Below functions are in bool-valued functions style:
vfio_msi_setup()
vfio_msix_setup()
vfio_add_std_cap()
vfio_add_capabilities()

Below two are integer-valued functions:
vfio_add_vendor_specific_cap()
vfio_setup_pcie_cap()

But the returned integer is only used for check succeed/failure.
Change them all to return bool so now all capability related
functions follow the coding standand in qapi/error.h to return
bool.

Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/pci.c | 77 ---
 1 file changed, 36 insertions(+), 41 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ab8f74299e..c3323912dd 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1339,7 +1339,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
 }
 }
 
-static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
+static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
 {
 uint16_t ctrl;
 bool msi_64bit, msi_maskbit;
@@ -1349,7 +1349,7 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, 
Error **errp)
 if (pread(vdev->vbasedev.fd, , sizeof(ctrl),
   vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
 error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
-return -errno;
+return false;
 }
 ctrl = le16_to_cpu(ctrl);
 
@@ -1362,14 +1362,14 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, 
Error **errp)
 ret = msi_init(>pdev, pos, entries, msi_64bit, msi_maskbit, );
 if (ret < 0) {
 if (ret == -ENOTSUP) {
-return 0;
+return true;
 }
 error_propagate_prepend(errp, err, "msi_init failed: ");
-return ret;
+return false;
 }
 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
 
-return 0;
+return true;
 }
 
 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
@@ -1644,7 +1644,7 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 return vfio_pci_relocate_msix(vdev, errp);
 }
 
-static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
+static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
 {
 int ret;
 Error *err = NULL;
@@ -1660,11 +1660,11 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int 
pos, Error **errp)
 if (ret < 0) {
 if (ret == -ENOTSUP) {
 warn_report_err(err);
-return 0;
+return true;
 }
 
 error_propagate(errp, err);
-return ret;
+return false;
 }
 
 /*
@@ -1698,7 +1698,7 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, 
Error **errp)
 memory_region_set_enabled(>pdev.msix_table_mmio, false);
 }
 
-return 0;
+return true;
 }
 
 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
@@ -1977,8 +1977,8 @@ static void vfio_pci_disable_rp_atomics(VFIOPCIDevice 
*vdev)
 }
 }
 
-static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
-   Error **errp)
+static bool vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
+Error **errp)
 {
 uint16_t flags;
 uint8_t type;
@@ -1992,7 +1992,7 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int 
pos, uint8_t size,
 
 error_setg(errp, "assignment of PCIe type 0x%x "
"devices is not currently supported", type);
-return -EINVAL;
+return false;
 }
 
 if (!pci_bus_is_express(pci_get_bus(>pdev))) {
@@ -2025,7 +2025,7 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int 
pos, uint8_t size,
 }
 
 if (pci_bus_is_express(bus)) {
-return 0;
+return true;
 }
 
 } else if (pci_bus_is_root(pci_get_bus(>pdev))) {
@@ -2063,7 +2063,7 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int 
pos, uint8_t size,
  * Legacy endpoints don't belong on the root complex.  Windows
  * seems to be happier with devices if we skip the capability.
  */
-return 0;
+return true;
 }
 
 } else {
@@ -2099,12 +2099,12 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int 
pos, uint8_t size,
 pos = pci_add_capability(>pdev, PCI_CAP_ID_EXP, pos, size,
  errp);
 if (pos < 0) {
-return pos;
+return false;
 }
 
 vdev->pdev.exp.exp_cap = pos;
 
-return pos;
+return true;
 }
 
 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
@@ -2137,14 +2137,14 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, 
uint8_t pos)
 }
 }
 
-static int vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,
-

[PATCH v2 01/20] vfio/display: Fix error path in call site of ramfb_setup()

2024-05-21 Thread Zhenzhong Duan
vfio_display_dmabuf_init() and vfio_display_region_init() calls
ramfb_setup() without checking its return value.

So we may run into a situation that vfio_display_probe() succeed
but errp is set. This is risky and may lead to assert failure in
error_setv().

Cc: Gerd Hoffmann 
Fixes: b290659fc3d ("hw/vfio/display: add ramfb support")
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/display.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index fe624a6c9b..d28b724102 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -361,6 +361,9 @@ static int vfio_display_dmabuf_init(VFIOPCIDevice *vdev, 
Error **errp)
   vdev);
 if (vdev->enable_ramfb) {
 vdev->dpy->ramfb = ramfb_setup(errp);
+if (!vdev->dpy->ramfb) {
+return -EINVAL;
+}
 }
 vfio_display_edid_init(vdev);
 return 0;
@@ -488,6 +491,9 @@ static int vfio_display_region_init(VFIOPCIDevice *vdev, 
Error **errp)
   vdev);
 if (vdev->enable_ramfb) {
 vdev->dpy->ramfb = ramfb_setup(errp);
+if (!vdev->dpy->ramfb) {
+return -EINVAL;
+}
 }
 return 0;
 }
-- 
2.34.1




[PATCH v2 05/20] vfio/helpers: Make vfio_device_get_name() return bool

2024-05-21 Thread Zhenzhong Duan
This is to follow the coding standand in qapi/error.h to return bool
for bool-valued functions.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h | 2 +-
 hw/vfio/ap.c  | 2 +-
 hw/vfio/ccw.c | 2 +-
 hw/vfio/helpers.c | 8 
 hw/vfio/pci.c | 2 +-
 hw/vfio/platform.c| 5 ++---
 6 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b712799caf..4cb1ab8645 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -279,7 +279,7 @@ int vfio_get_dirty_bitmap(const VFIOContainerBase 
*bcontainer, uint64_t iova,
   uint64_t size, ram_addr_t ram_addr, Error **errp);
 
 /* Returns 0 on success, or a negative errno. */
-int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
+bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
   DeviceState *dev, bool ram_discard);
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index d8a9615fee..c12531a788 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -158,7 +158,7 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
 VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
 VFIODevice *vbasedev = >vdev;
 
-if (vfio_device_get_name(vbasedev, errp) < 0) {
+if (!vfio_device_get_name(vbasedev, errp)) {
 return;
 }
 
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 1f578a3c75..8850ca17c8 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -588,7 +588,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 }
 }
 
-if (vfio_device_get_name(vbasedev, errp) < 0) {
+if (!vfio_device_get_name(vbasedev, errp)) {
 return;
 }
 
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 9edbc96688..4b079dc383 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -607,7 +607,7 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, 
uint16_t cap_type)
 return ret;
 }
 
-int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
+bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
 {
 ERRP_GUARD();
 struct stat st;
@@ -616,7 +616,7 @@ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
 if (stat(vbasedev->sysfsdev, ) < 0) {
 error_setg_errno(errp, errno, "no such host device");
 error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
-return -errno;
+return false;
 }
 /* User may specify a name, e.g: VFIO platform device */
 if (!vbasedev->name) {
@@ -625,7 +625,7 @@ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
 } else {
 if (!vbasedev->iommufd) {
 error_setg(errp, "Use FD passing only with iommufd backend");
-return -EINVAL;
+return false;
 }
 /*
  * Give a name with fd so any function printing out vbasedev->name
@@ -636,7 +636,7 @@ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
 }
 }
 
-return 0;
+return true;
 }
 
 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 358da4497b..aad012c348 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2999,7 +2999,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vdev->host.slot, vdev->host.function);
 }
 
-if (vfio_device_get_name(vbasedev, errp) < 0) {
+if (!vfio_device_get_name(vbasedev, errp)) {
 return;
 }
 
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 3233ca8691..e1a32863d9 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -545,9 +545,8 @@ static int vfio_base_device_init(VFIODevice *vbasedev, 
Error **errp)
  vbasedev->name);
 }
 
-ret = vfio_device_get_name(vbasedev, errp);
-if (ret) {
-return ret;
+if (!vfio_device_get_name(vbasedev, errp)) {
+return -EINVAL;
 }
 
 if (!vfio_attach_device(vbasedev->name, vbasedev,
-- 
2.34.1




[PATCH v2 06/20] vfio/platform: Make vfio_populate_device() and vfio_base_device_init() return bool

2024-05-21 Thread Zhenzhong Duan
This is to follow the coding standand in qapi/error.h to return bool
for bool-valued functions.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/platform.c | 40 +---
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index e1a32863d9..a85c199c76 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -441,7 +441,7 @@ static int vfio_platform_hot_reset_multi(VFIODevice 
*vbasedev)
  * @errp: error object
  *
  */
-static int vfio_populate_device(VFIODevice *vbasedev, Error **errp)
+static bool vfio_populate_device(VFIODevice *vbasedev, Error **errp)
 {
 VFIOINTp *intp, *tmp;
 int i, ret = -1;
@@ -450,7 +450,7 @@ static int vfio_populate_device(VFIODevice *vbasedev, Error 
**errp)
 
 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PLATFORM)) {
 error_setg(errp, "this isn't a platform device");
-return ret;
+return false;
 }
 
 vdev->regions = g_new0(VFIORegion *, vbasedev->num_regions);
@@ -487,12 +487,11 @@ static int vfio_populate_device(VFIODevice *vbasedev, 
Error **errp)
 irq.flags);
 intp = vfio_init_intp(vbasedev, irq, errp);
 if (!intp) {
-ret = -1;
 goto irq_err;
 }
 }
 }
-return 0;
+return true;
 irq_err:
 timer_del(vdev->mmap_timer);
 QLIST_FOREACH_SAFE(intp, >intp_list, next, tmp) {
@@ -507,7 +506,7 @@ reg_error:
 g_free(vdev->regions[i]);
 }
 g_free(vdev->regions);
-return ret;
+return false;
 }
 
 /* specialized functions for VFIO Platform devices */
@@ -527,10 +526,8 @@ static VFIODeviceOps vfio_platform_ops = {
  * fd retrieval, resource query.
  * Precondition: the device name must be initialized
  */
-static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
+static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
 {
-int ret;
-
 /* @fd takes precedence over @sysfsdev which takes precedence over @host */
 if (vbasedev->fd < 0 && vbasedev->sysfsdev) {
 g_free(vbasedev->name);
@@ -538,7 +535,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, 
Error **errp)
 } else if (vbasedev->fd < 0) {
 if (!vbasedev->name || strchr(vbasedev->name, '/')) {
 error_setg(errp, "wrong host device name");
-return -EINVAL;
+return false;
 }
 
 vbasedev->sysfsdev = g_strdup_printf("/sys/bus/platform/devices/%s",
@@ -546,20 +543,20 @@ static int vfio_base_device_init(VFIODevice *vbasedev, 
Error **errp)
 }
 
 if (!vfio_device_get_name(vbasedev, errp)) {
-return -EINVAL;
+return false;
 }
 
 if (!vfio_attach_device(vbasedev->name, vbasedev,
 _space_memory, errp)) {
-return -EINVAL;
+return false;
 }
 
-ret = vfio_populate_device(vbasedev, errp);
-if (ret) {
-vfio_detach_device(vbasedev);
+if (vfio_populate_device(vbasedev, errp)) {
+return true;
 }
 
-return ret;
+vfio_detach_device(vbasedev);
+return false;
 }
 
 /**
@@ -576,7 +573,7 @@ static void vfio_platform_realize(DeviceState *dev, Error 
**errp)
 VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
 SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
 VFIODevice *vbasedev = >vbasedev;
-int i, ret;
+int i;
 
 qemu_mutex_init(>intp_mutex);
 
@@ -584,9 +581,8 @@ static void vfio_platform_realize(DeviceState *dev, Error 
**errp)
 vbasedev->sysfsdev : vbasedev->name,
 vdev->compat);
 
-ret = vfio_base_device_init(vbasedev, errp);
-if (ret) {
-goto out;
+if (!vfio_base_device_init(vbasedev, errp)) {
+goto init_err;
 }
 
 if (!vdev->compat) {
@@ -618,11 +614,9 @@ static void vfio_platform_realize(DeviceState *dev, Error 
**errp)
 }
 sysbus_init_mmio(sbdev, vdev->regions[i]->mem);
 }
-out:
-if (!ret) {
-return;
-}
+return;
 
+init_err:
 if (vdev->vbasedev.name) {
 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 } else {
-- 
2.34.1




[PATCH v2 02/20] vfio/display: Make vfio_display_*() return bool

2024-05-21 Thread Zhenzhong Duan
This is to follow the coding standand in qapi/error.h to return bool
for bool-valued functions.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/pci.h |  2 +-
 hw/vfio/display.c | 20 ++--
 hw/vfio/pci.c |  3 +--
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 92cd62d115..a5ac9efd4b 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -232,7 +232,7 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
Error **errp);
 
 void vfio_display_reset(VFIOPCIDevice *vdev);
-int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
+bool vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
 void vfio_display_finalize(VFIOPCIDevice *vdev);
 
 extern const VMStateDescription vfio_display_vmstate;
diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index d28b724102..661e921616 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -348,11 +348,11 @@ static const GraphicHwOps vfio_display_dmabuf_ops = {
 .ui_info= vfio_display_edid_ui_info,
 };
 
-static int vfio_display_dmabuf_init(VFIOPCIDevice *vdev, Error **errp)
+static bool vfio_display_dmabuf_init(VFIOPCIDevice *vdev, Error **errp)
 {
 if (!display_opengl) {
 error_setg(errp, "vfio-display-dmabuf: opengl not available");
-return -1;
+return false;
 }
 
 vdev->dpy = g_new0(VFIODisplay, 1);
@@ -362,11 +362,11 @@ static int vfio_display_dmabuf_init(VFIOPCIDevice *vdev, 
Error **errp)
 if (vdev->enable_ramfb) {
 vdev->dpy->ramfb = ramfb_setup(errp);
 if (!vdev->dpy->ramfb) {
-return -EINVAL;
+return false;
 }
 }
 vfio_display_edid_init(vdev);
-return 0;
+return true;
 }
 
 static void vfio_display_dmabuf_exit(VFIODisplay *dpy)
@@ -483,7 +483,7 @@ static const GraphicHwOps vfio_display_region_ops = {
 .gfx_update = vfio_display_region_update,
 };
 
-static int vfio_display_region_init(VFIOPCIDevice *vdev, Error **errp)
+static bool vfio_display_region_init(VFIOPCIDevice *vdev, Error **errp)
 {
 vdev->dpy = g_new0(VFIODisplay, 1);
 vdev->dpy->con = graphic_console_init(DEVICE(vdev), 0,
@@ -492,10 +492,10 @@ static int vfio_display_region_init(VFIOPCIDevice *vdev, 
Error **errp)
 if (vdev->enable_ramfb) {
 vdev->dpy->ramfb = ramfb_setup(errp);
 if (!vdev->dpy->ramfb) {
-return -EINVAL;
+return false;
 }
 }
-return 0;
+return true;
 }
 
 static void vfio_display_region_exit(VFIODisplay *dpy)
@@ -510,7 +510,7 @@ static void vfio_display_region_exit(VFIODisplay *dpy)
 
 /* -- */
 
-int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_display_probe(VFIOPCIDevice *vdev, Error **errp)
 {
 struct vfio_device_gfx_plane_info probe;
 int ret;
@@ -533,11 +533,11 @@ int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp)
 
 if (vdev->display == ON_OFF_AUTO_AUTO) {
 /* not an error in automatic mode */
-return 0;
+return true;
 }
 
 error_setg(errp, "vfio: device doesn't support any (known) display 
method");
-return -1;
+return false;
 }
 
 void vfio_display_finalize(VFIOPCIDevice *vdev)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c1adef5cf8..a447013a1d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3200,8 +3200,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 }
 
 if (vdev->display != ON_OFF_AUTO_OFF) {
-ret = vfio_display_probe(vdev, errp);
-if (ret) {
+if (!vfio_display_probe(vdev, errp)) {
 goto out_deregister;
 }
 }
-- 
2.34.1




[PATCH v2 04/20] vfio/helpers: Make vfio_set_irq_signaling() return bool

2024-05-21 Thread Zhenzhong Duan
This is to follow the coding standand in qapi/error.h to return bool
for bool-valued functions.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h |  4 ++--
 hw/vfio/ap.c  |  8 +++
 hw/vfio/ccw.c |  8 +++
 hw/vfio/helpers.c | 18 ++--
 hw/vfio/pci.c | 40 ++-
 hw/vfio/platform.c| 18 +++-
 6 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b7bb4f5304..b712799caf 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -207,8 +207,8 @@ void vfio_spapr_container_deinit(VFIOContainer *container);
 void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index);
-int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
-   int action, int fd, Error **errp);
+bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
+int action, int fd, Error **errp);
 void vfio_region_write(void *opaque, hwaddr addr,
uint64_t data, unsigned size);
 uint64_t vfio_region_read(void *opaque,
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index ba653ef70f..d8a9615fee 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -117,8 +117,8 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice 
*vapdev,
 fd = event_notifier_get_fd(notifier);
 qemu_set_fd_handler(fd, fd_read, NULL, vapdev);
 
-if (vfio_set_irq_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
-   errp)) {
+if (!vfio_set_irq_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd,
+errp)) {
 qemu_set_fd_handler(fd, NULL, NULL, vapdev);
 event_notifier_cleanup(notifier);
 }
@@ -141,8 +141,8 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice 
*vapdev,
 return;
 }
 
-if (vfio_set_irq_signaling(>vdev, irq, 0,
-   VFIO_IRQ_SET_ACTION_TRIGGER, -1, )) {
+if (!vfio_set_irq_signaling(>vdev, irq, 0,
+VFIO_IRQ_SET_ACTION_TRIGGER, -1, )) {
 warn_reportf_err(err, VFIO_MSG_PREFIX, vapdev->vdev.name);
 }
 
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 89bb980167..1f578a3c75 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -434,8 +434,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice 
*vcdev,
 fd = event_notifier_get_fd(notifier);
 qemu_set_fd_handler(fd, fd_read, NULL, vcdev);
 
-if (vfio_set_irq_signaling(vdev, irq, 0,
-   VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
+if (!vfio_set_irq_signaling(vdev, irq, 0,
+VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
 qemu_set_fd_handler(fd, NULL, NULL, vcdev);
 event_notifier_cleanup(notifier);
 }
@@ -464,8 +464,8 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice 
*vcdev,
 return;
 }
 
-if (vfio_set_irq_signaling(>vdev, irq, 0,
-   VFIO_IRQ_SET_ACTION_TRIGGER, -1, )) {
+if (!vfio_set_irq_signaling(>vdev, irq, 0,
+VFIO_IRQ_SET_ACTION_TRIGGER, -1, )) {
 warn_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name);
 }
 
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 1f3bdd9bf0..9edbc96688 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -107,12 +107,12 @@ static const char *index_to_str(VFIODevice *vbasedev, int 
index)
 }
 }
 
-int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
-   int action, int fd, Error **errp)
+bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
+int action, int fd, Error **errp)
 {
 ERRP_GUARD();
 g_autofree struct vfio_irq_set *irq_set = NULL;
-int argsz, ret = 0;
+int argsz;
 const char *name;
 int32_t *pfd;
 
@@ -127,15 +127,11 @@ int vfio_set_irq_signaling(VFIODevice *vbasedev, int 
index, int subindex,
 pfd = (int32_t *)_set->data;
 *pfd = fd;
 
-if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
-ret = -errno;
+if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
+return true;
 }
 
-if (!ret) {
-return 0;
-}
-
-error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
+error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure");
 
 name = index_to_str(vbasedev, index);
 if (name) {
@@ -146,7 +142,7 @@ int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, 
int subindex,
 error_prepend(er

[PATCH v2 20/20] vfio/ccw: Fix the missed unrealize() call in error path

2024-05-21 Thread Zhenzhong Duan
When get name failed, we should call unrealize() so that
vfio_ccw_realize() is self contained.

Fixes: 909a6254eda ("vfio/ccw: Make vfio cdev pre-openable by passing a file 
handle")
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/ccw.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 168c9e5973..161704cd7b 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -589,7 +589,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 }
 
 if (!vfio_device_get_name(vbasedev, errp)) {
-return;
+goto out_unrealize;
 }
 
 if (!vfio_attach_device(cdev->mdevid, vbasedev,
@@ -633,6 +633,7 @@ out_region_err:
 vfio_detach_device(vbasedev);
 out_attach_dev_err:
 g_free(vbasedev->name);
+out_unrealize:
 if (cdc->unrealize) {
 cdc->unrealize(cdev);
 }
-- 
2.34.1




[PATCH v2 19/20] vfio/ccw: Drop local @err in vfio_ccw_realize()

2024-05-21 Thread Zhenzhong Duan
Use @errp to fetch error information directly and drop the local
variable @err.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/ccw.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 2600e62e37..168c9e5973 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -574,17 +574,17 @@ static void vfio_ccw_put_region(VFIOCCWDevice *vcdev)
 
 static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 {
+ERRP_GUARD();
 S390CCWDevice *cdev = S390_CCW_DEVICE(dev);
 VFIOCCWDevice *vcdev = VFIO_CCW(cdev);
 S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
 VFIODevice *vbasedev = >vdev;
-Error *err = NULL;
 
 /* Call the class init function for subchannel. */
 if (cdc->realize) {
-cdc->realize(cdev, vcdev->vdev.sysfsdev, );
-if (err) {
-goto out_err_propagate;
+cdc->realize(cdev, vcdev->vdev.sysfsdev, errp);
+if (*errp) {
+return;
 }
 }
 
@@ -597,27 +597,28 @@ static void vfio_ccw_realize(DeviceState *dev, Error 
**errp)
 goto out_attach_dev_err;
 }
 
-if (!vfio_ccw_get_region(vcdev, )) {
+if (!vfio_ccw_get_region(vcdev, errp)) {
 goto out_region_err;
 }
 
-if (!vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX, )) {
+if (!vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX, errp)) {
 goto out_io_notifier_err;
 }
 
 if (vcdev->crw_region) {
 if (!vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX,
-)) {
+errp)) {
 goto out_irq_notifier_err;
 }
 }
 
-if (!vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_REQ_IRQ_INDEX, )) {
+if (!vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_REQ_IRQ_INDEX, errp)) {
 /*
  * Report this error, but do not make it a failing condition.
  * Lack of this IRQ in the host does not prevent normal operation.
  */
-error_report_err(err);
+error_report_err(*errp);
+*errp = NULL;
 }
 
 return;
@@ -635,8 +636,6 @@ out_attach_dev_err:
 if (cdc->unrealize) {
 cdc->unrealize(cdev);
 }
-out_err_propagate:
-error_propagate(errp, err);
 }
 
 static void vfio_ccw_unrealize(DeviceState *dev)
-- 
2.34.1




[PATCH v2 11/20] vfio/pci: Make vfio_intx_enable() return bool

2024-05-21 Thread Zhenzhong Duan
This is to follow the coding standand in qapi/error.h to return bool
for bool-valued functions.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/pci.c | 19 ---
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 46d3c61859..7f35cb8a29 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -261,7 +261,7 @@ static void vfio_irqchip_change(Notifier *notify, void 
*data)
 vfio_intx_update(vdev, >intx.route);
 }
 
-static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
+static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
 {
 uint8_t pin = vfio_pci_read_config(>pdev, PCI_INTERRUPT_PIN, 1);
 Error *err = NULL;
@@ -270,7 +270,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
 
 
 if (!pin) {
-return 0;
+return true;
 }
 
 vfio_disable_interrupts(vdev);
@@ -292,7 +292,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
 ret = event_notifier_init(>intx.interrupt, 0);
 if (ret) {
 error_setg_errno(errp, -ret, "event_notifier_init failed");
-return ret;
+return false;
 }
 fd = event_notifier_get_fd(>intx.interrupt);
 qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
@@ -301,7 +301,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
 VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
 qemu_set_fd_handler(fd, NULL, NULL, vdev);
 event_notifier_cleanup(>intx.interrupt);
-return -errno;
+return false;
 }
 
 if (!vfio_intx_enable_kvm(vdev, )) {
@@ -311,7 +311,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
 vdev->interrupt = VFIO_INT_INTx;
 
 trace_vfio_intx_enable(vdev->vbasedev.name);
-return 0;
+return true;
 }
 
 static void vfio_intx_disable(VFIOPCIDevice *vdev)
@@ -836,8 +836,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
 vfio_disable_irqindex(>vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 
 vfio_msi_disable_common(vdev);
-vfio_intx_enable(vdev, );
-if (err) {
+if (!vfio_intx_enable(vdev, )) {
 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 }
 
@@ -2450,8 +2449,7 @@ void vfio_pci_post_reset(VFIOPCIDevice *vdev)
 Error *err = NULL;
 int nr;
 
-vfio_intx_enable(vdev, );
-if (err) {
+if (!vfio_intx_enable(vdev, )) {
 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 }
 
@@ -3194,8 +3192,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  vfio_intx_routing_notifier);
 vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
 kvm_irqchip_add_change_notifier(>irqchip_change_notifier);
-ret = vfio_intx_enable(vdev, errp);
-if (ret) {
+if (!vfio_intx_enable(vdev, errp)) {
 goto out_deregister;
 }
 }
-- 
2.34.1




[PATCH v2 12/20] vfio/pci: Make vfio_populate_vga() return bool

2024-05-21 Thread Zhenzhong Duan
This is to follow the coding standand in qapi/error.h to return bool
for bool-valued functions.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 hw/vfio/pci.h |  2 +-
 hw/vfio/igd.c |  2 +-
 hw/vfio/pci.c | 11 +--
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a5ac9efd4b..7914f019d5 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -225,7 +225,7 @@ bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const 
char *name);
 int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
 struct vfio_pci_hot_reset_info **info_p);
 
-int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
+bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
 
 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
struct vfio_region_info *info,
diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c
index b31ee79c60..ffe57c5954 100644
--- a/hw/vfio/igd.c
+++ b/hw/vfio/igd.c
@@ -478,7 +478,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
  * try to enable it.  Probably shouldn't be using legacy mode without VGA,
  * but also no point in us enabling VGA if disabled in hardware.
  */
-if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev, )) {
+if (!(gmch & 0x2) && !vdev->vga && !vfio_populate_vga(vdev, )) {
 error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 error_report("IGD device %s failed to enable VGA access, "
  "legacy mode disabled", vdev->vbasedev.name);
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 7f35cb8a29..ab8f74299e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2670,7 +2670,7 @@ static VFIODeviceOps vfio_pci_ops = {
 .vfio_load_config = vfio_pci_load_config,
 };
 
-int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
 {
 VFIODevice *vbasedev = >vbasedev;
 struct vfio_region_info *reg_info;
@@ -2681,7 +2681,7 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
 error_setg_errno(errp, -ret,
  "failed getting region info for VGA region index %d",
  VFIO_PCI_VGA_REGION_INDEX);
-return ret;
+return false;
 }
 
 if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
@@ -2691,7 +2691,7 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
(unsigned long)reg_info->flags,
(unsigned long)reg_info->size);
 g_free(reg_info);
-return -EINVAL;
+return false;
 }
 
 vdev->vga = g_new0(VFIOVGA, 1);
@@ -2735,7 +2735,7 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
  >vga->region[QEMU_PCI_VGA_IO_LO].mem,
  >vga->region[QEMU_PCI_VGA_IO_HI].mem);
 
-return 0;
+return true;
 }
 
 static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
@@ -2798,8 +2798,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, 
Error **errp)
 g_free(reg_info);
 
 if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
-ret = vfio_populate_vga(vdev, errp);
-if (ret) {
+if (!vfio_populate_vga(vdev, errp)) {
 error_append_hint(errp, "device does not support "
   "requested feature x-vga\n");
 return false;
-- 
2.34.1




  1   2   3   4   5   6   7   >