From: Zhenzhong Duan <[email protected]>

On a system influenced by ERRATA_772415, IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the readonly
range mapped on second stage page table could still be written.

Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
Update, Errata Details, SPR17.
Link 
https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/
Backup https://cdrdv2.intel.com/v1/dl/getContent/772415

Also copied the SPR17 details from above link:
"Problem: When remapping hardware is configured by system software in
scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the
PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended
Access bit if enabled) in first-stage page-table entries even when
second-stage mappings indicate that corresponding first-stage page-table
is Read-Only.

Implication: Due to this erratum, pages mapped as Read-only in second-stage
page-tables may be modified by remapping hardware Access/Dirty bit updates.

Workaround: None identified. System software enabling nested translations
for a VM should ensure that there are no read-only pages in the
corresponding second-stage mappings."

Introduce a helper vfio_device_get_host_iommu_quirk_bypass_ro to check if
readonly mappings should be bypassed.

Signed-off-by: Zhenzhong Duan <[email protected]>
Link: 
https://lore.kernel.org/qemu-devel/[email protected]
Signed-off-by: Cédric Le Goater <[email protected]>
---
 docs/devel/vfio-iommufd.rst      |  9 +++++++++
 include/hw/vfio/vfio-container.h |  1 +
 include/hw/vfio/vfio-device.h    |  3 +++
 hw/vfio/device.c                 | 14 ++++++++++++++
 hw/vfio/iommufd.c                |  9 ++++++++-
 hw/vfio/listener.c               |  6 ++++--
 6 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst
index 
2d6e60dce1d38f1de136c3d65f3c396aef9e0805..6928b47643b876df51675e7607edca62435de139
 100644
--- a/docs/devel/vfio-iommufd.rst
+++ b/docs/devel/vfio-iommufd.rst
@@ -169,3 +169,12 @@ otherwise below error shows:
 .. code-block:: none
 
     qemu-system-x86_64: -device 
vfio-pci,host=0000:02:00.0,bus=bridge1,iommufd=iommufd0: vfio 0000:02:00.0: 
Failed to set vIOMMU: Host device downstream to a PCI bridge is unsupported 
when x-flts=on
+
+If host IOMMU has ERRATA_772415_SPR17, running guest with 
"intel_iommu=on,sm_off"
+is unsupported, kexec or reboot guest from "intel_iommu=on,sm_on" to
+"intel_iommu=on,sm_off" is also unsupported. Configure scalable mode off as
+below if it's not needed by guest:
+
+.. code-block:: bash
+
+    -device intel-iommu,x-scalable-mode=off
diff --git a/include/hw/vfio/vfio-container.h b/include/hw/vfio/vfio-container.h
index 
9f6e8cedfc9541e84558d74bdb156e4963a68639..a7d5c5ed679a0338937ae02f37140d94720f6f11
 100644
--- a/include/hw/vfio/vfio-container.h
+++ b/include/hw/vfio/vfio-container.h
@@ -52,6 +52,7 @@ struct VFIOContainer {
     QLIST_HEAD(, VFIODevice) device_list;
     GList *iova_ranges;
     NotifierWithReturn cpr_reboot_notifier;
+    bool bypass_ro;
 };
 
 #define TYPE_VFIO_IOMMU "vfio-iommu"
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 
48d00c7bc47a2fd11a522a1ad09b051f16342545..f6f3d0e3786cf85553d75674828391e16f9fa250
 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -268,6 +268,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, 
VFIOContainer *bcontainer,
 void vfio_device_unprepare(VFIODevice *vbasedev);
 
 bool vfio_device_get_viommu_flags_want_nesting(VFIODevice *vbasedev);
+bool vfio_device_get_host_iommu_quirk_bypass_ro(VFIODevice *vbasedev,
+                                                uint32_t type, void *caps,
+                                                uint32_t size);
 
 int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
                                 struct vfio_region_info **info);
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 
3bab082322633f7cbd4295b4e91717c83fbb48da..086f20f6762a3a86f52bbab840ef67f603850a01
 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -533,6 +533,20 @@ bool vfio_device_get_viommu_flags_want_nesting(VFIODevice 
*vbasedev)
     return false;
 }
 
+bool vfio_device_get_host_iommu_quirk_bypass_ro(VFIODevice *vbasedev,
+                                                uint32_t type, void *caps,
+                                                uint32_t size)
+{
+    VFIOPCIDevice *vdev = vfio_pci_from_vfio_device(vbasedev);
+
+    if (vdev) {
+        return !!(pci_device_get_host_iommu_quirks(PCI_DEVICE(vdev), type,
+                                                   caps, size) &
+                  HOST_IOMMU_QUIRK_NESTING_PARENT_BYPASS_RO);
+    }
+    return false;
+}
+
 /*
  * Traditional ioctl() based io
  */
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 
0bf68620d2c9a7a5e21553b9cc275e627b73327f..2947e1b80f5213d2781a32cb669bf3b66b69a643
 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -351,6 +351,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice 
*vbasedev,
     VFIOContainer *bcontainer = VFIO_IOMMU(container);
     uint32_t type, flags = 0;
     uint64_t hw_caps;
+    VendorCaps caps;
     VFIOIOASHwpt *hwpt;
     uint32_t hwpt_id;
     int ret;
@@ -396,7 +397,8 @@ static bool iommufd_cdev_autodomains_get(VFIODevice 
*vbasedev,
      * instead.
      */
     if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid,
-                                         &type, NULL, 0, &hw_caps, errp)) {
+                                         &type, &caps, sizeof(caps), &hw_caps,
+                                         errp)) {
         return false;
     }
 
@@ -411,6 +413,11 @@ static bool iommufd_cdev_autodomains_get(VFIODevice 
*vbasedev,
      */
     if (vfio_device_get_viommu_flags_want_nesting(vbasedev)) {
         flags |= IOMMU_HWPT_ALLOC_NEST_PARENT;
+
+        if (vfio_device_get_host_iommu_quirk_bypass_ro(vbasedev, type,
+                                                       &caps, sizeof(caps))) {
+            bcontainer->bypass_ro = true;
+        }
     }
 
     if (cpr_is_incoming()) {
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index 
f193468dee30354ea8c07e9bf2d89b4da42ab78a..8ba1cd255d146ab8055ab73c71eac640eafa1bdd
 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -502,7 +502,8 @@ void vfio_container_region_add(VFIOContainer *bcontainer,
     int ret;
     Error *err = NULL;
 
-    if (!vfio_listener_valid_section(section, false, "region_add")) {
+    if (!vfio_listener_valid_section(section, bcontainer->bypass_ro,
+                                     "region_add")) {
         return;
     }
 
@@ -668,7 +669,8 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
     int ret;
     bool try_unmap = true;
 
-    if (!vfio_listener_valid_section(section, false, "region_del")) {
+    if (!vfio_listener_valid_section(section, bcontainer->bypass_ro,
+                                     "region_del")) {
         return;
     }
 
-- 
2.52.0


Reply via email to