From: Manish Honap <[email protected]>

Wire the CXL DPA range up as a VFIO demand-paged region so QEMU can
mmap guest device memory directly. Faults call vmf_insert_pfn() to
insert one PFN at a time rather than mapping the full range upfront.

CXL region lifecycle:
- The CXL memory region is registered with VFIO layer during
  vfio_pci_open_device
- mmap() establishes the VMA with vm_ops but inserts no PTEs
- Each guest page fault calls vfio_cxl_region_page_fault() which
  inserts a single PFN under the memory_lock read side
- On device reset, vfio_cxl_zap_region_locked() sets region_active=false
  and calls unmap_mapping_range() to invalidate all DPA PTEs atomically
  while holding memory_lock for writing
- Faults racing with reset see region_active==false and return
  VM_FAULT_SIGBUS
- vfio_cxl_reactivate_region() restores region_active after successful
  hardware reset

Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so
that FLR correctly invalidates DPA mappings and restores them on success.

Co-developed-by: Zhi Wang <[email protected]>
Signed-off-by: Zhi Wang <[email protected]>
Signed-off-by: Manish Honap <[email protected]>
---
 drivers/vfio/pci/cxl/vfio_cxl_core.c | 187 +++++++++++++++++++++++++++
 drivers/vfio/pci/cxl/vfio_cxl_emu.c  |   2 +-
 drivers/vfio/pci/cxl/vfio_cxl_priv.h |   3 +
 drivers/vfio/pci/vfio_pci_core.c     |  11 ++
 drivers/vfio/pci/vfio_pci_priv.h     |   6 +
 5 files changed, 208 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c 
b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index 30b365b91903..19d3dc205f99 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -435,4 +435,191 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device 
*vdev)
        vfio_cxl_destroy_cxl_region(cxl);
 }
 
+static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf)
+{
+       struct vfio_pci_region *region = vmf->vma->vm_private_data;
+       struct vfio_pci_cxl_state *cxl = region->data;
+       unsigned long pgoff;
+       unsigned long pfn;
+
+       if (!READ_ONCE(cxl->region_active))
+               return VM_FAULT_SIGBUS;
+
+       pgoff = vmf->pgoff &
+               ((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+       if (pgoff >= (cxl->region_size >> PAGE_SHIFT))
+               return VM_FAULT_SIGBUS;
+
+       pfn = PHYS_PFN(cxl->region_hpa) + pgoff;
+
+       return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
+}
+
+static const struct vm_operations_struct vfio_cxl_region_vm_ops = {
+       .fault = vfio_cxl_region_vm_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev,
+                               struct vfio_pci_region *region,
+                               struct vm_area_struct *vma)
+{
+       struct vfio_pci_cxl_state *cxl = vdev->cxl;
+       u64 req_len, pgoff, end;
+
+       if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+               return -EINVAL;
+
+       if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
+           (vma->vm_flags & VM_READ))
+               return -EPERM;
+
+       if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
+           (vma->vm_flags & VM_WRITE))
+               return -EPERM;
+
+       pgoff = vma->vm_pgoff &
+               ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+       if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
+           check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
+               return -EOVERFLOW;
+
+       if (end > cxl->region_size)
+               return -EINVAL;
+
+       vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+       vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+                    VM_DONTEXPAND | VM_DONTDUMP);
+
+       vma->vm_ops = &vfio_cxl_region_vm_ops;
+       vma->vm_private_data = region;
+
+       return 0;
+}
+
+/*
+ * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs.
+ *
+ * Must be called with vdev->memory_lock held for writing.  Sets
+ * region_active=false before zapping so any subsequent I/O to the region
+ * sees the inactive state and returns an error rather than accessing
+ * stale mappings.
+ */
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev)
+{
+       struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+       lockdep_assert_held_write(&vdev->memory_lock);
+
+       if (!cxl)
+               return;
+
+       WRITE_ONCE(cxl->region_active, false);
+}
+
+/*
+ * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset.
+ *
+ * Must be called with vdev->memory_lock held for writing.  Re-reads the
+ * HDM decoder state from hardware (FLR cleared it) and sets region_active
+ * so that subsequent I/O to the region is permitted again.
+ */
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev)
+{
+       struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+       lockdep_assert_held_write(&vdev->memory_lock);
+
+       if (!cxl)
+               return;
+       /*
+        * Re-initialise the emulated HDM comp_reg_virt[] from hardware.
+        * After FLR the decoder registers read as zero; mirror that in
+        * the emulated state so QEMU sees a clean slate.
+        */
+       vfio_cxl_reinit_comp_regs(cxl);
+
+       /*
+        * Only re-enable the DPA mmap if the hardware has actually
+        * re-committed decoder 0 after FLR.  Read the COMMITTED bit from the
+        * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR
+        * hardware state, not stale pre-reset state.
+        *
+        * If COMMITTED is 0 (slow firmware re-commit path), leave
+        * region_active=false.  Guest faults will return VM_FAULT_SIGBUS
+        * until the decoder is re-committed and the region is re-enabled.
+        */
+       if (cxl->precommitted && cxl->comp_reg_virt) {
+               /*
+                * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset
+                * (now CXL.mem-relative) plus the within-HDM-block offset.
+                */
+               u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl,
+                                           CXL_HDM_DECODER0_CTRL_OFFSET(0)));
+
+               if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)
+                       WRITE_ONCE(cxl->region_active, true);
+       }
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+                                 char __user *buf, size_t count, loff_t *ppos,
+                                 bool iswrite)
+{
+       unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+       struct vfio_pci_cxl_state *cxl = core_dev->region[i].data;
+       loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+       if (!count || pos >= cxl->region_size)
+               return 0;
+
+       /*
+        * Guard against access after a failed reset (region_active=false)
+        * or a release race (region_vaddr=NULL).  Either condition means
+        * the memremap'd window is no longer valid; touching it would produce
+        * a Synchronous External Abort.  Return -EIO so the caller gets a
+        * clean error rather than a kernel oops.
+        */
+       if (!READ_ONCE(cxl->region_active) || !cxl->region_vaddr)
+               return -EIO;
+
+       count = min(count, (size_t)(cxl->region_size - pos));
+
+       if (iswrite) {
+               if (copy_from_user(cxl->region_vaddr + pos, buf, count))
+                       return -EFAULT;
+       } else {
+               if (copy_to_user(buf, cxl->region_vaddr + pos, count))
+                       return -EFAULT;
+       }
+
+       return count;
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+                                   struct vfio_pci_region *region)
+{
+       struct vfio_pci_cxl_state *cxl = region->data;
+
+       /*
+        * Deactivate the region before removing user mappings so that any
+        * fault handler racing the release returns VM_FAULT_SIGBUS rather
+        * than inserting a PFN into an unmapped region.
+        */
+       WRITE_ONCE(cxl->region_active, false);
+
+       if (cxl->region_vaddr) {
+               memunmap(cxl->region_vaddr);
+               cxl->region_vaddr = NULL;
+       }
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+       .rw             = vfio_cxl_region_rw,
+       .mmap           = vfio_cxl_region_mmap,
+       .release        = vfio_cxl_region_release,
+};
+
 MODULE_IMPORT_NS("CXL");
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c 
b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
index 11195e8c21d7..781328a79b43 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
@@ -33,7 +33,7 @@
  *     +0x1c: (reserved)
  */
 
-static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off)
+__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off)
 {
        /*
         * hdm_off is a byte offset within the HDM decoder block.
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h 
b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 72a0d7d7e183..3458768445af 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -33,6 +33,7 @@ struct vfio_pci_cxl_state {
        u8                           comp_reg_bar;
        bool                         cache_capable;
        bool                         precommitted;
+       bool                         region_active;
 };
 
 /* Register access sizes */
@@ -96,4 +97,6 @@ int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl,
                               resource_size_t size);
 void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl);
 
+__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off);
+
 #endif /* __LINUX_VFIO_CXL_PRIV_H */
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index b7364178e23d..48e0274c19aa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1223,6 +1223,9 @@ static int vfio_pci_ioctl_reset(struct 
vfio_pci_core_device *vdev,
 
        vfio_pci_zap_and_down_write_memory_lock(vdev);
 
+       /* Zap CXL DPA region PTEs before hardware reset clears HDM state */
+       vfio_cxl_zap_region_locked(vdev);
+
        /*
         * This function can be invoked while the power state is non-D0. If
         * pci_try_reset_function() has been called while the power state is
@@ -1236,6 +1239,14 @@ static int vfio_pci_ioctl_reset(struct 
vfio_pci_core_device *vdev,
 
        vfio_pci_dma_buf_move(vdev, true);
        ret = pci_try_reset_function(vdev->pdev);
+
+       /*
+        * Re-enable DPA region if reset succeeded; fault handler will
+        * re-insert PFNs on next access without requiring a new mmap.
+        */
+       if (!ret)
+               vfio_cxl_reactivate_region(vdev);
+
        if (__vfio_pci_memory_enabled(vdev))
                vfio_pci_dma_buf_move(vdev, false);
        up_write(&vdev->memory_lock);
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 1082ba43bafe..726063b6ff70 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -145,6 +145,8 @@ static inline void vfio_pci_dma_buf_move(struct 
vfio_pci_core_device *vdev,
 
 void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev);
 void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev);
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev);
 
 #else
 
@@ -152,6 +154,10 @@ static inline void
 vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { }
 static inline void
 vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { }
 
 #endif /* CONFIG_VFIO_CXL_CORE */
 
-- 
2.25.1


Reply via email to