The CXL Fixed Memory Window (CFMW) is registered with
memory_region_init_io() and has no backing ram_block.  As a result
address_space_map() on a guest physical address inside the CFMW
takes the bounce-buffer path and is bounded by
DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4 KiB by default for the system
AddressSpace).  Once a Type-3 device is brought online as system RAM
(daxctl online-memory), the kernel happily allocates DMA buffers from
the CFMW range and any virtio operation whose scatter list exceeds
4 KiB or that overlaps with another in-flight transfer fails with:

    qemu-system-riscv64: virtio: bogus descriptor or out of resources

The bug is not RISC-V specific: CFMW registration and the bounce
buffer limit are both arch-agnostic, so any guest that onlines CXL
memory and issues DMA larger than 4 KiB into it is affected.  It
shows up first on RISC-V virt because that is where the rest of
this series enables the daxctl + virtio path end-to-end.

Reproduce on RISC-V virt with cxl=on and a single Type-3 device:

    cxl create-region -m -t ram -d decoder0.0 -w 1 mem0 -s 4G
    daxctl online-memory dax0.0
    free -h    # triggers the error and stalls the guest

Fix it by overlaying a RAM alias of the device's memory backend
(hostvmem / hostpmem) at the committed HDM decoder's HPA range, with
higher priority than the CFMW I/O region.  flatview_translate() then
hits the alias, address_space_map() returns a direct host pointer,
and DMA proceeds without bouncing.  This mirrors the existing QEMU
pattern of PCI BAR and IOMMU MR overlays.  The alias is torn down on
hdm_decoder_uncommit() so subsequent region tear-down + re-creation
works.

Signed-off-by: Chen Pei <[email protected]>
---
 hw/mem/cxl_type3.c          | 81 +++++++++++++++++++++++++++++++++++++
 include/hw/cxl/cxl_device.h |  4 ++
 2 files changed, 85 insertions(+)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 4739239da3..f962bce66a 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -24,6 +24,7 @@
 #include "qemu/module.h"
 #include "qemu/pmem.h"
 #include "qemu/range.h"
+#include "system/address-spaces.h"
 #include "qemu/rcu.h"
 #include "qemu/guest-random.h"
 #include "system/hostmem.h"
@@ -420,6 +421,11 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int 
which)
     ComponentRegisters *cregs = &ct3d->cxl_cstate.crb;
     uint32_t *cache_mem = cregs->cache_mem_registers;
     uint32_t ctrl;
+    uint32_t low, high;
+    uint64_t decoder_base, decoder_size;
+    MemoryRegion *mr = NULL;
+    uint64_t dpa_offset = 0;
+    char *alias_name;
 
     ctrl = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + which * hdm_inc);
     /* TODO: Sanity checks that the decoder is possible */
@@ -427,6 +433,73 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int 
which)
     ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
 
     stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + which * hdm_inc, ctrl);
+
+    /*
+     * Create a RAM alias in system memory for the committed decoder range.
+     * This enables direct DMA mapping (address_space_map) for devices like
+     * virtio that need to DMA to/from CXL memory.  Without this, the CFMW
+     * I/O region would require bounce buffering which is limited to 4KB.
+     */
+    low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_BASE_LO + which * hdm_inc);
+    high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_BASE_HI + which * hdm_inc);
+    decoder_base = ((uint64_t)high << 32) | (low & 0xf0000000);
+
+    low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_LO + which * hdm_inc);
+    high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_HI + which * hdm_inc);
+    decoder_size = ((uint64_t)high << 32) | (low & 0xf0000000);
+
+    if (!decoder_base || !decoder_size) {
+        return;
+    }
+
+    /* Calculate DPA offset by summing sizes of preceding decoders */
+    for (int i = 0; i < which; i++) {
+        uint32_t prev_low, prev_high;
+        uint64_t prev_size;
+        uint32_t prev_ctrl;
+
+        prev_ctrl = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL +
+                             i * hdm_inc);
+        if (!FIELD_EX32(prev_ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED)) {
+            continue;
+        }
+        prev_low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_LO +
+                            i * hdm_inc);
+        prev_high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_HI +
+                             i * hdm_inc);
+        prev_size = ((uint64_t)prev_high << 32) | (prev_low & 0xf0000000);
+        dpa_offset += prev_size;
+    }
+
+    /* Determine which memory backend to alias */
+    if (ct3d->hostvmem) {
+        MemoryRegion *vmr = host_memory_backend_get_memory(ct3d->hostvmem);
+        uint64_t vmr_size = memory_region_size(vmr);
+
+        if (dpa_offset < vmr_size) {
+            mr = vmr;
+        }
+    }
+    if (!mr && ct3d->hostpmem) {
+        MemoryRegion *pmr = host_memory_backend_get_memory(ct3d->hostpmem);
+        uint64_t vmr_size = ct3d->hostvmem ?
+            memory_region_size(
+                host_memory_backend_get_memory(ct3d->hostvmem)) : 0;
+        mr = pmr;
+        dpa_offset -= vmr_size;
+    }
+
+    if (!mr) {
+        return;
+    }
+
+    alias_name = g_strdup_printf("cxl-hdm%d-ram-alias", which);
+    memory_region_init_alias(&ct3d->hdm_ram_alias[which], OBJECT(ct3d),
+                             alias_name, mr, dpa_offset, decoder_size);
+    memory_region_add_subregion_overlap(get_system_memory(), decoder_base,
+                                        &ct3d->hdm_ram_alias[which], 1);
+    ct3d->hdm_ram_alias_valid[which] = true;
+    g_free(alias_name);
 }
 
 static void hdm_decoder_uncommit(CXLType3Dev *ct3d, int which)
@@ -442,6 +515,14 @@ static void hdm_decoder_uncommit(CXLType3Dev *ct3d, int 
which)
     ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED, 0);
 
     stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + which * hdm_inc, ctrl);
+
+    /* Remove the RAM alias if it was added during commit */
+    if (ct3d->hdm_ram_alias_valid[which]) {
+        memory_region_del_subregion(get_system_memory(),
+                                    &ct3d->hdm_ram_alias[which]);
+        object_unparent(OBJECT(&ct3d->hdm_ram_alias[which]));
+        ct3d->hdm_ram_alias_valid[which] = false;
+    }
 }
 
 static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index 393f312217..07deef2e2c 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -714,6 +714,10 @@ struct CXLType3Dev {
     /* State */
     AddressSpace hostvmem_as;
     AddressSpace hostpmem_as;
+
+    /* RAM aliases for HDM decoders - enables direct DMA mapping */
+    MemoryRegion hdm_ram_alias[CXL_HDM_DECODER_COUNT];
+    bool hdm_ram_alias_valid[CXL_HDM_DECODER_COUNT];
     CXLComponentState cxl_cstate;
     CXLDeviceState cxl_dstate;
     CXLCCI cci; /* Primary PCI mailbox CCI */
-- 
2.50.1


Reply via email to