Hi Gavin,

On 12/6/26 13:03, Gavin Shan wrote:
All ram device regions was turned to be indirectly accessible by commit
4a2e242bbb ("memory: Don't use memcpy for ram_device regions"). This leads
to guest hang on compiling 'cuda-samples' as reported by Julia. The guest
is started by the following command lines, with a GH100 GPU card.

    host$ lspci | grep GH100
    0009:01:00.0 3D controller: NVIDIA Corporation GH100 [GH200 120GB / 480GB] 
(rev a1)
    host$ /home/sandbox/gavin/qemu.main/build/qemu-system-aarch64            \
          -machine virt,gic-version=host,ras=on,highmem-mmio-size=4T         \
          -accel kvm -cpu host -smp cpus=48 -m size=8G                       \
          -drive file=/home/gavin/sandbox/images/disk.qcow2,if=none,id=d0    \
          -device virtio-blk-pci,id=vb0,bus=pcie.0,drive=d0,num-queues=4     \
          -device vfio-pci-nohotplug,host=0009:01:00.0,bus=pcie.1.0
            :
    guest$ cd cuda-samples/build
    guest$ make -j 20 clean
    guest$ make -j 20
            :
    [ 54%] Linking CUDA executable graphMemoryNodes
    [ 54%] Built target graphMemoryNodes
    <no more output afterwards, guest becomes frozen here>

    guest$ qemu-system-aarch64: virtio: bogus descriptor or out of resources
    [  555.814025] virtio_blk virtio0: [vda] new size: 268435456 512-byte 
logical blocks (137 GB/128 GiB)

When the GPU's driver (NVidia open driver) is loaded on guest bootup,
the memory blocks residing in the PCI BAR#4 can be presented to the
guest through memory hot-add. The page cache can be allocated from the
hot added memory blocks when cuda-samples is being compiled. Afterwards,
the page cache is sent to QEMU's virtio-blk device as part of the DMA
request, the bounce buffer has to be used to accomodate the request as
the corresponding memory region (MemoryRegion) is a RAM DEVICE region
and indirectly accessible in qemu. However, the max bounce bufer size
is only 4096 bytes by default. We're running out of that space quickly.

   QEMU
   ====
   virtio_blk_handle_output
     virtio_blk_handle_vq
       virtio_blk_get_request
         virtqueue_pop
           virtqueue_split_pop
             virtqueue_map_desc
               address_space_map
                 memory_access_is_direct         # Return false
                   memory_region_supports_direct_access

   (qemu) info mtree
   memory-region: pci_bridge_pci
     0000000000000000-ffffffffffffffff (prio 0, container): pci_bridge_pci
       0000042000000000-0000043fffffffff (prio 1, i/o): 0009:01:00.0 base BAR 4
         0000042000000000-0000043fffffffff (prio 0, i/o): 0009:01:00.0 BAR 4
           0000042000000000-000004379fffffff (prio 0, ramd): 0009:01:00.0 BAR 4 
mmaps[0]

This replaces mem{cpy, move} with __builtin_mem{cpy, move} in the memory
accessors to ram device memory region, preparatory work to make ram device
region directly accessible and bypass the bounce buffer in the DMA path
in next patch.

Reported-by: Julia Graham <[email protected]>
Suggested-by: Michael S. Tsirkin <[email protected]>
Suggested-by: Peter Xu <[email protected]>
Signed-off-by: Gavin Shan <[email protected]>
---
  hw/remote/vfio-user-obj.c |  4 ++--
  include/system/memory.h   | 42 ++++++++++++++++++++++++++++++++++++++-
  system/physmem.c          |  8 ++++----
  3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 87fa7b6572..fe6f661fe2 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -375,9 +375,9 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, 
hwaddr offset,
          ram_ptr = memory_region_get_ram_ptr(mr);
if (is_write) {
-            memcpy((ram_ptr + offset), buf, size);
+            address_space_memcpy(ram_ptr + offset, buf, size);
          } else {
-            memcpy(buf, (ram_ptr + offset), size);
+            address_space_memcpy(buf, ram_ptr + offset, size);
          }
return 0;
diff --git a/include/system/memory.h b/include/system/memory.h
index 1417132f6d..6bb2e13eea 100644
--- a/include/system/memory.h
+++ b/include/system/memory.h
@@ -2938,6 +2938,46 @@ static inline bool memory_access_is_direct(const 
MemoryRegion *mr,
      return true;
  }
+static inline void address_space_memcpy(void *dest, const void *src, size_t n)

'address_space_' prefix for something that doesn't use neither
AddressSpace nor MemoryRegion is odd.

Maybe prefix 'qemu_ram_' or 'qemu_ram_ptr_' instead? (since the
address is returned by memory_region_get_ram_ptr)
Add the definitions in "system/ramblock.h" with that declaration?

+{
+    switch (n) {
+    case 1:
+        __builtin_memcpy(dest, src, 1);
+        break;
+    case 2:
+        __builtin_memcpy(dest, src, 2);
+        break;
+    case 4:
+        __builtin_memcpy(dest, src, 4);
+        break;
+    case 8:
+        __builtin_memcpy(dest, src, 8);
+        break;
+    default:
+        __builtin_memcpy(dest, src, n);
+    }
+}
+
+static inline void address_space_memmove(void *dest, const void *src, size_t n)
+{
+    switch (n) {
+    case 1:
+        __builtin_memmove(dest, src, 1);
+        break;
+    case 2:
+        __builtin_memmove(dest, src, 2);
+        break;
+    case 4:
+        __builtin_memmove(dest, src, 4);
+        break;
+    case 8:
+        __builtin_memmove(dest, src, 8);
+        break;
+    default:
+        __builtin_memmove(dest, src, n);
+    }
+}
+
  /**
   * address_space_read: read from an address space.
   *
@@ -2970,7 +3010,7 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr 
addr,
              mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
              if (len == l && memory_access_is_direct(mr, false, attrs)) {
                  ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
-                memcpy(buf, ptr, len);
+                __builtin_memcpy(buf, ptr, len);
              } else {
                  result = flatview_read_continue(fv, addr, attrs, buf, len,
                                                  addr1, l, mr);
diff --git a/system/physmem.c b/system/physmem.c
index 7bcbf87573..5f46a9d676 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3272,7 +3272,7 @@ static MemTxResult 
flatview_write_continue_step(MemTxAttrs attrs,
          uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
                                                 false, true);
- memmove(ram_ptr, buf, *l);
+        address_space_memmove(ram_ptr, buf, *l);
          invalidate_and_set_dirty(mr, mr_addr, *l);
return MEMTX_OK;
@@ -3365,7 +3365,7 @@ static MemTxResult flatview_read_continue_step(MemTxAttrs 
attrs, uint8_t *buf,
          uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
                                                 false, false);
- memcpy(buf, ram_ptr, *l);
+        address_space_memcpy(buf, ram_ptr, *l);
return MEMTX_OK;
      }
@@ -3503,8 +3503,8 @@ MemTxResult address_space_write_rom(AddressSpace *as, 
hwaddr addr,
              l = memory_access_size(mr, l, addr1);
          } else {
              /* ROM/RAM case */
-            void *ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
-            memcpy(ram_ptr, buf, l);
+            address_space_memcpy(qemu_map_ram_ptr(mr->ram_block, addr1),
+                                 buf, l);
              invalidate_and_set_dirty(mr, addr1, l);
          }
          len -= l;


Reply via email to