Hi Gavin,
On 12/6/26 13:03, Gavin Shan wrote:
All ram device regions was turned to be indirectly accessible by commit
4a2e242bbb ("memory: Don't use memcpy for ram_device regions"). This leads
to guest hang on compiling 'cuda-samples' as reported by Julia. The guest
is started by the following command lines, with a GH100 GPU card.
host$ lspci | grep GH100
0009:01:00.0 3D controller: NVIDIA Corporation GH100 [GH200 120GB / 480GB]
(rev a1)
host$ /home/sandbox/gavin/qemu.main/build/qemu-system-aarch64 \
-machine virt,gic-version=host,ras=on,highmem-mmio-size=4T \
-accel kvm -cpu host -smp cpus=48 -m size=8G \
-drive file=/home/gavin/sandbox/images/disk.qcow2,if=none,id=d0 \
-device virtio-blk-pci,id=vb0,bus=pcie.0,drive=d0,num-queues=4 \
-device vfio-pci-nohotplug,host=0009:01:00.0,bus=pcie.1.0
:
guest$ cd cuda-samples/build
guest$ make -j 20 clean
guest$ make -j 20
:
[ 54%] Linking CUDA executable graphMemoryNodes
[ 54%] Built target graphMemoryNodes
<no more output afterwards, guest becomes frozen here>
guest$ qemu-system-aarch64: virtio: bogus descriptor or out of resources
[ 555.814025] virtio_blk virtio0: [vda] new size: 268435456 512-byte
logical blocks (137 GB/128 GiB)
When the GPU's driver (NVidia open driver) is loaded on guest bootup,
the memory blocks residing in the PCI BAR#4 can be presented to the
guest through memory hot-add. The page cache can be allocated from the
hot added memory blocks when cuda-samples is being compiled. Afterwards,
the page cache is sent to QEMU's virtio-blk device as part of the DMA
request, the bounce buffer has to be used to accomodate the request as
the corresponding memory region (MemoryRegion) is a RAM DEVICE region
and indirectly accessible in qemu. However, the max bounce bufer size
is only 4096 bytes by default. We're running out of that space quickly.
QEMU
====
virtio_blk_handle_output
virtio_blk_handle_vq
virtio_blk_get_request
virtqueue_pop
virtqueue_split_pop
virtqueue_map_desc
address_space_map
memory_access_is_direct # Return false
memory_region_supports_direct_access
(qemu) info mtree
memory-region: pci_bridge_pci
0000000000000000-ffffffffffffffff (prio 0, container): pci_bridge_pci
0000042000000000-0000043fffffffff (prio 1, i/o): 0009:01:00.0 base BAR 4
0000042000000000-0000043fffffffff (prio 0, i/o): 0009:01:00.0 BAR 4
0000042000000000-000004379fffffff (prio 0, ramd): 0009:01:00.0 BAR 4
mmaps[0]
This replaces mem{cpy, move} with __builtin_mem{cpy, move} in the memory
accessors to ram device memory region, preparatory work to make ram device
region directly accessible and bypass the bounce buffer in the DMA path
in next patch.
Reported-by: Julia Graham <[email protected]>
Suggested-by: Michael S. Tsirkin <[email protected]>
Suggested-by: Peter Xu <[email protected]>
Signed-off-by: Gavin Shan <[email protected]>
---
hw/remote/vfio-user-obj.c | 4 ++--
include/system/memory.h | 42 ++++++++++++++++++++++++++++++++++++++-
system/physmem.c | 8 ++++----
3 files changed, 47 insertions(+), 7 deletions(-)
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 87fa7b6572..fe6f661fe2 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -375,9 +375,9 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf,
hwaddr offset,
ram_ptr = memory_region_get_ram_ptr(mr);
if (is_write) {
- memcpy((ram_ptr + offset), buf, size);
+ address_space_memcpy(ram_ptr + offset, buf, size);
} else {
- memcpy(buf, (ram_ptr + offset), size);
+ address_space_memcpy(buf, ram_ptr + offset, size);
}
return 0;
diff --git a/include/system/memory.h b/include/system/memory.h
index 1417132f6d..6bb2e13eea 100644
--- a/include/system/memory.h
+++ b/include/system/memory.h
@@ -2938,6 +2938,46 @@ static inline bool memory_access_is_direct(const
MemoryRegion *mr,
return true;
}
+static inline void address_space_memcpy(void *dest, const void *src, size_t n)
'address_space_' prefix for something that doesn't use neither
AddressSpace nor MemoryRegion is odd.
Maybe prefix 'qemu_ram_' or 'qemu_ram_ptr_' instead? (since the
address is returned by memory_region_get_ram_ptr)
Add the definitions in "system/ramblock.h" with that declaration?
+{
+ switch (n) {
+ case 1:
+ __builtin_memcpy(dest, src, 1);
+ break;
+ case 2:
+ __builtin_memcpy(dest, src, 2);
+ break;
+ case 4:
+ __builtin_memcpy(dest, src, 4);
+ break;
+ case 8:
+ __builtin_memcpy(dest, src, 8);
+ break;
+ default:
+ __builtin_memcpy(dest, src, n);
+ }
+}
+
+static inline void address_space_memmove(void *dest, const void *src, size_t n)
+{
+ switch (n) {
+ case 1:
+ __builtin_memmove(dest, src, 1);
+ break;
+ case 2:
+ __builtin_memmove(dest, src, 2);
+ break;
+ case 4:
+ __builtin_memmove(dest, src, 4);
+ break;
+ case 8:
+ __builtin_memmove(dest, src, 8);
+ break;
+ default:
+ __builtin_memmove(dest, src, n);
+ }
+}
+
/**
* address_space_read: read from an address space.
*
@@ -2970,7 +3010,7 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr
addr,
mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
if (len == l && memory_access_is_direct(mr, false, attrs)) {
ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
- memcpy(buf, ptr, len);
+ __builtin_memcpy(buf, ptr, len);
} else {
result = flatview_read_continue(fv, addr, attrs, buf, len,
addr1, l, mr);
diff --git a/system/physmem.c b/system/physmem.c
index 7bcbf87573..5f46a9d676 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3272,7 +3272,7 @@ static MemTxResult
flatview_write_continue_step(MemTxAttrs attrs,
uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
false, true);
- memmove(ram_ptr, buf, *l);
+ address_space_memmove(ram_ptr, buf, *l);
invalidate_and_set_dirty(mr, mr_addr, *l);
return MEMTX_OK;
@@ -3365,7 +3365,7 @@ static MemTxResult flatview_read_continue_step(MemTxAttrs
attrs, uint8_t *buf,
uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
false, false);
- memcpy(buf, ram_ptr, *l);
+ address_space_memcpy(buf, ram_ptr, *l);
return MEMTX_OK;
}
@@ -3503,8 +3503,8 @@ MemTxResult address_space_write_rom(AddressSpace *as,
hwaddr addr,
l = memory_access_size(mr, l, addr1);
} else {
/* ROM/RAM case */
- void *ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
- memcpy(ram_ptr, buf, l);
+ address_space_memcpy(qemu_map_ram_ptr(mr->ram_block, addr1),
+ buf, l);
invalidate_and_set_dirty(mr, addr1, l);
}
len -= l;