On Fri, Jun 12, 2026 at 09:03:06PM +1000, Gavin Shan wrote:
> All ram device regions was turned to be indirectly accessible by commit
> 4a2e242bbb ("memory: Don't use memcpy for ram_device regions"). This leads
> to guest hang on compiling 'cuda-samples' as reported by Julia. The guest
> is started by the following command lines, with a GH100 GPU card.
>
> host$ lspci | grep GH100
> 0009:01:00.0 3D controller: NVIDIA Corporation GH100 [GH200 120GB / 480GB]
> (rev a1)
> host$ /home/sandbox/gavin/qemu.main/build/qemu-system-aarch64 \
> -machine virt,gic-version=host,ras=on,highmem-mmio-size=4T \
> -accel kvm -cpu host -smp cpus=48 -m size=8G \
> -drive file=/home/gavin/sandbox/images/disk.qcow2,if=none,id=d0 \
> -device virtio-blk-pci,id=vb0,bus=pcie.0,drive=d0,num-queues=4 \
> -device vfio-pci-nohotplug,host=0009:01:00.0,bus=pcie.1.0
> :
> guest$ cd cuda-samples/build
> guest$ make -j 20 clean
> guest$ make -j 20
> :
> [ 54%] Linking CUDA executable graphMemoryNodes
> [ 54%] Built target graphMemoryNodes
> <no more output afterwards, guest becomes frozen here>
>
> guest$ qemu-system-aarch64: virtio: bogus descriptor or out of resources
> [ 555.814025] virtio_blk virtio0: [vda] new size: 268435456 512-byte
> logical blocks (137 GB/128 GiB)
>
> When the GPU's driver (NVidia open driver) is loaded on guest bootup,
> the memory blocks residing in the PCI BAR#4 can be presented to the
> guest through memory hot-add. The page cache can be allocated from the
> hot added memory blocks when cuda-samples is being compiled. Afterwards,
> the page cache is sent to QEMU's virtio-blk device as part of the DMA
> request, the bounce buffer has to be used to accomodate the request as
> the corresponding memory region (MemoryRegion) is a RAM DEVICE region
> and indirectly accessible in qemu. However, the max bounce bufer size
> is only 4096 bytes by default. We're running out of that space quickly.
>
> QEMU
> ====
> virtio_blk_handle_output
> virtio_blk_handle_vq
> virtio_blk_get_request
> virtqueue_pop
> virtqueue_split_pop
> virtqueue_map_desc
> address_space_map
> memory_access_is_direct # Return false
> memory_region_supports_direct_access
>
> (qemu) info mtree
> memory-region: pci_bridge_pci
> 0000000000000000-ffffffffffffffff (prio 0, container): pci_bridge_pci
> 0000042000000000-0000043fffffffff (prio 1, i/o): 0009:01:00.0 base BAR 4
> 0000042000000000-0000043fffffffff (prio 0, i/o): 0009:01:00.0 BAR 4
> 0000042000000000-000004379fffffff (prio 0, ramd): 0009:01:00.0 BAR
> 4 mmaps[0]
>
> This replaces mem{cpy, move} with __builtin_mem{cpy, move} in the memory
> accessors to ram device memory region, preparatory work to make ram device
> region directly accessible and bypass the bounce buffer in the DMA path
> in next patch.
>
> Reported-by: Julia Graham <[email protected]>
> Suggested-by: Michael S. Tsirkin <[email protected]>
> Suggested-by: Peter Xu <[email protected]>
> Signed-off-by: Gavin Shan <[email protected]>
> ---
> hw/remote/vfio-user-obj.c | 4 ++--
> include/system/memory.h | 42 ++++++++++++++++++++++++++++++++++++++-
> system/physmem.c | 8 ++++----
> 3 files changed, 47 insertions(+), 7 deletions(-)
>
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index 87fa7b6572..fe6f661fe2 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -375,9 +375,9 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t
> *buf, hwaddr offset,
> ram_ptr = memory_region_get_ram_ptr(mr);
>
> if (is_write) {
> - memcpy((ram_ptr + offset), buf, size);
> + address_space_memcpy(ram_ptr + offset, buf, size);
> } else {
> - memcpy(buf, (ram_ptr + offset), size);
> + address_space_memcpy(buf, ram_ptr + offset, size);
> }
>
> return 0;
> diff --git a/include/system/memory.h b/include/system/memory.h
> index 1417132f6d..6bb2e13eea 100644
> --- a/include/system/memory.h
> +++ b/include/system/memory.h
> @@ -2938,6 +2938,46 @@ static inline bool memory_access_is_direct(const
> MemoryRegion *mr,
> return true;
> }
>
> +static inline void address_space_memcpy(void *dest, const void *src, size_t
> n)
> +{
> + switch (n) {
> + case 1:
> + __builtin_memcpy(dest, src, 1);
> + break;
> + case 2:
> + __builtin_memcpy(dest, src, 2);
> + break;
> + case 4:
> + __builtin_memcpy(dest, src, 4);
> + break;
> + case 8:
> + __builtin_memcpy(dest, src, 8);
> + break;
> + default:
> + __builtin_memcpy(dest, src, n);
> + }
> +}
> +
> +static inline void address_space_memmove(void *dest, const void *src, size_t
> n)
> +{
> + switch (n) {
> + case 1:
> + __builtin_memmove(dest, src, 1);
> + break;
> + case 2:
> + __builtin_memmove(dest, src, 2);
> + break;
> + case 4:
> + __builtin_memmove(dest, src, 4);
> + break;
> + case 8:
> + __builtin_memmove(dest, src, 8);
> + break;
> + default:
> + __builtin_memmove(dest, src, n);
> + }
> +}
> +
> /**
> * address_space_read: read from an address space.
> *
The variable length probably should use the regular memcpy/memmove -
no reason to bypass fortification for these.
> @@ -2970,7 +3010,7 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr
> addr,
> mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
> if (len == l && memory_access_is_direct(mr, false, attrs)) {
> ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
> - memcpy(buf, ptr, len);
> + __builtin_memcpy(buf, ptr, len);
> } else {
> result = flatview_read_continue(fv, addr, attrs, buf, len,
> addr1, l, mr);
> diff --git a/system/physmem.c b/system/physmem.c
> index 7bcbf87573..5f46a9d676 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -3272,7 +3272,7 @@ static MemTxResult
> flatview_write_continue_step(MemTxAttrs attrs,
> uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
> false, true);
>
> - memmove(ram_ptr, buf, *l);
> + address_space_memmove(ram_ptr, buf, *l);
> invalidate_and_set_dirty(mr, mr_addr, *l);
>
> return MEMTX_OK;
> @@ -3365,7 +3365,7 @@ static MemTxResult
> flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf,
> uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
> false, false);
>
> - memcpy(buf, ram_ptr, *l);
> + address_space_memcpy(buf, ram_ptr, *l);
>
> return MEMTX_OK;
> }
> @@ -3503,8 +3503,8 @@ MemTxResult address_space_write_rom(AddressSpace *as,
> hwaddr addr,
> l = memory_access_size(mr, l, addr1);
> } else {
> /* ROM/RAM case */
> - void *ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
> - memcpy(ram_ptr, buf, l);
> + address_space_memcpy(qemu_map_ram_ptr(mr->ram_block, addr1),
> + buf, l);
> invalidate_and_set_dirty(mr, addr1, l);
> }
> len -= l;
> --
> 2.54.0