> On Sep 20, 2023, at 4:06 AM, Mattias Nissler <mniss...@rivosinc.com> wrote:
>
> Wire up support for DMA for the case where the vfio-user client does not
> provide mmap()-able file descriptors, but DMA requests must be performed
> via the VFIO-user protocol. This installs an indirect memory region,
> which already works for pci_dma_{read,write}, and pci_dma_map works
> thanks to the existing DMA bounce buffering support.
>
> Note that while simple scenarios work with this patch, there's a known
> race condition in libvfio-user that will mess up the communication
> channel. See https://github.com/nutanix/libvfio-user/issues/279 for
> details as well as a proposed fix.
>
> Signed-off-by: Mattias Nissler <mniss...@rivosinc.com>
> ---
> hw/remote/trace-events | 2 +
> hw/remote/vfio-user-obj.c | 84 +++++++++++++++++++++++++++++++++++----
> 2 files changed, 79 insertions(+), 7 deletions(-)
>
> diff --git a/hw/remote/trace-events b/hw/remote/trace-events
> index 0d1b7d56a5..358a68fb34 100644
> --- a/hw/remote/trace-events
> +++ b/hw/remote/trace-events
> @@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x
> -> 0x%x"
> vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
> vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64",
> %zu bytes"
> vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
> +vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes"
> +vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu
> bytes"
> vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr
> 0x%"PRIx64" size 0x%"PRIx64""
> vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR
> address 0x%"PRIx64""
> vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR
> address 0x%"PRIx64""
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index 8b10c32a3c..6a561f7969 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx,
> char * const buf,
> return count;
> }
>
> +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
> + unsigned size, MemTxAttrs attrs)
> +{
> + MemoryRegion *region = opaque;
> + vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
> + uint8_t buf[sizeof(uint64_t)];
> +
> + trace_vfu_dma_read(region->addr + addr, size);
> +
> + g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
> + vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> + if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
> + vfu_sgl_read(vfu_ctx, sg, 1, buf) != 0) {
> + return MEMTX_ERROR;
> + }
> +
> + *val = ldn_he_p(buf, size);
> +
> + return MEMTX_OK;
> +}
> +
> +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
> + unsigned size, MemTxAttrs attrs)
> +{
> + MemoryRegion *region = opaque;
> + vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
> + uint8_t buf[sizeof(uint64_t)];
> +
> + trace_vfu_dma_write(region->addr + addr, size);
> +
> + stn_he_p(buf, size, val);
> +
> + g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
> + vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> + if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
> + vfu_sgl_write(vfu_ctx, sg, 1, buf) != 0) {
> + return MEMTX_ERROR;
> + }
> +
> + return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps vfu_dma_ops = {
> + .read_with_attrs = vfu_dma_read,
> + .write_with_attrs = vfu_dma_write,
> + .endianness = DEVICE_HOST_ENDIAN,
> + .valid = {
> + .min_access_size = 1,
> + .max_access_size = 8,
> + .unaligned = true,
> + },
> + .impl = {
> + .min_access_size = 1,
> + .max_access_size = 8,
> + },
> +};
> +
> static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
> {
> VfuObject *o = vfu_get_private(vfu_ctx);
> @@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx,
> vfu_dma_info_t *info)
> g_autofree char *name = NULL;
> struct iovec *iov = &info->iova;
>
> - if (!info->vaddr) {
> - return;
> - }
> -
> name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
> - (uint64_t)info->vaddr);
> + (uint64_t)iov->iov_base);
>
> subregion = g_new0(MemoryRegion, 1);
>
> - memory_region_init_ram_ptr(subregion, NULL, name,
> - iov->iov_len, info->vaddr);
> + if (info->vaddr) {
> + memory_region_init_ram_ptr(subregion, OBJECT(o), name,
> + iov->iov_len, info->vaddr);
> + } else {
> + /*
> + * Note that I/O regions' MemoryRegionOps handle accesses of at most
> 8
> + * bytes at a time, and larger accesses are broken down. However,
> + * many/most DMA accesses are larger than 8 bytes and VFIO-user can
> + * handle large DMA accesses just fine, thus this size restriction
> + * unnecessarily hurts performance, in particular given that each
> + * access causes a round trip on the VFIO-user socket.
> + *
> + * TODO: Investigate how to plumb larger accesses through memory
> + * regions, possibly by amending MemoryRegionOps or by creating a new
> + * memory region type.
> + */
> + memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion,
> + name, iov->iov_len);
Hi Mattias,
We should update dma_unregister() to ensure we remove this subregion.
dma_unregister() presently removes the RAM region, but not this one.
--
Jag
> + }
>
> dma_as = pci_device_iommu_address_space(o->pci_dev);
>
> --
> 2.34.1
>