On Thu, Sep 14, 2023 at 8:49 PM Stefan Hajnoczi <stefa...@redhat.com> wrote:
>
> On Thu, Sep 07, 2023 at 06:04:07AM -0700, Mattias Nissler wrote:
> > When DMA memory can't be directly accessed, as is the case when
> > running the device model in a separate process without shareable DMA
> > file descriptors, bounce buffering is used.
> >
> > It is not uncommon for device models to request mapping of several DMA
> > regions at the same time. Examples include:
> >  * net devices, e.g. when transmitting a packet that is split across
> >    several TX descriptors (observed with igb)
> >  * USB host controllers, when handling a packet with multiple data TRBs
> >    (observed with xhci)
> >
> > Previously, qemu only provided a single bounce buffer per AddressSpace
> > and would fail DMA map requests while the buffer was already in use. In
> > turn, this would cause DMA failures that ultimately manifest as hardware
> > errors from the guest perspective.
> >
> > This change allocates DMA bounce buffers dynamically instead of
> > supporting only a single buffer. Thus, multiple DMA mappings work
> > correctly also when RAM can't be mmap()-ed.
> >
> > The total bounce buffer allocation size is limited individually for each
> > AddressSpace. The default limit is 4096 bytes, matching the previous
> > maximum buffer size. A new x-max-bounce-buffer-size parameter is
> > provided to configure the limit for PCI devices.
> >
> > Signed-off-by: Mattias Nissler <mniss...@rivosinc.com>
> > ---
> >  hw/pci/pci.c                |  8 ++++
> >  include/exec/memory.h       | 14 ++----
> >  include/hw/pci/pci_device.h |  3 ++
> >  softmmu/memory.c            |  3 +-
> >  softmmu/physmem.c           | 94 +++++++++++++++++++++++++------------
> >  5 files changed, 80 insertions(+), 42 deletions(-)
> >
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index 881d774fb6..8c4541b394 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -85,6 +85,8 @@ static Property pci_props[] = {
> >                      QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
> >      DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
> >                      QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
> > +    DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
> > +                     max_bounce_buffer_size, 4096),
> >      DEFINE_PROP_END_OF_LIST()
> >  };
> >
> > @@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> > *pci_dev,
> >                         "bus master container", UINT64_MAX);
> >      address_space_init(&pci_dev->bus_master_as,
> >                         &pci_dev->bus_master_container_region, 
> > pci_dev->name);
> > +    pci_dev->bus_master_as.max_bounce_buffer_size =
> > +        pci_dev->max_bounce_buffer_size;
> >
> >      if (phase_check(PHASE_MACHINE_READY)) {
> >          pci_init_bus_master(pci_dev);
> > @@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass 
> > *klass, void *data)
> >      k->unrealize = pci_qdev_unrealize;
> >      k->bus_type = TYPE_PCI_BUS;
> >      device_class_set_props(k, pci_props);
> > +    object_class_property_set_description(
> > +        klass, "x-max-bounce-buffer-size",
> > +        "Maximum buffer size allocated for bounce buffers used for mapped "
> > +        "access to indirect DMA memory");
> >  }
> >
> >  static void pci_device_class_base_init(ObjectClass *klass, void *data)
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 7d68936157..5577542b5e 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -1081,14 +1081,6 @@ typedef struct AddressSpaceMapClient {
> >      QLIST_ENTRY(AddressSpaceMapClient) link;
> >  } AddressSpaceMapClient;
> >
> > -typedef struct {
> > -    MemoryRegion *mr;
> > -    void *buffer;
> > -    hwaddr addr;
> > -    hwaddr len;
> > -    bool in_use;
> > -} BounceBuffer;
> > -
> >  /**
> >   * struct AddressSpace: describes a mapping of addresses to #MemoryRegion 
> > objects
> >   */
> > @@ -1106,8 +1098,10 @@ struct AddressSpace {
> >      QTAILQ_HEAD(, MemoryListener) listeners;
> >      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> >
> > -    /* Bounce buffer to use for this address space. */
> > -    BounceBuffer bounce;
> > +    /* Maximum DMA bounce buffer size used for indirect memory map 
> > requests */
> > +    uint64_t max_bounce_buffer_size;
> > +    /* Total size of bounce buffers currently allocated, atomically 
> > accessed */
> > +    uint64_t bounce_buffer_size;
> >      /* List of callbacks to invoke when buffers free up */
> >      QemuMutex map_client_list_lock;
> >      QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > index d3dd0f64b2..f4027c5379 100644
> > --- a/include/hw/pci/pci_device.h
> > +++ b/include/hw/pci/pci_device.h
> > @@ -160,6 +160,9 @@ struct PCIDevice {
> >      /* ID of standby device in net_failover pair */
> >      char *failover_pair_id;
> >      uint32_t acpi_index;
> > +
> > +    /* Maximum DMA bounce buffer size used for indirect memory map 
> > requests */
> > +    uint64_t max_bounce_buffer_size;
> >  };
> >
> >  static inline int pci_intx(PCIDevice *pci_dev)
> > diff --git a/softmmu/memory.c b/softmmu/memory.c
> > index 5c9622c3d6..e02799359c 100644
> > --- a/softmmu/memory.c
> > +++ b/softmmu/memory.c
> > @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, 
> > MemoryRegion *root, const char *name)
> >      as->ioeventfds = NULL;
> >      QTAILQ_INIT(&as->listeners);
> >      QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
> > -    as->bounce.in_use = false;
> > +    as->max_bounce_buffer_size = 4096;
> > +    as->bounce_buffer_size = 0;
> >      qemu_mutex_init(&as->map_client_list_lock);
> >      QLIST_INIT(&as->map_client_list);
> >      as->name = g_strdup(name ? name : "anonymous");
> > diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> > index f40cc564b8..e3d1cf5fba 100644
> > --- a/softmmu/physmem.c
> > +++ b/softmmu/physmem.c
> > @@ -2926,6 +2926,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
> >                                       NULL, len, FLUSH_CACHE);
> >  }
> >
> > +/*
> > + * A magic value stored in the first 8 bytes of the bounce buffer struct. 
> > Used
> > + * to detect illegal pointers passed to address_space_unmap.
> > + */
> > +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> > +
> > +typedef struct {
> > +    uint64_t magic;
> > +    MemoryRegion *mr;
> > +    hwaddr addr;
> > +    size_t len;
> > +    uint8_t buffer[];
> > +} BounceBuffer;
> > +
> >  static void
> >  address_space_unregister_map_client_do(AddressSpaceMapClient *client)
> >  {
> > @@ -2953,7 +2967,7 @@ void address_space_register_map_client(AddressSpace 
> > *as, QEMUBH *bh)
> >      QLIST_INSERT_HEAD(&as->map_client_list, client, link);
> >      /* Write map_client_list before reading bounce_buffer_size.  */
> >      smp_mb();
> > -    if (!qatomic_read(&as->bounce.in_use)) {
> > +    if (qatomic_read(&as->bounce_buffer_size) < 
> > as->max_bounce_buffer_size) {
> >          address_space_notify_map_clients_locked(as);
> >      }
> >      qemu_mutex_unlock(&as->map_client_list_lock);
> > @@ -3081,31 +3095,36 @@ void *address_space_map(AddressSpace *as,
> >      RCU_READ_LOCK_GUARD();
> >      fv = address_space_to_flatview(as);
> >      mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
> > +    memory_region_ref(mr);
> >
> >      if (!memory_access_is_direct(mr, is_write)) {
> > -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> > +        size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l);
> > +        if (size > as->max_bounce_buffer_size) {
> > +            size_t excess = size - as->max_bounce_buffer_size;
> > +            l -= excess;
> > +            qatomic_sub(&as->bounce_buffer_size, excess);
>
> I think two threads can race here. as->bounce_buffer_size will be
> corrupted (smaller than it should be) and l will be wrong as well. A
> cmpxchg loop would solve the race.

Ah, thanks for pointing this out. I think the case you have in mind is this:
1. Thread A bumps the size to be larger than the max
2. Thread B bumps the size further
3. Thread B now computes an incorrect excess and sutracts more than it added.

I should be good if I ensure that each thread will only subtract what
it has previously added to enforce the invariant that additions and
subtractions for each map/unmap pair cancel each other out. Let me
know if there's a reason to still prefer the cmpxchg loop.

>
> > +        }
> > +
> > +        if (l == 0) {
> >              *plen = 0;
> >              return NULL;
> >          }
> > -        /* Avoid unbounded allocations */
> > -        l = MIN(l, TARGET_PAGE_SIZE);
> > -        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
> > -        as->bounce.addr = addr;
> > -        as->bounce.len = l;
> >
> > -        memory_region_ref(mr);
> > -        as->bounce.mr = mr;
> > +        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
> > +        bounce->magic = BOUNCE_BUFFER_MAGIC;
> > +        bounce->mr = mr;
> > +        bounce->addr = addr;
> > +        bounce->len = l;
> > +
> >          if (!is_write) {
> >              flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
> > -                          as->bounce.buffer, l);
> > +                          bounce->buffer, l);
> >          }
> >
> >          *plen = l;
> > -        return as->bounce.buffer;
> > +        return bounce->buffer;
> >      }
> >
> > -
> > -    memory_region_ref(mr);
> >      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
> >                                          l, is_write, attrs);
> >      fuzz_dma_read_cb(addr, *plen, mr);
> > @@ -3119,31 +3138,44 @@ void *address_space_map(AddressSpace *as,
> >  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
> >                           bool is_write, hwaddr access_len)
> >  {
> > -    if (buffer != as->bounce.buffer) {
> > -        MemoryRegion *mr;
> > -        ram_addr_t addr1;
> > +    MemoryRegion *mr;
> > +    ram_addr_t addr1;
> > +
> > +    mr = memory_region_from_host(buffer, &addr1);
> > +    if (mr == NULL) {
> > +        BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
> > +        if (bounce->magic != BOUNCE_BUFFER_MAGIC) {
> > +            error_report(
> > +                "Unmap request for %p, which neither corresponds to a 
> > memory "
> > +                "region, nor looks like a bounce buffer, ignoring!",
> > +                buffer);
> > +            return;
> > +        }
> >
> > -        mr = memory_region_from_host(buffer, &addr1);
> > -        assert(mr != NULL);
> >          if (is_write) {
> > -            invalidate_and_set_dirty(mr, addr1, access_len);
> > +            address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> > +                                bounce->buffer, access_len);
> >          }
> > -        if (xen_enabled()) {
> > -            xen_invalidate_map_cache_entry(buffer);
> > +
> > +        memory_region_unref(bounce->mr);
> > +        uint64_t previous_buffer_size =
> > +            qatomic_fetch_sub(&as->bounce_buffer_size, bounce->len);
> > +        if (previous_buffer_size == as->max_bounce_buffer_size) {
> > +            /* Write bounce_buffer_size before reading map_client_list. */
> > +            smp_mb();
> > +            address_space_notify_map_clients(as);
> >          }
> > -        memory_region_unref(mr);
> > +        bounce->magic = ~BOUNCE_BUFFER_MAGIC;
> > +        g_free(bounce);
> >          return;
> >      }
> > +
> > +    if (xen_enabled()) {
> > +        xen_invalidate_map_cache_entry(buffer);
> > +    }
> >      if (is_write) {
> > -        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
> > -                            as->bounce.buffer, access_len);
> > -    }
> > -    qemu_vfree(as->bounce.buffer);
> > -    as->bounce.buffer = NULL;
> > -    memory_region_unref(as->bounce.mr);
> > -    /* Clear in_use before reading map_client_list.  */
> > -    qatomic_set_mb(&as->bounce.in_use, false);
> > -    address_space_notify_map_clients(as);
> > +        invalidate_and_set_dirty(mr, addr1, access_len);
> > +    }
> >  }
> >
> >  void *cpu_physical_memory_map(hwaddr addr,
> > --
> > 2.34.1
> >

Reply via email to