On Mon, Feb 28, 2022 at 8:37 AM Jason Wang <jasow...@redhat.com> wrote: > > > 在 2022/2/27 下午9:41, Eugenio Pérez 写道: > > Use translations added in VhostIOVATree in SVQ. > > > > Only introduce usage here, not allocation and deallocation. As with > > previous patches, we use the dead code paths of shadow_vqs_enabled to > > avoid commiting too many changes at once. These are impossible to take > > at the moment. > > > > Signed-off-by: Eugenio Pérez <epere...@redhat.com> > > --- > > hw/virtio/vhost-shadow-virtqueue.h | 6 +- > > include/hw/virtio/vhost-vdpa.h | 3 + > > hw/virtio/vhost-shadow-virtqueue.c | 76 ++++++++++++++++- > > hw/virtio/vhost-vdpa.c | 128 ++++++++++++++++++++++++----- > > 4 files changed, 187 insertions(+), 26 deletions(-) > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h > > b/hw/virtio/vhost-shadow-virtqueue.h > > index 04c67685fd..b2f722d101 100644 > > --- a/hw/virtio/vhost-shadow-virtqueue.h > > +++ b/hw/virtio/vhost-shadow-virtqueue.h > > @@ -13,6 +13,7 @@ > > #include "qemu/event_notifier.h" > > #include "hw/virtio/virtio.h" > > #include "standard-headers/linux/vhost_types.h" > > +#include "hw/virtio/vhost-iova-tree.h" > > > > /* Shadow virtqueue to relay notifications */ > > typedef struct VhostShadowVirtqueue { > > @@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue { > > /* Virtio device */ > > VirtIODevice *vdev; > > > > + /* IOVA mapping */ > > + VhostIOVATree *iova_tree; > > + > > /* Map for use the guest's descriptors */ > > VirtQueueElement **ring_id_maps; > > > > @@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, > > VirtIODevice *vdev, > > VirtQueue *vq); > > void vhost_svq_stop(VhostShadowVirtqueue *svq); > > > > -VhostShadowVirtqueue *vhost_svq_new(void); > > +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree); > > > > void vhost_svq_free(gpointer vq); > > G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free); > > diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h > > index 009a9f3b6b..ee8e939ad0 100644 > > --- a/include/hw/virtio/vhost-vdpa.h > > +++ b/include/hw/virtio/vhost-vdpa.h > > @@ -14,6 +14,7 @@ > > > > #include <gmodule.h> > > > > +#include "hw/virtio/vhost-iova-tree.h" > > #include "hw/virtio/virtio.h" > > #include "standard-headers/linux/vhost_types.h" > > > > @@ -30,6 +31,8 @@ typedef struct vhost_vdpa { > > MemoryListener listener; > > struct vhost_vdpa_iova_range iova_range; > > bool shadow_vqs_enabled; > > + /* IOVA mapping used by the Shadow Virtqueue */ > > + VhostIOVATree *iova_tree; > > GPtrArray *shadow_vqs; > > struct vhost_dev *dev; > > VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX]; > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c > > b/hw/virtio/vhost-shadow-virtqueue.c > > index a38d313755..7e073773d1 100644 > > --- a/hw/virtio/vhost-shadow-virtqueue.c > > +++ b/hw/virtio/vhost-shadow-virtqueue.c > > @@ -11,6 +11,7 @@ > > #include "hw/virtio/vhost-shadow-virtqueue.h" > > > > #include "qemu/error-report.h" > > +#include "qemu/log.h" > > #include "qemu/main-loop.h" > > #include "qemu/log.h" > > #include "linux-headers/linux/vhost.h" > > @@ -84,7 +85,58 @@ static void > > vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable) > > } > > } > > > > +/** > > + * Translate addresses between the qemu's virtual address and the SVQ IOVA > > + * > > + * @svq Shadow VirtQueue > > + * @vaddr Translated IOVA addresses > > + * @iovec Source qemu's VA addresses > > + * @num Length of iovec and minimum length of vaddr > > + */ > > +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, > > + void **addrs, const struct iovec > > *iovec, > > + size_t num) > > +{ > > + if (num == 0) { > > + return true; > > + } > > + > > + for (size_t i = 0; i < num; ++i) { > > + DMAMap needle = { > > + .translated_addr = (hwaddr)iovec[i].iov_base, > > + .size = iovec[i].iov_len, > > + }; > > + size_t off; > > + > > + const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, > > &needle); > > + /* > > + * Map cannot be NULL since iova map contains all guest space and > > + * qemu already has a physical address mapped > > + */ > > + if (unlikely(!map)) { > > + qemu_log_mask(LOG_GUEST_ERROR, > > + "Invalid address 0x%"HWADDR_PRIx" given by > > guest", > > + needle.translated_addr); > > + return false; > > + } > > + > > + off = needle.translated_addr - map->translated_addr; > > + addrs[i] = (void *)(map->iova + off); > > + > > + if (unlikely(int128_gt(int128_add(needle.translated_addr, > > + iovec[i].iov_len), > > + map->translated_addr + map->size))) { > > + qemu_log_mask(LOG_GUEST_ERROR, > > + "Guest buffer expands over iova range"); > > + return false; > > + } > > + } > > + > > + return true; > > +} > > + > > static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > > + void * const *vaddr_sg, > > > Nit: it looks to me we are not passing vaddr but iova here, so it might > be better to use "sg"? >
Sure, this is a leftover of pre-IOVA translations. I see better to write as you say. > > > const struct iovec *iovec, > > size_t num, bool more_descs, bool > > write) > > { > > @@ -103,7 +155,7 @@ static void > > vhost_vring_write_descs(VhostShadowVirtqueue *svq, > > } else { > > descs[i].flags = flags; > > } > > - descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); > > + descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]); > > descs[i].len = cpu_to_le32(iovec[n].iov_len); > > > > last = i; > > @@ -119,6 +171,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue > > *svq, > > { > > unsigned avail_idx; > > vring_avail_t *avail = svq->vring.avail; > > + bool ok; > > + g_autofree void **sgs = g_new(void *, MAX(elem->out_num, > > elem->in_num)); > > > > *head = svq->free_head; > > > > @@ -129,9 +183,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue > > *svq, > > return false; > > } > > > > - vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > > + ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num); > > + if (unlikely(!ok)) { > > + return false; > > + } > > + vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num, > > elem->in_num > 0, false); > > - vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > > + > > + > > + ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num); > > + if (unlikely(!ok)) { > > + return false; > > + } > > + > > + vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, > > true); > > > > /* > > * Put the entry in the available array (but don't update avail->idx > > until > > @@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) > > * Creates vhost shadow virtqueue, and instructs the vhost device to use > > the > > * shadow methods and file descriptors. > > * > > + * @iova_tree Tree to perform descriptors translations > > + * > > * Returns the new virtqueue or NULL. > > * > > * In case of error, reason is reported through error_report. > > */ > > -VhostShadowVirtqueue *vhost_svq_new(void) > > +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree) > > { > > g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, > > 1); > > int r; > > @@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void) > > > > event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND); > > event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call); > > + svq->iova_tree = iova_tree; > > return g_steal_pointer(&svq); > > > > err_init_hdev_call: > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > > index 435b9c2e9e..56f9f125cd 100644 > > --- a/hw/virtio/vhost-vdpa.c > > +++ b/hw/virtio/vhost-vdpa.c > > @@ -209,6 +209,21 @@ static void > > vhost_vdpa_listener_region_add(MemoryListener *listener, > > vaddr, section->readonly); > > > > llsize = int128_sub(llend, int128_make64(iova)); > > + if (v->shadow_vqs_enabled) { > > + DMAMap mem_region = { > > + .translated_addr = (hwaddr)vaddr, > > + .size = int128_get64(llsize) - 1, > > + .perm = IOMMU_ACCESS_FLAG(true, section->readonly), > > + }; > > + > > + int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); > > + if (unlikely(r != IOVA_OK)) { > > + error_report("Can't allocate a mapping (%d)", r); > > + goto fail; > > + } > > + > > + iova = mem_region.iova; > > + } > > > > vhost_vdpa_iotlb_batch_begin_once(v); > > ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), > > @@ -261,6 +276,20 @@ static void > > vhost_vdpa_listener_region_del(MemoryListener *listener, > > > > llsize = int128_sub(llend, int128_make64(iova)); > > > > + if (v->shadow_vqs_enabled) { > > + const DMAMap *result; > > + const void *vaddr = memory_region_get_ram_ptr(section->mr) + > > + section->offset_within_region + > > + (iova - section->offset_within_address_space); > > + DMAMap mem_region = { > > + .translated_addr = (hwaddr)vaddr, > > + .size = int128_get64(llsize) - 1, > > + }; > > + > > + result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); > > + iova = result->iova; > > + vhost_iova_tree_remove(v->iova_tree, &mem_region); > > + } > > vhost_vdpa_iotlb_batch_begin_once(v); > > ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); > > if (ret) { > > @@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, > > struct vhost_vdpa *v, > > > > shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); > > for (unsigned n = 0; n < hdev->nvqs; ++n) { > > - g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(); > > + g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree); > > > > if (unlikely(!svq)) { > > error_setg(errp, "Cannot create svq %u", n); > > @@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev > > *dev, > > /** > > * Unmap a SVQ area in the device > > */ > > -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova, > > - hwaddr size) > > +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, > > + const DMAMap *needle) > > { > > + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); > > + hwaddr size; > > int r; > > > > - size = ROUND_UP(size, qemu_real_host_page_size); > > - r = vhost_vdpa_dma_unmap(v, iova, size); > > + if (unlikely(!result)) { > > + error_report("Unable to find SVQ address to unmap"); > > + return false; > > + } > > + > > + size = ROUND_UP(result->size, qemu_real_host_page_size); > > + r = vhost_vdpa_dma_unmap(v, result->iova, size); > > return r == 0; > > } > > > > static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, > > const VhostShadowVirtqueue *svq) > > { > > + DMAMap needle; > > struct vhost_vdpa *v = dev->opaque; > > struct vhost_vring_addr svq_addr; > > - size_t device_size = vhost_svq_device_area_size(svq); > > - size_t driver_size = vhost_svq_driver_area_size(svq); > > bool ok; > > > > vhost_svq_get_vring_addr(svq, &svq_addr); > > > > - ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, > > driver_size); > > + needle = (DMAMap) { > > + .translated_addr = svq_addr.desc_user_addr, > > + }; > > > Let's simply initialize the member to zero during start of this function > then we can use needle->transalted_addr = XXX here. > Sure > > > + ok = vhost_vdpa_svq_unmap_ring(v, &needle); > > if (unlikely(!ok)) { > > return false; > > } > > > > - return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, > > device_size); > > + needle = (DMAMap) { > > + .translated_addr = svq_addr.used_user_addr, > > + }; > > + return vhost_vdpa_svq_unmap_ring(v, &needle); > > +} > > + > > +/** > > + * Map the SVQ area in the device > > + * > > + * @v Vhost-vdpa device > > + * @needle The area to search iova > > + * @errorp Error pointer > > + */ > > +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, > > + Error **errp) > > +{ > > + int r; > > + > > + r = vhost_iova_tree_map_alloc(v->iova_tree, needle); > > + if (unlikely(r != IOVA_OK)) { > > + error_setg(errp, "Cannot allocate iova (%d)", r); > > + return false; > > + } > > + > > + r = vhost_vdpa_dma_map(v, needle->iova, needle->size, > > + (void *)needle->translated_addr, > > + !(needle->perm & IOMMU_ACCESS_FLAG(0, 1))); > > > Let's simply use needle->perm == IOMMU_RO here? > The motivation to use this way is to be more resilient to the future. For example, if a new flag is added. But I'm totally ok with comparing with IOMMU_RO, I see that scenario unlikely at the moment. > > > + if (unlikely(r != 0)) { > > + error_setg_errno(errp, -r, "Cannot map region to device"); > > + vhost_iova_tree_remove(v->iova_tree, needle); > > + } > > + > > + return r == 0; > > } > > > > /** > > - * Map shadow virtqueue rings in device > > + * Map the shadow virtqueue rings in the device > > * > > * @dev The vhost device > > * @svq The shadow virtqueue > > @@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev > > *dev, > > struct vhost_vring_addr *addr, > > Error **errp) > > { > > + DMAMap device_region, driver_region; > > + struct vhost_vring_addr svq_addr; > > struct vhost_vdpa *v = dev->opaque; > > size_t device_size = vhost_svq_device_area_size(svq); > > size_t driver_size = vhost_svq_driver_area_size(svq); > > - int r; > > + size_t avail_offset; > > + bool ok; > > > > ERRP_GUARD(); > > - vhost_svq_get_vring_addr(svq, addr); > > + vhost_svq_get_vring_addr(svq, &svq_addr); > > > > - r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size, > > - (void *)addr->desc_user_addr, true); > > - if (unlikely(r != 0)) { > > - error_setg_errno(errp, -r, "Cannot create vq driver region: "); > > + driver_region = (DMAMap) { > > + .translated_addr = svq_addr.desc_user_addr, > > + .size = driver_size - 1, > > > Any reason for the "-1" here? I see several places do things like that, > it's probably hint of wrong API somehwere. > The "problem" is the api mismatch between _end and _last, to include the last member in the size or not. IOVA tree needs to use _end so we can allocate the last page in case of available range ending in (uint64_t)-1 [1]. But If we change vhost_svq_{device,driver}_area_size to make it inclusive, we need to use "+1" in calls like qemu_memalign and memset at vhost_svq_start. Probably in more places too QEMU's emulated Intel iommu code solves it using the address mask as the size, something that does not fit 100% with vhost devices since they can allocate an arbitrary address of arbitrary size when using vIOMMU. It's not a problem for vhost-vdpa at this moment since we make sure we expose unaligned and whole pages with vrings, but I feel it would only be to move the problem somewhere else. Thanks! [1] There are alternatives: to use Int128_t, etc. But I think it's better not to change that in this patch series. > Thanks > > > > + .perm = IOMMU_RO, > > + }; > > + ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); > > + if (unlikely(!ok)) { > > + error_prepend(errp, "Cannot create vq driver region: "); > > return false; > > } > > + addr->desc_user_addr = driver_region.iova; > > + avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; > > + addr->avail_user_addr = driver_region.iova + avail_offset; > > > > - r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size, > > - (void *)addr->used_user_addr, false); > > - if (unlikely(r != 0)) { > > - error_setg_errno(errp, -r, "Cannot create vq device region: "); > > + device_region = (DMAMap) { > > + .translated_addr = svq_addr.used_user_addr, > > + .size = device_size - 1, > > + .perm = IOMMU_RW, > > + }; > > + ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); > > + if (unlikely(!ok)) { > > + error_prepend(errp, "Cannot create vq device region: "); > > + vhost_vdpa_svq_unmap_ring(v, &driver_region); > > } > > + addr->used_user_addr = device_region.iova; > > > > - return r == 0; > > + return ok; > > } > > > > static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, >