On Wed, Feb 23, 2022 at 3:01 AM Eugenio Perez Martin <epere...@redhat.com> wrote: > > On Tue, Feb 8, 2022 at 9:11 AM Jason Wang <jasow...@redhat.com> wrote: > > > > > > 在 2022/2/2 上午1:08, Eugenio Perez Martin 写道: > > > On Sun, Jan 30, 2022 at 5:43 AM Jason Wang <jasow...@redhat.com> wrote: > > >> > > >> 在 2022/1/22 上午4:27, Eugenio Pérez 写道: > > >>> Initial version of shadow virtqueue that actually forward buffers. There > > >>> is no iommu support at the moment, and that will be addressed in future > > >>> patches of this series. Since all vhost-vdpa devices use forced IOMMU, > > >>> this means that SVQ is not usable at this point of the series on any > > >>> device. > > >>> > > >>> For simplicity it only supports modern devices, that expects vring > > >>> in little endian, with split ring and no event idx or indirect > > >>> descriptors. Support for them will not be added in this series. > > >>> > > >>> It reuses the VirtQueue code for the device part. The driver part is > > >>> based on Linux's virtio_ring driver, but with stripped functionality > > >>> and optimizations so it's easier to review. > > >>> > > >>> However, forwarding buffers have some particular pieces: One of the most > > >>> unexpected ones is that a guest's buffer can expand through more than > > >>> one descriptor in SVQ. While this is handled gracefully by qemu's > > >>> emulated virtio devices, it may cause unexpected SVQ queue full. This > > >>> patch also solves it by checking for this condition at both guest's > > >>> kicks and device's calls. The code may be more elegant in the future if > > >>> SVQ code runs in its own iocontext. > > >>> > > >>> Signed-off-by: Eugenio Pérez <epere...@redhat.com> > > >>> --- > > >>> hw/virtio/vhost-shadow-virtqueue.h | 2 + > > >>> hw/virtio/vhost-shadow-virtqueue.c | 365 > > >>> ++++++++++++++++++++++++++++- > > >>> hw/virtio/vhost-vdpa.c | 111 ++++++++- > > >>> 3 files changed, 462 insertions(+), 16 deletions(-) > > >>> > > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h > > >>> b/hw/virtio/vhost-shadow-virtqueue.h > > >>> index 39aef5ffdf..19c934af49 100644 > > >>> --- a/hw/virtio/vhost-shadow-virtqueue.h > > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.h > > >>> @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue > > >>> *svq); > > >>> size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq); > > >>> size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq); > > >>> > > >>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, > > >>> + VirtQueue *vq); > > >>> void vhost_svq_stop(VhostShadowVirtqueue *svq); > > >>> > > >>> VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize); > > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c > > >>> b/hw/virtio/vhost-shadow-virtqueue.c > > >>> index 7c168075d7..a1a404f68f 100644 > > >>> --- a/hw/virtio/vhost-shadow-virtqueue.c > > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c > > >>> @@ -9,6 +9,8 @@ > > >>> > > >>> #include "qemu/osdep.h" > > >>> #include "hw/virtio/vhost-shadow-virtqueue.h" > > >>> +#include "hw/virtio/vhost.h" > > >>> +#include "hw/virtio/virtio-access.h" > > >>> #include "standard-headers/linux/vhost_types.h" > > >>> > > >>> #include "qemu/error-report.h" > > >>> @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue { > > >>> > > >>> /* Guest's call notifier, where SVQ calls guest. */ > > >>> EventNotifier svq_call; > > >>> + > > >>> + /* Virtio queue shadowing */ > > >>> + VirtQueue *vq; > > >>> + > > >>> + /* Virtio device */ > > >>> + VirtIODevice *vdev; > > >>> + > > >>> + /* Map for returning guest's descriptors */ > > >>> + VirtQueueElement **ring_id_maps; > > >>> + > > >>> + /* Next VirtQueue element that guest made available */ > > >>> + VirtQueueElement *next_guest_avail_elem; > > >>> + > > >>> + /* Next head to expose to device */ > > >>> + uint16_t avail_idx_shadow; > > >>> + > > >>> + /* Next free descriptor */ > > >>> + uint16_t free_head; > > >>> + > > >>> + /* Last seen used idx */ > > >>> + uint16_t shadow_used_idx; > > >>> + > > >>> + /* Next head to consume from device */ > > >>> + uint16_t last_used_idx; > > >>> + > > >>> + /* Cache for the exposed notification flag */ > > >>> + bool notification; > > >>> } VhostShadowVirtqueue; > > >>> > > >>> #define INVALID_SVQ_KICK_FD -1 > > >>> @@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t > > >>> dev_features, > > >>> return true; > > >>> } > > >>> > > >>> -/* Forward guest notifications */ > > >>> -static void vhost_handle_guest_kick(EventNotifier *n) > > >>> +/** > > >>> + * Number of descriptors that SVQ can make available from the guest. > > >>> + * > > >>> + * @svq The svq > > >>> + */ > > >>> +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue > > >>> *svq) > > >>> { > > >>> - VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > > >>> - svq_kick); > > >>> + return svq->vring.num - (svq->avail_idx_shadow - > > >>> svq->shadow_used_idx); > > >>> +} > > >>> + > > >>> +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool > > >>> enable) > > >>> +{ > > >>> + uint16_t notification_flag; > > >>> > > >>> - if (unlikely(!event_notifier_test_and_clear(n))) { > > >>> + if (svq->notification == enable) { > > >>> + return; > > >>> + } > > >>> + > > >>> + notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); > > >>> + > > >>> + svq->notification = enable; > > >>> + if (enable) { > > >>> + svq->vring.avail->flags &= ~notification_flag; > > >>> + } else { > > >>> + svq->vring.avail->flags |= notification_flag; > > >>> + } > > >>> +} > > >>> + > > >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > > >>> + const struct iovec *iovec, > > >>> + size_t num, bool more_descs, bool > > >>> write) > > >>> +{ > > >>> + uint16_t i = svq->free_head, last = svq->free_head; > > >>> + unsigned n; > > >>> + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; > > >>> + vring_desc_t *descs = svq->vring.desc; > > >>> + > > >>> + if (num == 0) { > > >>> + return; > > >>> + } > > >>> + > > >>> + for (n = 0; n < num; n++) { > > >>> + if (more_descs || (n + 1 < num)) { > > >>> + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); > > >>> + } else { > > >>> + descs[i].flags = flags; > > >>> + } > > >>> + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); > > >>> + descs[i].len = cpu_to_le32(iovec[n].iov_len); > > >>> + > > >>> + last = i; > > >>> + i = cpu_to_le16(descs[i].next); > > >>> + } > > >>> + > > >>> + svq->free_head = le16_to_cpu(descs[last].next); > > >>> +} > > >>> + > > >>> +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq, > > >>> + VirtQueueElement *elem) > > >>> +{ > > >>> + int head; > > >>> + unsigned avail_idx; > > >>> + vring_avail_t *avail = svq->vring.avail; > > >>> + > > >>> + head = svq->free_head; > > >>> + > > >>> + /* We need some descriptors here */ > > >>> + assert(elem->out_num || elem->in_num); > > >> > > >> Looks like this could be triggered by guest, we need fail instead assert > > >> here. > > >> > > > My understanding was that virtqueue_pop already sanitized that case, > > > but I'm not able to find where now. I will recheck and, in case it's > > > not, I will move to a failure. > > > > > >>> + > > >>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > > >>> + elem->in_num > 0, false); > > >>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, > > >>> true); > > >>> + > > >>> + /* > > >>> + * Put entry in available array (but don't update avail->idx until > > >>> they > > >>> + * do sync). > > >>> + */ > > >>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > > >>> + avail->ring[avail_idx] = cpu_to_le16(head); > > >>> + svq->avail_idx_shadow++; > > >>> + > > >>> + /* Update avail index after the descriptor is wrote */ > > >>> + smp_wmb(); > > >>> + avail->idx = cpu_to_le16(svq->avail_idx_shadow); > > >>> + > > >>> + return head; > > >>> +} > > >>> + > > >>> +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement > > >>> *elem) > > >>> +{ > > >>> + unsigned qemu_head = vhost_svq_add_split(svq, elem); > > >>> + > > >>> + svq->ring_id_maps[qemu_head] = elem; > > >>> +} > > >>> + > > >>> +static void vhost_svq_kick(VhostShadowVirtqueue *svq) > > >>> +{ > > >>> + /* We need to expose available array entries before checking used > > >>> flags */ > > >>> + smp_mb(); > > >>> + if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) { > > >>> return; > > >>> } > > >>> > > >>> event_notifier_set(&svq->hdev_kick); > > >>> } > > >>> > > >>> -/* Forward vhost notifications */ > > >>> +/** > > >>> + * Forward available buffers. > > >>> + * > > >>> + * @svq Shadow VirtQueue > > >>> + * > > >>> + * Note that this function does not guarantee that all guest's > > >>> available > > >>> + * buffers are available to the device in SVQ avail ring. The guest > > >>> may have > > >>> + * exposed a GPA / GIOVA congiuous buffer, but it may not be > > >>> contiguous in qemu > > >>> + * vaddr. > > >>> + * > > >>> + * If that happens, guest's kick notifications will be disabled until > > >>> device > > >>> + * makes some buffers used. > > >>> + */ > > >>> +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) > > >>> +{ > > >>> + /* Clear event notifier */ > > >>> + event_notifier_test_and_clear(&svq->svq_kick); > > >>> + > > >>> + /* Make available as many buffers as possible */ > > >>> + do { > > >>> + if (virtio_queue_get_notification(svq->vq)) { > > >>> + virtio_queue_set_notification(svq->vq, false); > > >> > > >> This looks like an optimization the should belong to > > >> virtio_queue_set_notification() itself. > > >> > > > Sure we can move. > > > > > >>> + } > > >>> + > > >>> + while (true) { > > >>> + VirtQueueElement *elem; > > >>> + > > >>> + if (svq->next_guest_avail_elem) { > > >>> + elem = g_steal_pointer(&svq->next_guest_avail_elem); > > >>> + } else { > > >>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); > > >>> + } > > >>> + > > >>> + if (!elem) { > > >>> + break; > > >>> + } > > >>> + > > >>> + if (elem->out_num + elem->in_num > > > >>> + vhost_svq_available_slots(svq)) { > > >>> + /* > > >>> + * This condition is possible since a contiguous > > >>> buffer in GPA > > >>> + * does not imply a contiguous buffer in qemu's VA > > >>> + * scatter-gather segments. If that happen, the buffer > > >>> exposed > > >>> + * to the device needs to be a chain of descriptors at > > >>> this > > >>> + * moment. > > >>> + * > > >>> + * SVQ cannot hold more available buffers if we are > > >>> here: > > >>> + * queue the current guest descriptor and ignore > > >>> further kicks > > >>> + * until some elements are used. > > >>> + */ > > >>> + svq->next_guest_avail_elem = elem; > > >>> + return; > > >>> + } > > >>> + > > >>> + vhost_svq_add(svq, elem); > > >>> + vhost_svq_kick(svq); > > >>> + } > > >>> + > > >>> + virtio_queue_set_notification(svq->vq, true); > > >>> + } while (!virtio_queue_empty(svq->vq)); > > >>> +} > > >>> + > > >>> +/** > > >>> + * Handle guest's kick. > > >>> + * > > >>> + * @n guest kick event notifier, the one that guest set to notify svq. > > >>> + */ > > >>> +static void vhost_handle_guest_kick_notifier(EventNotifier *n) > > >>> +{ > > >>> + VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > > >>> + svq_kick); > > >>> + vhost_handle_guest_kick(svq); > > >>> +} > > >>> + > > >>> +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq) > > >>> +{ > > >>> + if (svq->last_used_idx != svq->shadow_used_idx) { > > >>> + return true; > > >>> + } > > >>> + > > >>> + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); > > >>> + > > >>> + return svq->last_used_idx != svq->shadow_used_idx; > > >>> +} > > >>> + > > >>> +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq) > > >>> +{ > > >>> + vring_desc_t *descs = svq->vring.desc; > > >>> + const vring_used_t *used = svq->vring.used; > > >>> + vring_used_elem_t used_elem; > > >>> + uint16_t last_used; > > >>> + > > >>> + if (!vhost_svq_more_used(svq)) { > > >>> + return NULL; > > >>> + } > > >>> + > > >>> + /* Only get used array entries after they have been exposed by dev > > >>> */ > > >>> + smp_rmb(); > > >>> + last_used = svq->last_used_idx & (svq->vring.num - 1); > > >>> + used_elem.id = le32_to_cpu(used->ring[last_used].id); > > >>> + used_elem.len = le32_to_cpu(used->ring[last_used].len); > > >>> + > > >>> + svq->last_used_idx++; > > >>> + if (unlikely(used_elem.id >= svq->vring.num)) { > > >>> + error_report("Device %s says index %u is used", > > >>> svq->vdev->name, > > >>> + used_elem.id); > > >>> + return NULL; > > >>> + } > > >>> + > > >>> + if (unlikely(!svq->ring_id_maps[used_elem.id])) { > > >>> + error_report( > > >>> + "Device %s says index %u is used, but it was not > > >>> available", > > >>> + svq->vdev->name, used_elem.id); > > >>> + return NULL; > > >>> + } > > >>> + > > >>> + descs[used_elem.id].next = svq->free_head; > > >>> + svq->free_head = used_elem.id; > > >>> + > > >>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > > >>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > > >>> +} > > >>> + > > >>> +static void vhost_svq_flush(VhostShadowVirtqueue *svq, > > >>> + bool check_for_avail_queue) > > >>> +{ > > >>> + VirtQueue *vq = svq->vq; > > >>> + > > >>> + /* Make as many buffers as possible used. */ > > >>> + do { > > >>> + unsigned i = 0; > > >>> + > > >>> + vhost_svq_set_notification(svq, false); > > >>> + while (true) { > > >>> + g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq); > > >>> + if (!elem) { > > >>> + break; > > >>> + } > > >>> + > > >>> + if (unlikely(i >= svq->vring.num)) { > > >>> + virtio_error(svq->vdev, > > >>> + "More than %u used buffers obtained in a %u > > >>> size SVQ", > > >>> + i, svq->vring.num); > > >>> + virtqueue_fill(vq, elem, elem->len, i); > > >>> + virtqueue_flush(vq, i); > > >> > > >> Let's simply use virtqueue_push() here? > > >> > > > virtqueue_push support to fill and flush only one element, instead of > > > batch. I'm fine with either but I think the less updates to the used > > > idx, the better. > > > > > > Fine. > > > > > > > > > >>> + i = 0; > > >> > > >> Do we need to bail out here? > > >> > > > Yes I guess we can simply return. > > > > > >>> + } > > >>> + virtqueue_fill(vq, elem, elem->len, i++); > > >>> + } > > >>> + > > >>> + virtqueue_flush(vq, i); > > >>> + event_notifier_set(&svq->svq_call); > > >>> + > > >>> + if (check_for_avail_queue && svq->next_guest_avail_elem) { > > >>> + /* > > >>> + * Avail ring was full when vhost_svq_flush was called, so > > >>> it's a > > >>> + * good moment to make more descriptors available if > > >>> possible > > >>> + */ > > >>> + vhost_handle_guest_kick(svq); > > >> > > >> Is there better to have a similar check as vhost_handle_guest_kick() did? > > >> > > >> if (elem->out_num + elem->in_num > > > >> vhost_svq_available_slots(svq)) { > > >> > > > It will be duplicated when we call vhost_handle_guest_kick, won't it? > > > > > > Right, I mis-read the code. > > > > > > > > > >>> + } > > >>> + > > >>> + vhost_svq_set_notification(svq, true); > > >> > > >> A mb() is needed here? Otherwise we may lost a call here (where > > >> vhost_svq_more_used() is run before vhost_svq_set_notification()). > > >> > > > I'm confused here then, I thought you said this is just a hint so > > > there was no need? [1]. I think the memory barrier is needed too. > > > > > > Yes, it's a hint but: > > > > 1) When we disable the notification, consider the notification disable > > is just a hint, device can still raise an interrupt, so the ordering is > > meaningless and a memory barrier is not necessary (the > > vhost_svq_set_notification(svq, false)) > > > > 2) When we enable the notification, though it's a hint, the device can > > choose to implement it by enabling the interrupt, in this case, the > > notification enable should be done before checking the used. Otherwise, > > the checking of more used might be done before enable the notification: > > > > 1) driver check more used > > 2) device add more used but no notification > > 3) driver enable the notification then we lost a notification here > > > > That was my understanding too. So the right way is to only add the > memory barrier in case 2), when setting the flag, right?
Yes. > > > > > >>> + } while (vhost_svq_more_used(svq)); > > >>> +} > > >>> + > > >>> +/** > > >>> + * Forward used buffers. > > >>> + * > > >>> + * @n hdev call event notifier, the one that device set to notify svq. > > >>> + * > > >>> + * Note that we are not making any buffers available in the loop, > > >>> there is no > > >>> + * way that it runs more than virtqueue size times. > > >>> + */ > > >>> static void vhost_svq_handle_call(EventNotifier *n) > > >>> { > > >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > > >>> hdev_call); > > >>> > > >>> - if (unlikely(!event_notifier_test_and_clear(n))) { > > >>> - return; > > >>> - } > > >>> + /* Clear event notifier */ > > >>> + event_notifier_test_and_clear(n); > > >> > > >> Any reason that we remove the above check? > > >> > > > This comes from the previous versions, where this made sure we missed > > > no used buffers in the process of switching to SVQ mode. > > > > > > I'm not sure I get here. Even if for the switching, it should be more > > safe the handle the flush unconditionally? > > > > Yes, I also think it's better to forward and kick/call unconditionally. > > Thanks! Ok. Thanks > > > Thanks > > > > > > > > > > If we enable SVQ from the beginning I think we can rely on getting all > > > the device's used buffer notifications, so let me think a little bit > > > and I can move to check the eventfd. > > > > > >>> - event_notifier_set(&svq->svq_call); > > >>> + vhost_svq_flush(svq, true); > > >>> } > > >>> > > >>> /** > > >>> @@ -258,13 +551,38 @@ void > > >>> vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd) > > >>> * need to explicitely check for them. > > >>> */ > > >>> event_notifier_init_fd(&svq->svq_kick, svq_kick_fd); > > >>> - event_notifier_set_handler(&svq->svq_kick, > > >>> vhost_handle_guest_kick); > > >>> + event_notifier_set_handler(&svq->svq_kick, > > >>> + vhost_handle_guest_kick_notifier); > > >>> > > >>> if (!check_old || event_notifier_test_and_clear(&tmp)) { > > >>> event_notifier_set(&svq->hdev_kick); > > >>> } > > >>> } > > >>> > > >>> +/** > > >>> + * Start shadow virtqueue operation. > > >>> + * > > >>> + * @svq Shadow Virtqueue > > >>> + * @vdev VirtIO device > > >>> + * @vq Virtqueue to shadow > > >>> + */ > > >>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, > > >>> + VirtQueue *vq) > > >>> +{ > > >>> + svq->next_guest_avail_elem = NULL; > > >>> + svq->avail_idx_shadow = 0; > > >>> + svq->shadow_used_idx = 0; > > >>> + svq->last_used_idx = 0; > > >>> + svq->vdev = vdev; > > >>> + svq->vq = vq; > > >>> + > > >>> + memset(svq->vring.avail, 0, sizeof(*svq->vring.avail)); > > >>> + memset(svq->vring.used, 0, sizeof(*svq->vring.avail)); > > >>> + for (unsigned i = 0; i < svq->vring.num - 1; i++) { > > >>> + svq->vring.desc[i].next = cpu_to_le16(i + 1); > > >>> + } > > >>> +} > > >>> + > > >>> /** > > >>> * Stop shadow virtqueue operation. > > >>> * @svq Shadow Virtqueue > > >>> @@ -272,6 +590,28 @@ void > > >>> vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd) > > >>> void vhost_svq_stop(VhostShadowVirtqueue *svq) > > >>> { > > >>> event_notifier_set_handler(&svq->svq_kick, NULL); > > >>> + g_autofree VirtQueueElement *next_avail_elem = NULL; > > >>> + > > >>> + if (!svq->vq) { > > >>> + return; > > >>> + } > > >>> + > > >>> + /* Send all pending used descriptors to guest */ > > >>> + vhost_svq_flush(svq, false); > > >>> + > > >>> + for (unsigned i = 0; i < svq->vring.num; ++i) { > > >>> + g_autofree VirtQueueElement *elem = NULL; > > >>> + elem = g_steal_pointer(&svq->ring_id_maps[i]); > > >>> + if (elem) { > > >>> + virtqueue_detach_element(svq->vq, elem, elem->len); > > >>> + } > > >>> + } > > >>> + > > >>> + next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem); > > >>> + if (next_avail_elem) { > > >>> + virtqueue_detach_element(svq->vq, next_avail_elem, > > >>> + next_avail_elem->len); > > >>> + } > > >>> } > > >>> > > >>> /** > > >>> @@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize) > > >>> memset(svq->vring.desc, 0, driver_size); > > >>> svq->vring.used = qemu_memalign(qemu_real_host_page_size, > > >>> device_size); > > >>> memset(svq->vring.used, 0, device_size); > > >>> - > > >>> + svq->ring_id_maps = g_new0(VirtQueueElement *, qsize); > > >>> event_notifier_set_handler(&svq->hdev_call, > > >>> vhost_svq_handle_call); > > >>> return g_steal_pointer(&svq); > > >>> > > >>> @@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq) > > >>> event_notifier_cleanup(&vq->hdev_kick); > > >>> event_notifier_set_handler(&vq->hdev_call, NULL); > > >>> event_notifier_cleanup(&vq->hdev_call); > > >>> + g_free(vq->ring_id_maps); > > >>> qemu_vfree(vq->vring.desc); > > >>> qemu_vfree(vq->vring.used); > > >>> g_free(vq); > > >>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > > >>> index 53e14bafa0..0e5c00ed7e 100644 > > >>> --- a/hw/virtio/vhost-vdpa.c > > >>> +++ b/hw/virtio/vhost-vdpa.c > > >>> @@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct > > >>> vhost_dev *dev, > > >>> * Note that this function does not rewind kick file descriptor if > > >>> cannot set > > >>> * call one. > > >>> */ > > >>> -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, > > >>> - VhostShadowVirtqueue *svq, > > >>> - unsigned idx) > > >>> +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, > > >>> + VhostShadowVirtqueue *svq, > > >>> + unsigned idx) > > >>> { > > >>> struct vhost_vring_file file = { > > >>> .index = dev->vq_index + idx, > > >>> @@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev > > >>> *dev, > > >>> r = vhost_vdpa_set_vring_dev_kick(dev, &file); > > >>> if (unlikely(r != 0)) { > > >>> error_report("Can't set device kick fd (%d)", -r); > > >>> - return false; > > >>> + return r; > > >>> } > > >>> > > >>> event_notifier = vhost_svq_get_svq_call_notifier(svq); > > >>> @@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev > > >>> *dev, > > >>> error_report("Can't set device call fd (%d)", -r); > > >>> } > > >>> > > >>> + return r; > > >>> +} > > >>> + > > >>> +/** > > >>> + * Unmap SVQ area in the device > > >>> + */ > > >>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr > > >>> iova, > > >>> + hwaddr size) > > >>> +{ > > >>> + int r; > > >>> + > > >>> + size = ROUND_UP(size, qemu_real_host_page_size); > > >>> + r = vhost_vdpa_dma_unmap(v, iova, size); > > >>> + return r == 0; > > >>> +} > > >>> + > > >>> +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, > > >>> + const VhostShadowVirtqueue *svq) > > >>> +{ > > >>> + struct vhost_vdpa *v = dev->opaque; > > >>> + struct vhost_vring_addr svq_addr; > > >>> + size_t device_size = vhost_svq_device_area_size(svq); > > >>> + size_t driver_size = vhost_svq_driver_area_size(svq); > > >>> + bool ok; > > >>> + > > >>> + vhost_svq_get_vring_addr(svq, &svq_addr); > > >>> + > > >>> + ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, > > >>> driver_size); > > >>> + if (unlikely(!ok)) { > > >>> + return false; > > >>> + } > > >>> + > > >>> + return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, > > >>> device_size); > > >>> +} > > >>> + > > >>> +/** > > >>> + * Map shadow virtqueue rings in device > > >>> + * > > >>> + * @dev The vhost device > > >>> + * @svq The shadow virtqueue > > >>> + */ > > >>> +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, > > >>> + const VhostShadowVirtqueue *svq) > > >>> +{ > > >>> + struct vhost_vdpa *v = dev->opaque; > > >>> + struct vhost_vring_addr svq_addr; > > >>> + size_t device_size = vhost_svq_device_area_size(svq); > > >>> + size_t driver_size = vhost_svq_driver_area_size(svq); > > >>> + int r; > > >>> + > > >>> + vhost_svq_get_vring_addr(svq, &svq_addr); > > >>> + > > >>> + r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size, > > >>> + (void *)svq_addr.desc_user_addr, true); > > >>> + if (unlikely(r != 0)) { > > >>> + return false; > > >>> + } > > >>> + > > >>> + r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size, > > >>> + (void *)svq_addr.used_user_addr, false); > > >> > > >> Do we need unmap the driver area if we fail here? > > >> > > > Yes, this used to trust in unmap them at the disabling of SVQ. Now I > > > think we need to unmap as you say. > > > > > > Thanks! > > > > > > [1] > > > https://lists.linuxfoundation.org/pipermail/virtualization/2021-March/053322.html > > > > > >> Thanks > > >> > > >> > > >>> + return r == 0; > > >>> +} > > >>> + > > >>> +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, > > >>> + VhostShadowVirtqueue *svq, > > >>> + unsigned idx) > > >>> +{ > > >>> + uint16_t vq_index = dev->vq_index + idx; > > >>> + struct vhost_vring_state s = { > > >>> + .index = vq_index, > > >>> + }; > > >>> + int r; > > >>> + bool ok; > > >>> + > > >>> + r = vhost_vdpa_set_dev_vring_base(dev, &s); > > >>> + if (unlikely(r)) { > > >>> + error_report("Can't set vring base (%d)", r); > > >>> + return false; > > >>> + } > > >>> + > > >>> + s.num = vhost_svq_get_num(svq); > > >>> + r = vhost_vdpa_set_dev_vring_num(dev, &s); > > >>> + if (unlikely(r)) { > > >>> + error_report("Can't set vring num (%d)", r); > > >>> + return false; > > >>> + } > > >>> + > > >>> + ok = vhost_vdpa_svq_map_rings(dev, svq); > > >>> + if (unlikely(!ok)) { > > >>> + return false; > > >>> + } > > >>> + > > >>> + r = vhost_vdpa_svq_set_fds(dev, svq, idx); > > >>> return r == 0; > > >>> } > > >>> > > >>> @@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev > > >>> *dev, bool started) > > >>> if (started) { > > >>> vhost_vdpa_host_notifiers_init(dev); > > >>> for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { > > >>> + VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index > > >>> + i); > > >>> VhostShadowVirtqueue *svq = > > >>> g_ptr_array_index(v->shadow_vqs, i); > > >>> bool ok = vhost_vdpa_svq_setup(dev, svq, i); > > >>> if (unlikely(!ok)) { > > >>> return -1; > > >>> } > > >>> + vhost_svq_start(svq, dev->vdev, vq); > > >>> } > > >>> vhost_vdpa_set_vring_ready(dev); > > >>> } else { > > >>> + for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { > > >>> + VhostShadowVirtqueue *svq = > > >>> g_ptr_array_index(v->shadow_vqs, > > >>> + i); > > >>> + bool ok = vhost_vdpa_svq_unmap_rings(dev, svq); > > >>> + if (unlikely(!ok)) { > > >>> + return -1; > > >>> + } > > >>> + } > > >>> vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); > > >>> } > > >>> > > >