[PATCH v3] block: Use LVM tools for LV block device truncation

2024-03-15 Thread Alexander Ivanov
If a block device is an LVM logical volume we can resize it using
standard LVM tools.

Add a helper to detect if a device is a DM device. In raw_co_truncate()
check if the block device is DM and resize it executing lvresize.

Signed-off-by: Alexander Ivanov 
---
 block/file-posix.c | 61 ++
 1 file changed, 61 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 35684f7e21..af17a43fe9 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2642,6 +2642,38 @@ raw_regular_truncate(BlockDriverState *bs, int fd, 
int64_t offset,
 return raw_thread_pool_submit(handle_aiocb_truncate, &acb);
 }
 
+static bool device_is_dm(struct stat *st)
+{
+unsigned int maj, maj2;
+char line[32], devname[16];
+bool ret = false;
+FILE *f;
+
+if (!S_ISBLK(st->st_mode)) {
+return false;
+}
+
+f = fopen("/proc/devices", "r");
+if (!f) {
+return false;
+}
+
+maj = major(st->st_rdev);
+
+while (fgets(line, sizeof(line), f)) {
+if (sscanf(line, "%u %15s", &maj2, devname) != 2) {
+continue;
+}
+if (strcmp(devname, "device-mapper") == 0) {
+ret = (maj == maj2);
+break;
+}
+}
+
+fclose(f);
+return ret;
+}
+
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 bool exact, PreallocMode prealloc,
 BdrvRequestFlags flags, Error **errp)
@@ -2670,6 +2702,35 @@ static int coroutine_fn raw_co_truncate(BlockDriverState 
*bs, int64_t offset,
 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
 int64_t cur_length = raw_getlength(bs);
 
+/*
+ * Try to resize an LVM device using LVM tools.
+ */
+if (device_is_dm(&st) && offset > 0) {
+int spawn_flags = G_SPAWN_SEARCH_PATH | G_SPAWN_STDOUT_TO_DEV_NULL;
+int status;
+bool success;
+char *err;
+GError *gerr = NULL, *gerr_exit = NULL;
+g_autofree char *size_str = g_strdup_printf("%" PRId64 "B", 
offset);
+const char *cmd[] = {"lvresize", "-f", "-L",
+ size_str, bs->filename, NULL};
+
+success = g_spawn_sync(NULL, (gchar **)cmd, NULL, spawn_flags,
+   NULL, NULL, NULL, &err, &status, &gerr);
+
+if (success && g_spawn_check_exit_status(status, &gerr_exit)) {
+return 0;
+}
+
+if (success) {
+error_setg(errp, "%s: %s", gerr_exit->message, err);
+} else {
+error_setg(errp, "lvresize execution error: %s", 
gerr->message);
+}
+
+return -EINVAL;
+}
+
 if (offset != cur_length && exact) {
 error_setg(errp, "Cannot resize device files");
 return -ENOTSUP;
-- 
2.40.1




Re: [PATCH v3] block: Use LVM tools for LV block device truncation

2024-03-15 Thread Daniel P . Berrangé
On Fri, Mar 15, 2024 at 09:58:38AM +0100, Alexander Ivanov wrote:
> If a block device is an LVM logical volume we can resize it using
> standard LVM tools.
> 
> Add a helper to detect if a device is a DM device. In raw_co_truncate()
> check if the block device is DM and resize it executing lvresize.
> 
> Signed-off-by: Alexander Ivanov 
> ---
>  block/file-posix.c | 61 ++
>  1 file changed, 61 insertions(+)

Reviewed-by: Daniel P. Berrangé 


> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 35684f7e21..af17a43fe9 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2642,6 +2642,38 @@ raw_regular_truncate(BlockDriverState *bs, int fd, 
> int64_t offset,
>  return raw_thread_pool_submit(handle_aiocb_truncate, &acb);
>  }
>  
> +static bool device_is_dm(struct stat *st)
> +{
> +unsigned int maj, maj2;
> +char line[32], devname[16];
> +bool ret = false;
> +FILE *f;
> +
> +if (!S_ISBLK(st->st_mode)) {
> +return false;
> +}
> +
> +f = fopen("/proc/devices", "r");
> +if (!f) {
> +return false;
> +}
> +
> +maj = major(st->st_rdev);
> +
> +while (fgets(line, sizeof(line), f)) {
> +if (sscanf(line, "%u %15s", &maj2, devname) != 2) {
> +continue;
> +}
> +if (strcmp(devname, "device-mapper") == 0) {
> +ret = (maj == maj2);
> +break;
> +}
> +}
> +
> +fclose(f);
> +return ret;
> +}
> +
>  static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
>  bool exact, PreallocMode prealloc,
>  BdrvRequestFlags flags, Error **errp)
> @@ -2670,6 +2702,35 @@ static int coroutine_fn 
> raw_co_truncate(BlockDriverState *bs, int64_t offset,
>  if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
>  int64_t cur_length = raw_getlength(bs);
>  
> +/*
> + * Try to resize an LVM device using LVM tools.
> + */
> +if (device_is_dm(&st) && offset > 0) {
> +int spawn_flags = G_SPAWN_SEARCH_PATH | 
> G_SPAWN_STDOUT_TO_DEV_NULL;
> +int status;
> +bool success;
> +char *err;
> +GError *gerr = NULL, *gerr_exit = NULL;
> +g_autofree char *size_str = g_strdup_printf("%" PRId64 "B", 
> offset);
> +const char *cmd[] = {"lvresize", "-f", "-L",
> + size_str, bs->filename, NULL};
> +
> +success = g_spawn_sync(NULL, (gchar **)cmd, NULL, spawn_flags,
> +   NULL, NULL, NULL, &err, &status, &gerr);
> +
> +if (success && g_spawn_check_exit_status(status, &gerr_exit)) {
> +return 0;
> +}
> +
> +if (success) {
> +error_setg(errp, "%s: %s", gerr_exit->message, err);
> +} else {
> +error_setg(errp, "lvresize execution error: %s", 
> gerr->message);
> +}
> +
> +return -EINVAL;
> +}
> +
>  if (offset != cur_length && exact) {
>  error_setg(errp, "Cannot resize device files");
>  return -ENOTSUP;
> -- 
> 2.40.1
> 
> 

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v2 1/6] virtio/virtio-pci: Handle extra notification data

2024-03-15 Thread Eugenio Perez Martin
On Thu, Mar 14, 2024 at 9:24 PM Jonah Palmer  wrote:
>
>
>
> On 3/14/24 3:05 PM, Eugenio Perez Martin wrote:
> > On Thu, Mar 14, 2024 at 5:06 PM Jonah Palmer  
> > wrote:
> >>
> >>
> >>
> >> On 3/14/24 10:55 AM, Eugenio Perez Martin wrote:
> >>> On Thu, Mar 14, 2024 at 1:16 PM Jonah Palmer  
> >>> wrote:
> 
> 
> 
>  On 3/13/24 11:01 PM, Jason Wang wrote:
> > On Wed, Mar 13, 2024 at 7:55 PM Jonah Palmer  
> > wrote:
> >>
> >> Add support to virtio-pci devices for handling the extra data sent
> >> from the driver to the device when the VIRTIO_F_NOTIFICATION_DATA
> >> transport feature has been negotiated.
> >>
> >> The extra data that's passed to the virtio-pci device when this
> >> feature is enabled varies depending on the device's virtqueue
> >> layout.
> >>
> >> In a split virtqueue layout, this data includes:
> >> - upper 16 bits: shadow_avail_idx
> >> - lower 16 bits: virtqueue index
> >>
> >> In a packed virtqueue layout, this data includes:
> >> - upper 16 bits: 1-bit wrap counter & 15-bit shadow_avail_idx
> >> - lower 16 bits: virtqueue index
> >>
> >> Tested-by: Lei Yang 
> >> Reviewed-by: Eugenio Pérez 
> >> Signed-off-by: Jonah Palmer 
> >> ---
> >> hw/virtio/virtio-pci.c | 10 +++---
> >> hw/virtio/virtio.c | 18 ++
> >> include/hw/virtio/virtio.h |  1 +
> >> 3 files changed, 26 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> >> index cb6940fc0e..0f5c3c3b2f 100644
> >> --- a/hw/virtio/virtio-pci.c
> >> +++ b/hw/virtio/virtio-pci.c
> >> @@ -384,7 +384,7 @@ static void virtio_ioport_write(void *opaque, 
> >> uint32_t addr, uint32_t val)
> >> {
> >> VirtIOPCIProxy *proxy = opaque;
> >> VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
> >> -uint16_t vector;
> >> +uint16_t vector, vq_idx;
> >> hwaddr pa;
> >>
> >> switch (addr) {
> >> @@ -408,8 +408,12 @@ static void virtio_ioport_write(void *opaque, 
> >> uint32_t addr, uint32_t val)
> >> vdev->queue_sel = val;
> >> break;
> >> case VIRTIO_PCI_QUEUE_NOTIFY:
> >> -if (val < VIRTIO_QUEUE_MAX) {
> >> -virtio_queue_notify(vdev, val);
> >> +vq_idx = val;
> >> +if (vq_idx < VIRTIO_QUEUE_MAX) {
> >> +if (virtio_vdev_has_feature(vdev, 
> >> VIRTIO_F_NOTIFICATION_DATA)) {
> >> +virtio_queue_set_shadow_avail_data(vdev, val);
> >> +}
> >> +virtio_queue_notify(vdev, vq_idx);
> >> }
> >> break;
> >> case VIRTIO_PCI_STATUS:
> >> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> >> index d229755eae..bcb9e09df0 100644
> >> --- a/hw/virtio/virtio.c
> >> +++ b/hw/virtio/virtio.c
> >> @@ -2255,6 +2255,24 @@ void virtio_queue_set_align(VirtIODevice *vdev, 
> >> int n, int align)
> >> }
> >> }
> >>
> >> +void virtio_queue_set_shadow_avail_data(VirtIODevice *vdev, uint32_t 
> >> data)
> >>>
> >>> Maybe I didn't explain well, but I think it is better to pass directly
> >>> idx to a VirtQueue *. That way only the caller needs to check for a
> >>> valid vq idx, and (my understanding is) the virtio.c interface is
> >>> migrating to VirtQueue * use anyway.
> >>>
> >>
> >> Oh, are you saying to just pass in a VirtQueue *vq instead of
> >> VirtIODevice *vdev and get rid of the vq->vring.desc check in the function?
> >>
> >
> > No, that needs to be kept. I meant the access to vdev->vq[i] without
> > checking for a valid i.
> >
>
> Ahh okay I see what you mean. But I thought the following was checking
> for a valid VQ index:
>
> if (vq_idx < VIRTIO_QUEUE_MAX)
>

Right, but then the (potentially multiple) callers are responsible to
check for that. If we accept a VirtQueue *, it is assumed it is valid
already.

> Of course the virtio device may not have up to VIRTIO_QUEUE_MAX
> virtqueues, so maybe we should be checking for validity like this?
>
> if (vdev->vq[i].vring.num == 0)
>

Actually yes, if you're going to send a new version I think checking
against num is better. Good find!

> Or was there something else you had in mind? Apologies for the confusion.
>

No worries, virtio.c is full of checks like that :).

Thanks!

> > You can get the VirtQueue in the caller with virtio_get_queue. Which
> > also does not check for a valid index, but that way is clearer the
> > caller needs to check it.
> >
>
> Roger, I'll use this instead for clarity.
>
> > As a side note, the check for desc != 0 is widespread in QEMU but the
> > driver may use 0 address for desc, so it's not 100% valid. But to
> > change that now requires a deeper change out of the scope of this
>

Re: [PATCH v3] blockcommit: Reopen base image as RO after abort

2024-03-15 Thread Alexander Ivanov




On 2/28/24 17:48, Vladimir Sementsov-Ogievskiy wrote:

On 09.02.24 15:29, Alexander Ivanov wrote:

Could you please review the patch?


Sorry for long delay.

Honestly, I don't like refcnt in block-driver. It violate 
incapsulation, refcnt is interal thing of common block layer. And 
actually, you can't make any assumptions from value of refcnt, as you 
don't know which additional parents were created and why, and when 
they are going unref their children.
Hmmm... Maybe I can just exclude refcnt check from the condition, can't 
I. If BDS will be removed it doesn't matter if we make it RO. What do 
you think?


What was wrong with v2?

My bad, it seems, I didn't send v2 before I decided to change the patch.




On 1/30/24 10:14, Alexander Ivanov wrote:
If a blockcommit is aborted the base image remains in RW mode, that 
leads

to a fail of subsequent live migration.

How to reproduce:
   $ virsh snapshot-create-as vm snp1 --disk-only

   *** write something to the disk inside the guest ***

   $ virsh blockcommit vm vda --active --shallow && virsh blockjob 
vm vda --abort

   $ lsof /vzt/vm.qcow2
   COMMAND  PID USER   FD   TYPE DEVICE   SIZE/OFF NODE NAME
   qemu-syst 433203 root   45u   REG  253,0 1724776448  133 
/vzt/vm.qcow2

   $ cat /proc/433203/fdinfo/45
   pos:    0
   flags:  02140002 < The last 2 means RW mode

If the base image is in RW mode at the end of blockcommit and was in RO
mode before blockcommit, check if src BDS has refcnt > 1. If so, the 
BDS
will not be removed after blockcommit, and we should make the base 
image

RO. Otherwise check recursively if there is a parent BDS of src BDS and
reopen the base BDS in RO in this case.

Signed-off-by: Alexander Ivanov 
---
  block/mirror.c | 38 --
  1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 5145eb53e1..52a7fee75e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -93,6 +93,7 @@ typedef struct MirrorBlockJob {
  int64_t active_write_bytes_in_flight;
  bool prepared;
  bool in_drain;
+    bool base_ro;
  } MirrorBlockJob;
  typedef struct MirrorBDSOpaque {
@@ -652,6 +653,32 @@ static void coroutine_fn 
mirror_wait_for_all_io(MirrorBlockJob *s)

  }
  }
+/*
+ * Check recursively if there is a parent BDS referenced more than
+ * min_refcnt times. This argument is needed because at the first
+ * call there is a bds referenced in blockcommit.
+ */
+static bool bdrv_chain_has_significant_parent(BlockDriverState *bs)
+{
+    BdrvChild *parent;
+    BlockDriverState *parent_bs;
+
+    QLIST_FOREACH(parent, &bs->parents, next) {
+    if (!(parent->klass->parent_is_bds)) {
+    continue;
+    }
+    parent_bs = parent->opaque;
+    if (parent_bs->drv && !parent_bs->drv->is_filter) {
+    return true;
+    }
+    if (bdrv_chain_has_significant_parent(parent_bs)) {
+    return true;
+    }
+    }
+
+    return false;
+}
+
  /**
   * mirror_exit_common: handle both abort() and prepare() cases.
   * for .prepare, returns 0 on success and -errno on failure.
@@ -793,6 +820,11 @@ static int mirror_exit_common(Job *job)
  bdrv_drained_end(target_bs);
  bdrv_unref(target_bs);
+    if (s->base_ro && !bdrv_is_read_only(target_bs) &&
+    (src->refcnt > 1 || bdrv_chain_has_significant_parent(src))) {
+    bdrv_reopen_set_read_only(target_bs, true, NULL);
+    }
+
  bs_opaque->job = NULL;
  bdrv_drained_end(src);
@@ -1715,6 +1747,7 @@ static BlockJob *mirror_start_job(
   bool is_none_mode, BlockDriverState 
*base,
   bool auto_complete, const char 
*filter_node_name,
   bool is_mirror, MirrorCopyMode 
copy_mode,

+ bool base_ro,
   Error **errp)
  {
  MirrorBlockJob *s;
@@ -1798,6 +1831,7 @@ static BlockJob *mirror_start_job(
  bdrv_unref(mirror_top_bs);
  s->mirror_top_bs = mirror_top_bs;
+    s->base_ro = base_ro;
  /* No resize for the target either; while the mirror is still 
running, a
   * consistent read isn't necessarily possible. We could 
possibly allow
@@ -2027,7 +2061,7 @@ void mirror_start(const char *job_id, 
BlockDriverState *bs,
   speed, granularity, buf_size, backing_mode, 
zero_target,
   on_source_error, on_target_error, unmap, 
NULL, NULL,

   &mirror_job_driver, is_none_mode, base, false,
- filter_node_name, true, copy_mode, errp);
+ filter_node_name, true, copy_mode, false, errp);
  }
  BlockJob *commit_active_start(const char *job_id, BlockDriverState 
*bs,
@@ -2056,7 +2090,7 @@ BlockJob *commit_active_start(const char 
*job_id, BlockDriverState *bs,

   on_error, on_error, true, cb, opaque,
   &commit_active_job_driver, false, base, 
auto_complete,
   

[PATCH 0/9] tests/qemu-iotests: Fix running with "check -ssh -qcow2"

2024-03-15 Thread Thomas Huth
I recently wanted to check for some changes that I did to the URI handling
in the block layer code, but I had to discover that a lot of iotests only
work with the raw file format when using a protocol that is not "file",
i.e. "./check -ssh -qcow2" shows a lot of failures.
While some tests could be fixed to work with the "ssh" protocol, too,
many other tests seem to be written for the "file" protocol only and
thus have to be marked accordingly.

After applying these patches, there is still one failure left in test 181
where I'm unsure whether it's a real bug or whether this test should also
simply be marked to work with the "file" protocol only. Suggestions are
welcome!

Thomas Huth (9):
  tests/qemu-iotests: Fix test 033 for running with non-file protocols
  tests/qemu-iotests: Restrict test 066 to the 'file' protocol
  tests/qemu-iotests: Restrict test 114 to the 'file' protocol
  tests/qemu-iotests: Restrict test 130 to the 'file' protocol
  tests/qemu-iotests: Restrict test 134 and 158 to the 'file' protocol
  tests/qemu-iotests: Restrict test 156 to the 'file' protocol
  tests/qemu-iotests: Restrict tests that use --image-opts to the 'file'
protocol
  tests/qemu-iotests: Fix some tests that use --image-opts for other
protocols
  tests/qemu-iotests: Restrict tests using "--blockdev file" to the file
protocol

 tests/qemu-iotests/033| 6 +++---
 tests/qemu-iotests/066| 2 +-
 tests/qemu-iotests/114| 2 +-
 tests/qemu-iotests/130| 2 +-
 tests/qemu-iotests/134| 2 +-
 tests/qemu-iotests/156| 2 +-
 tests/qemu-iotests/158| 2 +-
 tests/qemu-iotests/188| 2 +-
 tests/qemu-iotests/189| 2 +-
 tests/qemu-iotests/198| 2 +-
 tests/qemu-iotests/263| 6 --
 tests/qemu-iotests/284| 7 +++
 tests/qemu-iotests/tests/detect-zeroes-registered-buf | 4 +++-
 tests/qemu-iotests/tests/qcow2-internal-snapshots | 2 +-
 tests/qemu-iotests/tests/qsd-jobs | 2 +-
 15 files changed, 24 insertions(+), 21 deletions(-)

-- 
2.44.0




[PATCH 6/9] tests/qemu-iotests: Restrict test 156 to the 'file' protocol

2024-03-15 Thread Thomas Huth
The test fails completely when you try to use it with a different
protocol, e.g. with "./check -ssh -qcow2 156".
The test uses some hand-crafted JSON statements which cannot work with other
protocols, thus let's change this test to only support the 'file' protocol.

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/156 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/156 b/tests/qemu-iotests/156
index a9540bd80d..97c2d86ce5 100755
--- a/tests/qemu-iotests/156
+++ b/tests/qemu-iotests/156
@@ -50,7 +50,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.qemu
 
 _supported_fmt qcow2 qed
-_supported_proto generic
+_supported_proto file
 # Copying files around with cp does not work with external data files
 _unsupported_imgopts data_file
 
-- 
2.44.0




[PATCH 7/9] tests/qemu-iotests: Restrict tests that use --image-opts to the 'file' protocol

2024-03-15 Thread Thomas Huth
These tests 188, 189 and 198 use qemu-io with --image-opts with additional
hard-coded parameters for the file protocol, so they cannot work for other
protocols. Thus we have to limit these tests to the file protocol only.

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/188 | 2 +-
 tests/qemu-iotests/189 | 2 +-
 tests/qemu-iotests/198 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/188 b/tests/qemu-iotests/188
index ce087d1873..2950b1dc31 100755
--- a/tests/qemu-iotests/188
+++ b/tests/qemu-iotests/188
@@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.filter
 
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 _supported_os Linux
 _require_working_luks
 
diff --git a/tests/qemu-iotests/189 b/tests/qemu-iotests/189
index 801494c6b9..008f73b07d 100755
--- a/tests/qemu-iotests/189
+++ b/tests/qemu-iotests/189
@@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.filter
 
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 _supported_os Linux
 _require_working_luks
 
diff --git a/tests/qemu-iotests/198 b/tests/qemu-iotests/198
index 1c93dea1f7..6ddeffddd2 100755
--- a/tests/qemu-iotests/198
+++ b/tests/qemu-iotests/198
@@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.filter
 
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 _supported_os Linux
 _require_working_luks
 
-- 
2.44.0




[PATCH 2/9] tests/qemu-iotests: Restrict test 066 to the 'file' protocol

2024-03-15 Thread Thomas Huth
The hand-crafted json statement in this test only works if the test
is run with the "file" protocol, so mark this test accordingly.

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/066 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/066 b/tests/qemu-iotests/066
index cf63144cb9..336d8565dd 100755
--- a/tests/qemu-iotests/066
+++ b/tests/qemu-iotests/066
@@ -39,7 +39,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # This tests qcow2-specific low-level functionality
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 # We need zero clusters and snapshots
 # (TODO: Consider splitting the snapshot part into a separate test
 #file, so this one runs with refcount_bits=1 and data_file)
-- 
2.44.0




[PATCH 4/9] tests/qemu-iotests: Restrict test 130 to the 'file' protocol

2024-03-15 Thread Thomas Huth
Using "-drive ...,backing.file.filename=..." only works with the
file protocol, but not with URIs, so mark this test accordingly.

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/130 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/130 b/tests/qemu-iotests/130
index 7257f09677..7af85d20a8 100755
--- a/tests/qemu-iotests/130
+++ b/tests/qemu-iotests/130
@@ -42,7 +42,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.qemu
 
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 _supported_os Linux
 # We are going to use lazy-refcounts
 _unsupported_imgopts 'compat=0.10'
-- 
2.44.0




[PATCH 1/9] tests/qemu-iotests: Fix test 033 for running with non-file protocols

2024-03-15 Thread Thomas Huth
When running iotest 033 with the ssh protocol, it fails with:

 033   fail   [14:48:31] [14:48:41]   10.2soutput mismatch
 --- /.../tests/qemu-iotests/033.out
 +++ /.../tests/qemu-iotests/scratch/qcow2-ssh-033/033.out.bad
 @@ -174,6 +174,7 @@
  512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  wrote 512/512 bytes at offset 2097152
  512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +qemu-io: warning: Failed to truncate the tail of the image: ssh driver does 
not support shrinking files
  read 512/512 bytes at offset 0
  512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)

We already check for the qcow2 format here, so let's simply also
add a check for the protocol here, too, to only test the truncation
with the file protocol.

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/033 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/033 b/tests/qemu-iotests/033
index da9133c44b..4bc7a071bd 100755
--- a/tests/qemu-iotests/033
+++ b/tests/qemu-iotests/033
@@ -123,9 +123,9 @@ do_test 512 "write -P 1 0 0x200" "$TEST_IMG" | 
_filter_qemu_io
 # next L2 table
 do_test 512 "write -P 1 $L2_COVERAGE 0x200" "$TEST_IMG" | _filter_qemu_io
 
-# only interested in qcow2 here; also other formats might respond with
-#  "not supported" error message
-if [ $IMGFMT = "qcow2" ]; then
+# only interested in qcow2 with file protocol here; also other formats
+# might respond with "not supported" error message
+if [ $IMGFMT = "qcow2" ] && [ $IMGPROTO = "file" ]; then
 do_test 512 "truncate $L2_COVERAGE" "$TEST_IMG" | _filter_qemu_io
 fi
 
-- 
2.44.0




[PATCH 8/9] tests/qemu-iotests: Fix some tests that use --image-opts for other protocols

2024-03-15 Thread Thomas Huth
Tests 263, 284 and detect-zeroes-registered-buf use qemu-io
with --image-opts so we have to enforce IMGOPTSSYNTAX=true here
to get $TEST_IMG in shape for other protocols than "file".

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/263| 6 --
 tests/qemu-iotests/284| 7 +++
 tests/qemu-iotests/tests/detect-zeroes-registered-buf | 4 +++-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/qemu-iotests/263 b/tests/qemu-iotests/263
index ec09b41405..44fdada0d6 100755
--- a/tests/qemu-iotests/263
+++ b/tests/qemu-iotests/263
@@ -34,6 +34,8 @@ _cleanup()
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
+IMGOPTSSYNTAX=true
+
 # get standard environment, filters and checks
 . ./common.rc
 . ./common.filter
@@ -73,7 +75,7 @@ echo "testing LUKS qcow2 encryption"
 echo
 
 _make_test_img --object $SECRET -o 
"encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,cluster_size=64K"
 $size
-_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
+_run_test "$TEST_IMG,encrypt.key-secret=sec0"
 _cleanup_test_img
 
 echo
@@ -82,7 +84,7 @@ echo
 
 
 _make_test_img --object $SECRET -o 
"encrypt.format=aes,encrypt.key-secret=sec0,cluster_size=64K" $size
-_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
+_run_test "$TEST_IMG,encrypt.key-secret=sec0"
 _cleanup_test_img
 
 
diff --git a/tests/qemu-iotests/284 b/tests/qemu-iotests/284
index 5a82639e7f..722267486d 100755
--- a/tests/qemu-iotests/284
+++ b/tests/qemu-iotests/284
@@ -33,6 +33,8 @@ _cleanup()
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
+IMGOPTSSYNTAX=true
+
 # get standard environment, filters and checks
 . ./common.rc
 . ./common.filter
@@ -47,14 +49,12 @@ size=1M
 
 SECRET="secret,id=sec0,data=astrochicken"
 
-IMGSPEC="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0"
 QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
 
 _run_test()
 {
-IMGOPTSSYNTAX=true
 OLD_TEST_IMG="$TEST_IMG"
-
TEST_IMG="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0"
+TEST_IMG="$TEST_IMG,encrypt.key-secret=sec0"
 QEMU_IMG_EXTRA_ARGS="--image-opts --object $SECRET"
 
 echo
@@ -78,7 +78,6 @@ _run_test()
 
 TEST_IMG="$OLD_TEST_IMG"
 QEMU_IMG_EXTRA_ARGS=
-IMGOPTSSYNTAX=
 }
 
 
diff --git a/tests/qemu-iotests/tests/detect-zeroes-registered-buf 
b/tests/qemu-iotests/tests/detect-zeroes-registered-buf
index edb5f2cee5..5eaf34e5a6 100755
--- a/tests/qemu-iotests/tests/detect-zeroes-registered-buf
+++ b/tests/qemu-iotests/tests/detect-zeroes-registered-buf
@@ -36,6 +36,8 @@ _cleanup()
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
+IMGOPTSSYNTAX=true
+
 # get standard environment, filters and checks
 cd ..
 . ./common.rc
@@ -46,7 +48,7 @@ _supported_proto generic
 
 size=128M
 _make_test_img $size
-IMGSPEC="driver=$IMGFMT,file.filename=$TEST_IMG,discard=unmap,detect-zeroes=unmap"
+IMGSPEC="$TEST_IMG,discard=unmap,detect-zeroes=unmap"
 
 echo
 echo "== writing zero buffer to image =="
-- 
2.44.0




[PATCH 3/9] tests/qemu-iotests: Restrict test 114 to the 'file' protocol

2024-03-15 Thread Thomas Huth
iotest 114 uses "truncate" and the qcow2.py script on the destination file,
which both cannot deal with URIs. Thus this test needs the "file" protocol,
otherwise it fails with an error message like this:

 truncate: cannot open 
'ssh://127.0.0.1/tmp/qemu-build/tests/qemu-iotests/scratch/qcow2-ssh-114/t.qcow2.orig'
  for writing: No such file or directory

Thus mark this test for "file protocol only" accordingly.

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/114 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/114 b/tests/qemu-iotests/114
index de6fd327ee..dccc71008b 100755
--- a/tests/qemu-iotests/114
+++ b/tests/qemu-iotests/114
@@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.filter
 
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 # At least OpenBSD doesn't seem to have truncate
 _supported_os Linux
 # qcow2.py does not work too well with external data files
-- 
2.44.0




[PATCH 9/9] tests/qemu-iotests: Restrict tests using "--blockdev file" to the file protocol

2024-03-15 Thread Thomas Huth
Tests that use "--blockdev" with the "file" driver cannot work with
other protocols, so we should mark them accordingly.

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/tests/qcow2-internal-snapshots | 2 +-
 tests/qemu-iotests/tests/qsd-jobs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/tests/qcow2-internal-snapshots 
b/tests/qemu-iotests/tests/qcow2-internal-snapshots
index 36523aba06..9f83aa8903 100755
--- a/tests/qemu-iotests/tests/qcow2-internal-snapshots
+++ b/tests/qemu-iotests/tests/qcow2-internal-snapshots
@@ -39,7 +39,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # This tests qcow2-specific low-level functionality
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 # Internal snapshots are (currently) impossible with refcount_bits=1,
 # and generally impossible with external data files
 _unsupported_imgopts 'compat=0.10' 'refcount_bits=1[^0-9]' data_file
diff --git a/tests/qemu-iotests/tests/qsd-jobs 
b/tests/qemu-iotests/tests/qsd-jobs
index 510bf0a9dc..9b843af631 100755
--- a/tests/qemu-iotests/tests/qsd-jobs
+++ b/tests/qemu-iotests/tests/qsd-jobs
@@ -40,7 +40,7 @@ cd ..
 . ./common.filter
 
 _supported_fmt qcow2
-_supported_proto generic
+_supported_proto file
 
 size=128M
 
-- 
2.44.0




[PATCH 5/9] tests/qemu-iotests: Restrict test 134 and 158 to the 'file' protocol

2024-03-15 Thread Thomas Huth
Commit b25b387fa592 updated the iotests 134 and 158 to use the --image-opts
parameter for qemu-io with file protocol related options, but forgot to
update the _supported_proto line accordingly. So let's do that now.

Fixes: b25b387fa5 ("qcow2: convert QCow2 to use QCryptoBlock for encryption")
Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/134 | 2 +-
 tests/qemu-iotests/158 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/134 b/tests/qemu-iotests/134
index ded153c0b9..b2c3c03f08 100755
--- a/tests/qemu-iotests/134
+++ b/tests/qemu-iotests/134
@@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.filter
 
 _supported_fmt qcow qcow2
-_supported_proto generic
+_supported_proto file
 
 
 size=128M
diff --git a/tests/qemu-iotests/158 b/tests/qemu-iotests/158
index a95878e4ce..3a9ad7eed0 100755
--- a/tests/qemu-iotests/158
+++ b/tests/qemu-iotests/158
@@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.filter
 
 _supported_fmt qcow qcow2
-_supported_proto generic
+_supported_proto file
 
 
 size=128M
-- 
2.44.0




Re: [PATCH v2 3/6] qdev-monitor: add option to report GenericError from find_device_state

2024-03-15 Thread Markus Armbruster
Sorry for the late answer.

Vladimir Sementsov-Ogievskiy  writes:

> On 07.03.24 12:46, Markus Armbruster wrote:

[...]

>> I appreciate the attempt to curb the spread of DeviceNotFound errors.
>> Two issues:
>>
>> * Copy-pasting find_device_state() with a false argument is an easy
>>   error to make.
>>
>> * Most uses of find_device_state() are via blk_by_qdev_id() and
>>   qmp_get_blk().  Any new uses of qemu_get_blk() will still produce
>>   DeviceNotFound.
>>
>> Hmm.
>
> Hmm. Right. Wait a bit, I can make the change stricter.
>
> Could you still explain (or give a link), why and when we decided to use only 
> GenericError? I think, having different "error-codes" is a good thing, why we 
> are trying to get rid of it?

We actually got rid of most of them years ago :)

But you deserve a more complete answer.

QMP initially specified the following error response[1]:

2.4.2 error
---

The error response is issued when the command execution could not be
completed because of an error condition.

The format is:

{ "error": { "class": json-string, "data": json-value }, "id": json-value }

 Where,

- The "class" member contains the error class name (eg. 
"ServiceUnavailable")
- The "data" member contains specific error data and is defined in a
  per-command basis, it will be an empty json-object if the error has no 
data
- The "id" member contains the transaction identification associated with
  the command execution (if issued by the Client)

Note the structure of @data depends on @class.  We documented a
command's possible error classes (well, we tried), but never bothered to
document the @data it comes with.

Documentation deficits aside, this is looks quite expressive.  There are
issues, though:

1. Formatting errors in human-readable form is bothersome, and creates a
   tight coupling between QMP server and client.

   Consider:

{"class": "DeviceNotFound", "data": {"device": "ide1-cd0"}}

   To format this in human-readable form, you need to know the error.

   The server does.  Fine print: it has a table mapping JSON templates
   to human-readable error message templates.

   The client needs to duplicate this somehow.  If it receives an error
   it doesn't know, all it can do is barf (possibly dressed up) JSON at
   the human.  To avoid that, clients need to track the server closely:
   tight coupling.

2. Errors have notational overhead, which leads to bad errors.

   To create a new error, you have to edit two source files (not
   counting clients).  Strong incentive to reuse existing errors.  Even
   when they don't quite fit.  When a name provided by the user couldn't
   be resolved, reusing DeviceNotFound is easier than creating a new
   error that is more precise.

3. The human-readable error message is hidden from the programmer's
   view, which leads to bad error messages.

   At the point in the source where the error is created, we see
   something like QERR_DEVICE_NOT_FOUND, name.  To see the
   human-readable message, we have to look up macro
   QERR_DEVICE_NOT_FOUND's error message template in the table, or
   actually test (*gasp*) the error.  People generally do neither, and
   bad error messages proliferate.

4. Too little gain for the pain

   Clients rarely want to handle different errors differently.  More
   often than not, all they do with @class and @data is log them.  Only
   occasionally do they switch on @class, and basically never on @data.

It me took a considerable fight to get the protocol amended to include a
human-readable message[2].  This addressed issue 1.

Over the next two years or so, issues 2. to 4. slowly sank in.  We
eventually tired of the complexity, ripped out @data, and dumbed down
all error classes to GenericError, except for the few clients actually
cared for[3].  We also mandated that new errors avoid the QERR_ macros.

Eliminating the existing QERR_ macros has been slow.  We're down to 13
in master, with patches deleting 7 on the list.

This has served us reasonably well.

Questions?


[1] Commit f544d174dfc
QMP: Introduce specification
Dec 2009

[2] Commit 77e595e7c61q
QMP: add human-readable description to error response
Dec 2009

[3] Commit de253f14912
qmp: switch to the new error format on the wire
Aug 2012




Re: [PATCH v2 3/6] qdev-monitor: add option to report GenericError from find_device_state

2024-03-15 Thread Vladimir Sementsov-Ogievskiy

On 15.03.24 15:51, Markus Armbruster wrote:

Sorry for the late answer.

Vladimir Sementsov-Ogievskiy  writes:


On 07.03.24 12:46, Markus Armbruster wrote:


[...]


I appreciate the attempt to curb the spread of DeviceNotFound errors.
Two issues:

* Copy-pasting find_device_state() with a false argument is an easy
   error to make.

* Most uses of find_device_state() are via blk_by_qdev_id() and
   qmp_get_blk().  Any new uses of qemu_get_blk() will still produce
   DeviceNotFound.

Hmm.


Hmm. Right. Wait a bit, I can make the change stricter.

Could you still explain (or give a link), why and when we decided to use only 
GenericError? I think, having different "error-codes" is a good thing, why we 
are trying to get rid of it?


We actually got rid of most of them years ago :)

But you deserve a more complete answer.

QMP initially specified the following error response[1]:

 2.4.2 error
 ---
 
 The error response is issued when the command execution could not be

 completed because of an error condition.
 
 The format is:
 
 { "error": { "class": json-string, "data": json-value }, "id": json-value }
 
  Where,
 
 - The "class" member contains the error class name (eg. "ServiceUnavailable")

 - The "data" member contains specific error data and is defined in a
   per-command basis, it will be an empty json-object if the error has no 
data
 - The "id" member contains the transaction identification associated with
   the command execution (if issued by the Client)

Note the structure of @data depends on @class.  We documented a
command's possible error classes (well, we tried), but never bothered to
document the @data it comes with.

Documentation deficits aside, this is looks quite expressive.  There are
issues, though:

1. Formatting errors in human-readable form is bothersome, and creates a
tight coupling between QMP server and client.

Consider:

 {"class": "DeviceNotFound", "data": {"device": "ide1-cd0"}}

To format this in human-readable form, you need to know the error.

The server does.  Fine print: it has a table mapping JSON templates
to human-readable error message templates.

The client needs to duplicate this somehow.  If it receives an error
it doesn't know, all it can do is barf (possibly dressed up) JSON at
the human.  To avoid that, clients need to track the server closely:
tight coupling.

2. Errors have notational overhead, which leads to bad errors.

To create a new error, you have to edit two source files (not
counting clients).  Strong incentive to reuse existing errors.  Even
when they don't quite fit.  When a name provided by the user couldn't
be resolved, reusing DeviceNotFound is easier than creating a new
error that is more precise.

3. The human-readable error message is hidden from the programmer's
view, which leads to bad error messages.

At the point in the source where the error is created, we see
something like QERR_DEVICE_NOT_FOUND, name.  To see the
human-readable message, we have to look up macro
QERR_DEVICE_NOT_FOUND's error message template in the table, or
actually test (*gasp*) the error.  People generally do neither, and
bad error messages proliferate.

4. Too little gain for the pain

Clients rarely want to handle different errors differently.  More
often than not, all they do with @class and @data is log them.  Only
occasionally do they switch on @class, and basically never on @data.

It me took a considerable fight to get the protocol amended to include a
human-readable message[2].  This addressed issue 1.

Over the next two years or so, issues 2. to 4. slowly sank in.  We
eventually tired of the complexity, ripped out @data, and dumbed down
all error classes to GenericError, except for the few clients actually
cared for[3].  We also mandated that new errors avoid the QERR_ macros.

Eliminating the existing QERR_ macros has been slow.  We're down to 13
in master, with patches deleting 7 on the list.

This has served us reasonably well.

Questions?


[1] Commit f544d174dfc
 QMP: Introduce specification
 Dec 2009

[2] Commit 77e595e7c61q
 QMP: add human-readable description to error response
 Dec 2009

[3] Commit de253f14912
 qmp: switch to the new error format on the wire
 Aug 2012



Thanks for full-picture story! Now I'm all for it.

--
Best regards,
Vladimir




Re: [PATCH v3] blockcommit: Reopen base image as RO after abort

2024-03-15 Thread Vladimir Sementsov-Ogievskiy

On 15.03.24 12:55, Alexander Ivanov wrote:



On 2/28/24 17:48, Vladimir Sementsov-Ogievskiy wrote:

On 09.02.24 15:29, Alexander Ivanov wrote:

Could you please review the patch?


Sorry for long delay.

Honestly, I don't like refcnt in block-driver. It violate incapsulation, refcnt 
is interal thing of common block layer. And actually, you can't make any 
assumptions from value of refcnt, as you don't know which additional parents 
were created and why, and when they are going unref their children.

Hmmm... Maybe I can just exclude refcnt check from the condition, can't I. If 
BDS will be removed it doesn't matter if we make it RO. What do you think?


Sounds good. I even don't see, why you need bdrv_chain_has_significant_parent() 
check. We just roll-back ro->rw transition on failure case, isn't just always 
correct thing to do?



What was wrong with v2?

My bad, it seems, I didn't send v2 before I decided to change the patch.


Hmm, somehow, I don't have it in my mailbox, but here it is: 
https://patchew.org/QEMU/20240109093128.157460-1-alexander.iva...@virtuozzo.com/


===

More: in commit message you say about failure case. And it seems OK to roll-back 
ro->rw transition on failure, if we did it. But mirror_exit_common() called on 
success path too. I think, on success patch, we should do any additional 
reopenings?

--
Best regards,
Vladimir




[PATCH for-9.0 v2] vdpa-dev: Fix initialisation order to restore VDUSE compatibility

2024-03-15 Thread Kevin Wolf
VDUSE requires that virtqueues are first enabled before the DRIVER_OK
status flag is set; with the current API of the kernel module, it is
impossible to enable the opposite order in our block export code because
userspace is not notified when a virtqueue is enabled.

This requirement also mathces the normal initialisation order as done by
the generic vhost code in QEMU. However, commit 6c482547 accidentally
changed the order for vdpa-dev and broke access to VDUSE devices with
this.

This changes vdpa-dev to use the normal order again and use the standard
vhost callback .vhost_set_vring_enable for this. VDUSE devices can be
used with vdpa-dev again after this fix.

vhost_net intentionally avoided enabling the vrings for vdpa and does
this manually later while it does enable them for other vhost backends.
Reflect this in the vhost_net code and return early for vdpa, so that
the behaviour doesn't change for this device.

Cc: qemu-sta...@nongnu.org
Fixes: 6c4825476a4351530bcac17abab72295b75ffe98
Signed-off-by: Kevin Wolf 
---
v2:
- Actually make use of the @enable parameter
- Change vhost_net to preserve the current behaviour

 hw/net/vhost_net.c | 10 ++
 hw/virtio/vdpa-dev.c   |  5 +
 hw/virtio/vhost-vdpa.c | 27 +--
 hw/virtio/vhost.c  |  8 +++-
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index e8e1661646..fd1a93701a 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable)
 VHostNetState *net = get_vhost_net(nc);
 const VhostOps *vhost_ops = net->dev.vhost_ops;
 
+/*
+ * vhost-vdpa network devices need to enable dataplane virtqueues after
+ * DRIVER_OK, so they can recover device state before starting dataplane.
+ * Because of that, we don't enable virtqueues here and leave it to
+ * net/vhost-vdpa.c.
+ */
+if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
+return 0;
+}
+
 nc->vring_enable = enable;
 
 if (vhost_ops && vhost_ops->vhost_set_vring_enable) {
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index eb9ecea83b..13e87f06f6 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, 
Error **errp)
 
 s->dev.acked_features = vdev->guest_features;
 
-ret = vhost_dev_start(&s->dev, vdev, false);
+ret = vhost_dev_start(&s->dev, vdev, true);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "Error starting vhost");
 goto err_guest_notifiers;
 }
-for (i = 0; i < s->dev.nvqs; ++i) {
-vhost_vdpa_set_vring_ready(&s->vdpa, i);
-}
 s->started = true;
 
 /*
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index ddae494ca8..401afac2f5 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -886,12 +886,13 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, 
int idx)
 return idx;
 }
 
-int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx,
+   int enable)
 {
 struct vhost_dev *dev = v->dev;
 struct vhost_vring_state state = {
 .index = idx,
-.num = 1,
+.num = enable,
 };
 int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
 
@@ -899,6 +900,27 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, 
unsigned idx)
 return r;
 }
 
+static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+struct vhost_vdpa *v = dev->opaque;
+unsigned int i;
+int ret;
+
+for (i = 0; i < dev->nvqs; ++i) {
+ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
+if (ret < 0) {
+return ret;
+}
+}
+
+return 0;
+}
+
+int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+{
+return vhost_vdpa_set_vring_enable_one(v, idx, 1);
+}
+
 static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
int fd)
 {
@@ -1514,6 +1536,7 @@ const VhostOps vdpa_ops = {
 .vhost_set_features = vhost_vdpa_set_features,
 .vhost_reset_device = vhost_vdpa_reset_device,
 .vhost_get_vq_index = vhost_vdpa_get_vq_index,
+.vhost_set_vring_enable = vhost_vdpa_set_vring_enable,
 .vhost_get_config  = vhost_vdpa_get_config,
 .vhost_set_config = vhost_vdpa_set_config,
 .vhost_requires_shm_log = NULL,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac79468..decfb85184 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1984,7 +1984,13 @@ static int vhost_dev_set_vring_enable(struct vhost_dev 
*hdev, int enable)
 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
 }
 
-/* Host notifiers must be enabled at this point. */
+/*
+ * Host notifiers must be

Re: [PATCH for-9.0 v2] vdpa-dev: Fix initialisation order to restore VDUSE compatibility

2024-03-15 Thread Stefano Garzarella

On Fri, Mar 15, 2024 at 03:03:31PM +0100, Kevin Wolf wrote:

VDUSE requires that virtqueues are first enabled before the DRIVER_OK
status flag is set; with the current API of the kernel module, it is
impossible to enable the opposite order in our block export code because
userspace is not notified when a virtqueue is enabled.

This requirement also mathces the normal initialisation order as done by
the generic vhost code in QEMU. However, commit 6c482547 accidentally
changed the order for vdpa-dev and broke access to VDUSE devices with
this.

This changes vdpa-dev to use the normal order again and use the standard
vhost callback .vhost_set_vring_enable for this. VDUSE devices can be
used with vdpa-dev again after this fix.

vhost_net intentionally avoided enabling the vrings for vdpa and does
this manually later while it does enable them for other vhost backends.
Reflect this in the vhost_net code and return early for vdpa, so that
the behaviour doesn't change for this device.

Cc: qemu-sta...@nongnu.org
Fixes: 6c4825476a4351530bcac17abab72295b75ffe98
Signed-off-by: Kevin Wolf 
---
v2:
- Actually make use of the @enable parameter
- Change vhost_net to preserve the current behaviour

hw/net/vhost_net.c | 10 ++
hw/virtio/vdpa-dev.c   |  5 +
hw/virtio/vhost-vdpa.c | 27 +--
hw/virtio/vhost.c  |  8 +++-
4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index e8e1661646..fd1a93701a 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable)
VHostNetState *net = get_vhost_net(nc);
const VhostOps *vhost_ops = net->dev.vhost_ops;

+/*
+ * vhost-vdpa network devices need to enable dataplane virtqueues after
+ * DRIVER_OK, so they can recover device state before starting dataplane.
+ * Because of that, we don't enable virtqueues here and leave it to
+ * net/vhost-vdpa.c.
+ */
+if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
+return 0;
+}
+
nc->vring_enable = enable;

if (vhost_ops && vhost_ops->vhost_set_vring_enable) {
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index eb9ecea83b..13e87f06f6 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, 
Error **errp)

s->dev.acked_features = vdev->guest_features;

-ret = vhost_dev_start(&s->dev, vdev, false);
+ret = vhost_dev_start(&s->dev, vdev, true);
if (ret < 0) {
error_setg_errno(errp, -ret, "Error starting vhost");
goto err_guest_notifiers;
}
-for (i = 0; i < s->dev.nvqs; ++i) {
-vhost_vdpa_set_vring_ready(&s->vdpa, i);
-}
s->started = true;

/*
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index ddae494ca8..401afac2f5 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -886,12 +886,13 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, 
int idx)
return idx;
}

-int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx,
+   int enable)
{
struct vhost_dev *dev = v->dev;
struct vhost_vring_state state = {
.index = idx,
-.num = 1,
+.num = enable,
};
int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);



After this line we now have:

  trace_vhost_vdpa_set_vring_ready(dev, idx, r);

Should we rename it or move it to the new function?

If we rename it, we should trace also `enable`.

@@ -899,6 +900,27 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa 
*v, unsigned idx)

return r;
}

+static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+struct vhost_vdpa *v = dev->opaque;
+unsigned int i;
+int ret;
+
+for (i = 0; i < dev->nvqs; ++i) {
+ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
+if (ret < 0) {
+return ret;
+}
+}
+
+return 0;
+}
+
+int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+{
+return vhost_vdpa_set_vring_enable_one(v, idx, 1);
+}
+
static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
   int fd)
{
@@ -1514,6 +1536,7 @@ const VhostOps vdpa_ops = {
.vhost_set_features = vhost_vdpa_set_features,
.vhost_reset_device = vhost_vdpa_reset_device,
.vhost_get_vq_index = vhost_vdpa_get_vq_index,
+.vhost_set_vring_enable = vhost_vdpa_set_vring_enable,
.vhost_get_config  = vhost_vdpa_get_config,
.vhost_set_config = vhost_vdpa_set_config,
.vhost_requires_shm_log = NULL,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac79468..decfb85184 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1984,7 +1984,13 @@ static int vhost_dev_s

Re: [PATCH for-9.0 v2] vdpa-dev: Fix initialisation order to restore VDUSE compatibility

2024-03-15 Thread Kevin Wolf
Am 15.03.2024 um 16:07 hat Stefano Garzarella geschrieben:
> On Fri, Mar 15, 2024 at 03:03:31PM +0100, Kevin Wolf wrote:
> > VDUSE requires that virtqueues are first enabled before the DRIVER_OK
> > status flag is set; with the current API of the kernel module, it is
> > impossible to enable the opposite order in our block export code because
> > userspace is not notified when a virtqueue is enabled.
> > 
> > This requirement also mathces the normal initialisation order as done by
> > the generic vhost code in QEMU. However, commit 6c482547 accidentally
> > changed the order for vdpa-dev and broke access to VDUSE devices with
> > this.
> > 
> > This changes vdpa-dev to use the normal order again and use the standard
> > vhost callback .vhost_set_vring_enable for this. VDUSE devices can be
> > used with vdpa-dev again after this fix.
> > 
> > vhost_net intentionally avoided enabling the vrings for vdpa and does
> > this manually later while it does enable them for other vhost backends.
> > Reflect this in the vhost_net code and return early for vdpa, so that
> > the behaviour doesn't change for this device.
> > 
> > Cc: qemu-sta...@nongnu.org
> > Fixes: 6c4825476a4351530bcac17abab72295b75ffe98
> > Signed-off-by: Kevin Wolf 
> > ---
> > v2:
> > - Actually make use of the @enable parameter
> > - Change vhost_net to preserve the current behaviour
> > 
> > hw/net/vhost_net.c | 10 ++
> > hw/virtio/vdpa-dev.c   |  5 +
> > hw/virtio/vhost-vdpa.c | 27 +--
> > hw/virtio/vhost.c  |  8 +++-
> > 4 files changed, 43 insertions(+), 7 deletions(-)
> > 
> > diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
> > index e8e1661646..fd1a93701a 100644
> > --- a/hw/net/vhost_net.c
> > +++ b/hw/net/vhost_net.c
> > @@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int 
> > enable)
> > VHostNetState *net = get_vhost_net(nc);
> > const VhostOps *vhost_ops = net->dev.vhost_ops;
> > 
> > +/*
> > + * vhost-vdpa network devices need to enable dataplane virtqueues after
> > + * DRIVER_OK, so they can recover device state before starting 
> > dataplane.
> > + * Because of that, we don't enable virtqueues here and leave it to
> > + * net/vhost-vdpa.c.
> > + */
> > +if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
> > +return 0;
> > +}
> > +
> > nc->vring_enable = enable;
> > 
> > if (vhost_ops && vhost_ops->vhost_set_vring_enable) {
> > diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
> > index eb9ecea83b..13e87f06f6 100644
> > --- a/hw/virtio/vdpa-dev.c
> > +++ b/hw/virtio/vdpa-dev.c
> > @@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice 
> > *vdev, Error **errp)
> > 
> > s->dev.acked_features = vdev->guest_features;
> > 
> > -ret = vhost_dev_start(&s->dev, vdev, false);
> > +ret = vhost_dev_start(&s->dev, vdev, true);
> > if (ret < 0) {
> > error_setg_errno(errp, -ret, "Error starting vhost");
> > goto err_guest_notifiers;
> > }
> > -for (i = 0; i < s->dev.nvqs; ++i) {
> > -vhost_vdpa_set_vring_ready(&s->vdpa, i);
> > -}
> > s->started = true;
> > 
> > /*
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index ddae494ca8..401afac2f5 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -886,12 +886,13 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev 
> > *dev, int idx)
> > return idx;
> > }
> > 
> > -int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
> > +static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned 
> > idx,
> > +   int enable)
> > {
> > struct vhost_dev *dev = v->dev;
> > struct vhost_vring_state state = {
> > .index = idx,
> > -.num = 1,
> > +.num = enable,
> > };
> > int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
> > 
> 
> After this line we now have:
> 
>   trace_vhost_vdpa_set_vring_ready(dev, idx, r);
> 
> Should we rename it or move it to the new function?
> 
> If we rename it, we should trace also `enable`.

I think renaming is better so that we cover all code paths that enable a
vring. I'll change this and send a v3.

> > @@ -899,6 +900,27 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa
> > *v, unsigned idx)
> > return r;
> > }
> > 
> > +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
> > +{
> > +struct vhost_vdpa *v = dev->opaque;
> > +unsigned int i;
> > +int ret;
> > +
> > +for (i = 0; i < dev->nvqs; ++i) {
> > +ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
> > +if (ret < 0) {
> > +return ret;
> > +}
> > +}
> > +
> > +return 0;
> > +}
> > +
> > +int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
> > +{
> > +return vhost_vdpa_set_vring_enable_one(v, idx, 1);
> > +}
> > +
> > static int vhost_vdp

[PATCH for-9.0 v3] vdpa-dev: Fix initialisation order to restore VDUSE compatibility

2024-03-15 Thread Kevin Wolf
VDUSE requires that virtqueues are first enabled before the DRIVER_OK
status flag is set; with the current API of the kernel module, it is
impossible to enable the opposite order in our block export code because
userspace is not notified when a virtqueue is enabled.

This requirement also mathces the normal initialisation order as done by
the generic vhost code in QEMU. However, commit 6c482547 accidentally
changed the order for vdpa-dev and broke access to VDUSE devices with
this.

This changes vdpa-dev to use the normal order again and use the standard
vhost callback .vhost_set_vring_enable for this. VDUSE devices can be
used with vdpa-dev again after this fix.

vhost_net intentionally avoided enabling the vrings for vdpa and does
this manually later while it does enable them for other vhost backends.
Reflect this in the vhost_net code and return early for vdpa, so that
the behaviour doesn't change for this device.

Cc: qemu-sta...@nongnu.org
Fixes: 6c4825476a4351530bcac17abab72295b75ffe98
Signed-off-by: Kevin Wolf 
---
v2:
- Actually make use of the @enable parameter
- Change vhost_net to preserve the current behaviour

v3:
- Updated trace point [Stefano]
- Fixed typo in comment [Stefano]

 hw/net/vhost_net.c | 10 ++
 hw/virtio/vdpa-dev.c   |  5 +
 hw/virtio/vhost-vdpa.c | 29 ++---
 hw/virtio/vhost.c  |  8 +++-
 hw/virtio/trace-events |  2 +-
 5 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index e8e1661646..fd1a93701a 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable)
 VHostNetState *net = get_vhost_net(nc);
 const VhostOps *vhost_ops = net->dev.vhost_ops;
 
+/*
+ * vhost-vdpa network devices need to enable dataplane virtqueues after
+ * DRIVER_OK, so they can recover device state before starting dataplane.
+ * Because of that, we don't enable virtqueues here and leave it to
+ * net/vhost-vdpa.c.
+ */
+if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
+return 0;
+}
+
 nc->vring_enable = enable;
 
 if (vhost_ops && vhost_ops->vhost_set_vring_enable) {
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index eb9ecea83b..13e87f06f6 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, 
Error **errp)
 
 s->dev.acked_features = vdev->guest_features;
 
-ret = vhost_dev_start(&s->dev, vdev, false);
+ret = vhost_dev_start(&s->dev, vdev, true);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "Error starting vhost");
 goto err_guest_notifiers;
 }
-for (i = 0; i < s->dev.nvqs; ++i) {
-vhost_vdpa_set_vring_ready(&s->vdpa, i);
-}
 s->started = true;
 
 /*
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index ddae494ca8..31453466af 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -886,19 +886,41 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, 
int idx)
 return idx;
 }
 
-int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx,
+   int enable)
 {
 struct vhost_dev *dev = v->dev;
 struct vhost_vring_state state = {
 .index = idx,
-.num = 1,
+.num = enable,
 };
 int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
 
-trace_vhost_vdpa_set_vring_ready(dev, idx, r);
+trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r);
 return r;
 }
 
+static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+struct vhost_vdpa *v = dev->opaque;
+unsigned int i;
+int ret;
+
+for (i = 0; i < dev->nvqs; ++i) {
+ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
+if (ret < 0) {
+return ret;
+}
+}
+
+return 0;
+}
+
+int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+{
+return vhost_vdpa_set_vring_enable_one(v, idx, 1);
+}
+
 static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
int fd)
 {
@@ -1514,6 +1536,7 @@ const VhostOps vdpa_ops = {
 .vhost_set_features = vhost_vdpa_set_features,
 .vhost_reset_device = vhost_vdpa_reset_device,
 .vhost_get_vq_index = vhost_vdpa_get_vq_index,
+.vhost_set_vring_enable = vhost_vdpa_set_vring_enable,
 .vhost_get_config  = vhost_vdpa_get_config,
 .vhost_set_config = vhost_vdpa_set_config,
 .vhost_requires_shm_log = NULL,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac79468..a66186 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1984,7 +1984,13 @@ static int vhost_dev_set_vring_enable(struct vhost_dev 
*hdev, int enable)
 return hdev->vhos

[PATCH v3 for 9.1 5/6] vhost/vhost-user: Add VIRTIO_F_NOTIFICATION_DATA to vhost feature bits

2024-03-15 Thread Jonah Palmer
Add support for the VIRTIO_F_NOTIFICATION_DATA feature across a variety
of vhost devices.

The inclusion of VIRTIO_F_NOTIFICATION_DATA in the feature bits arrays
for these devices ensures that the backend is capable of offering and
providing support for this feature, and that it can be disabled if the
backend does not support it.

Tested-by: Lei Yang 
Reviewed-by: Eugenio Pérez 
Signed-off-by: Jonah Palmer 
---
 hw/block/vhost-user-blk.c| 1 +
 hw/net/vhost_net.c   | 2 ++
 hw/scsi/vhost-scsi.c | 1 +
 hw/scsi/vhost-user-scsi.c| 1 +
 hw/virtio/vhost-user-fs.c| 2 +-
 hw/virtio/vhost-user-vsock.c | 1 +
 net/vhost-vdpa.c | 1 +
 7 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 6a856ad51a..983c0657da 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -51,6 +51,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_NOTIFICATION_DATA,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index e8e1661646..bb1f975b39 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -48,6 +48,7 @@ static const int kernel_feature_bits[] = {
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_NOTIFICATION_DATA,
 VIRTIO_NET_F_HASH_REPORT,
 VHOST_INVALID_FEATURE_BIT
 };
@@ -55,6 +56,7 @@ static const int kernel_feature_bits[] = {
 /* Features supported by others. */
 static const int user_feature_bits[] = {
 VIRTIO_F_NOTIFY_ON_EMPTY,
+VIRTIO_F_NOTIFICATION_DATA,
 VIRTIO_RING_F_INDIRECT_DESC,
 VIRTIO_RING_F_EVENT_IDX,
 
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index ae26bc19a4..3d5fe0994d 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -38,6 +38,7 @@ static const int kernel_feature_bits[] = {
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_SCSI_F_HOTPLUG,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_NOTIFICATION_DATA,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index a63b1f4948..0b050805a8 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -36,6 +36,7 @@ static const int user_feature_bits[] = {
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_SCSI_F_HOTPLUG,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_NOTIFICATION_DATA,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index cca2cd41be..ae48cc1c96 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -33,7 +33,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_RESET,
-
+VIRTIO_F_NOTIFICATION_DATA,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
index 9431b9792c..802b44a07d 100644
--- a/hw/virtio/vhost-user-vsock.c
+++ b/hw/virtio/vhost-user-vsock.c
@@ -21,6 +21,7 @@ static const int user_feature_bits[] = {
 VIRTIO_RING_F_INDIRECT_DESC,
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_F_NOTIFY_ON_EMPTY,
+VIRTIO_F_NOTIFICATION_DATA,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 2a9ddb4552..5583ce5279 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -61,6 +61,7 @@ const int vdpa_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
 VIRTIO_F_VERSION_1,
+VIRTIO_F_NOTIFICATION_DATA,
 VIRTIO_NET_F_CSUM,
 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
 VIRTIO_NET_F_CTRL_MAC_ADDR,
-- 
2.39.3




[PATCH v3 for 9.1 3/6] virtio-mmio: Handle extra notification data

2024-03-15 Thread Jonah Palmer
Add support to virtio-mmio devices for handling the extra data sent from
the driver to the device when the VIRTIO_F_NOTIFICATION_DATA transport
feature has been negotiated.

The extra data that's passed to the virtio-mmio device when this feature
is enabled varies depending on the device's virtqueue layout.

The data passed to the virtio-mmio device is in the same format as the
data passed to virtio-pci devices.

Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio-mmio.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c
index 22f9fbcf5a..003c363f0b 100644
--- a/hw/virtio/virtio-mmio.c
+++ b/hw/virtio/virtio-mmio.c
@@ -248,6 +248,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, 
uint64_t value,
 {
 VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque;
 VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+uint16_t vq_idx;
 
 trace_virtio_mmio_write_offset(offset, value);
 
@@ -407,8 +408,13 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, 
uint64_t value,
 }
 break;
 case VIRTIO_MMIO_QUEUE_NOTIFY:
-if (value < VIRTIO_QUEUE_MAX) {
-virtio_queue_notify(vdev, value);
+vq_idx = value;
+if (vq_idx < VIRTIO_QUEUE_MAX && virtio_queue_get_num(vdev, vq_idx)) {
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
+virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, 
vq_idx),
+  (value >> 16) & 0x);
+}
+virtio_queue_notify(vdev, vq_idx);
 }
 break;
 case VIRTIO_MMIO_INTERRUPT_ACK:
-- 
2.39.3




[PATCH v3 for 9.1 4/6] virtio-ccw: Handle extra notification data

2024-03-15 Thread Jonah Palmer
Add support to virtio-ccw devices for handling the extra data sent from
the driver to the device when the VIRTIO_F_NOTIFICATION_DATA transport
feature has been negotiated.

The extra data that's passed to the virtio-ccw device when this feature
is enabled varies depending on the device's virtqueue layout.

That data passed to the virtio-ccw device is in the same format as the
data passed to virtio-pci devices.

Signed-off-by: Jonah Palmer 
---
 hw/s390x/s390-virtio-ccw.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index b1dcb3857f..b550adfc68 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -140,9 +140,11 @@ static void subsystem_reset(void)
 static int virtio_ccw_hcall_notify(const uint64_t *args)
 {
 uint64_t subch_id = args[0];
-uint64_t queue = args[1];
+uint64_t data = args[1];
 SubchDev *sch;
+VirtIODevice *vdev;
 int cssid, ssid, schid, m;
+uint16_t vq_idx = data;
 
 if (ioinst_disassemble_sch_ident(subch_id, &m, &cssid, &ssid, &schid)) {
 return -EINVAL;
@@ -151,12 +153,19 @@ static int virtio_ccw_hcall_notify(const uint64_t *args)
 if (!sch || !css_subch_visible(sch)) {
 return -EINVAL;
 }
-if (queue >= VIRTIO_QUEUE_MAX) {
+
+vdev = virtio_ccw_get_vdev(sch);
+if (vq_idx >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, vq_idx)) {
 return -EINVAL;
 }
-virtio_queue_notify(virtio_ccw_get_vdev(sch), queue);
-return 0;
 
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
+virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, vq_idx),
+  (data >> 16) & 0x);
+}
+
+virtio_queue_notify(vdev, vq_idx);
+return 0;
 }
 
 static int virtio_ccw_hcall_early_printk(const uint64_t *args)
-- 
2.39.3




[PATCH v3 for 9.1 2/6] virtio: Prevent creation of device using notification-data with ioeventfd

2024-03-15 Thread Jonah Palmer
Prevent the realization of a virtio device that attempts to use the
VIRTIO_F_NOTIFICATION_DATA transport feature without disabling
ioeventfd.

Due to ioeventfd not being able to carry the extra data associated with
this feature, having both enabled is a functional mismatch and therefore
Qemu should not continue the device's realization process.

Although the device does not yet know if the feature will be
successfully negotiated, many devices using this feature wont actually
work without this extra data and would fail FEATURES_OK anyway.

If ioeventfd is able to work with the extra notification data in the
future, this compatibility check can be removed.

Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 463426ca92..f9cb8d1e5c 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2971,6 +2971,20 @@ int virtio_set_features(VirtIODevice *vdev, uint64_t val)
 return ret;
 }
 
+static void virtio_device_check_notification_compatibility(VirtIODevice *vdev,
+   Error **errp)
+{
+VirtioBusState *bus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus);
+DeviceState *proxy = DEVICE(BUS(bus)->parent);
+
+if (virtio_host_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA) &&
+k->ioeventfd_enabled(proxy)) {
+error_setg(errp,
+   "notification_data=on without ioeventfd=off is not 
supported");
+}
+}
+
 size_t virtio_get_config_size(const VirtIOConfigSizeParams *params,
   uint64_t host_features)
 {
@@ -3731,6 +3745,14 @@ static void virtio_device_realize(DeviceState *dev, 
Error **errp)
 }
 }
 
+/* Devices should not use both ioeventfd and notification data feature */
+virtio_device_check_notification_compatibility(vdev, &err);
+if (err != NULL) {
+error_propagate(errp, err);
+vdc->unrealize(dev);
+return;
+}
+
 virtio_bus_device_plugged(vdev, &err);
 if (err != NULL) {
 error_propagate(errp, err);
-- 
2.39.3




[PATCH v3 for 9.1 1/6] virtio/virtio-pci: Handle extra notification data

2024-03-15 Thread Jonah Palmer
Add support to virtio-pci devices for handling the extra data sent
from the driver to the device when the VIRTIO_F_NOTIFICATION_DATA
transport feature has been negotiated.

The extra data that's passed to the virtio-pci device when this
feature is enabled varies depending on the device's virtqueue
layout.

In a split virtqueue layout, this data includes:
 - upper 16 bits: shadow_avail_idx
 - lower 16 bits: virtqueue index

In a packed virtqueue layout, this data includes:
 - upper 16 bits: 1-bit wrap counter & 15-bit shadow_avail_idx
 - lower 16 bits: virtqueue index

Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio-pci.c | 11 ---
 hw/virtio/virtio.c | 18 ++
 include/hw/virtio/virtio.h |  2 ++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index cb6940fc0e..f3e0a08f53 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -384,7 +384,7 @@ static void virtio_ioport_write(void *opaque, uint32_t 
addr, uint32_t val)
 {
 VirtIOPCIProxy *proxy = opaque;
 VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
-uint16_t vector;
+uint16_t vector, vq_idx;
 hwaddr pa;
 
 switch (addr) {
@@ -408,8 +408,13 @@ static void virtio_ioport_write(void *opaque, uint32_t 
addr, uint32_t val)
 vdev->queue_sel = val;
 break;
 case VIRTIO_PCI_QUEUE_NOTIFY:
-if (val < VIRTIO_QUEUE_MAX) {
-virtio_queue_notify(vdev, val);
+vq_idx = val;
+if (vq_idx < VIRTIO_QUEUE_MAX && virtio_queue_get_num(vdev, vq_idx)) {
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
+virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, 
vq_idx),
+  val >> 16);
+}
+virtio_queue_notify(vdev, vq_idx);
 }
 break;
 case VIRTIO_PCI_STATUS:
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index d229755eae..463426ca92 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2255,6 +2255,24 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, 
int align)
 }
 }
 
+void virtio_queue_set_shadow_avail_idx(VirtQueue *vq, uint16_t 
shadow_avail_idx)
+{
+if (!vq->vring.desc) {
+return;
+}
+
+/*
+ * 16-bit data for packed VQs include 1-bit wrap counter and
+ * 15-bit shadow_avail_idx.
+ */
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+vq->shadow_avail_wrap_counter = (shadow_avail_idx >> 15) & 0x1;
+vq->shadow_avail_idx = shadow_avail_idx & 0x7FFF;
+} else {
+vq->shadow_avail_idx = shadow_avail_idx;
+}
+}
+
 static void virtio_queue_notify_vq(VirtQueue *vq)
 {
 if (vq->vring.desc && vq->handle_output) {
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index c8f72850bc..cdd4f86b61 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -306,6 +306,8 @@ int virtio_queue_ready(VirtQueue *vq);
 
 int virtio_queue_empty(VirtQueue *vq);
 
+void virtio_queue_set_shadow_avail_idx(VirtQueue *vq, uint16_t idx);
+
 /* Host binding interface.  */
 
 uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr);
-- 
2.39.3




[PATCH v3 for 9.1 6/6] virtio: Add VIRTIO_F_NOTIFICATION_DATA property definition

2024-03-15 Thread Jonah Palmer
Extend the virtio device property definitions to include the
VIRTIO_F_NOTIFICATION_DATA feature.

The default state of this feature is disabled, allowing it to be
explicitly enabled where it's supported.

Tested-by: Lei Yang 
Reviewed-by: Eugenio Pérez 
Signed-off-by: Jonah Palmer 
---
 include/hw/virtio/virtio.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index cdd4f86b61..14858c0924 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -370,7 +370,9 @@ typedef struct VirtIORNGConf VirtIORNGConf;
 DEFINE_PROP_BIT64("packed", _state, _field, \
   VIRTIO_F_RING_PACKED, false), \
 DEFINE_PROP_BIT64("queue_reset", _state, _field, \
-  VIRTIO_F_RING_RESET, true)
+  VIRTIO_F_RING_RESET, true), \
+DEFINE_PROP_BIT64("notification_data", _state, _field, \
+  VIRTIO_F_NOTIFICATION_DATA, false)
 
 hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
 bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n);
-- 
2.39.3




[PATCH v3 for 9.1 0/6] virtio, vhost: Add VIRTIO_F_NOTIFICATION_DATA support

2024-03-15 Thread Jonah Palmer
The goal of these patches are to add support to a variety of virtio and
vhost devices for the VIRTIO_F_NOTIFICATION_DATA transport feature. This
feature indicates that a driver will pass extra data (instead of just a
virtqueue's index) when notifying the corresponding device.

The data passed in by the driver when this feature is enabled varies in
format depending on if the device is using a split or packed virtqueue
layout:

 Split VQ
  - Upper 16 bits: shadow_avail_idx
  - Lower 16 bits: virtqueue index

 Packed VQ
  - Upper 16 bits: 1-bit wrap counter & 15-bit shadow_avail_idx
  - Lower 16 bits: virtqueue index

Also, due to the limitations of ioeventfd not being able to carry the
extra provided by the driver, having both VIRTIO_F_NOTIFICATION_DATA
feature and ioeventfd enabled is a functional mismatch. The user must
explicitly disable ioeventfd for the device in the Qemu arguments when
using this feature, else the device will fail to complete realization.

For example, a device must explicitly enable notification_data as well
as disable ioeventfd:

-device virtio-scsi-pci,...,ioeventfd=off,notification_data=on

A significant aspect of this effort has been to maintain compatibility
across different backends. As such, the feature is offered by backend
devices only when supported, with fallback mechanisms where backend
support is absent.

v3: Validate VQ idx via. virtio_queue_get_num() (pci, mmio, ccw)
Rename virtio_queue_set_shadow_avail_data
Only pass in upper 16 bits of 32-bit extra data (was redundant)
Make notification compatibility check function static
Drop tags on patches 1/6, 3/6, and 4/6

v2: Don't disable ioeventfd by default, user must disable it
Drop tags on patch 2/6

Jonah Palmer (6):
  virtio/virtio-pci: Handle extra notification data
  virtio: Prevent creation of device using notification-data with ioeventfd
  virtio-mmio: Handle extra notification data
  virtio-ccw: Handle extra notification data
  vhost/vhost-user: Add VIRTIO_F_NOTIFICATION_DATA to vhost feature bits
  virtio: Add VIRTIO_F_NOTIFICATION_DATA property definition

 hw/block/vhost-user-blk.c|  1 +
 hw/net/vhost_net.c   |  2 ++
 hw/s390x/s390-virtio-ccw.c   | 17 +++
 hw/scsi/vhost-scsi.c |  1 +
 hw/scsi/vhost-user-scsi.c|  1 +
 hw/virtio/vhost-user-fs.c|  2 +-
 hw/virtio/vhost-user-vsock.c |  1 +
 hw/virtio/virtio-mmio.c  | 10 +++--
 hw/virtio/virtio-pci.c   | 11 +++---
 hw/virtio/virtio.c   | 40 
 include/hw/virtio/virtio.h   |  6 +-
 net/vhost-vdpa.c |  1 +
 12 files changed, 82 insertions(+), 11 deletions(-)

-- 
2.39.3




Re: [PATCH for-9.0 v3] vdpa-dev: Fix initialisation order to restore VDUSE compatibility

2024-03-15 Thread Stefano Garzarella

On Fri, Mar 15, 2024 at 04:59:49PM +0100, Kevin Wolf wrote:

VDUSE requires that virtqueues are first enabled before the DRIVER_OK
status flag is set; with the current API of the kernel module, it is
impossible to enable the opposite order in our block export code because
userspace is not notified when a virtqueue is enabled.

This requirement also mathces the normal initialisation order as done by
the generic vhost code in QEMU. However, commit 6c482547 accidentally
changed the order for vdpa-dev and broke access to VDUSE devices with
this.

This changes vdpa-dev to use the normal order again and use the standard
vhost callback .vhost_set_vring_enable for this. VDUSE devices can be
used with vdpa-dev again after this fix.

vhost_net intentionally avoided enabling the vrings for vdpa and does
this manually later while it does enable them for other vhost backends.
Reflect this in the vhost_net code and return early for vdpa, so that
the behaviour doesn't change for this device.

Cc: qemu-sta...@nongnu.org
Fixes: 6c4825476a4351530bcac17abab72295b75ffe98
Signed-off-by: Kevin Wolf 
---
v2:
- Actually make use of the @enable parameter
- Change vhost_net to preserve the current behaviour

v3:
- Updated trace point [Stefano]
- Fixed typo in comment [Stefano]

hw/net/vhost_net.c | 10 ++
hw/virtio/vdpa-dev.c   |  5 +
hw/virtio/vhost-vdpa.c | 29 ++---
hw/virtio/vhost.c  |  8 +++-
hw/virtio/trace-events |  2 +-
5 files changed, 45 insertions(+), 9 deletions(-)


LGTM!

Reviewed-by: Stefano Garzarella 

Thanks,
Stefano



diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index e8e1661646..fd1a93701a 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable)
VHostNetState *net = get_vhost_net(nc);
const VhostOps *vhost_ops = net->dev.vhost_ops;

+/*
+ * vhost-vdpa network devices need to enable dataplane virtqueues after
+ * DRIVER_OK, so they can recover device state before starting dataplane.
+ * Because of that, we don't enable virtqueues here and leave it to
+ * net/vhost-vdpa.c.
+ */
+if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
+return 0;
+}
+
nc->vring_enable = enable;

if (vhost_ops && vhost_ops->vhost_set_vring_enable) {
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index eb9ecea83b..13e87f06f6 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, 
Error **errp)

s->dev.acked_features = vdev->guest_features;

-ret = vhost_dev_start(&s->dev, vdev, false);
+ret = vhost_dev_start(&s->dev, vdev, true);
if (ret < 0) {
error_setg_errno(errp, -ret, "Error starting vhost");
goto err_guest_notifiers;
}
-for (i = 0; i < s->dev.nvqs; ++i) {
-vhost_vdpa_set_vring_ready(&s->vdpa, i);
-}
s->started = true;

/*
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index ddae494ca8..31453466af 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -886,19 +886,41 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, 
int idx)
return idx;
}

-int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx,
+   int enable)
{
struct vhost_dev *dev = v->dev;
struct vhost_vring_state state = {
.index = idx,
-.num = 1,
+.num = enable,
};
int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);

-trace_vhost_vdpa_set_vring_ready(dev, idx, r);
+trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r);
return r;
}

+static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+struct vhost_vdpa *v = dev->opaque;
+unsigned int i;
+int ret;
+
+for (i = 0; i < dev->nvqs; ++i) {
+ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
+if (ret < 0) {
+return ret;
+}
+}
+
+return 0;
+}
+
+int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+{
+return vhost_vdpa_set_vring_enable_one(v, idx, 1);
+}
+
static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
   int fd)
{
@@ -1514,6 +1536,7 @@ const VhostOps vdpa_ops = {
.vhost_set_features = vhost_vdpa_set_features,
.vhost_reset_device = vhost_vdpa_reset_device,
.vhost_get_vq_index = vhost_vdpa_get_vq_index,
+.vhost_set_vring_enable = vhost_vdpa_set_vring_enable,
.vhost_get_config  = vhost_vdpa_get_config,
.vhost_set_config = vhost_vdpa_set_config,
.vhost_requires_shm_log = NULL,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac79468..a66186 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1984,7 +1984,13 @@ static int vhost_d

Re: [PATCH v3 for 9.1 1/6] virtio/virtio-pci: Handle extra notification data

2024-03-15 Thread Eugenio Perez Martin
On Fri, Mar 15, 2024 at 5:57 PM Jonah Palmer  wrote:
>
> Add support to virtio-pci devices for handling the extra data sent
> from the driver to the device when the VIRTIO_F_NOTIFICATION_DATA
> transport feature has been negotiated.
>
> The extra data that's passed to the virtio-pci device when this
> feature is enabled varies depending on the device's virtqueue
> layout.
>
> In a split virtqueue layout, this data includes:
>  - upper 16 bits: shadow_avail_idx
>  - lower 16 bits: virtqueue index
>
> In a packed virtqueue layout, this data includes:
>  - upper 16 bits: 1-bit wrap counter & 15-bit shadow_avail_idx
>  - lower 16 bits: virtqueue index
>

Reviewed-by: Eugenio Pérez 

Thanks!

> Signed-off-by: Jonah Palmer 
> ---
>  hw/virtio/virtio-pci.c | 11 ---
>  hw/virtio/virtio.c | 18 ++
>  include/hw/virtio/virtio.h |  2 ++
>  3 files changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index cb6940fc0e..f3e0a08f53 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -384,7 +384,7 @@ static void virtio_ioport_write(void *opaque, uint32_t 
> addr, uint32_t val)
>  {
>  VirtIOPCIProxy *proxy = opaque;
>  VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
> -uint16_t vector;
> +uint16_t vector, vq_idx;
>  hwaddr pa;
>
>  switch (addr) {
> @@ -408,8 +408,13 @@ static void virtio_ioport_write(void *opaque, uint32_t 
> addr, uint32_t val)
>  vdev->queue_sel = val;
>  break;
>  case VIRTIO_PCI_QUEUE_NOTIFY:
> -if (val < VIRTIO_QUEUE_MAX) {
> -virtio_queue_notify(vdev, val);
> +vq_idx = val;
> +if (vq_idx < VIRTIO_QUEUE_MAX && virtio_queue_get_num(vdev, vq_idx)) 
> {
> +if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
> +virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, 
> vq_idx),
> +  val >> 16);
> +}
> +virtio_queue_notify(vdev, vq_idx);
>  }
>  break;
>  case VIRTIO_PCI_STATUS:
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index d229755eae..463426ca92 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -2255,6 +2255,24 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, 
> int align)
>  }
>  }
>
> +void virtio_queue_set_shadow_avail_idx(VirtQueue *vq, uint16_t 
> shadow_avail_idx)
> +{
> +if (!vq->vring.desc) {
> +return;
> +}
> +
> +/*
> + * 16-bit data for packed VQs include 1-bit wrap counter and
> + * 15-bit shadow_avail_idx.
> + */
> +if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
> +vq->shadow_avail_wrap_counter = (shadow_avail_idx >> 15) & 0x1;
> +vq->shadow_avail_idx = shadow_avail_idx & 0x7FFF;
> +} else {
> +vq->shadow_avail_idx = shadow_avail_idx;
> +}
> +}
> +
>  static void virtio_queue_notify_vq(VirtQueue *vq)
>  {
>  if (vq->vring.desc && vq->handle_output) {
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index c8f72850bc..cdd4f86b61 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -306,6 +306,8 @@ int virtio_queue_ready(VirtQueue *vq);
>
>  int virtio_queue_empty(VirtQueue *vq);
>
> +void virtio_queue_set_shadow_avail_idx(VirtQueue *vq, uint16_t idx);
> +
>  /* Host binding interface.  */
>
>  uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr);
> --
> 2.39.3
>