Re: [PATCH 1/5] vhost-user-blk: Don't reconnect during initialisation

2021-04-23 Thread Denis Plotnikov

Reviewed-by: Denis Plotnikov 

On 22.04.2021 20:02, Kevin Wolf wrote:

This is a partial revert of commits 77542d43149 and bc79c87bcde.

Usually, an error during initialisation means that the configuration was
wrong. Reconnecting won't make the error go away, but just turn the
error condition into an endless loop. Avoid this and return errors
again.

Additionally, calling vhost_user_blk_disconnect() from the chardev event
handler could result in use-after-free because none of the
initialisation code expects that the device could just go away in the
middle. So removing the call fixes crashes in several places.

For example, using a num-queues setting that is incompatible with the
backend would result in a crash like this (dereferencing dev->opaque,
which is already NULL):

  #0  0x55d0a4bd in vhost_user_read_cb (source=0x568f4690, 
condition=(G_IO_IN | G_IO_HUP), opaque=0x7fffcbf0) at 
../hw/virtio/vhost-user.c:313
  #1  0x55d950d3 in qio_channel_fd_source_dispatch (source=0x57c3f750, 
callback=0x55d0a478 , user_data=0x7fffcbf0) at 
../io/channel-watch.c:84
  #2  0x77b32a9f in g_main_context_dispatch () at 
/lib64/libglib-2.0.so.0
  #3  0x77b84a98 in g_main_context_iterate.constprop () at 
/lib64/libglib-2.0.so.0
  #4  0x77b32163 in g_main_loop_run () at /lib64/libglib-2.0.so.0
  #5  0x55d0a724 in vhost_user_read (dev=0x57bc62f8, 
msg=0x7fffcc50) at ../hw/virtio/vhost-user.c:402
  #6  0x55d0ee6b in vhost_user_get_config (dev=0x57bc62f8, 
config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost-user.c:2133
  #7  0x55d56d46 in vhost_dev_get_config (hdev=0x57bc62f8, 
config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost.c:1566
  #8  0x55cdd150 in vhost_user_blk_device_realize (dev=0x57bc60b0, 
errp=0x7fffcf90) at ../hw/block/vhost-user-blk.c:510
  #9  0x55d08f6d in virtio_device_realize (dev=0x57bc60b0, 
errp=0x7fffcff0) at ../hw/virtio/virtio.c:3660

Signed-off-by: Kevin Wolf 
---
  hw/block/vhost-user-blk.c | 54 ++-
  1 file changed, 13 insertions(+), 41 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index f5e9682703..e824b0a759 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -50,6 +50,8 @@ static const int user_feature_bits[] = {
  VHOST_INVALID_FEATURE_BIT
  };
  
+static void vhost_user_blk_event(void *opaque, QEMUChrEvent event);

+
  static void vhost_user_blk_update_config(VirtIODevice *vdev, uint8_t *config)
  {
  VHostUserBlk *s = VHOST_USER_BLK(vdev);
@@ -362,19 +364,6 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
  vhost_dev_cleanup(&s->dev);
  }
  
-static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,

- bool realized);
-
-static void vhost_user_blk_event_realize(void *opaque, QEMUChrEvent event)
-{
-vhost_user_blk_event(opaque, event, false);
-}
-
-static void vhost_user_blk_event_oper(void *opaque, QEMUChrEvent event)
-{
-vhost_user_blk_event(opaque, event, true);
-}
-
  static void vhost_user_blk_chr_closed_bh(void *opaque)
  {
  DeviceState *dev = opaque;
@@ -382,12 +371,11 @@ static void vhost_user_blk_chr_closed_bh(void *opaque)
  VHostUserBlk *s = VHOST_USER_BLK(vdev);
  
  vhost_user_blk_disconnect(dev);

-qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL,
-vhost_user_blk_event_oper, NULL, opaque, NULL, true);
+qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
+ NULL, opaque, NULL, true);
  }
  
-static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,

- bool realized)
+static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
  {
  DeviceState *dev = opaque;
  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -401,17 +389,7 @@ static void vhost_user_blk_event(void *opaque, 
QEMUChrEvent event,
  }
  break;
  case CHR_EVENT_CLOSED:
-/*
- * Closing the connection should happen differently on device
- * initialization and operation stages.
- * On initalization, we want to re-start vhost_dev initialization
- * from the very beginning right away when the connection is closed,
- * so we clean up vhost_dev on each connection closing.
- * On operation, we want to postpone vhost_dev cleanup to let the
- * other code perform its own cleanup sequence using vhost_dev data
- * (e.g. vhost_dev_set_log).
- */
-if (realized && !runstate_check(RUN_STATE_SHUTDOWN)) {
+if (!runstate_check(RUN_STATE_SHUTDOWN)) {
  /*
   * A close event may happen during a read/write, but vhost
   * code assumes the vhost_dev remains setup, so delay the
@@ -431,8 +409,6 @@ static void vhost_user_blk_event(void *opaque, QEMUChrE

Re: [PATCH v3 00/36] block: update graph permissions update

2021-04-23 Thread Kevin Wolf
Am 18.03.2021 um 09:04 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 17.03.2021 20:33, Eric Blake wrote:
> > On 3/17/21 10:38 AM, Vladimir Sementsov-Ogievskiy wrote:
> > 
> > > > 6/36 Checking commit 5780b805277e (block: drop ctx argument from
> > > > bdrv_root_attach_child)
> > > > 7/36 Checking commit 68189c099a3a (block: make bdrv_reopen_{prepare,
> > > > commit, abort} private)
> > > > ERROR: Author email address is mangled by the mailing list
> > > > #2:
> > > > Author: Vladimir Sementsov-Ogievskiy via 
> > > > 
> > > 
> > > Who know what is it? Commit message, subject and "From:" header are
> > > clean in the email..
> > 
> > The list mangles mails for setups where DKIM/SCP setups are strict
> > enough that the mail would be rejected by various recipients otherwise.
> > But I have no idea why the mailing list rewrote the headers for that one
> > mail, but not the rest of your series - usually, DKIM setups are
> > persistent enough that it will be an all-or-none conversion to the
> > entire series.
> > 
> > At any rate, a maintainer can manually fix it for one patch, or you can
> > resend (if the mailing list keeps mangling headers, you can add a 'From:
> > ' line in the body of your email that will override the mangled header;
> > but since the list doesn't usually mangle your headers, you may not need
> > to resort to that).
> > 
> 
> I'm looking at 7/36 in my mailbox, and don't see where is it mangled?

My primary copy is good, too, but that seems to be because you CCed me
directly, so it didn't even go through the list. My copy from qemu-devel
seems to be mangled, though. You can look at Patchew's here:

https://patchew.org/QEMU/20210317143529.615584-8-vsement...@virtuozzo.com/mbox

But that you CCed me means it's not a practical problem for this series.

Kevin




Re: [PATCH v2 1/2] block: Add BDRV_O_NO_SHARE for blk_new_open()

2021-04-23 Thread Vladimir Sementsov-Ogievskiy

22.04.2021 19:43, Kevin Wolf wrote:

Normally, blk_new_open() just shares all permissions. This was fine
originally when permissions only protected against uses in the same
process because no other part of the code would actually get to access
the block nodes opened with blk_new_open(). However, since we use it for
file locking now, unsharing permissions becomes desirable.

Add a new BDRV_O_NO_SHARE flag that is used in blk_new_open() to unshare
any permissions that can be unshared.

Signed-off-by: Kevin Wolf 
---
  include/block/block.h |  1 +
  block/block-backend.c | 19 +--
  2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index b3f6e509d4..735db05a39 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -101,6 +101,7 @@ typedef struct HDGeometry {
  uint32_t cylinders;
  } HDGeometry;
  
+#define BDRV_O_NO_SHARE0x0001 /* don't share permissons */

  #define BDRV_O_RDWR0x0002
  #define BDRV_O_RESIZE  0x0004 /* request permission for resizing the node 
*/
  #define BDRV_O_SNAPSHOT0x0008 /* open the file read only and save writes 
in a snapshot */
diff --git a/block/block-backend.c b/block/block-backend.c
index 413af51f3b..61a10ea860 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -398,15 +398,19 @@ BlockBackend *blk_new_open(const char *filename, const 
char *reference,
  BlockBackend *blk;
  BlockDriverState *bs;
  uint64_t perm = 0;
+uint64_t shared = BLK_PERM_ALL;
  
-/* blk_new_open() is mainly used in .bdrv_create implementations and the

- * tools where sharing isn't a concern because the BDS stays private, so we
- * just request permission according to the flags.
+/*
+ * blk_new_open() is mainly used in .bdrv_create implementations and the
+ * tools where sharing isn't a major concern because the BDS stays private
+ * and the file is generally not supposed to be used by a second process,
+ * so we just request permission according to the flags.
   *
   * The exceptions are xen_disk and blockdev_init(); in these cases, the
   * caller of blk_new_open() doesn't make use of the permissions, but they
   * shouldn't hurt either. We can still share everything here because the
- * guest devices will add their own blockers if they can't share. */
+ * guest devices will add their own blockers if they can't share.
+ */


Probably old comment "We can still share everything" is a bit in conflict with 
commit message (and new logic).


  if ((flags & BDRV_O_NO_IO) == 0) {
  perm |= BLK_PERM_CONSISTENT_READ;
  if (flags & BDRV_O_RDWR) {
@@ -416,8 +420,11 @@ BlockBackend *blk_new_open(const char *filename, const 
char *reference,
  if (flags & BDRV_O_RESIZE) {
  perm |= BLK_PERM_RESIZE;
  }
+if (flags & BDRV_O_NO_SHARE) {
+shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
+}
  
-blk = blk_new(qemu_get_aio_context(), perm, BLK_PERM_ALL);

+blk = blk_new(qemu_get_aio_context(), perm, shared);
  bs = bdrv_open(filename, reference, options, flags, errp);
  if (!bs) {
  blk_unref(blk);
@@ -426,7 +433,7 @@ BlockBackend *blk_new_open(const char *filename, const char 
*reference,
  
  blk->root = bdrv_root_attach_child(bs, "root", &child_root,

 BDRV_CHILD_FILTERED | 
BDRV_CHILD_PRIMARY,
-   blk->ctx, perm, BLK_PERM_ALL, blk, 
errp);
+   blk->ctx, perm, shared, blk, errp);
  if (!blk->root) {
  blk_unref(blk);
  return NULL;



Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir



Re: [PATCH v3 06/33] util/async: aio_co_schedule(): support reschedule in same ctx

2021-04-23 Thread Roman Kagan
On Fri, Apr 16, 2021 at 11:08:44AM +0300, Vladimir Sementsov-Ogievskiy wrote:
> With the following patch we want to call wake coroutine from thread.
> And it doesn't work with aio_co_wake:
> Assume we have no iothreads.
> Assume we have a coroutine A, which waits in the yield point for
> external aio_co_wake(), and no progress can be done until it happen.
> Main thread is in blocking aio_poll() (for example, in blk_read()).
> 
> Now, in a separate thread we do aio_co_wake(). It calls  aio_co_enter(),
> which goes through last "else" branch and do aio_context_acquire(ctx).
> 
> Now we have a deadlock, as aio_poll() will not release the context lock
> until some progress is done, and progress can't be done until
> aio_co_wake() wake the coroutine A. And it can't because it wait for
> aio_context_acquire().
> 
> Still, aio_co_schedule() works well in parallel with blocking
> aio_poll(). So we want use it. Let's add a possibility of rescheduling
> coroutine in same ctx where it was yield'ed.
> 
> Fetch co->ctx in same way as it is done in aio_co_wake().
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/block/aio.h | 2 +-
>  util/async.c| 8 
>  2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/include/block/aio.h b/include/block/aio.h
> index 5f342267d5..744b695525 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -643,7 +643,7 @@ static inline bool aio_node_check(AioContext *ctx, bool 
> is_external)
>  
>  /**
>   * aio_co_schedule:
> - * @ctx: the aio context
> + * @ctx: the aio context, if NULL, the current ctx of @co will be used.
>   * @co: the coroutine
>   *
>   * Start a coroutine on a remote AioContext.
> diff --git a/util/async.c b/util/async.c
> index 674dbefb7c..750be555c6 100644
> --- a/util/async.c
> +++ b/util/async.c
> @@ -545,6 +545,14 @@ fail:
>  
>  void aio_co_schedule(AioContext *ctx, Coroutine *co)
>  {
> +if (!ctx) {
> +/*
> + * Read coroutine before co->ctx.  Matches smp_wmb in
> + * qemu_coroutine_enter.
> + */
> +smp_read_barrier_depends();
> +ctx = qatomic_read(&co->ctx);
> +}

I'd rather not extend this interface, but add a new one on top.  And
document how it's different from aio_co_wake().

Roman.

>  trace_aio_co_schedule(ctx, co);
>  const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL,
> __func__);
> -- 
> 2.29.2
> 



Re: [PATCH for-6.0 v2 0/2] hw/block/nvme: fix msix uninit

2021-04-23 Thread Klaus Jensen

On Apr 23 07:21, Klaus Jensen wrote:

From: Klaus Jensen 

First patch fixes a regression where msix is not correctly uninit'ed
when an nvme device is hotplugged with device_del. When viewed in
conjunction with the commit that introduced the bug (commit
1901b4967c3f), I think the fix looks relatively obvious.

Second patch disables hotplugging for nvme controllers that are
connected to subsystems since the way namespaces are connected to the
nvme controller bus is messed up by removing the device. This bug causes
a segfault but is *not* a regression and is related to an experimental
feature.

v2:
 - remove memory subregion as well
 - add (possible) patch to disable hotplugging on subsystem connected
   controllers

Klaus Jensen (2):
 hw/block/nvme: fix invalid msix exclusive uninit
 hw/block/nvme: disable hotplugging for subsystem-linked controllers

hw/block/nvme.c | 7 ++-
1 file changed, 6 insertions(+), 1 deletion(-)

--
2.31.1




Peter,

I know you have a lot of crap on your plate right now, so for the 
record, yes, this is a regression, but not release critical, right?


I am not aware of anyone depending on this unplugging functionality 
(which according to Bug 1925496 is and have always been flaky) in 
production. Basically, as far as I know, all known uses of this device 
are for development and/or testing.


signature.asc
Description: PGP signature


Re: [PATCH for-6.0 v2 0/2] hw/block/nvme: fix msix uninit

2021-04-23 Thread Peter Maydell
On Fri, 23 Apr 2021 at 11:38, Klaus Jensen  wrote:
>
> On Apr 23 07:21, Klaus Jensen wrote:
> >From: Klaus Jensen 
> >
> >First patch fixes a regression where msix is not correctly uninit'ed
> >when an nvme device is hotplugged with device_del. When viewed in
> >conjunction with the commit that introduced the bug (commit
> >1901b4967c3f), I think the fix looks relatively obvious.
> >
> >Second patch disables hotplugging for nvme controllers that are
> >connected to subsystems since the way namespaces are connected to the
> >nvme controller bus is messed up by removing the device. This bug causes
> >a segfault but is *not* a regression and is related to an experimental
> >feature.

> I know you have a lot of crap on your plate right now, so for the
> record, yes, this is a regression, but not release critical, right?
>
> I am not aware of anyone depending on this unplugging functionality
> (which according to Bug 1925496 is and have always been flaky) in
> production. Basically, as far as I know, all known uses of this device
> are for development and/or testing.

Thanks for the update. We probably are going to have to roll an rc5
for the network-interface hotplug crash, but it sounds from your
description (especially if hotunplug has always been broken) as if
we could safely leave theses fixes until 6.1.

thanks
-- PMM



Re: [PATCH v3 01/36] tests/test-bdrv-graph-mod: add test_parallel_exclusive_write

2021-04-23 Thread Kevin Wolf
Am 17.03.2021 um 15:34 hat Vladimir Sementsov-Ogievskiy geschrieben:
> Add the test that shows that concept of ignore_children is incomplete.
> Actually, when we want to update something, ignoring permission of some
> existing BdrvChild, we should ignore also the propagated effect of this
> child to the other children. But that's not done. Better approach
> (update permissions on already updated graph) will be implemented
> later.
> 
> Now the test fails, so it's added with -d argument to not break make
> check.
> 
> Test fails with
> 
>  "Conflicts with use by fl1 as 'backing', which does not allow 'write' on 
> base"
> 
> because when updating permissions we can ignore original top->fl1
> BdrvChild. But we don't ignore exclusive write permission in fl1->base
> BdrvChild, which is propagated. Correct thing to do is make graph
> change first and then do permission update from the top node.
> 
> To run test do
> 
>   ./test-bdrv-graph-mod -d -p /bdrv-graph-mod/parallel-exclusive-write
> 
> from /tests.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  tests/unit/test-bdrv-graph-mod.c | 70 +++-
>  1 file changed, 69 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/unit/test-bdrv-graph-mod.c 
> b/tests/unit/test-bdrv-graph-mod.c
> index c4f7d16039..4e4e83674a 100644
> --- a/tests/unit/test-bdrv-graph-mod.c
> +++ b/tests/unit/test-bdrv-graph-mod.c
> @@ -1,7 +1,7 @@
>  /*
>   * Block node graph modifications tests
>   *
> - * Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
> + * Copyright (c) 2019-2021 Virtuozzo International GmbH. All rights reserved.
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -44,6 +44,21 @@ static BlockDriver bdrv_no_perm = {
>  .bdrv_child_perm = no_perm_default_perms,
>  };
>  
> +static void exclusive_write_perms(BlockDriverState *bs, BdrvChild *c,
> +  BdrvChildRole role,
> +  BlockReopenQueue *reopen_queue,
> +  uint64_t perm, uint64_t shared,
> +  uint64_t *nperm, uint64_t *nshared)
> +{
> +*nperm = BLK_PERM_WRITE;
> +*nshared = BLK_PERM_ALL & ~BLK_PERM_WRITE;
> +}
> +
> +static BlockDriver bdrv_exclusive_writer = {
> +.format_name = "exclusive-writer",
> +.bdrv_child_perm = exclusive_write_perms,
> +};
> +
>  static BlockDriverState *no_perm_node(const char *name)
>  {
>  return bdrv_new_open_driver(&bdrv_no_perm, name, BDRV_O_RDWR, 
> &error_abort);
> @@ -55,6 +70,12 @@ static BlockDriverState *pass_through_node(const char 
> *name)
>  BDRV_O_RDWR, &error_abort);
>  }
>  
> +static BlockDriverState *exclusive_writer_node(const char *name)
> +{
> +return bdrv_new_open_driver(&bdrv_exclusive_writer, name,
> +BDRV_O_RDWR, &error_abort);
> +}
> +
>  /*
>   * test_update_perm_tree
>   *
> @@ -185,8 +206,50 @@ static void test_should_update_child(void)
>  blk_unref(root);
>  }
>  
> +/*
> + * test_parallel_exclusive_write
> + *
> + * Check that when we replace node, old permissions of the node being removed
> + * doesn't break the replacement.
> + */
> +static void test_parallel_exclusive_write(void)
> +{
> +BlockDriverState *top = exclusive_writer_node("top");
> +BlockDriverState *base = no_perm_node("base");
> +BlockDriverState *fl1 = pass_through_node("fl1");
> +BlockDriverState *fl2 = pass_through_node("fl2");
> +
> +/*
> + * bdrv_attach_child() eats child bs reference, so we need two @base
> + * references for two filters:
> + */
> +bdrv_ref(base);
> +
> +bdrv_attach_child(top, fl1, "backing", &child_of_bds, BDRV_CHILD_DATA,
> +  &error_abort);
> +bdrv_attach_child(fl1, base, "backing", &child_of_bds, 
> BDRV_CHILD_FILTERED,
> +  &error_abort);
> +bdrv_attach_child(fl2, base, "backing", &child_of_bds, 
> BDRV_CHILD_FILTERED,
> +  &error_abort);
> +
> +bdrv_replace_node(fl1, fl2, &error_abort);
> +
> +bdrv_unref(fl2); /* second reference was created by bdrv_replace_node() 
> */

This line is new and I don't understand it.

Why does bdrv_replace_node() create new references? Shouldn't it just
move all parents of fl2 to fl1, and when the refcount of fl2 drops to
zero, it would be deleted?

If we have to unref manually, is this a bug?

> +bdrv_unref(top);
> +}

Kevin




Re: [PATCH v3 01/36] tests/test-bdrv-graph-mod: add test_parallel_exclusive_write

2021-04-23 Thread Vladimir Sementsov-Ogievskiy

23.04.2021 15:25, Kevin Wolf wrote:

Am 17.03.2021 um 15:34 hat Vladimir Sementsov-Ogievskiy geschrieben:

Add the test that shows that concept of ignore_children is incomplete.
Actually, when we want to update something, ignoring permission of some
existing BdrvChild, we should ignore also the propagated effect of this
child to the other children. But that's not done. Better approach
(update permissions on already updated graph) will be implemented
later.

Now the test fails, so it's added with -d argument to not break make
check.

Test fails with

  "Conflicts with use by fl1 as 'backing', which does not allow 'write' on base"

because when updating permissions we can ignore original top->fl1
BdrvChild. But we don't ignore exclusive write permission in fl1->base
BdrvChild, which is propagated. Correct thing to do is make graph
change first and then do permission update from the top node.

To run test do

   ./test-bdrv-graph-mod -d -p /bdrv-graph-mod/parallel-exclusive-write

from /tests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  tests/unit/test-bdrv-graph-mod.c | 70 +++-
  1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test-bdrv-graph-mod.c b/tests/unit/test-bdrv-graph-mod.c
index c4f7d16039..4e4e83674a 100644
--- a/tests/unit/test-bdrv-graph-mod.c
+++ b/tests/unit/test-bdrv-graph-mod.c
@@ -1,7 +1,7 @@
  /*
   * Block node graph modifications tests
   *
- * Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
+ * Copyright (c) 2019-2021 Virtuozzo International GmbH. All rights reserved.
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -44,6 +44,21 @@ static BlockDriver bdrv_no_perm = {
  .bdrv_child_perm = no_perm_default_perms,
  };
  
+static void exclusive_write_perms(BlockDriverState *bs, BdrvChild *c,

+  BdrvChildRole role,
+  BlockReopenQueue *reopen_queue,
+  uint64_t perm, uint64_t shared,
+  uint64_t *nperm, uint64_t *nshared)
+{
+*nperm = BLK_PERM_WRITE;
+*nshared = BLK_PERM_ALL & ~BLK_PERM_WRITE;
+}
+
+static BlockDriver bdrv_exclusive_writer = {
+.format_name = "exclusive-writer",
+.bdrv_child_perm = exclusive_write_perms,
+};
+
  static BlockDriverState *no_perm_node(const char *name)
  {
  return bdrv_new_open_driver(&bdrv_no_perm, name, BDRV_O_RDWR, 
&error_abort);
@@ -55,6 +70,12 @@ static BlockDriverState *pass_through_node(const char *name)
  BDRV_O_RDWR, &error_abort);
  }
  
+static BlockDriverState *exclusive_writer_node(const char *name)

+{
+return bdrv_new_open_driver(&bdrv_exclusive_writer, name,
+BDRV_O_RDWR, &error_abort);
+}
+
  /*
   * test_update_perm_tree
   *
@@ -185,8 +206,50 @@ static void test_should_update_child(void)
  blk_unref(root);
  }
  
+/*

+ * test_parallel_exclusive_write
+ *
+ * Check that when we replace node, old permissions of the node being removed
+ * doesn't break the replacement.
+ */
+static void test_parallel_exclusive_write(void)
+{
+BlockDriverState *top = exclusive_writer_node("top");
+BlockDriverState *base = no_perm_node("base");
+BlockDriverState *fl1 = pass_through_node("fl1");
+BlockDriverState *fl2 = pass_through_node("fl2");
+
+/*
+ * bdrv_attach_child() eats child bs reference, so we need two @base
+ * references for two filters:
+ */
+bdrv_ref(base);
+
+bdrv_attach_child(top, fl1, "backing", &child_of_bds, BDRV_CHILD_DATA,
+  &error_abort);
+bdrv_attach_child(fl1, base, "backing", &child_of_bds, BDRV_CHILD_FILTERED,
+  &error_abort);
+bdrv_attach_child(fl2, base, "backing", &child_of_bds, BDRV_CHILD_FILTERED,
+  &error_abort);
+
+bdrv_replace_node(fl1, fl2, &error_abort);
+
+bdrv_unref(fl2); /* second reference was created by bdrv_replace_node() */


This line is new and I don't understand it.

Why does bdrv_replace_node() create new references? Shouldn't it just
move all parents of fl2 to fl1, and when the refcount of fl2 drops to
zero, it would be deleted?



fl2 is second argument of bdrv_replace_node, it's @to. So all parents of fl1 
moved to fl2. So, fl2 referenced by top. But our first reference that comes 
from pass_through_node() is still here as well.



--
Best regards,
Vladimir



[PATCH] qapi: deprecate drive-backup

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Modern way is using blockdev-add + blockdev-backup, which provides a
lot more control on how target is opened.

As example of drive-backup problems consider the following:

User of drive-backup expects that target will be opened in the same
cache and aio mode as source. Corresponding logic is in
drive_backup_prepare(), where we take bs->open_flags of source.

It works rather bad if source was added by blockdev-add. Assume source
is qcow2 image. On blockdev-add we should specify aio and cache options
for file child of qcow2 node. What happens next:

drive_backup_prepare() looks at bs->open_flags of qcow2 source node.
But there no BDRV_O_NOCAHE neither BDRV_O_NATIVE_AIO: BDRV_O_NOCAHE is
places in bs->file->bs->open_flags, and BDRV_O_NATIVE_AIO is nowhere,
as file-posix parse options and simply set s->use_linux_aio.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

Hi all! I remember, I suggested to deprecate drive-backup some time ago,
and nobody complain.. But that old patch was inside the series with
other more questionable deprecations and it did not landed.

Let's finally deprecate what should be deprecated long ago.

We now faced a problem in our downstream, described in commit message.
In downstream I've fixed it by simply enabling O_DIRECT and linux_aio
unconditionally for drive_backup target. But actually this just shows
that using drive-backup in blockdev era is a bad idea. So let's motivate
everyone (including Virtuozzo of course) to move to new interfaces and
avoid problems with all that outdated option inheritance.

 docs/system/deprecated.rst | 5 +
 qapi/block-core.json   | 5 -
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst
index 80cae86252..b6f5766e17 100644
--- a/docs/system/deprecated.rst
+++ b/docs/system/deprecated.rst
@@ -186,6 +186,11 @@ Use the more generic commands ``block-export-add`` and 
``block-export-del``
 instead.  As part of this deprecation, where ``nbd-server-add`` used a
 single ``bitmap``, the new ``block-export-add`` uses a list of ``bitmaps``.
 
+``drive-backup`` (since 6.0)
+
+
+Use ``blockdev-backup`` in pair with ``blockdev-add`` instead.
+
 System accelerators
 ---
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 6d227924d0..8e2c6e1622 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1642,6 +1642,9 @@
 # The operation can be stopped before it has completed using the
 # block-job-cancel command.
 #
+# Features:
+# @deprecated: This command is deprecated. Use @blockdev-backup instead.
+#
 # Returns: - nothing on success
 #  - If @device is not a valid block device, GenericError
 #
@@ -1657,7 +1660,7 @@
 #
 ##
 { 'command': 'drive-backup', 'boxed': true,
-  'data': 'DriveBackup' }
+  'data': 'DriveBackup', 'features': ['deprecated'] }
 
 ##
 # @blockdev-backup:
-- 
2.29.2




Re: [PATCH v3 01/36] tests/test-bdrv-graph-mod: add test_parallel_exclusive_write

2021-04-23 Thread Kevin Wolf
Am 23.04.2021 um 14:46 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 23.04.2021 15:25, Kevin Wolf wrote:
> > Am 17.03.2021 um 15:34 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > Add the test that shows that concept of ignore_children is incomplete.
> > > Actually, when we want to update something, ignoring permission of some
> > > existing BdrvChild, we should ignore also the propagated effect of this
> > > child to the other children. But that's not done. Better approach
> > > (update permissions on already updated graph) will be implemented
> > > later.
> > > 
> > > Now the test fails, so it's added with -d argument to not break make
> > > check.
> > > 
> > > Test fails with
> > > 
> > >   "Conflicts with use by fl1 as 'backing', which does not allow 'write' 
> > > on base"
> > > 
> > > because when updating permissions we can ignore original top->fl1
> > > BdrvChild. But we don't ignore exclusive write permission in fl1->base
> > > BdrvChild, which is propagated. Correct thing to do is make graph
> > > change first and then do permission update from the top node.
> > > 
> > > To run test do
> > > 
> > >./test-bdrv-graph-mod -d -p /bdrv-graph-mod/parallel-exclusive-write
> > > 
> > > from /tests.
> > > 
> > > Signed-off-by: Vladimir Sementsov-Ogievskiy 
> > > ---
> > >   tests/unit/test-bdrv-graph-mod.c | 70 +++-
> > >   1 file changed, 69 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/tests/unit/test-bdrv-graph-mod.c 
> > > b/tests/unit/test-bdrv-graph-mod.c
> > > index c4f7d16039..4e4e83674a 100644
> > > --- a/tests/unit/test-bdrv-graph-mod.c
> > > +++ b/tests/unit/test-bdrv-graph-mod.c
> > > @@ -1,7 +1,7 @@
> > >   /*
> > >* Block node graph modifications tests
> > >*
> > > - * Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
> > > + * Copyright (c) 2019-2021 Virtuozzo International GmbH. All rights 
> > > reserved.
> > >*
> > >* This program is free software; you can redistribute it and/or modify
> > >* it under the terms of the GNU General Public License as published by
> > > @@ -44,6 +44,21 @@ static BlockDriver bdrv_no_perm = {
> > >   .bdrv_child_perm = no_perm_default_perms,
> > >   };
> > > +static void exclusive_write_perms(BlockDriverState *bs, BdrvChild *c,
> > > +  BdrvChildRole role,
> > > +  BlockReopenQueue *reopen_queue,
> > > +  uint64_t perm, uint64_t shared,
> > > +  uint64_t *nperm, uint64_t *nshared)
> > > +{
> > > +*nperm = BLK_PERM_WRITE;
> > > +*nshared = BLK_PERM_ALL & ~BLK_PERM_WRITE;
> > > +}
> > > +
> > > +static BlockDriver bdrv_exclusive_writer = {
> > > +.format_name = "exclusive-writer",
> > > +.bdrv_child_perm = exclusive_write_perms,
> > > +};
> > > +
> > >   static BlockDriverState *no_perm_node(const char *name)
> > >   {
> > >   return bdrv_new_open_driver(&bdrv_no_perm, name, BDRV_O_RDWR, 
> > > &error_abort);
> > > @@ -55,6 +70,12 @@ static BlockDriverState *pass_through_node(const char 
> > > *name)
> > >   BDRV_O_RDWR, &error_abort);
> > >   }
> > > +static BlockDriverState *exclusive_writer_node(const char *name)
> > > +{
> > > +return bdrv_new_open_driver(&bdrv_exclusive_writer, name,
> > > +BDRV_O_RDWR, &error_abort);
> > > +}
> > > +
> > >   /*
> > >* test_update_perm_tree
> > >*
> > > @@ -185,8 +206,50 @@ static void test_should_update_child(void)
> > >   blk_unref(root);
> > >   }
> > > +/*
> > > + * test_parallel_exclusive_write
> > > + *
> > > + * Check that when we replace node, old permissions of the node being 
> > > removed
> > > + * doesn't break the replacement.
> > > + */
> > > +static void test_parallel_exclusive_write(void)
> > > +{
> > > +BlockDriverState *top = exclusive_writer_node("top");
> > > +BlockDriverState *base = no_perm_node("base");
> > > +BlockDriverState *fl1 = pass_through_node("fl1");
> > > +BlockDriverState *fl2 = pass_through_node("fl2");
> > > +
> > > +/*
> > > + * bdrv_attach_child() eats child bs reference, so we need two @base
> > > + * references for two filters:
> > > + */
> > > +bdrv_ref(base);
> > > +
> > > +bdrv_attach_child(top, fl1, "backing", &child_of_bds, 
> > > BDRV_CHILD_DATA,
> > > +  &error_abort);
> > > +bdrv_attach_child(fl1, base, "backing", &child_of_bds, 
> > > BDRV_CHILD_FILTERED,
> > > +  &error_abort);
> > > +bdrv_attach_child(fl2, base, "backing", &child_of_bds, 
> > > BDRV_CHILD_FILTERED,
> > > +  &error_abort);
> > > +
> > > +bdrv_replace_node(fl1, fl2, &error_abort);
> > > +
> > > +bdrv_unref(fl2); /* second reference was created by 
> > > bdrv_replace_node() */
> > 
> > This line is new and I don't understand it.
> > 
> > Why does bdrv_replace_node() create new refere

Re: [PATCH v3 01/36] tests/test-bdrv-graph-mod: add test_parallel_exclusive_write

2021-04-23 Thread Vladimir Sementsov-Ogievskiy

23.04.2021 15:59, Kevin Wolf wrote:

Am 23.04.2021 um 14:46 hat Vladimir Sementsov-Ogievskiy geschrieben:

23.04.2021 15:25, Kevin Wolf wrote:

Am 17.03.2021 um 15:34 hat Vladimir Sementsov-Ogievskiy geschrieben:

Add the test that shows that concept of ignore_children is incomplete.
Actually, when we want to update something, ignoring permission of some
existing BdrvChild, we should ignore also the propagated effect of this
child to the other children. But that's not done. Better approach
(update permissions on already updated graph) will be implemented
later.

Now the test fails, so it's added with -d argument to not break make
check.

Test fails with

   "Conflicts with use by fl1 as 'backing', which does not allow 'write' on 
base"

because when updating permissions we can ignore original top->fl1
BdrvChild. But we don't ignore exclusive write permission in fl1->base
BdrvChild, which is propagated. Correct thing to do is make graph
change first and then do permission update from the top node.

To run test do

./test-bdrv-graph-mod -d -p /bdrv-graph-mod/parallel-exclusive-write

from /tests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
   tests/unit/test-bdrv-graph-mod.c | 70 +++-
   1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test-bdrv-graph-mod.c b/tests/unit/test-bdrv-graph-mod.c
index c4f7d16039..4e4e83674a 100644
--- a/tests/unit/test-bdrv-graph-mod.c
+++ b/tests/unit/test-bdrv-graph-mod.c
@@ -1,7 +1,7 @@
   /*
* Block node graph modifications tests
*
- * Copyright (c) 2019 Virtuozzo International GmbH. All rights reserved.
+ * Copyright (c) 2019-2021 Virtuozzo International GmbH. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -44,6 +44,21 @@ static BlockDriver bdrv_no_perm = {
   .bdrv_child_perm = no_perm_default_perms,
   };
+static void exclusive_write_perms(BlockDriverState *bs, BdrvChild *c,
+  BdrvChildRole role,
+  BlockReopenQueue *reopen_queue,
+  uint64_t perm, uint64_t shared,
+  uint64_t *nperm, uint64_t *nshared)
+{
+*nperm = BLK_PERM_WRITE;
+*nshared = BLK_PERM_ALL & ~BLK_PERM_WRITE;
+}
+
+static BlockDriver bdrv_exclusive_writer = {
+.format_name = "exclusive-writer",
+.bdrv_child_perm = exclusive_write_perms,
+};
+
   static BlockDriverState *no_perm_node(const char *name)
   {
   return bdrv_new_open_driver(&bdrv_no_perm, name, BDRV_O_RDWR, 
&error_abort);
@@ -55,6 +70,12 @@ static BlockDriverState *pass_through_node(const char *name)
   BDRV_O_RDWR, &error_abort);
   }
+static BlockDriverState *exclusive_writer_node(const char *name)
+{
+return bdrv_new_open_driver(&bdrv_exclusive_writer, name,
+BDRV_O_RDWR, &error_abort);
+}
+
   /*
* test_update_perm_tree
*
@@ -185,8 +206,50 @@ static void test_should_update_child(void)
   blk_unref(root);
   }
+/*
+ * test_parallel_exclusive_write
+ *
+ * Check that when we replace node, old permissions of the node being removed
+ * doesn't break the replacement.
+ */
+static void test_parallel_exclusive_write(void)
+{
+BlockDriverState *top = exclusive_writer_node("top");
+BlockDriverState *base = no_perm_node("base");
+BlockDriverState *fl1 = pass_through_node("fl1");
+BlockDriverState *fl2 = pass_through_node("fl2");
+
+/*
+ * bdrv_attach_child() eats child bs reference, so we need two @base
+ * references for two filters:
+ */
+bdrv_ref(base);
+
+bdrv_attach_child(top, fl1, "backing", &child_of_bds, BDRV_CHILD_DATA,
+  &error_abort);
+bdrv_attach_child(fl1, base, "backing", &child_of_bds, BDRV_CHILD_FILTERED,
+  &error_abort);
+bdrv_attach_child(fl2, base, "backing", &child_of_bds, BDRV_CHILD_FILTERED,
+  &error_abort);
+
+bdrv_replace_node(fl1, fl2, &error_abort);
+
+bdrv_unref(fl2); /* second reference was created by bdrv_replace_node() */


This line is new and I don't understand it.

Why does bdrv_replace_node() create new references? Shouldn't it just
move all parents of fl2 to fl1, and when the refcount of fl2 drops to
zero, it would be deleted?


fl2 is second argument of bdrv_replace_node, it's @to. So all parents
of fl1 moved to fl2. So, fl2 referenced by top. But our first
reference that comes from pass_through_node() is still here as well.


Oh, right. I assumed that fl2 was attached to top, but it isn't. So we
indeed still own that reference.

I feel the comment is misleading, it made me think that we unref a
reference that was created by bdrv_replace_node(). What you probably
meant is that bdrv_replace_node() only took an additional reference (by
attaching it to top), but did not t

Re: [PATCH] qapi: deprecate drive-backup

2021-04-23 Thread Daniel P . Berrangé
On Fri, Apr 23, 2021 at 03:59:00PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Modern way is using blockdev-add + blockdev-backup, which provides a
> lot more control on how target is opened.
> 
> As example of drive-backup problems consider the following:
> 
> User of drive-backup expects that target will be opened in the same
> cache and aio mode as source. Corresponding logic is in
> drive_backup_prepare(), where we take bs->open_flags of source.
> 
> It works rather bad if source was added by blockdev-add. Assume source
> is qcow2 image. On blockdev-add we should specify aio and cache options
> for file child of qcow2 node. What happens next:
> 
> drive_backup_prepare() looks at bs->open_flags of qcow2 source node.
> But there no BDRV_O_NOCAHE neither BDRV_O_NATIVE_AIO: BDRV_O_NOCAHE is
> places in bs->file->bs->open_flags, and BDRV_O_NATIVE_AIO is nowhere,
> as file-posix parse options and simply set s->use_linux_aio.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
> 
> Hi all! I remember, I suggested to deprecate drive-backup some time ago,
> and nobody complain.. But that old patch was inside the series with
> other more questionable deprecations and it did not landed.
> 
> Let's finally deprecate what should be deprecated long ago.
> 
> We now faced a problem in our downstream, described in commit message.
> In downstream I've fixed it by simply enabling O_DIRECT and linux_aio
> unconditionally for drive_backup target. But actually this just shows
> that using drive-backup in blockdev era is a bad idea. So let's motivate
> everyone (including Virtuozzo of course) to move to new interfaces and
> avoid problems with all that outdated option inheritance.
> 
>  docs/system/deprecated.rst | 5 +
>  qapi/block-core.json   | 5 -
>  2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst
> index 80cae86252..b6f5766e17 100644
> --- a/docs/system/deprecated.rst
> +++ b/docs/system/deprecated.rst
> @@ -186,6 +186,11 @@ Use the more generic commands ``block-export-add`` and 
> ``block-export-del``
>  instead.  As part of this deprecation, where ``nbd-server-add`` used a
>  single ``bitmap``, the new ``block-export-add`` uses a list of ``bitmaps``.
>  
> +``drive-backup`` (since 6.0)

NB, it'll need to be 6.1 by time this patch has a chance to merge

> +
> +
> +Use ``blockdev-backup`` in pair with ``blockdev-add`` instead.
> +
>  System accelerators
>  ---
>  
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 6d227924d0..8e2c6e1622 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -1642,6 +1642,9 @@
>  # The operation can be stopped before it has completed using the
>  # block-job-cancel command.
>  #
> +# Features:
> +# @deprecated: This command is deprecated. Use @blockdev-backup instead.
> +#
>  # Returns: - nothing on success
>  #  - If @device is not a valid block device, GenericError
>  #
> @@ -1657,7 +1660,7 @@
>  #
>  ##
>  { 'command': 'drive-backup', 'boxed': true,
> -  'data': 'DriveBackup' }
> +  'data': 'DriveBackup', 'features': ['deprecated'] }
>  
>  ##
>  # @blockdev-backup:
> -- 
> 2.29.2
> 
> 

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH for-6.0 v2 2/2] hw/block/nvme: disable hotplugging for subsystem-linked controllers

2021-04-23 Thread Peter Maydell
On Fri, 23 Apr 2021 at 06:21, Klaus Jensen  wrote:
>
> From: Klaus Jensen 
>
> If a controller is linked to a subsystem, do not allow it to be
> hotplugged since this will mess up the (possibly shared) namespaces.
>
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 5fe082ec34c5..7606b58a39b9 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -6140,12 +6140,16 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>
>  static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
>  {
> +DeviceClass *dc;
>  int cntlid;
>
>  if (!n->subsys) {
>  return 0;
>  }
>
> +dc = DEVICE_GET_CLASS(n);
> +dc->hotpluggable = false;
> +
>  cntlid = nvme_subsys_register_ctrl(n, errp);
>  if (cntlid < 0) {
>  return -1;

I'm not sure this is right -- the DeviceClass is the
class struct, which there's only one of for every instance
of the device in the system. So this is saying "if this instance
is linked to a subsystem, don't let any *future* instances ever
be hotpluggable". I'm not even sure if it will do the right
thing for the current device, because this function is called
from the device's realize method, and the device_set_realized()
function does the "forbid if dc->hotpluggable is false" check
before calling the realize method.

Possibly what you want to do here is to call the
device_get_hotplugged() function and just make the realize
method fail with a suitable error if the device is both (a) being
hotplugged and (b) has a subsystem link; but I'm not an expert on
hotplug, so I might be wrong.

thanks
-- PMM



Re: [PATCH] qapi: deprecate drive-backup

2021-04-23 Thread Vladimir Sementsov-Ogievskiy

23.04.2021 16:09, Daniel P. Berrangé wrote:

On Fri, Apr 23, 2021 at 03:59:00PM +0300, Vladimir Sementsov-Ogievskiy wrote:

Modern way is using blockdev-add + blockdev-backup, which provides a
lot more control on how target is opened.

As example of drive-backup problems consider the following:

User of drive-backup expects that target will be opened in the same
cache and aio mode as source. Corresponding logic is in
drive_backup_prepare(), where we take bs->open_flags of source.

It works rather bad if source was added by blockdev-add. Assume source
is qcow2 image. On blockdev-add we should specify aio and cache options
for file child of qcow2 node. What happens next:

drive_backup_prepare() looks at bs->open_flags of qcow2 source node.
But there no BDRV_O_NOCAHE neither BDRV_O_NATIVE_AIO: BDRV_O_NOCAHE is
places in bs->file->bs->open_flags, and BDRV_O_NATIVE_AIO is nowhere,
as file-posix parse options and simply set s->use_linux_aio.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

Hi all! I remember, I suggested to deprecate drive-backup some time ago,
and nobody complain.. But that old patch was inside the series with
other more questionable deprecations and it did not landed.

Let's finally deprecate what should be deprecated long ago.

We now faced a problem in our downstream, described in commit message.
In downstream I've fixed it by simply enabling O_DIRECT and linux_aio
unconditionally for drive_backup target. But actually this just shows
that using drive-backup in blockdev era is a bad idea. So let's motivate
everyone (including Virtuozzo of course) to move to new interfaces and
avoid problems with all that outdated option inheritance.

  docs/system/deprecated.rst | 5 +
  qapi/block-core.json   | 5 -
  2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst
index 80cae86252..b6f5766e17 100644
--- a/docs/system/deprecated.rst
+++ b/docs/system/deprecated.rst
@@ -186,6 +186,11 @@ Use the more generic commands ``block-export-add`` and 
``block-export-del``
  instead.  As part of this deprecation, where ``nbd-server-add`` used a
  single ``bitmap``, the new ``block-export-add`` uses a list of ``bitmaps``.
  
+``drive-backup`` (since 6.0)


NB, it'll need to be 6.1 by time this patch has a chance to merge


Oops yes.


--
Best regards,
Vladimir



Re: [PATCH for-6.0 v2 2/2] hw/block/nvme: disable hotplugging for subsystem-linked controllers

2021-04-23 Thread Klaus Jensen

On Apr 23 14:21, Peter Maydell wrote:

On Fri, 23 Apr 2021 at 06:21, Klaus Jensen  wrote:


From: Klaus Jensen 

If a controller is linked to a subsystem, do not allow it to be
hotplugged since this will mess up the (possibly shared) namespaces.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 5fe082ec34c5..7606b58a39b9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -6140,12 +6140,16 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)

 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
 {
+DeviceClass *dc;
 int cntlid;

 if (!n->subsys) {
 return 0;
 }

+dc = DEVICE_GET_CLASS(n);
+dc->hotpluggable = false;
+
 cntlid = nvme_subsys_register_ctrl(n, errp);
 if (cntlid < 0) {
 return -1;


I'm not sure this is right -- the DeviceClass is the
class struct, which there's only one of for every instance
of the device in the system. So this is saying "if this instance
is linked to a subsystem, don't let any *future* instances ever
be hotpluggable". I'm not even sure if it will do the right
thing for the current device, because this function is called
from the device's realize method, and the device_set_realized()
function does the "forbid if dc->hotpluggable is false" check
before calling the realize method.

Possibly what you want to do here is to call the
device_get_hotplugged() function and just make the realize
method fail with a suitable error if the device is both (a) being
hotplugged and (b) has a subsystem link; but I'm not an expert on
hotplug, so I might be wrong.



Thanks Peter, this sounds exactly like what I want. I'll respin!

I have a "full" fix that actually makes the device hotpluggable in the 
context of subsystems, but it is definitely not an -rc thing.


signature.asc
Description: PGP signature


Re: [PATCH for-6.0 v2 2/2] hw/block/nvme: disable hotplugging for subsystem-linked controllers

2021-04-23 Thread Peter Maydell
On Fri, 23 Apr 2021 at 14:25, Klaus Jensen  wrote:
>
> On Apr 23 14:21, Peter Maydell wrote:
> >On Fri, 23 Apr 2021 at 06:21, Klaus Jensen  wrote:
> >>
> >> From: Klaus Jensen 
> >>
> >> If a controller is linked to a subsystem, do not allow it to be
> >> hotplugged since this will mess up the (possibly shared) namespaces.
> >>
> >> Signed-off-by: Klaus Jensen 
> >> ---
> >>  hw/block/nvme.c | 4 
> >>  1 file changed, 4 insertions(+)
> >>
> >> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> >> index 5fe082ec34c5..7606b58a39b9 100644
> >> --- a/hw/block/nvme.c
> >> +++ b/hw/block/nvme.c
> >> @@ -6140,12 +6140,16 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> >> *pci_dev)
> >>
> >>  static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
> >>  {
> >> +DeviceClass *dc;
> >>  int cntlid;
> >>
> >>  if (!n->subsys) {
> >>  return 0;
> >>  }
> >>
> >> +dc = DEVICE_GET_CLASS(n);
> >> +dc->hotpluggable = false;
> >> +
> >>  cntlid = nvme_subsys_register_ctrl(n, errp);
> >>  if (cntlid < 0) {
> >>  return -1;
> >
> >I'm not sure this is right -- the DeviceClass is the
> >class struct, which there's only one of for every instance
> >of the device in the system. So this is saying "if this instance
> >is linked to a subsystem, don't let any *future* instances ever
> >be hotpluggable". I'm not even sure if it will do the right
> >thing for the current device, because this function is called
> >from the device's realize method, and the device_set_realized()
> >function does the "forbid if dc->hotpluggable is false" check
> >before calling the realize method.
> >
> >Possibly what you want to do here is to call the
> >device_get_hotplugged() function and just make the realize
> >method fail with a suitable error if the device is both (a) being
> >hotplugged and (b) has a subsystem link; but I'm not an expert on
> >hotplug, so I might be wrong.
> >
>
> Thanks Peter, this sounds exactly like what I want. I'll respin!
>
> I have a "full" fix that actually makes the device hotpluggable in the
> context of subsystems, but it is definitely not an -rc thing.

For 6.0 I don't think we should put this in anyway -- it's not
a regression and in any case it sounds like it needs more work...

-- PMM



Re: [PATCH for-6.0 v2 2/2] hw/block/nvme: disable hotplugging for subsystem-linked controllers

2021-04-23 Thread Klaus Jensen

On Apr 23 14:25, Peter Maydell wrote:

On Fri, 23 Apr 2021 at 14:25, Klaus Jensen  wrote:


On Apr 23 14:21, Peter Maydell wrote:
>On Fri, 23 Apr 2021 at 06:21, Klaus Jensen  wrote:
>>
>> From: Klaus Jensen 
>>
>> If a controller is linked to a subsystem, do not allow it to be
>> hotplugged since this will mess up the (possibly shared) namespaces.
>>
>> Signed-off-by: Klaus Jensen 
>> ---
>>  hw/block/nvme.c | 4 
>>  1 file changed, 4 insertions(+)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index 5fe082ec34c5..7606b58a39b9 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -6140,12 +6140,16 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
>>
>>  static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
>>  {
>> +DeviceClass *dc;
>>  int cntlid;
>>
>>  if (!n->subsys) {
>>  return 0;
>>  }
>>
>> +dc = DEVICE_GET_CLASS(n);
>> +dc->hotpluggable = false;
>> +
>>  cntlid = nvme_subsys_register_ctrl(n, errp);
>>  if (cntlid < 0) {
>>  return -1;
>
>I'm not sure this is right -- the DeviceClass is the
>class struct, which there's only one of for every instance
>of the device in the system. So this is saying "if this instance
>is linked to a subsystem, don't let any *future* instances ever
>be hotpluggable". I'm not even sure if it will do the right
>thing for the current device, because this function is called
>from the device's realize method, and the device_set_realized()
>function does the "forbid if dc->hotpluggable is false" check
>before calling the realize method.
>
>Possibly what you want to do here is to call the
>device_get_hotplugged() function and just make the realize
>method fail with a suitable error if the device is both (a) being
>hotplugged and (b) has a subsystem link; but I'm not an expert on
>hotplug, so I might be wrong.
>

Thanks Peter, this sounds exactly like what I want. I'll respin!

I have a "full" fix that actually makes the device hotpluggable in the
context of subsystems, but it is definitely not an -rc thing.


For 6.0 I don't think we should put this in anyway -- it's not
a regression and in any case it sounds like it needs more work...



Agree, patch 1 is what I would like to see in if an -rc5 is spun.


signature.asc
Description: PGP signature


[PATCH v2] monitor: hmp_qemu_io: acquire aio contex, fix crash

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Max reported the following bug:

$ ./qemu-img create -f raw src.img 1G
$ ./qemu-img create -f raw dst.img 1G

$ (echo '
   {"execute":"qmp_capabilities"}
   {"execute":"blockdev-mirror",
"arguments":{"job-id":"mirror",
 "device":"source",
 "target":"target",
 "sync":"full",
 "filter-node-name":"mirror-top"}}
'; sleep 3; echo '
   {"execute":"human-monitor-command",
"arguments":{"command-line":
 "qemu-io mirror-top \"write 0 1G\""}}') \
| x86_64-softmmu/qemu-system-x86_64 \
   -qmp stdio \
   -blockdev file,node-name=source,filename=src.img \
   -blockdev file,node-name=target,filename=dst.img \
   -object iothread,id=iothr0 \
   -device virtio-blk,drive=source,iothread=iothr0

crashes:

0  raise () at /usr/lib/libc.so.6
1  abort () at /usr/lib/libc.so.6
2  error_exit
   (err=,
   msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl")
   at ../util/qemu-thread-posix.c:37
3  qemu_mutex_unlock_impl
   (mutex=mutex@entry=0x55fbb25ab6e0,
   file=file@entry=0x55fbb1636957 "../util/async.c",
   line=line@entry=650)
   at ../util/qemu-thread-posix.c:109
4  aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650
5  bdrv_do_drained_begin
   (bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false,
   parent=parent@entry=0x0,
   ignore_bds_parents=ignore_bds_parents@entry=false,
   poll=poll@entry=true) at ../block/io.c:441
6  bdrv_do_drained_begin
   (poll=true, ignore_bds_parents=false, parent=0x0, recursive=false,
   bs=0x55fbb3a87000) at ../block/io.c:448
7  blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718
8  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498
9  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491
10 hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=)
   at ../block/monitor/block-hmp-cmds.c:628

man pthread_mutex_unlock
...
EPERM  The  mutex type is PTHREAD_MUTEX_ERRORCHECK or
PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the
current thread does not own the mutex.

So, thread doesn't own the mutex. And we have iothread here.

Next, note that AIO_WAIT_WHILE() documents that ctx must be acquired
exactly once by caller. But where is it acquired in the call stack?
Seems nowhere.

qemuio_command do acquire aio context.. But we need context acquired
around blk_unref() as well and actually around blk_insert_bs() too.

Let's refactor qemuio_command so that it doesn't acquire aio context
but callers do that instead. This way we can cleanly acquire aio
context in hmp_qemu_io() around all three calls.

Reported-by: Max Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

v2: rewrite, to cleanly acquire aio-context around all needed things in
hmp_qemu_io

 block/monitor/block-hmp-cmds.c | 31 +--
 qemu-io-cmds.c |  8 
 qemu-io.c  | 17 +++--
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index ebf1033f31..3e6670c963 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -557,8 +557,10 @@ void hmp_eject(Monitor *mon, const QDict *qdict)
 
 void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 {
-BlockBackend *blk;
+BlockBackend *blk = NULL;
+BlockDriverState *bs = NULL;
 BlockBackend *local_blk = NULL;
+AioContext *ctx = NULL;
 bool qdev = qdict_get_try_bool(qdict, "qdev", false);
 const char *device = qdict_get_str(qdict, "device");
 const char *command = qdict_get_str(qdict, "command");
@@ -573,20 +575,24 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 } else {
 blk = blk_by_name(device);
 if (!blk) {
-BlockDriverState *bs = bdrv_lookup_bs(NULL, device, &err);
-if (bs) {
-blk = local_blk = blk_new(bdrv_get_aio_context(bs),
-  0, BLK_PERM_ALL);
-ret = blk_insert_bs(blk, bs, &err);
-if (ret < 0) {
-goto fail;
-}
-} else {
+bs = bdrv_lookup_bs(NULL, device, &err);
+if (!bs) {
 goto fail;
 }
 }
 }
 
+ctx = blk ? blk_get_aio_context(blk) : bdrv_get_aio_context(bs);
+aio_context_acquire(ctx);
+
+if (bs) {
+blk = local_blk = blk_new(bdrv_get_aio_context(bs), 0, BLK_PERM_ALL);
+ret = blk_insert_bs(blk, bs, &err);
+if (ret < 0) {
+goto fail;
+}
+}
+
 /*
  * Notably absent: Proper permission management. This is sad, but it seems
  * almost impossible to achieve without changing the semantics and thereby
@@ -616,6 +622,11 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 
 fail:
 blk_unref(local_blk);
+
+if (ctx) {
+aio_context_release(ctx);
+}
+
 hmp_handle_error(mon, er

Re: [PATCH v2 1/2] block: Add BDRV_O_NO_SHARE for blk_new_open()

2021-04-23 Thread Kevin Wolf
Am 23.04.2021 um 11:43 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 22.04.2021 19:43, Kevin Wolf wrote:
> > Normally, blk_new_open() just shares all permissions. This was fine
> > originally when permissions only protected against uses in the same
> > process because no other part of the code would actually get to access
> > the block nodes opened with blk_new_open(). However, since we use it for
> > file locking now, unsharing permissions becomes desirable.
> > 
> > Add a new BDRV_O_NO_SHARE flag that is used in blk_new_open() to unshare
> > any permissions that can be unshared.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >   include/block/block.h |  1 +
> >   block/block-backend.c | 19 +--
> >   2 files changed, 14 insertions(+), 6 deletions(-)
> > 
> > diff --git a/include/block/block.h b/include/block/block.h
> > index b3f6e509d4..735db05a39 100644
> > --- a/include/block/block.h
> > +++ b/include/block/block.h
> > @@ -101,6 +101,7 @@ typedef struct HDGeometry {
> >   uint32_t cylinders;
> >   } HDGeometry;
> > +#define BDRV_O_NO_SHARE0x0001 /* don't share permissons */
> >   #define BDRV_O_RDWR0x0002
> >   #define BDRV_O_RESIZE  0x0004 /* request permission for resizing the 
> > node */
> >   #define BDRV_O_SNAPSHOT0x0008 /* open the file read only and save 
> > writes in a snapshot */
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index 413af51f3b..61a10ea860 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -398,15 +398,19 @@ BlockBackend *blk_new_open(const char *filename, 
> > const char *reference,
> >   BlockBackend *blk;
> >   BlockDriverState *bs;
> >   uint64_t perm = 0;
> > +uint64_t shared = BLK_PERM_ALL;
> > -/* blk_new_open() is mainly used in .bdrv_create implementations and 
> > the
> > - * tools where sharing isn't a concern because the BDS stays private, 
> > so we
> > - * just request permission according to the flags.
> > +/*
> > + * blk_new_open() is mainly used in .bdrv_create implementations and 
> > the
> > + * tools where sharing isn't a major concern because the BDS stays 
> > private
> > + * and the file is generally not supposed to be used by a second 
> > process,
> > + * so we just request permission according to the flags.
> >*
> >* The exceptions are xen_disk and blockdev_init(); in these cases, 
> > the
> >* caller of blk_new_open() doesn't make use of the permissions, but 
> > they
> >* shouldn't hurt either. We can still share everything here because 
> > the
> > - * guest devices will add their own blockers if they can't share. */
> > + * guest devices will add their own blockers if they can't share.
> > + */
> 
> Probably old comment "We can still share everything" is a bit in
> conflict with commit message (and new logic).

I read that paragraph as referring to xen_disk and blockdev_init() only,
which still don't pass BDRV_O_NO_SHARE after this series. The comment
explains why they don't have to pass it.

Kevin




Re: [PATCH v3 04/36] block: bdrv_append(): don't consume reference

2021-04-23 Thread Kevin Wolf
Am 17.03.2021 um 15:34 hat Vladimir Sementsov-Ogievskiy geschrieben:
> We have too much comments for this feature. It seems better just don't
> do it. Most of real users (tests don't count) have to create additional
> reference.
> 
> Drop also comment in external_snapshot_prepare:
>  - bdrv_append doesn't "remove" old bs in common sense, it sounds
>strange
>  - the fact that bdrv_append can fail is obvious from the context
>  - the fact that we must rollback all changes in transaction abort is
>known (it's the direct role of abort)
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

test_append_greedy_filter() is new since v2 and needs an additional
bdrv_unref() now, too.

Kevin




Re: [PATCH v3 08/36] util: add transactions.c

2021-04-23 Thread Kevin Wolf
Am 17.03.2021 um 15:35 hat Vladimir Sementsov-Ogievskiy geschrieben:
> Add simple transaction API to use in further update of block graph
> operations.
> 
> Supposed usage is:
> 
> - "prepare" is main function of the action and it should make the main
>   effect of the action to be visible for the following actions, keeping
>   possibility of roll-back, saving necessary things in action state,
>   which is prepended to the action list (to do that, prepare func
>   should call tran_add()). So, driver struct doesn't include "prepare"
>   field, as it is supposed to be called directly.
> 
> - commit/rollback is supposed to be called for the list of action
>   states, to commit/rollback all the actions in reverse order
> 
> - When possible "commit" should not make visible effect for other
>   actions, which make possible transparent logical interaction between
>   actions.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/qemu/transactions.h | 63 
>  util/transactions.c | 96 +
>  MAINTAINERS |  6 +++
>  util/meson.build|  1 +
>  4 files changed, 166 insertions(+)
>  create mode 100644 include/qemu/transactions.h
>  create mode 100644 util/transactions.c
> 
> diff --git a/include/qemu/transactions.h b/include/qemu/transactions.h
> new file mode 100644
> index 00..e7add9637f
> --- /dev/null
> +++ b/include/qemu/transactions.h
> @@ -0,0 +1,63 @@
> +/*
> + * Simple transactions API
> + *
> + * Copyright (c) 2021 Virtuozzo International GmbH.
> + *
> + * Author:
> + *  Vladimir Sementsov-Ogievskiy 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see .
> + *
> + *
> + * = Generic transaction API =
> + *
> + * The intended usage is the following: you create "prepare" functions, which
> + * represents the actions. They will usually have Transaction* argument, and
> + * call tran_add() to register finalization callbacks. For finalization
> + * callbacks, prepare corresponding TransactionActionDrv structures.
> + *
> + * Than, when you need to make a transaction, create an empty Transaction by

Then

> + * tran_create(), call your "prepare" functions on it, and finally call
> + * tran_abort() or tran_commit() to finalize the transaction by corresponding
> + * finalization actions in reverse order.
> + */
> +
> +#ifndef QEMU_TRANSACTIONS_H
> +#define QEMU_TRANSACTIONS_H
> +
> +#include 
> +
> +typedef struct TransactionActionDrv {
> +void (*abort)(void *opaque);
> +void (*commit)(void *opaque);
> +void (*clean)(void *opaque);
> +} TransactionActionDrv;
> +
> +typedef struct Transaction Transaction;
> +
> +Transaction *tran_new(void);
> +void tran_add(Transaction *tran, TransactionActionDrv *drv, void *opaque);
> +void tran_abort(Transaction *tran);
> +void tran_commit(Transaction *tran);
> +
> +static inline void tran_finalize(Transaction *tran, int ret)
> +{
> +if (ret < 0) {
> +tran_abort(tran);
> +} else {
> +tran_commit(tran);
> +}
> +}
> +
> +#endif /* QEMU_TRANSACTIONS_H */
> diff --git a/util/transactions.c b/util/transactions.c
> new file mode 100644
> index 00..d0bc9a3e73
> --- /dev/null
> +++ b/util/transactions.c
> @@ -0,0 +1,96 @@
> +/*
> + * Simple transactions API
> + *
> + * Copyright (c) 2021 Virtuozzo International GmbH.
> + *
> + * Author:
> + *  Sementsov-Ogievskiy Vladimir 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see .
> + */
> +
> +#include "qemu/osdep.h"
> +
> +#include "qemu/transactions.h"
> +#include "qemu/queue.h"
> +
> +typedef struct TransactionAction {
> +TransactionActionDrv *drv;
> +void *opaque;
> +QSLIST_ENTRY(TransactionAction) entry;

"next" is a bit more conv

Re: [PATCH v2 2/2] qemu-img convert: Unshare write permission for source

2021-04-23 Thread Vladimir Sementsov-Ogievskiy

22.04.2021 19:43, Kevin Wolf wrote:

For a successful conversion of an image, we must make sure that its
content doesn't change during the conversion.

A special case of this is using the same image file both as the source
and as the destination. If both input and output format are raw, the
operation would just be useless work, with other formats it is a sure
way to destroy the image. This will now fail because the image file
can't be opened a second time for the output when opening it for the
input has already acquired file locks to unshare BLK_PERM_WRITE.

Nevertheless, if there is some reason in a special case why it is
actually okay to allow writes to the image while it is being converted,
-U can still be used to force sharing all permissions.

Note that for most image formats, BLK_PERM_WRITE would already be
unshared by the format driver, so this only really makes a difference
for raw source images (but any output format).

Reported-by: Xueqiang Wei 
Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
---
  qemu-img.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index babb5573ab..a5993682aa 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2146,7 +2146,7 @@ static void set_rate_limit(BlockBackend *blk, int64_t 
rate_limit)
  
  static int img_convert(int argc, char **argv)

  {
-int c, bs_i, flags, src_flags = 0;
+int c, bs_i, flags, src_flags = BDRV_O_NO_SHARE;
  const char *fmt = NULL, *out_fmt = NULL, *cache = "unsafe",
 *src_cache = BDRV_DEFAULT_CACHE, *out_baseimg = NULL,
 *out_filename, *out_baseimg_param, *snapshot_name = NULL;



Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir



Re: [PATCH for-6.0 v2 1/2] hw/block/nvme: fix invalid msix exclusive uninit

2021-04-23 Thread Peter Maydell
On Fri, 23 Apr 2021 at 06:21, Klaus Jensen  wrote:
>
> From: Klaus Jensen 
>
> Commit 1901b4967c3f changed the nvme device from using a bar exclusive
> for MSI-x to sharing it on bar0.
>
> Unfortunately, the msix_uninit_exclusive_bar() call remains in
> nvme_exit() which causes havoc when the device is removed with, say,
> device_del. Fix this.
>
> Additionally, a subregion is added but it is not removed on exit which
> causes a reference to linger and the drive to never be unlocked.
>
> Fixes: 1901b4967c3f ("hw/block/nvme: move msix table and pba to BAR 0")
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 624a1431d072..5fe082ec34c5 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -6235,7 +6235,8 @@ static void nvme_exit(PCIDevice *pci_dev)
>  if (n->pmr.dev) {
>  host_memory_backend_set_mapped(n->pmr.dev, false);
>  }
> -msix_uninit_exclusive_bar(pci_dev);
> +msix_uninit(pci_dev, &n->bar0, &n->bar0);
> +memory_region_del_subregion(&n->bar0, &n->iomem);
>  }
>
>  static Property nvme_props[] = {

Looks plausible, but if you want this in rc5 could somebody review
it, please ?

thanks
-- PMM



Re: [PATCH v3 00/36] block: update graph permissions update

2021-04-23 Thread Kevin Wolf
Am 17.03.2021 um 15:34 hat Vladimir Sementsov-Ogievskiy geschrieben:
> Hi all!
> 
> Finally, I finished v3. Phew.
> 
> Missed a soft-freeze. Should we consider it a bugfix? There are bugfixes
> here but they are mostly theoretical. So, up to Kevin, should it go to
> current release or to the next..
> 
> The main point of the series is fixing some permission update problems
> (see patches 01-03 as examples), that in turn makes possible more clean
> inserting and removing of filters (see patch 26 where .active field is
> dropped finally from backup-top filter, we don't need a workaround
> anymore).
> 
> The series brings util/transactions.c (patch 10) and use of it in
> block.c, which allows clean block graph change transactions, with
> possibility of reverting all modifications (movement and removement of
> children, changing aio context, changing permissions) in reverse order
> on failure path.
> 
> The series also helps Alberto's "Allow changing bs->file on reopen"
> which we want to merge prior dropping x- prefix from blockdev-reopen
> command.

With the minor comments I had so far fixed, patches 1-13 are:
Reviewed-by: Kevin Wolf 

I'll continue next week.

Kevin




[PATCH 00/11] qemu-io-cmds: move to coroutine

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Hi!

I've done this by accident. I decided that it is needed to make a nice
fix for a problem with aio context locking in hmp_qemu_io(). Still, the
problem is fixed without this series, so it's based on on the fix
instead of being preparation for it:

Based-on: <20210423134233.51495-1-vsement...@virtuozzo.com>
  "[PATCH v2] monitor: hmp_qemu_io: acquire aio contex, fix crash"

When I understood this, the series was more than half-done, so I decided
to finish it. I think it's a good refactoring. And moving block-layer
things to coroutine is good in general: we reduce number of polling
loops here and there.

Vladimir Sementsov-Ogievskiy (11):
  block-coroutine-wrapper: allow non bdrv_ prefix
  block-coroutine-wrapper: support BlockBackend first argument
  block/block-gen.h: bind monitor
  block: introduce bdrv_debug_wait_break
  qemu-io-cmds: move qemu-io commands to coroutine
  block: drop unused bdrv_debug_is_suspended()
  block-backend: add _co_ versions of blk_save_vmstate /
blk_load_vmstate
  qemu-io-cmds: refactor read_f(): drop extra helpers and variables
  qemu-io-cmds: refactor write_f(): drop extra helpers and variables
  qemu-io-cmds: drop do_co_readv() and do_co_writev() helpers
  block-backend: drop unused blk_save_vmstate() and blk_load_vmstate()

 block/block-gen.h  |  22 ++-
 include/block/block.h  |   2 +-
 include/block/block_int.h  |   2 +-
 include/qemu-io.h  |   9 +-
 include/sysemu/block-backend.h |   6 +-
 block.c|  10 +-
 block/blkdebug.c   |  17 ++-
 block/block-backend.c  |  21 ++-
 qemu-io-cmds.c | 237 -
 block/meson.build  |   3 +-
 scripts/block-coroutine-wrapper.py |  30 +++-
 11 files changed, 113 insertions(+), 246 deletions(-)

-- 
2.29.2




[PATCH 01/11] block-coroutine-wrapper: allow non bdrv_ prefix

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
We are going to reuse the script to generate a qcow2_ function in
further commit. Prepare the script now.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 scripts/block-coroutine-wrapper.py | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/block-coroutine-wrapper.py 
b/scripts/block-coroutine-wrapper.py
index 0461fd1c45..85dbeb9ecf 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -98,12 +98,13 @@ def snake_to_camel(func_name: str) -> str:
 
 
 def gen_wrapper(func: FuncDecl) -> str:
-assert func.name.startswith('bdrv_')
-assert not func.name.startswith('bdrv_co_')
+assert not '_co_' in func.name
 assert func.return_type == 'int'
 assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *']
 
-name = 'bdrv_co_' + func.name[5:]
+subsystem, subname = func.name.split('_', 1)
+
+name = f'{subsystem}_co_{subname}'
 bs = 'bs' if func.args[0].type == 'BlockDriverState *' else 'child->bs'
 struct_name = snake_to_camel(name)
 
-- 
2.29.2




[PATCH 03/11] block/block-gen.h: bind monitor

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
If we have current monitor, let's bind it to wrapper coroutine too.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-gen.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/block-gen.h b/block/block-gen.h
index c1fd3f40de..61f055a8cc 100644
--- a/block/block-gen.h
+++ b/block/block-gen.h
@@ -27,6 +27,7 @@
 #define BLOCK_BLOCK_GEN_H
 
 #include "block/block_int.h"
+#include "monitor/monitor.h"
 
 /* Base structure for argument packing structures */
 typedef struct AioPollCo {
@@ -38,11 +39,20 @@ typedef struct AioPollCo {
 
 static inline int aio_poll_co(AioPollCo *s)
 {
+Monitor *mon = monitor_cur();
 assert(!qemu_in_coroutine());
 
+if (mon) {
+monitor_set_cur(s->co, mon);
+}
+
 aio_co_enter(s->ctx, s->co);
 AIO_WAIT_WHILE(s->ctx, s->in_progress);
 
+if (mon) {
+monitor_set_cur(s->co, NULL);
+}
+
 return s->ret;
 }
 
-- 
2.29.2




[PATCH 06/11] block: drop unused bdrv_debug_is_suspended()

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Now it's actually substituted by coroutine based
bdrv_debug_wait_break().

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block.h |  1 -
 include/block/block_int.h |  1 -
 block.c   | 13 -
 block/blkdebug.c  |  1 -
 4 files changed, 16 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index e133adf54f..fb1897c1e8 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -647,7 +647,6 @@ int bdrv_debug_breakpoint(BlockDriverState *bs, const char 
*event,
const char *tag);
 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
 int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
-bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
 void coroutine_fn bdrv_debug_wait_break(BlockDriverState *bs, const char *tag);
 
 /**
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 89e6904fc7..592acc960f 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -501,7 +501,6 @@ struct BlockDriver {
 int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
 const char *tag);
 int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
-bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
 void (*bdrv_debug_wait_break)(BlockDriverState *bs, const char *tag);
 
 void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
diff --git a/block.c b/block.c
index 3ea088b9fb..f026d710b7 100644
--- a/block.c
+++ b/block.c
@@ -5689,19 +5689,6 @@ int bdrv_debug_resume(BlockDriverState *bs, const char 
*tag)
 return -ENOTSUP;
 }
 
-bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
-{
-while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
-bs = bdrv_primary_bs(bs);
-}
-
-if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
-return bs->drv->bdrv_debug_is_suspended(bs, tag);
-}
-
-return false;
-}
-
 void coroutine_fn bdrv_debug_wait_break(BlockDriverState *bs, const char *tag)
 {
 while (bs && bs->drv && !bs->drv->bdrv_debug_wait_break) {
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 10b7c38467..608d1d5bd6 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -1062,7 +1062,6 @@ static BlockDriver bdrv_blkdebug = {
 .bdrv_debug_remove_breakpoint
 = blkdebug_debug_remove_breakpoint,
 .bdrv_debug_resume  = blkdebug_debug_resume,
-.bdrv_debug_is_suspended= blkdebug_debug_is_suspended,
 .bdrv_debug_wait_break  = blkdebug_debug_wait_break,
 
 .strong_runtime_opts= blkdebug_strong_runtime_opts,
-- 
2.29.2




[PATCH 10/11] qemu-io-cmds: drop do_co_readv() and do_co_writev() helpers

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
They don't make much sense. Call blk_co_ functions directly and also
drop some redundant variables.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qemu-io-cmds.c | 38 ++
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 2f0a27079d..9a0e5322de 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -527,24 +527,6 @@ fail:
 return buf;
 }
 
-static int coroutine_fn do_co_readv(BlockBackend *blk, QEMUIOVector *qiov,
-int64_t offset, int *total)
-{
-int ret = blk_co_preadv(blk, offset, qiov->size, qiov, 0);
-
-*total = qiov->size;
-return ret < 0 ? ret : 1;
-}
-
-static int coroutine_fn do_co_writev(BlockBackend *blk, QEMUIOVector *qiov,
- int64_t offset, int flags, int *total)
-{
-int ret = blk_co_pwritev(blk, offset, qiov->size, qiov, flags);
-
-*total = qiov->size;
-return ret < 0 ? ret : 1;
-}
-
 static void read_help(void)
 {
 printf(
@@ -767,11 +749,10 @@ static int coroutine_fn readv_f(BlockBackend *blk, int 
argc, char **argv)
 {
 struct timespec t1, t2;
 bool Cflag = false, qflag = false, vflag = false;
-int c, cnt, ret;
+int c, ret;
 char *buf;
 int64_t offset;
 /* Some compilers get confused and warn if this is not initialized.  */
-int total = 0;
 int nr_iov;
 QEMUIOVector qiov;
 int pattern = 0;
@@ -821,16 +802,13 @@ static int coroutine_fn readv_f(BlockBackend *blk, int 
argc, char **argv)
 }
 
 clock_gettime(CLOCK_MONOTONIC, &t1);
-ret = do_co_readv(blk, &qiov, offset, &total);
+ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
 clock_gettime(CLOCK_MONOTONIC, &t2);
 
 if (ret < 0) {
 printf("readv failed: %s\n", strerror(-ret));
 goto out;
 }
-cnt = ret;
-
-ret = 0;
 
 if (Pflag) {
 void *cmp_buf = g_malloc(qiov.size);
@@ -853,7 +831,7 @@ static int coroutine_fn readv_f(BlockBackend *blk, int 
argc, char **argv)
 
 /* Finally, report back -- -C gives a parsable format */
 t2 = tsub(t2, t1);
-print_report("read", &t2, offset, qiov.size, total, cnt, Cflag);
+print_report("read", &t2, offset, qiov.size, qiov.size, 1, Cflag);
 
 out:
 qemu_iovec_destroy(&qiov);
@@ -1100,11 +1078,10 @@ static int coroutine_fn writev_f(BlockBackend *blk, int 
argc, char **argv)
 struct timespec t1, t2;
 bool Cflag = false, qflag = false;
 int flags = 0;
-int c, cnt, ret;
+int c, ret;
 char *buf;
 int64_t offset;
 /* Some compilers get confused and warn if this is not initialized.  */
-int total = 0;
 int nr_iov;
 int pattern = 0xcd;
 QEMUIOVector qiov;
@@ -1151,16 +1128,13 @@ static int coroutine_fn writev_f(BlockBackend *blk, int 
argc, char **argv)
 }
 
 clock_gettime(CLOCK_MONOTONIC, &t1);
-ret = do_co_writev(blk, &qiov, offset, flags, &total);
+ret = blk_co_pwritev(blk, offset, qiov.size,  &qiov, flags);
 clock_gettime(CLOCK_MONOTONIC, &t2);
 
 if (ret < 0) {
 printf("writev failed: %s\n", strerror(-ret));
 goto out;
 }
-cnt = ret;
-
-ret = 0;
 
 if (qflag) {
 goto out;
@@ -1168,7 +1142,7 @@ static int coroutine_fn writev_f(BlockBackend *blk, int 
argc, char **argv)
 
 /* Finally, report back -- -C gives a parsable format */
 t2 = tsub(t2, t1);
-print_report("wrote", &t2, offset, qiov.size, total, cnt, Cflag);
+print_report("wrote", &t2, offset, qiov.size, qiov.size, 1, Cflag);
 out:
 qemu_iovec_destroy(&qiov);
 qemu_io_free(buf);
-- 
2.29.2




[PATCH 08/11] qemu-io-cmds: refactor read_f(): drop extra helpers and variables

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Now all commands are in coroutines, so we want to move to _co_
functions. Let's just drop helpers and use blk_co_ functions directly.
Note that count is checked at start of read_f anyway.
Both blk_co_pread and blk_co_load_vmstate returns 0 on success, so we
should not care to set ret to 0 explicitly. Moreover, no caller is
care, is successful ret of qemuio_command positive or not.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qemu-io-cmds.c | 44 ++--
 1 file changed, 6 insertions(+), 38 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index adc9e64c37..bbebecba55 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -527,20 +527,6 @@ fail:
 return buf;
 }
 
-static int do_pread(BlockBackend *blk, char *buf, int64_t offset,
-int64_t bytes, int64_t *total)
-{
-if (bytes > INT_MAX) {
-return -ERANGE;
-}
-
-*total = blk_pread(blk, offset, (uint8_t *)buf, bytes);
-if (*total < 0) {
-return *total;
-}
-return 1;
-}
-
 static int do_pwrite(BlockBackend *blk, char *buf, int64_t offset,
  int64_t bytes, int flags, int64_t *total)
 {
@@ -586,20 +572,6 @@ static int do_write_compressed(BlockBackend *blk, char 
*buf, int64_t offset,
 return 1;
 }
 
-static int do_load_vmstate(BlockBackend *blk, char *buf, int64_t offset,
-   int64_t count, int64_t *total)
-{
-if (count > INT_MAX) {
-return -ERANGE;
-}
-
-*total = blk_load_vmstate(blk, (uint8_t *)buf, offset, count);
-if (*total < 0) {
-return *total;
-}
-return 1;
-}
-
 static int do_save_vmstate(BlockBackend *blk, char *buf, int64_t offset,
int64_t count, int64_t *total)
 {
@@ -667,17 +639,16 @@ static const cmdinfo_t read_cmd = {
 .help   = read_help,
 };
 
-static int read_f(BlockBackend *blk, int argc, char **argv)
+static int coroutine_fn read_f(BlockBackend *blk, int argc, char **argv)
 {
 struct timespec t1, t2;
 bool Cflag = false, qflag = false, vflag = false;
 bool Pflag = false, sflag = false, lflag = false, bflag = false;
-int c, cnt, ret;
-char *buf;
+int c, ret;
+uint8_t *buf;
 int64_t offset;
 int64_t count;
 /* Some compilers get confused and warn if this is not initialized.  */
-int64_t total = 0;
 int pattern = 0;
 int64_t pattern_offset = 0, pattern_count = 0;
 
@@ -780,9 +751,9 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
 
 clock_gettime(CLOCK_MONOTONIC, &t1);
 if (bflag) {
-ret = do_load_vmstate(blk, buf, offset, count, &total);
+ret = blk_co_load_vmstate(blk, buf, offset, count);
 } else {
-ret = do_pread(blk, buf, offset, count, &total);
+ret = blk_co_pread(blk, offset, count, buf, 0);
 }
 clock_gettime(CLOCK_MONOTONIC, &t2);
 
@@ -790,9 +761,6 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
 printf("read failed: %s\n", strerror(-ret));
 goto out;
 }
-cnt = ret;
-
-ret = 0;
 
 if (Pflag) {
 void *cmp_buf = g_malloc(pattern_count);
@@ -816,7 +784,7 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
 
 /* Finally, report back -- -C gives a parsable format */
 t2 = tsub(t2, t1);
-print_report("read", &t2, offset, count, total, cnt, Cflag);
+print_report("read", &t2, offset, count, count, 1, Cflag);
 
 out:
 qemu_io_free(buf);
-- 
2.29.2




[PATCH 09/11] qemu-io-cmds: refactor write_f(): drop extra helpers and variables

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
We are in coroutine context. Let's call blk_co_ functions directly and
drop all these helpers.
Note that count is checked earlier in write_f, so we don't need the
check in helpers.
Also, both blk_co_save_vmstate() and blk_co_pwrite() return 0 on
success, so we should not care to set ret to 0 explicitly. Moreover, no
caller is interested in successful ret of qemuio_command being exactly
zero.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qemu-io-cmds.c | 81 +-
 1 file changed, 8 insertions(+), 73 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index bbebecba55..2f0a27079d 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -527,65 +527,6 @@ fail:
 return buf;
 }
 
-static int do_pwrite(BlockBackend *blk, char *buf, int64_t offset,
- int64_t bytes, int flags, int64_t *total)
-{
-if (bytes > INT_MAX) {
-return -ERANGE;
-}
-
-*total = blk_pwrite(blk, offset, (uint8_t *)buf, bytes, flags);
-if (*total < 0) {
-return *total;
-}
-return 1;
-}
-
-static int coroutine_fn
-do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-int64_t bytes, int flags, int64_t *total)
-{
-int ret = blk_co_pwrite_zeroes(blk, offset, bytes, flags);
-if (ret < 0) {
-*total = ret;
-return ret;
-} else {
-*total = bytes;
-return 1;
-}
-}
-
-static int do_write_compressed(BlockBackend *blk, char *buf, int64_t offset,
-   int64_t bytes, int64_t *total)
-{
-int ret;
-
-if (bytes > BDRV_REQUEST_MAX_BYTES) {
-return -ERANGE;
-}
-
-ret = blk_pwrite_compressed(blk, offset, buf, bytes);
-if (ret < 0) {
-return ret;
-}
-*total = bytes;
-return 1;
-}
-
-static int do_save_vmstate(BlockBackend *blk, char *buf, int64_t offset,
-   int64_t count, int64_t *total)
-{
-if (count > INT_MAX) {
-return -ERANGE;
-}
-
-*total = blk_save_vmstate(blk, (uint8_t *)buf, offset, count);
-if (*total < 0) {
-return *total;
-}
-return 1;
-}
-
 static int coroutine_fn do_co_readv(BlockBackend *blk, QEMUIOVector *qiov,
 int64_t offset, int *total)
 {
@@ -945,7 +886,7 @@ static void write_help(void)
 "\n");
 }
 
-static int write_f(BlockBackend *blk, int argc, char **argv);
+static int coroutine_fn write_f(BlockBackend *blk, int argc, char **argv);
 
 static const cmdinfo_t write_cmd = {
 .name   = "write",
@@ -965,12 +906,11 @@ static int coroutine_fn write_f(BlockBackend *blk, int 
argc, char **argv)
 bool Cflag = false, qflag = false, bflag = false;
 bool Pflag = false, zflag = false, cflag = false, sflag = false;
 int flags = 0;
-int c, cnt, ret;
-char *buf = NULL;
+int c, ret;
+uint8_t *buf = NULL;
 int64_t offset;
 int64_t count;
 /* Some compilers get confused and warn if this is not initialized.  */
-int64_t total = 0;
 int pattern = 0xcd;
 const char *file_name = NULL;
 
@@ -981,6 +921,7 @@ static int coroutine_fn write_f(BlockBackend *blk, int 
argc, char **argv)
 break;
 case 'c':
 cflag = true;
+flags |= BDRV_REQ_WRITE_COMPRESSED;
 break;
 case 'C':
 Cflag = true;
@@ -1013,6 +954,7 @@ static int coroutine_fn write_f(BlockBackend *blk, int 
argc, char **argv)
 break;
 case 'z':
 zflag = true;
+flags |= BDRV_REQ_ZERO_WRITE;
 break;
 default:
 qemuio_command_usage(&write_cmd);
@@ -1095,13 +1037,9 @@ static int coroutine_fn write_f(BlockBackend *blk, int 
argc, char **argv)
 
 clock_gettime(CLOCK_MONOTONIC, &t1);
 if (bflag) {
-ret = do_save_vmstate(blk, buf, offset, count, &total);
-} else if (zflag) {
-ret = do_co_pwrite_zeroes(blk, offset, count, flags, &total);
-} else if (cflag) {
-ret = do_write_compressed(blk, buf, offset, count, &total);
+ret = blk_co_save_vmstate(blk, buf, offset, count);
 } else {
-ret = do_pwrite(blk, buf, offset, count, flags, &total);
+ret = blk_co_pwrite(blk, offset, count, buf, flags);
 }
 clock_gettime(CLOCK_MONOTONIC, &t2);
 
@@ -1109,9 +1047,6 @@ static int coroutine_fn write_f(BlockBackend *blk, int 
argc, char **argv)
 printf("write failed: %s\n", strerror(-ret));
 goto out;
 }
-cnt = ret;
-
-ret = 0;
 
 if (qflag) {
 goto out;
@@ -1119,7 +1054,7 @@ static int coroutine_fn write_f(BlockBackend *blk, int 
argc, char **argv)
 
 /* Finally, report back -- -C gives a parsable format */
 t2 = tsub(t2, t1);
-print_report("wrote", &t2, offset, count, total, cnt, Cflag);
+print_report("wrote", &t2, offset, count, count, 1, Cflag);
 
 out:
 if (!zflag) {
-- 
2.29.2




[PATCH 05/11] qemu-io-cmds: move qemu-io commands to coroutine

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Move qemuio_command to coroutine with all qemu io commands to simplify
the code and avoid extra explicit polling loops.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/qemu-io.h |   9 +++-
 qemu-io-cmds.c| 110 ++
 block/meson.build |   3 +-
 3 files changed, 34 insertions(+), 88 deletions(-)

diff --git a/include/qemu-io.h b/include/qemu-io.h
index 3af513004a..71cca117b9 100644
--- a/include/qemu-io.h
+++ b/include/qemu-io.h
@@ -18,6 +18,7 @@
 #ifndef QEMU_IO_H
 #define QEMU_IO_H
 
+#include "block/block.h"
 
 #define CMD_FLAG_GLOBAL ((int)0x8000) /* don't iterate "args" */
 
@@ -45,7 +46,13 @@ typedef struct cmdinfo {
 
 extern bool qemuio_misalign;
 
-int qemuio_command(BlockBackend *blk, const char *cmd);
+int coroutine_fn qemuio_co_command(BlockBackend *blk, const char *cmd);
+
+/*
+ * Called with aio context of blk acquired. Or with qemu_get_aio_context()
+ * context acquired if no blk is NULL.
+ */
+int generated_co_wrapper qemuio_command(BlockBackend *blk, const char *cmd);
 
 void qemuio_add_command(const cmdinfo_t *ci);
 void qemuio_command_usage(const cmdinfo_t *ci);
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 19149e014d..adc9e64c37 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -555,56 +555,16 @@ static int do_pwrite(BlockBackend *blk, char *buf, 
int64_t offset,
 return 1;
 }
 
-typedef struct {
-BlockBackend *blk;
-int64_t offset;
-int64_t bytes;
-int64_t *total;
-int flags;
-int ret;
-bool done;
-} CoWriteZeroes;
-
-static void coroutine_fn co_pwrite_zeroes_entry(void *opaque)
+static int coroutine_fn
+do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+int64_t bytes, int flags, int64_t *total)
 {
-CoWriteZeroes *data = opaque;
-
-data->ret = blk_co_pwrite_zeroes(data->blk, data->offset, data->bytes,
- data->flags);
-data->done = true;
-if (data->ret < 0) {
-*data->total = data->ret;
-return;
-}
-
-*data->total = data->bytes;
-}
-
-static int do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
-   int64_t bytes, int flags, int64_t *total)
-{
-Coroutine *co;
-CoWriteZeroes data = {
-.blk= blk,
-.offset = offset,
-.bytes  = bytes,
-.total  = total,
-.flags  = flags,
-.done   = false,
-};
-
-if (bytes > INT_MAX) {
-return -ERANGE;
-}
-
-co = qemu_coroutine_create(co_pwrite_zeroes_entry, &data);
-bdrv_coroutine_enter(blk_bs(blk), co);
-while (!data.done) {
-aio_poll(blk_get_aio_context(blk), true);
-}
-if (data.ret < 0) {
-return data.ret;
+int ret = blk_co_pwrite_zeroes(blk, offset, bytes, flags);
+if (ret < 0) {
+*total = ret;
+return ret;
 } else {
+*total = bytes;
 return 1;
 }
 }
@@ -654,38 +614,22 @@ static int do_save_vmstate(BlockBackend *blk, char *buf, 
int64_t offset,
 return 1;
 }
 
-#define NOT_DONE 0x7fff
-static void aio_rw_done(void *opaque, int ret)
-{
-*(int *)opaque = ret;
-}
-
-static int do_aio_readv(BlockBackend *blk, QEMUIOVector *qiov,
-int64_t offset, int *total)
+static int coroutine_fn do_co_readv(BlockBackend *blk, QEMUIOVector *qiov,
+int64_t offset, int *total)
 {
-int async_ret = NOT_DONE;
-
-blk_aio_preadv(blk, offset, qiov, 0, aio_rw_done, &async_ret);
-while (async_ret == NOT_DONE) {
-main_loop_wait(false);
-}
+int ret = blk_co_preadv(blk, offset, qiov->size, qiov, 0);
 
 *total = qiov->size;
-return async_ret < 0 ? async_ret : 1;
+return ret < 0 ? ret : 1;
 }
 
-static int do_aio_writev(BlockBackend *blk, QEMUIOVector *qiov,
- int64_t offset, int flags, int *total)
+static int coroutine_fn do_co_writev(BlockBackend *blk, QEMUIOVector *qiov,
+ int64_t offset, int flags, int *total)
 {
-int async_ret = NOT_DONE;
-
-blk_aio_pwritev(blk, offset, qiov, flags, aio_rw_done, &async_ret);
-while (async_ret == NOT_DONE) {
-main_loop_wait(false);
-}
+int ret = blk_co_pwritev(blk, offset, qiov->size, qiov, flags);
 
 *total = qiov->size;
-return async_ret < 0 ? async_ret : 1;
+return ret < 0 ? ret : 1;
 }
 
 static void read_help(void)
@@ -910,7 +854,7 @@ static const cmdinfo_t readv_cmd = {
 .help   = readv_help,
 };
 
-static int readv_f(BlockBackend *blk, int argc, char **argv)
+static int coroutine_fn readv_f(BlockBackend *blk, int argc, char **argv)
 {
 struct timespec t1, t2;
 bool Cflag = false, qflag = false, vflag = false;
@@ -968,7 +912,7 @@ static int readv_f(BlockBackend *blk, int argc, char **argv)
 }
 
 clock_gettime(CLOCK_MONOTONIC, &t1);
-ret = do_aio_readv(blk, &qiov, offset, &total);
+ret = do_co_readv(blk, &qiov, offset, 

[PATCH 02/11] block-coroutine-wrapper: support BlockBackend first argument

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
We'll need to wrap functions with first argument of BlockBackend *
type. For this let's generalize core function and struct to work with
pure AioContext.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-gen.h  | 12 ++--
 scripts/block-coroutine-wrapper.py | 23 ++-
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/block/block-gen.h b/block/block-gen.h
index f80cf4897d..c1fd3f40de 100644
--- a/block/block-gen.h
+++ b/block/block-gen.h
@@ -29,19 +29,19 @@
 #include "block/block_int.h"
 
 /* Base structure for argument packing structures */
-typedef struct BdrvPollCo {
-BlockDriverState *bs;
+typedef struct AioPollCo {
+AioContext *ctx;
 bool in_progress;
 int ret;
 Coroutine *co; /* Keep pointer here for debugging */
-} BdrvPollCo;
+} AioPollCo;
 
-static inline int bdrv_poll_co(BdrvPollCo *s)
+static inline int aio_poll_co(AioPollCo *s)
 {
 assert(!qemu_in_coroutine());
 
-bdrv_coroutine_enter(s->bs, s->co);
-BDRV_POLL_WHILE(s->bs, s->in_progress);
+aio_co_enter(s->ctx, s->co);
+AIO_WAIT_WHILE(s->ctx, s->in_progress);
 
 return s->ret;
 }
diff --git a/scripts/block-coroutine-wrapper.py 
b/scripts/block-coroutine-wrapper.py
index 85dbeb9ecf..114a54fcce 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -42,6 +42,8 @@ def gen_header():
 #include "qemu/osdep.h"
 #include "block/coroutines.h"
 #include "block/block-gen.h"
+#include "qemu-io.h"
+#include "sysemu/block-backend.h"
 #include "block/block_int.h"\
 """
 
@@ -100,12 +102,23 @@ def snake_to_camel(func_name: str) -> str:
 def gen_wrapper(func: FuncDecl) -> str:
 assert not '_co_' in func.name
 assert func.return_type == 'int'
-assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *']
+assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *',
+ 'BlockBackend *']
 
 subsystem, subname = func.name.split('_', 1)
 
 name = f'{subsystem}_co_{subname}'
-bs = 'bs' if func.args[0].type == 'BlockDriverState *' else 'child->bs'
+
+first_arg_type = func.args[0].type
+if first_arg_type == 'BlockDriverState *':
+ctx = 'bdrv_get_aio_context(bs)'
+elif first_arg_type == 'BdrvChild *':
+ctx = '(child ? bdrv_get_aio_context(child->bs) : ' \
+'qemu_get_aio_context())'
+else:
+assert first_arg_type == 'BlockBackend *'
+ctx = '(blk ? blk_get_aio_context(blk) : qemu_get_aio_context())'
+
 struct_name = snake_to_camel(name)
 
 return f"""\
@@ -114,7 +127,7 @@ def gen_wrapper(func: FuncDecl) -> str:
  */
 
 typedef struct {struct_name} {{
-BdrvPollCo poll_state;
+AioPollCo poll_state;
 { func.gen_block('{decl};') }
 }} {struct_name};
 
@@ -134,7 +147,7 @@ def gen_wrapper(func: FuncDecl) -> str:
 return {name}({ func.gen_list('{name}') });
 }} else {{
 {struct_name} s = {{
-.poll_state.bs = {bs},
+.poll_state.ctx = {ctx},
 .poll_state.in_progress = true,
 
 { func.gen_block('.{name} = {name},') }
@@ -142,7 +155,7 @@ def gen_wrapper(func: FuncDecl) -> str:
 
 s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
 
-return bdrv_poll_co(&s.poll_state);
+return aio_poll_co(&s.poll_state);
 }}
 }}"""
 
-- 
2.29.2




[PATCH 04/11] block: introduce bdrv_debug_wait_break

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Add a handler to wait for the break to happen in coroutine context. It
will be used in further commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block.h |  1 +
 include/block/block_int.h |  1 +
 block.c   | 11 +++
 block/blkdebug.c  | 16 
 4 files changed, 29 insertions(+)

diff --git a/include/block/block.h b/include/block/block.h
index b3f6e509d4..e133adf54f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -648,6 +648,7 @@ int bdrv_debug_breakpoint(BlockDriverState *bs, const char 
*event,
 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
 int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
+void coroutine_fn bdrv_debug_wait_break(BlockDriverState *bs, const char *tag);
 
 /**
  * bdrv_get_aio_context:
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 50af58af75..89e6904fc7 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -502,6 +502,7 @@ struct BlockDriver {
 const char *tag);
 int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
 bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
+void (*bdrv_debug_wait_break)(BlockDriverState *bs, const char *tag);
 
 void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
 
diff --git a/block.c b/block.c
index 001453105e..3ea088b9fb 100644
--- a/block.c
+++ b/block.c
@@ -5702,6 +5702,17 @@ bool bdrv_debug_is_suspended(BlockDriverState *bs, const 
char *tag)
 return false;
 }
 
+void coroutine_fn bdrv_debug_wait_break(BlockDriverState *bs, const char *tag)
+{
+while (bs && bs->drv && !bs->drv->bdrv_debug_wait_break) {
+bs = bdrv_primary_bs(bs);
+}
+
+if (bs && bs->drv && bs->drv->bdrv_debug_wait_break) {
+bs->drv->bdrv_debug_wait_break(bs, tag);
+}
+}
+
 /* backing_file can either be relative, or absolute, or a protocol.  If it is
  * relative, it must be relative to the chain.  So, passing in bs->filename
  * from a BDS as backing_file should not be done, as that may be relative to
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 2c0b9b0ee8..10b7c38467 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -57,6 +57,7 @@ typedef struct BDRVBlkdebugState {
 QLIST_HEAD(, BlkdebugRule) rules[BLKDBG__MAX];
 QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
 QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs;
+CoQueue break_waiters;
 } BDRVBlkdebugState;
 
 typedef struct BlkdebugAIOCB {
@@ -467,6 +468,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 int ret;
 uint64_t align;
 
+qemu_co_queue_init(&s->break_waiters);
+
 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 ret = -EINVAL;
@@ -785,6 +788,8 @@ static void suspend_request(BlockDriverState *bs, 
BlkdebugRule *rule)
 remove_rule(rule);
 QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next);
 
+qemu_co_queue_restart_all(&s->break_waiters);
+
 if (!qtest_enabled()) {
 printf("blkdebug: Suspended request '%s'\n", r.tag);
 }
@@ -922,6 +927,16 @@ static bool blkdebug_debug_is_suspended(BlockDriverState 
*bs, const char *tag)
 return false;
 }
 
+static void coroutine_fn
+blkdebug_debug_wait_break(BlockDriverState *bs, const char *tag)
+{
+BDRVBlkdebugState *s = bs->opaque;
+
+while (!blkdebug_debug_is_suspended(bs, tag)) {
+qemu_co_queue_wait(&s->break_waiters, NULL);
+}
+}
+
 static int64_t blkdebug_getlength(BlockDriverState *bs)
 {
 return bdrv_getlength(bs->file->bs);
@@ -1048,6 +1063,7 @@ static BlockDriver bdrv_blkdebug = {
 = blkdebug_debug_remove_breakpoint,
 .bdrv_debug_resume  = blkdebug_debug_resume,
 .bdrv_debug_is_suspended= blkdebug_debug_is_suspended,
+.bdrv_debug_wait_break  = blkdebug_debug_wait_break,
 
 .strong_runtime_opts= blkdebug_strong_runtime_opts,
 };
-- 
2.29.2




[PATCH 11/11] block-backend: drop unused blk_save_vmstate() and blk_load_vmstate()

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/sysemu/block-backend.h |  3 ---
 block/block-backend.c  | 30 --
 2 files changed, 33 deletions(-)

diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 8676bbde5a..14cc410244 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -242,9 +242,6 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t 
offset, const void *buf,
 int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
-int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
- int64_t pos, int size);
-int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size);
 int blk_co_save_vmstate(BlockBackend *blk, const uint8_t *buf,
 int64_t pos, int size);
 int blk_co_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int 
size);
diff --git a/block/block-backend.c b/block/block-backend.c
index d7f91ce7ad..83aafda791 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2198,36 +2198,6 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool 
exact,
 return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp);
 }
 
-int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
- int64_t pos, int size)
-{
-int ret;
-
-if (!blk_is_available(blk)) {
-return -ENOMEDIUM;
-}
-
-ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
-if (ret < 0) {
-return ret;
-}
-
-if (ret == size && !blk->enable_write_cache) {
-ret = bdrv_flush(blk_bs(blk));
-}
-
-return ret < 0 ? ret : size;
-}
-
-int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
-{
-if (!blk_is_available(blk)) {
-return -ENOMEDIUM;
-}
-
-return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
-}
-
 int blk_co_save_vmstate(BlockBackend *blk, const uint8_t *buf,
 int64_t pos, int size)
 {
-- 
2.29.2




[PATCH 07/11] block-backend: add _co_ versions of blk_save_vmstate / blk_load_vmstate

2021-04-23 Thread Vladimir Sementsov-Ogievskiy
To be used in further commit. Don't worry about some duplication with
existing blk_save_vmstate() and blk_load_vmstate(): they will be
removed soon.

Note the difference: new functions returns 0 on success.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/sysemu/block-backend.h |  3 +++
 block/block-backend.c  | 37 ++
 2 files changed, 40 insertions(+)

diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 880e903293..8676bbde5a 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -245,6 +245,9 @@ int blk_pdiscard(BlockBackend *blk, int64_t offset, int 
bytes);
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
  int64_t pos, int size);
 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size);
+int blk_co_save_vmstate(BlockBackend *blk, const uint8_t *buf,
+int64_t pos, int size);
+int blk_co_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int 
size);
 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz);
 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo);
 BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
diff --git a/block/block-backend.c b/block/block-backend.c
index 413af51f3b..d7f91ce7ad 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -14,6 +14,7 @@
 #include "sysemu/block-backend.h"
 #include "block/block_int.h"
 #include "block/blockjob.h"
+#include "block/coroutines.h"
 #include "block/throttle-groups.h"
 #include "hw/qdev-core.h"
 #include "sysemu/blockdev.h"
@@ -2227,6 +2228,42 @@ int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, 
int64_t pos, int size)
 return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
 }
 
+int blk_co_save_vmstate(BlockBackend *blk, const uint8_t *buf,
+int64_t pos, int size)
+{
+int ret;
+QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
+
+if (!blk_is_available(blk)) {
+return -ENOMEDIUM;
+}
+
+ret = bdrv_co_writev_vmstate(blk_bs(blk), &qiov, pos);
+if (ret < 0) {
+return ret;
+}
+
+if (!blk->enable_write_cache) {
+ret = bdrv_flush(blk_bs(blk));
+}
+
+return ret < 0 ? ret : 0;
+}
+
+int blk_co_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
+{
+int ret;
+QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
+
+if (!blk_is_available(blk)) {
+return -ENOMEDIUM;
+}
+
+ret = bdrv_co_readv_vmstate(blk_bs(blk), &qiov, pos);
+
+return ret < 0 ? ret : 0;
+}
+
 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
 {
 if (!blk_is_available(blk)) {
-- 
2.29.2




Re: [PATCH 03/11] block/block-gen.h: bind monitor

2021-04-23 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> If we have current monitor, let's bind it to wrapper coroutine too.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/block-gen.h | 10 ++
>  1 file changed, 10 insertions(+)
>
> diff --git a/block/block-gen.h b/block/block-gen.h
> index c1fd3f40de..61f055a8cc 100644
> --- a/block/block-gen.h
> +++ b/block/block-gen.h
> @@ -27,6 +27,7 @@
>  #define BLOCK_BLOCK_GEN_H
>  
>  #include "block/block_int.h"
> +#include "monitor/monitor.h"
>  
>  /* Base structure for argument packing structures */
>  typedef struct AioPollCo {
> @@ -38,11 +39,20 @@ typedef struct AioPollCo {
>  
>  static inline int aio_poll_co(AioPollCo *s)
>  {
> +Monitor *mon = monitor_cur();

This gets the currently executing coroutine's monitor from the hash
table.

>  assert(!qemu_in_coroutine());
>  
> +if (mon) {
> +monitor_set_cur(s->co, mon);

This writes it back.  No-op, since the coroutine hasn't changed.  Why?

> +}
> +
>  aio_co_enter(s->ctx, s->co);
>  AIO_WAIT_WHILE(s->ctx, s->in_progress);
>  
> +if (mon) {
> +monitor_set_cur(s->co, NULL);

This removes s->co's monitor from the hash table.  Why?

> +}
> +
>  return s->ret;
>  }