date:20210421

[PATCH 3/3] hw/block/nvme: add id ns metadata cap (mc) enum

2021-04-21 Thread Gollu Appalanaidu

Add Idnetify Namespace Metadata Capablities (MC) enum.

Signed-off-by: Gollu Appalanaidu 
---
 hw/block/nvme-ns.c   | 2 +-
 include/block/nvme.h | 5 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 9065a7ae99..db75302136 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -85,7 +85,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 ds = 31 - clz32(ns->blkconf.logical_block_size);
 ms = ns->params.ms;
 
-id_ns->mc = 0x3;
+id_ns->mc = NVME_ID_NS_MC_EXTENDED | NVME_ID_NS_MC_SEPARATE;
 
 if (ms && ns->params.mset) {
 id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDEND;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 1d61030756..a3b610ba86 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1344,6 +1344,11 @@ enum NvmeIdNsFlbas {
 NVME_ID_NS_FLBAS_EXTENDEND  = 1 << 4,
 };
 
+enum NvmeIdNsMc {
+NVME_ID_NS_MC_EXTENDED  = 1 << 0,
+NVME_ID_NS_MC_SEPARATE  = 1 << 1,
+};
+
 #define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK)
 
 typedef struct NvmeDifTuple {
-- 
2.17.1

[PATCH 2/3] hw/block/nvme: add id ns flbas enum

2021-04-21 Thread Gollu Appalanaidu

Add the Identify Namespace FLBAS related enums and remove
NVME_ID_NS_FLBAS_EXTENDEND macro its being used in only
one place and converted into enum.

Signed-off-by: Gollu Appalanaidu 
---
 hw/block/nvme-ns.c   | 2 +-
 hw/block/nvme-ns.h   | 2 +-
 include/block/nvme.h | 5 -
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index ae56142fcd..9065a7ae99 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -88,7 +88,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 id_ns->mc = 0x3;
 
 if (ms && ns->params.mset) {
-id_ns->flbas |= 0x10;
+id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDEND;
 }
 
 id_ns->dpc = 0x1f;
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index fb0a41f912..5aa36cd1d2 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -134,7 +134,7 @@ static inline size_t nvme_m2b(NvmeNamespace *ns, uint64_t 
lba)
 
 static inline bool nvme_ns_ext(NvmeNamespace *ns)
 {
-return !!NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas);
+return ns->id_ns.flbas & NVME_ID_NS_FLBAS_EXTENDEND;
 }
 
 /* calculate the number of LBAs that the namespace can accomodate */
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 4ac926fbc6..1d61030756 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1321,7 +1321,6 @@ typedef struct QEMU_PACKED NvmeIdNsZoned {
 
 #define NVME_ID_NS_NSFEAT_THIN(nsfeat)  ((nsfeat & 0x1))
 #define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
-#define NVME_ID_NS_FLBAS_EXTENDED(flbas)((flbas >> 4) & 0x1)
 #define NVME_ID_NS_FLBAS_INDEX(flbas)   ((flbas & 0xf))
 #define NVME_ID_NS_MC_SEPARATE(mc)  ((mc >> 1) & 0x1)
 #define NVME_ID_NS_MC_EXTENDED(mc)  ((mc & 0x1))
@@ -1341,6 +1340,10 @@ enum NvmeIdNsDps {
 NVME_ID_NS_DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeIdNsFlbas {
+NVME_ID_NS_FLBAS_EXTENDEND  = 1 << 4,
+};
+
 #define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK)
 
 typedef struct NvmeDifTuple {
-- 
2.17.1

[PATCH 1/3] hw/block/nvme: fix lbaf formats initialization

2021-04-21 Thread Gollu Appalanaidu

Currently LBAF formats are being intialized based on metadata
size if and only if nvme-ns "ms" parameter is non-zero value.
Since FormatNVM command being supported device parameter "ms"
may not be the criteria to initialize the supported LBAFs.

And make LBAF array as read-only.

Reviewed-by: Klaus Jensen 
Signed-off-by: Gollu Appalanaidu 
---
 hw/block/nvme-ns.c | 51 --
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 7bb618f182..ae56142fcd 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -85,39 +85,32 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 ds = 31 - clz32(ns->blkconf.logical_block_size);
 ms = ns->params.ms;
 
-if (ns->params.ms) {
-id_ns->mc = 0x3;
+id_ns->mc = 0x3;
 
-if (ns->params.mset) {
-id_ns->flbas |= 0x10;
-}
-
-id_ns->dpc = 0x1f;
-id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi;
-
-NvmeLBAF lbaf[16] = {
-[0] = { .ds =  9   },
-[1] = { .ds =  9, .ms =  8 },
-[2] = { .ds =  9, .ms = 16 },
-[3] = { .ds =  9, .ms = 64 },
-[4] = { .ds = 12   },
-[5] = { .ds = 12, .ms =  8 },
-[6] = { .ds = 12, .ms = 16 },
-[7] = { .ds = 12, .ms = 64 },
-};
-
-memcpy(_ns->lbaf, , sizeof(lbaf));
-id_ns->nlbaf = 7;
-} else {
-NvmeLBAF lbaf[16] = {
-[0] = { .ds =  9 },
-[1] = { .ds = 12 },
-};
+if (ms && ns->params.mset) {
+id_ns->flbas |= 0x10;
+}
 
-memcpy(_ns->lbaf, , sizeof(lbaf));
-id_ns->nlbaf = 1;
+id_ns->dpc = 0x1f;
+id_ns->dps = ns->params.pi;
+if (ns->params.pi && ns->params.pil) {
+id_ns->dps |= NVME_ID_NS_DPS_FIRST_EIGHT;
 }
 
+static const NvmeLBAF lbaf[16] = {
+[0] = { .ds =  9   },
+[1] = { .ds =  9, .ms =  8 },
+[2] = { .ds =  9, .ms = 16 },
+[3] = { .ds =  9, .ms = 64 },
+[4] = { .ds = 12   },
+[5] = { .ds = 12, .ms =  8 },
+[6] = { .ds = 12, .ms = 16 },
+[7] = { .ds = 12, .ms = 64 },
+};
+
+memcpy(_ns->lbaf, , sizeof(lbaf));
+id_ns->nlbaf = 7;
+
 for (i = 0; i <= id_ns->nlbaf; i++) {
 NvmeLBAF *lbaf = _ns->lbaf[i];
 if (lbaf->ds == ds) {
-- 
2.17.1

[PATCH 0/3] hw/block/nvme: fixes lbaf formats initialization and adds idns enums

2021-04-21 Thread Gollu Appalanaidu

This series fixes the LBAF Formats intialization and Adds the
Identify Namespace Parameters enums to make to more readable.

Gollu Appalanaidu (3):
  hw/block/nvme: fix lbaf formats initialization
  hw/block/nvme: add id ns flbas enum
  hw/block/nvme: add id ns metadata cap (mc) enum

 hw/block/nvme-ns.c   | 51 +++-
 hw/block/nvme-ns.h   |  2 +-
 include/block/nvme.h | 10 -
 3 files changed, 32 insertions(+), 31 deletions(-)

-- 
2.17.1

Re: [PATCH v2 7/7] hw/block/pflash_cfi02: Simplify pflash_cfi02_register() prototype

2021-04-21 Thread Richard Henderson


On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote:

The previous commit removed the mapping code from TYPE_PFLASH_CFI02.
pflash_cfi02_register() doesn't use the 'nb_mappings' argument
anymore. Simply remove it to simplify.

Reviewed-by: David Gibson 
Signed-off-by: Philippe Mathieu-Daudé 


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v2 6/7] hw/block/pflash_cfi02: Remove pflash_setup_mappings()

2021-04-21 Thread Richard Henderson


On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote:

All boards calling pflash_cfi02_register() use nb_mappings=1,
which does not do any mapping:

   $ git grep -wl pflash_cfi02_register hw/
   hw/arm/xilinx_zynq.c
   hw/block/pflash_cfi02.c
   hw/lm32/lm32_boards.c
   hw/ppc/ppc405_boards.c
   hw/sh4/r2d.c

We can remove this now unneeded code.

Reviewed-by: David Gibson 
Signed-off-by: Philippe Mathieu-Daudé 


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v2 5/7] hw/arm/digic: Map flash using memory_region_add_subregion_aliased()

2021-04-21 Thread Richard Henderson


On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote:

Instead of using a device specific feature for mapping the
flash memory multiple times over a wider region, use the
generic memory_region_add_subregion_aliased() helper.

There is no change in the memory layout.

* before:

   $ qemu-system-arm -M canon-a1100 -S -monitor stdio
   QEMU 5.2.90 monitor - type 'help' for more information
   (qemu) info mtree
   address-space: memory
 - (prio 0, i/o): system
   -03ff (prio 0, ram): ram
   c021-c02100ff (prio 0, i/o): digic-timer
   c0210100-c02101ff (prio 0, i/o): digic-timer
   c0210200-c02102ff (prio 0, i/o): digic-timer
   c080-c0800017 (prio 0, i/o): digic-uart
   f800- (prio 0, i/o): pflash
 f800-f83f (prio 0, romd): alias pflash-alias 
@pflash -003f
 f840-f87f (prio 0, romd): alias pflash-alias 
@pflash -003f
 f880-f8bf (prio 0, romd): alias pflash-alias 
@pflash -003f
 ...
 ff40-ff7f (prio 0, romd): alias pflash-alias 
@pflash -003f
 ff80-ffbf (prio 0, romd): alias pflash-alias 
@pflash -003f
 ffc0- (prio 0, romd): alias pflash-alias 
@pflash -003f

* after:

   (qemu) info mtree
   address-space: memory
 - (prio 0, i/o): system
   -03ff (prio 0, ram): ram
   c021-c02100ff (prio 0, i/o): digic-timer
   c0210100-c02101ff (prio 0, i/o): digic-timer
   c0210200-c02102ff (prio 0, i/o): digic-timer
   c080-c0800017 (prio 0, i/o): digic-uart
   f800- (prio 0, i/o): masked pflash [span of 
4 MiB]
 f800-f83f (prio 0, romd): alias pflash [#0/32] 
@pflash -003f
 f840-f87f (prio 0, romd): alias pflash [#1/32] 
@pflash -003f
 f880-f8bf (prio 0, romd): alias pflash [#2/32] 
@pflash -003f
 ...
 ff40-ff7f (prio 0, romd): alias pflash 
[#29/32] @pflash -003f
 ff80-ffbf (prio 0, romd): alias pflash 
[#30/32] @pflash -003f
 ffc0- (prio 0, romd): alias pflash 
[#31/32] @pflash -003f

Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/arm/digic_boards.c | 8 +---
  hw/arm/Kconfig| 1 +
  2 files changed, 6 insertions(+), 3 deletions(-)


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v2 3/7] hw/arm/musicpal: Map flash using memory_region_add_subregion_aliased()

2021-04-21 Thread Richard Henderson


On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote:

Instead of using a device specific feature for mapping the
flash memory multiple times over a wider region, use the
generic memory_region_add_subregion_aliased() helper.

There is no change in the memory layout:

- before:

   (qemu) info mtree
   fe00- (prio 0, i/o): pflash
 fe00-fe7f (prio 0, romd): alias pflash-alias 
@musicpal.flash -007f
 fe80-feff (prio 0, romd): alias pflash-alias 
@musicpal.flash -007f
 ff00-ff7f (prio 0, romd): alias pflash-alias 
@musicpal.flash -007f
 ff80- (prio 0, romd): alias pflash-alias 
@musicpal.flash -007f

- after:

   fe00- (prio 0, i/o): masked musicpal.flash [span 
of 8 MiB]
 fe00-fe7f (prio 0, romd): alias musicpal.flash 
[#0/4] @musicpal.flash -007f
 fe80-feff (prio 0, romd): alias musicpal.flash 
[#1/4] @musicpal.flash -007f
 ff00-ff7f (prio 0, romd): alias musicpal.flash 
[#2/4] @musicpal.flash -007f
 ff80- (prio 0, romd): alias musicpal.flash 
[#3/4] @musicpal.flash -007f

Flatview is the same:

   (qemu) info mtree -f
   FlatView #0
AS "memory", root: system
AS "cpu-memory-0", root: system
AS "emac-dma", root: system
Root memory region: system
 fe00-fe7f (prio 0, romd): musicpal.flash
 fe80-feff (prio 0, romd): musicpal.flash
 ff00-ff7f (prio 0, romd): musicpal.flash
 ff80- (prio 0, romd): musicpal.flash

Signed-off-by: Philippe Mathieu-Daudé 


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v2 4/7] hw/arm/digic: Open-code pflash_cfi02_register() call

2021-04-21 Thread Richard Henderson


On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote:

To be able to manually map the flash region on the main memory
(in the next commit), first expand the pflash_cfi02_register
in place.

Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/arm/digic_boards.c | 27 +--
  1 file changed, 21 insertions(+), 6 deletions(-)


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v2 2/7] hw/arm/musicpal: Open-code pflash_cfi02_register() call

2021-04-21 Thread Richard Henderson


On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote:

To be able to manually map the flash region on the main memory
(in the next commit), first expand the pflash_cfi02_register
in place.

Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/arm/musicpal.c | 27 +--
  1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/hw/arm/musicpal.c b/hw/arm/musicpal.c
index 9cebece2de0..8b58b66f263 100644
--- a/hw/arm/musicpal.c
+++ b/hw/arm/musicpal.c
@@ -10,6 +10,7 @@
   */
  
  #include "qemu/osdep.h"

+#include "qemu/units.h"
  #include "qapi/error.h"
  #include "cpu.h"
  #include "hw/sysbus.h"
@@ -1640,6 +1641,7 @@ static void musicpal_init(MachineState *machine)
  /* Register flash */
  dinfo = drive_get(IF_PFLASH, 0, 0);
  if (dinfo) {
+static const size_t sector_size = 64 * KiB;


Drop the static.  We do not need permanent storage for this.
Otherwise,

Reviewed-by: Richard Henderson 

r~

Re: [RFC PATCH v2 1/7] hw/misc: Add device to help managing aliased memory regions

2021-04-21 Thread Richard Henderson

On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote:

Not really RFC, simply that I'v to add the technical description,
but I'd like to know if there could be a possibility to not accept
this device (because I missed something) before keeping working on
it. So far it is only used in hobbyist boards.

Cc: Peter Xu
Cc: Paolo Bonzini
---
  include/hw/misc/aliased_region.h |  87 
  hw/misc/aliased_region.c | 132 +++
  MAINTAINERS  |   6 ++
  hw/misc/Kconfig  |   3 +
  hw/misc/meson.build  |   1 +
  5 files changed, 229 insertions(+)
  create mode 100644 include/hw/misc/aliased_region.h
  create mode 100644 hw/misc/aliased_region.c

Looks reasonable to me.

+subregion_bits = 64 - clz64(s->span_size - 1);
+s->mem.count = s->region_size >> subregion_bits;
+assert(s->mem.count > 1);
+subregion_size = 1ULL << subregion_bits;

So... subregion_size = pow2floor(s->span_size)?

Why use a floor-ish computation here and pow2ceil later in aliased_mr_realize? 
 Why use either floor or ceil as opposed to asserting that the size is in fact 
a power of 2?  Why must the region be a power of 2, as opposed to e.g. a 
multiple of page_size?  Or just the most basic requirement that region_size be 
a multiple of span_size?

r~

Re: [PATCH v3 02/33] block/nbd: fix how state is cleared on nbd_open() failure paths

2021-04-21 Thread Vladimir Sementsov-Ogievskiy


21.04.2021 17:00, Roman Kagan wrote:

On Fri, Apr 16, 2021 at 11:08:40AM +0300, Vladimir Sementsov-Ogievskiy wrote:

We have two "return error" paths in nbd_open() after
nbd_process_options(). Actually we should call nbd_clear_bdrvstate()
on these paths. Interesting that nbd_process_options() calls
nbd_clear_bdrvstate() by itself.

Let's fix leaks and refactor things to be more obvious:

- intialize yank at top of nbd_open()
- move yank cleanup to nbd_clear_bdrvstate()
- refactor nbd_open() so that all failure paths except for
   yank-register goes through nbd_clear_bdrvstate()

Signed-off-by: Vladimir Sementsov-Ogievskiy
---
  block/nbd.c | 36 ++--
  1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 739ae2941f..a407a3814b 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -152,8 +152,12 @@ static void 
nbd_co_establish_connection_cancel(BlockDriverState *bs,
  static int nbd_client_handshake(BlockDriverState *bs, Error **errp);
  static void nbd_yank(void *opaque);
  
-static void nbd_clear_bdrvstate(BDRVNBDState *s)

+static void nbd_clear_bdrvstate(BlockDriverState *bs)
  {
+BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
+
+yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
+
  object_unref(OBJECT(s->tlscreds));
  qapi_free_SocketAddress(s->saddr);
  s->saddr = NULL;
@@ -2279,9 +2283,6 @@ static int nbd_process_options(BlockDriverState *bs, 
QDict *options,
  ret = 0;
  
   error:

-if (ret < 0) {
-nbd_clear_bdrvstate(s);
-}
  qemu_opts_del(opts);
  return ret;
  }
@@ -2292,11 +2293,6 @@ static int nbd_open(BlockDriverState *bs, QDict 
*options, int flags,
  int ret;
  BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
  
-ret = nbd_process_options(bs, options, errp);

-if (ret < 0) {
-return ret;
-}
-
  s->bs = bs;
  qemu_co_mutex_init(>send_mutex);
  qemu_co_queue_init(>free_sema);
@@ -2305,20 +2301,23 @@ static int nbd_open(BlockDriverState *bs, QDict 
*options, int flags,
  return -EEXIST;
  }
  
+ret = nbd_process_options(bs, options, errp);

+if (ret < 0) {
+goto fail;
+}
+
  /*
   * establish TCP connection, return error if it fails
   * TODO: Configurable retry-until-timeout behaviour.
   */
  if (nbd_establish_connection(bs, s->saddr, errp) < 0) {
-yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
-return -ECONNREFUSED;
+ret = -ECONNREFUSED;
+goto fail;
  }
  
  ret = nbd_client_handshake(bs, errp);

Not that this was introduced by this patch, but once you're at it:
AFAICT nbd_client_handshake() calls yank_unregister_instance() on some
error path(s); I assume this needs to go too, otherwise it's called
twice (and asserts).

Roman.



No, nbd_client_handshake() only calls yank_unregister_function(), not instance.

--
Best regards,
Vladimir

Re: [PATCH] monitor: hmp_qemu_io: acquire aio contex, fix crash

2021-04-21 Thread Vladimir Sementsov-Ogievskiy


21.04.2021 22:47, Philippe Mathieu-Daudé wrote:

On 4/21/21 10:32 AM, Vladimir Sementsov-Ogievskiy wrote:

Max reported the following bug:

$ ./qemu-img create -f raw src.img 1G
$ ./qemu-img create -f raw dst.img 1G

$ (echo '
{"execute":"qmp_capabilities"}
{"execute":"blockdev-mirror",
 "arguments":{"job-id":"mirror",
  "device":"source",
  "target":"target",
  "sync":"full",
  "filter-node-name":"mirror-top"}}
'; sleep 3; echo'
{"execute":"human-monitor-command",
 "arguments":{"command-line":
  "qemu-io mirror-top \"write 0 1G\""}}') \
| x86_64-softmmu/qemu-system-x86_64 \
-qmp stdio \
-blockdev file,node-name=source,filename=src.img \
-blockdev file,node-name=target,filename=dst.img \
-object iothread,id=iothr0 \
-device virtio-blk,drive=source,iothread=iothr0

crashes:

0  raise () at /usr/lib/libc.so.6
1  abort () at /usr/lib/libc.so.6
2  error_exit
(err=,
msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl")
at ../util/qemu-thread-posix.c:37
3  qemu_mutex_unlock_impl
(mutex=mutex@entry=0x55fbb25ab6e0,
file=file@entry=0x55fbb1636957 "../util/async.c",
line=line@entry=650)
at ../util/qemu-thread-posix.c:109
4  aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650
5  bdrv_do_drained_begin
(bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false,
parent=parent@entry=0x0,
ignore_bds_parents=ignore_bds_parents@entry=false,
poll=poll@entry=true) at ../block/io.c:441
6  bdrv_do_drained_begin
(poll=true, ignore_bds_parents=false, parent=0x0, recursive=false,
bs=0x55fbb3a87000) at ../block/io.c:448
7  blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718
8  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498
9  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491
10 hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=)
at ../block/monitor/block-hmp-cmds.c:628

man pthread_mutex_unlock
...
 EPERM  The  mutex type is PTHREAD_MUTEX_ERRORCHECK or
 PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the
 current thread does not own the mutex.

So, thread doesn't own the mutex. And we have iothread here.

Next, note that AIO_WAIT_WHILE() documents that ctx must be acquired
exactly once by caller. But where is it acquired in the call stack?
Seems nowhere.

qemuio_command do acquire aio context.. But we need context acquired
around blk_unref as well. Let's do it.

Reported-by: Max Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/monitor/block-hmp-cmds.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index ebf1033f31..934100d0eb 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -559,6 +559,7 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
  {
  BlockBackend *blk;
  BlockBackend *local_blk = NULL;
+AioContext *ctx;
  bool qdev = qdict_get_try_bool(qdict, "qdev", false);
  const char *device = qdict_get_str(qdict, "device");
  const char *command = qdict_get_str(qdict, "command");
@@ -615,7 +616,13 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
  qemuio_command(blk, command);
  
  fail:

+ctx = blk_get_aio_context(blk);
+aio_context_acquire(ctx);
+
  blk_unref(local_blk);
+
+aio_context_release(ctx);


I dare to mention "code smell" here... Not to your fix, but to
the API. Can't we simplify it somehow? Maybe we can't, I don't
understand it well. But it seems bug prone, and expensive in
human brain resources (either develop, debug or review).


Better is move hmp_qemu_io to coroutine together with all called functions and 
qemu-io commands.. But it's a lot more work.


--
Best regards,
Vladimir

Re: [PATCH v2 4/8] block: make before-write notifiers thread-safe

2021-04-21 Thread Vladimir Sementsov-Ogievskiy


22.04.2021 00:23, Vladimir Sementsov-Ogievskiy wrote:

19.04.2021 11:55, Emanuele Giuseppe Esposito wrote:

Reads access the list in RCU style, so be careful to avoid use-after-free
scenarios in the backup block job.  Apart from this, all that's needed
is protecting updates with a mutex.


Note that backup doesn't use write-notifiers now. Probably best thing to do is 
to update other users to use filters instead of write notifiers, and drop write 
notifiers at all. But it may require more work, so don't care.


But on the other hand.. Why not. We can go without filter and still drop write-notifiers. 
Look at my new patch "[PATCH] block: simplify write-threshold and drop write 
notifiers"

--
Best regards,
Vladimir

[PATCH] block: simplify write-threshold and drop write notifiers

2021-04-21 Thread Vladimir Sementsov-Ogievskiy

write-notifiers are used only for write-threshold. New code for such
purpose should create filters.

Let's handle write-threshold simply in generic code and drop write
notifiers at all.

Also move part of write-threshold API that is used only for testing to
the test.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

I agree that this could be split into 2-3 parts and not combining
everything into one. But I'm tired now. I can send v2 if needed, so
consider it as RFC. Or take as is if you think it's not too much-in-one.

I also suggest this as a prepartion (and partly substitution) for
"[PATCH v2 0/8] Block layer thread-safety, continued"

 include/block/block_int.h | 12 -
 include/block/write-threshold.h   | 24 -
 block.c   |  1 -
 block/io.c| 21 +---
 block/write-threshold.c   | 87 ++-
 tests/unit/test-write-threshold.c | 38 ++
 6 files changed, 54 insertions(+), 129 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 88e4111939..50af58af75 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -957,12 +957,8 @@ struct BlockDriverState {
  */
 int64_t total_sectors;
 
-/* Callback before write request is processed */
-NotifierWithReturnList before_write_notifiers;
-
 /* threshold limit for writes, in bytes. "High water mark". */
 uint64_t write_threshold_offset;
-NotifierWithReturn write_threshold_notifier;
 
 /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
  * Reading from the list can be done with either the BQL or the
@@ -1087,14 +1083,6 @@ void bdrv_parse_filename_strip_prefix(const char 
*filename, const char *prefix,
 bool bdrv_backing_overridden(BlockDriverState *bs);
 
 
-/**
- * bdrv_add_before_write_notifier:
- *
- * Register a callback that is invoked before write requests are processed but
- * after any throttling or waiting for overlapping requests.
- */
-void bdrv_add_before_write_notifier(BlockDriverState *bs,
-NotifierWithReturn *notifier);
 
 /**
  * bdrv_add_aio_context_notifier:
diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h
index c646f267a4..23e1bfc123 100644
--- a/include/block/write-threshold.h
+++ b/include/block/write-threshold.h
@@ -35,28 +35,4 @@ void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t 
threshold_bytes);
  */
 uint64_t bdrv_write_threshold_get(const BlockDriverState *bs);
 
-/*
- * bdrv_write_threshold_is_set
- *
- * Tell if a write threshold is set for a given BDS.
- */
-bool bdrv_write_threshold_is_set(const BlockDriverState *bs);
-
-/*
- * bdrv_write_threshold_exceeded
- *
- * Return the extent of a write request that exceeded the threshold,
- * or zero if the request is below the threshold.
- * Return zero also if the threshold was not set.
- *
- * NOTE: here we assume the following holds for each request this code
- * deals with:
- *
- * assert((req->offset + req->bytes) <= UINT64_MAX)
- *
- * Please not there is *not* an actual C assert().
- */
-uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
-   const BdrvTrackedRequest *req);
-
 #endif
diff --git a/block.c b/block.c
index c5b887cec1..001453105e 100644
--- a/block.c
+++ b/block.c
@@ -381,7 +381,6 @@ BlockDriverState *bdrv_new(void)
 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
 QLIST_INIT(>op_blockers[i]);
 }
-notifier_with_return_list_init(>before_write_notifiers);
 qemu_co_mutex_init(>reqs_lock);
 qemu_mutex_init(>dirty_bitmap_mutex);
 bs->refcnt = 1;
diff --git a/block/io.c b/block/io.c
index ca2dca3007..e0aa775952 100644
--- a/block/io.c
+++ b/block/io.c
@@ -36,6 +36,8 @@
 #include "qemu/main-loop.h"
 #include "sysemu/replay.h"
 
+#include "qapi/qapi-events-block-core.h"
+
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
 
@@ -1974,6 +1976,8 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t 
offset, int64_t bytes,
child->perm & BLK_PERM_RESIZE);
 
 switch (req->type) {
+uint64_t write_threshold;
+
 case BDRV_TRACKED_WRITE:
 case BDRV_TRACKED_DISCARD:
 if (flags & BDRV_REQ_WRITE_UNCHANGED) {
@@ -1981,8 +1985,15 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t 
offset, int64_t bytes,
 } else {
 assert(child->perm & BLK_PERM_WRITE);
 }
-return notifier_with_return_list_notify(>before_write_notifiers,
-req);
+write_threshold = qatomic_read(>write_threshold_offset);
+if (write_threshold > 0 && offset + bytes > write_threshold) {
+qapi_event_send_block_write_threshold(
+bs->node_name,
+offset + bytes - write_threshold,
+write_threshold);
+

[PATCH v4 1/2] iotests/231: Update expected deprecation message

2021-04-21 Thread Connor Kuehl

The deprecation message in the expected output has technically been
wrong since the wrong version of a patch was applied to it. Because of
this, the test fails. Correct the expected output so that it passes.

Signed-off-by: Connor Kuehl 
Reviewed-by: Max Reitz 
Reviewed-by: Stefano Garzarella 
---
v3 -> v4:
  * Added Stefano's s-o-b to the commit message

 tests/qemu-iotests/231.out | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/231.out b/tests/qemu-iotests/231.out
index 579ba11c16..747dd221bb 100644
--- a/tests/qemu-iotests/231.out
+++ b/tests/qemu-iotests/231.out
@@ -1,9 +1,7 @@
 QA output created by 231
-qemu-img: RBD options encoded in the filename as keyvalue pairs is deprecated. 
 Future versions may cease to parse these options in the future.
+qemu-img: warning: RBD options encoded in the filename as keyvalue pairs is 
deprecated
 unable to get monitor info from DNS SRV with service name: ceph-mon
-no monitors specified to connect to.
 qemu-img: Could not open 
'json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=BOGUS_CONF'}': 
error connecting: No such file or directory
 unable to get monitor info from DNS SRV with service name: ceph-mon
-no monitors specified to connect to.
 qemu-img: Could not open 
'json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'BOGUS_CONF'}':
 error connecting: No such file or directory
 *** done
-- 
2.30.2

[PATCH v4 0/2] Fix segfault in qemu_rbd_parse_filename

2021-04-21 Thread Connor Kuehl

Connor Kuehl (2):
  iotests/231: Update expected deprecation message
  block/rbd: Add an escape-aware strchr helper

 block/rbd.c| 32 +---
 tests/qemu-iotests/231 |  4 
 tests/qemu-iotests/231.out |  7 ---
 3 files changed, 29 insertions(+), 14 deletions(-)

-- 
2.30.2

[PATCH v4 2/2] block/rbd: Add an escape-aware strchr helper

2021-04-21 Thread Connor Kuehl

Sometimes the parser needs to further split a token it has collected
from the token input stream. Right now, it does a cursory check to see
if the relevant characters appear in the token to determine if it should
break it down further.

However, qemu_rbd_next_tok() will escape characters as it removes tokens
from the token stream and plain strchr() won't. This can make the
initial strchr() check slightly misleading since it implies
qemu_rbd_next_tok() will find the token and split on it, except the
reality is that qemu_rbd_next_tok() will pass over it if it is escaped.

Use a custom strchr to avoid mixing escaped and unescaped string
operations. Furthermore, this code is identical to how
qemu_rbd_next_tok() seeks its next token, so incorporate this custom
strchr into the body of that function to reduce duplication.

Reported-by: Han Han 
Fixes: https://bugzilla.redhat.com/1873913
Signed-off-by: Connor Kuehl 
---
v3 -> v4:
  * Replace qemu_rbd_next_tok() seek loop with qemu_rbd_strchr() since
they're identical

 block/rbd.c| 32 +---
 tests/qemu-iotests/231 |  4 
 tests/qemu-iotests/231.out |  3 +++
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index f098a89c7b..26f64cce7c 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -113,21 +113,31 @@ static int qemu_rbd_connect(rados_t *cluster, 
rados_ioctx_t *io_ctx,
 const char *keypairs, const char *secretid,
 Error **errp);
 
+static char *qemu_rbd_strchr(char *src, char delim)
+{
+char *p;
+
+for (p = src; *p; ++p) {
+if (*p == delim) {
+return p;
+}
+if (*p == '\\' && p[1] != '\0') {
+++p;
+}
+}
+
+return NULL;
+}
+
+
 static char *qemu_rbd_next_tok(char *src, char delim, char **p)
 {
 char *end;
 
 *p = NULL;
 
-for (end = src; *end; ++end) {
-if (*end == delim) {
-break;
-}
-if (*end == '\\' && end[1] != '\0') {
-end++;
-}
-}
-if (*end == delim) {
+end = qemu_rbd_strchr(src, delim);
+if (end) {
 *p = end + 1;
 *end = '\0';
 }
@@ -171,7 +181,7 @@ static void qemu_rbd_parse_filename(const char *filename, 
QDict *options,
 qemu_rbd_unescape(found_str);
 qdict_put_str(options, "pool", found_str);
 
-if (strchr(p, '@')) {
+if (qemu_rbd_strchr(p, '@')) {
 image_name = qemu_rbd_next_tok(p, '@', );
 
 found_str = qemu_rbd_next_tok(p, ':', );
@@ -181,7 +191,7 @@ static void qemu_rbd_parse_filename(const char *filename, 
QDict *options,
 image_name = qemu_rbd_next_tok(p, ':', );
 }
 /* Check for namespace in the image_name */
-if (strchr(image_name, '/')) {
+if (qemu_rbd_strchr(image_name, '/')) {
 found_str = qemu_rbd_next_tok(image_name, '/', _name);
 qemu_rbd_unescape(found_str);
 qdict_put_str(options, "namespace", found_str);
diff --git a/tests/qemu-iotests/231 b/tests/qemu-iotests/231
index 0f66d0ca36..8e6c6447c1 100755
--- a/tests/qemu-iotests/231
+++ b/tests/qemu-iotests/231
@@ -55,6 +55,10 @@ _filter_conf()
 $QEMU_IMG info 
"json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=${BOGUS_CONF}'}" 
2>&1 | _filter_conf
 $QEMU_IMG info 
"json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'${BOGUS_CONF}'}"
 2>&1 | _filter_conf
 
+# Regression test: the qemu-img invocation is expected to fail, but it should
+# not seg fault the parser.
+$QEMU_IMG create "rbd:rbd/aa\/bb:conf=${BOGUS_CONF}" 1M 2>&1 | _filter_conf
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/231.out b/tests/qemu-iotests/231.out
index 747dd221bb..a785a6e859 100644
--- a/tests/qemu-iotests/231.out
+++ b/tests/qemu-iotests/231.out
@@ -4,4 +4,7 @@ unable to get monitor info from DNS SRV with service name: 
ceph-mon
 qemu-img: Could not open 
'json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=BOGUS_CONF'}': 
error connecting: No such file or directory
 unable to get monitor info from DNS SRV with service name: ceph-mon
 qemu-img: Could not open 
'json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'BOGUS_CONF'}':
 error connecting: No such file or directory
+Formatting 'rbd:rbd/aa\/bb:conf=BOGUS_CONF', fmt=raw size=1048576
+unable to get monitor info from DNS SRV with service name: ceph-mon
+qemu-img: rbd:rbd/aa\/bb:conf=BOGUS_CONF: error connecting: No such file or 
directory
 *** done
-- 
2.30.2

Re: [PATCH v2 4/8] block: make before-write notifiers thread-safe

2021-04-21 Thread Vladimir Sementsov-Ogievskiy


19.04.2021 11:55, Emanuele Giuseppe Esposito wrote:

Reads access the list in RCU style, so be careful to avoid use-after-free
scenarios in the backup block job.  Apart from this, all that's needed
is protecting updates with a mutex.


Note that backup doesn't use write-notifiers now. Probably best thing to do is 
to update other users to use filters instead of write notifiers, and drop write 
notifiers at all. But it may require more work, so don't care.



Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Paolo Bonzini 
Signed-off-by: Emanuele Giuseppe Esposito 
---
  block.c   |  6 +++---
  block/io.c| 12 
  block/write-threshold.c   |  2 +-
  include/block/block_int.h | 37 +
  4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index c5b887cec1..f40fb63c75 100644
--- a/block.c
+++ b/block.c
@@ -6434,7 +6434,7 @@ static void 
bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
  g_free(ban);
  }
  
-static void bdrv_detach_aio_context(BlockDriverState *bs)

+void bdrv_detach_aio_context(BlockDriverState *bs)


why it become public? It's not used in this commit, and this change is 
unrelated to commit message..


  {
  BdrvAioNotifier *baf, *baf_tmp;
  
@@ -6462,8 +6462,8 @@ static void bdrv_detach_aio_context(BlockDriverState *bs)

  bs->aio_context = NULL;
  }
  
-static void bdrv_attach_aio_context(BlockDriverState *bs,

-AioContext *new_context)
+void bdrv_attach_aio_context(BlockDriverState *bs,
+ AioContext *new_context)
  {
  BdrvAioNotifier *ban, *ban_tmp;
  
diff --git a/block/io.c b/block/io.c

index ca2dca3007..8f6af6077b 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3137,10 +3137,22 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, 
QEMUIOVector *qiov)
  return true;
  }
  
+static QemuSpin notifiers_spin_lock;

+
  void bdrv_add_before_write_notifier(BlockDriverState *bs,
  NotifierWithReturn *notifier)
  {
+qemu_spin_lock(_spin_lock);
  notifier_with_return_list_add(>before_write_notifiers, notifier);
+qemu_spin_unlock(_spin_lock);
+}
+
+void bdrv_remove_before_write_notifier(BlockDriverState *bs,
+   NotifierWithReturn *notifier)
+{
+qemu_spin_lock(_spin_lock);
+notifier_with_return_remove(notifier);
+qemu_spin_unlock(_spin_lock);
  }
  
  void bdrv_io_plug(BlockDriverState *bs)

diff --git a/block/write-threshold.c b/block/write-threshold.c
index 77c74bdaa7..b87b749b02 100644
--- a/block/write-threshold.c
+++ b/block/write-threshold.c
@@ -32,7 +32,7 @@ bool bdrv_write_threshold_is_set(const BlockDriverState *bs)
  static void write_threshold_disable(BlockDriverState *bs)
  {
  if (bdrv_write_threshold_is_set(bs)) {
-notifier_with_return_remove(>write_threshold_notifier);
+bdrv_remove_before_write_notifier(bs, >write_threshold_notifier);
  bs->write_threshold_offset = 0;
  }
  }
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 88e4111939..a1aad5ad2d 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1089,6 +1089,8 @@ bool bdrv_backing_overridden(BlockDriverState *bs);
  
  /**

   * bdrv_add_before_write_notifier:
+ * @bs: The #BlockDriverState for which to register the notifier
+ * @notifier: The notifier to add
   *
   * Register a callback that is invoked before write requests are processed but
   * after any throttling or waiting for overlapping requests.
@@ -1096,6 +1098,41 @@ bool bdrv_backing_overridden(BlockDriverState *bs);
  void bdrv_add_before_write_notifier(BlockDriverState *bs,
  NotifierWithReturn *notifier);
  
+/**

+ * bdrv_remove_before_write_notifier:
+ * @bs: The #BlockDriverState for which to register the notifier
+ * @notifier: The notifier to add
+ *
+ * Unregister a callback that is invoked before write requests are processed 
but
+ * after any throttling or waiting for overlapping requests.
+ *
+ * Note that more I/O could be pending on @bs.  You need to wait for
+ * it to finish, for example with bdrv_drain(), before freeing @notifier.
+ */
+void bdrv_remove_before_write_notifier(BlockDriverState *bs,
+   NotifierWithReturn *notifier);
+
+/**
+ * bdrv_detach_aio_context:
+ *
+ * May be called from .bdrv_detach_aio_context() to detach children from the
+ * current #AioContext.  This is only needed by block drivers that manage their
+ * own children.  Both ->file and ->backing are automatically handled and
+ * block drivers should not call this function on them explicitly.
+ */
+void bdrv_detach_aio_context(BlockDriverState *bs);
+
+/**
+ * bdrv_attach_aio_context:
+ *
+ * May be called from .bdrv_attach_aio_context() to attach children to the new
+ * #AioContext.  This is only needed by block drivers that manage their own
+ *

Re: [PATCH v3 2/2] block/rbd: Add an escape-aware strchr helper

2021-04-21 Thread Connor Kuehl

On 4/21/21 6:04 AM, Stefano Garzarella wrote:
>> +static char *qemu_rbd_strchr(char *src, char delim)
>> +{
>> +char *p;
>> +
>> +for (p = src; *p; ++p) {
>> +if (*p == delim) {
>> +return p;
>> +}
>> +if (*p == '\\' && p[1] != '\0') {
>> +++p;
>> +}
>> +}
>> +
>> +return NULL;
>> +}
>> +
> 
> IIUC this is similar to the code used in qemu_rbd_next_tok().
> To avoid code duplication can we use this new function inside it?

Hi Stefano! Good catch. I think you're right. I've incorporated your
suggestion into my next revision. The only thing I changed was the
if-statement from:

if (end && *end == delim) {

to:

if (end) {

Since qemu_rbd_strchr() will only ever return a non-NULL pointer if it
was able to find 'delim'.

Connor

> 
> I mean something like this (not tested):
> 
> diff --git a/block/rbd.c b/block/rbd.c
> index f098a89c7b..eb6a839362 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -119,15 +119,8 @@ static char *qemu_rbd_next_tok(char *src, char delim, 
> char **p)
>  
>  *p = NULL;
>  
> -for (end = src; *end; ++end) {
> -if (*end == delim) {
> -break;
> -}
> -if (*end == '\\' && end[1] != '\0') {
> -end++;
> -}
> -}
> -if (*end == delim) {
> +end = qemu_rbd_strchr(src, delim);
> +if (end && *end == delim) {
>  *p = end + 1;
>  *end = '\0';
>  }
> 
> 
> The rest LGTM!
> 
> Thanks for fixing this issue,
> Stefano
>

Re: [PATCH v3 2/3] vhost-user-blk: perform immediate cleanup if disconnect on initialization

2021-04-21 Thread Michael S. Tsirkin

On Wed, Apr 21, 2021 at 07:13:24PM +0300, Denis Plotnikov wrote:
> 
> On 21.04.2021 18:24, Kevin Wolf wrote:
> > Am 25.03.2021 um 16:12 hat Denis Plotnikov geschrieben:
> > > Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect")
> > > introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts
> > > because of connection problems with vhost-blk daemon.
> > > 
> > > However, it introdues a new problem. Now, any communication errors
> > > during execution of vhost_dev_init() called by 
> > > vhost_user_blk_device_realize()
> > > lead to qemu abort on assert in vhost_dev_get_config().
> > > 
> > > This happens because vhost_user_blk_disconnect() is postponed but
> > > it should have dropped s->connected flag by the time
> > > vhost_user_blk_device_realize() performs a new connection opening.
> > > On the connection opening, vhost_dev initialization in
> > > vhost_user_blk_connect() relies on s->connection flag and
> > > if it's not dropped, it skips vhost_dev initialization and returns
> > > with success. Then, vhost_user_blk_device_realize()'s execution flow
> > > goes to vhost_dev_get_config() where it's aborted on the assert.
> > > 
> > > To fix the problem this patch adds immediate cleanup on device
> > > initialization(in vhost_user_blk_device_realize()) using different
> > > event handlers for initialization and operation introduced in the
> > > previous patch.
> > > On initialization (in vhost_user_blk_device_realize()) we fully
> > > control the initialization process. At that point, nobody can use the
> > > device since it isn't initialized and we don't need to postpone any
> > > cleanups, so we can do cleaup right away when there is a communication
> > > problem with the vhost-blk daemon.
> > > On operation we leave it as is, since the disconnect may happen when
> > > the device is in use, so the device users may want to use vhost_dev's data
> > > to do rollback before vhost_dev is re-initialized (e.g. in 
> > > vhost_dev_set_log()).
> > > 
> > > Signed-off-by: Denis Plotnikov 
> > > Reviewed-by: Raphael Norwitz 
> > I think there is something wrong with this patch.
> > 
> > I'm debugging an error case, specifically num-queues being larger in
> > QEMU that in the vhost-user-blk export. Before this patch, it has just
> > an unfriendly error message:
> > 
> > qemu-system-x86_64: -device 
> > vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
> >  Unexpected end-of-file before all data were read
> > qemu-system-x86_64: -device 
> > vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
> >  Failed to read msg header. Read 0 instead of 12. Original request 24.
> > qemu-system-x86_64: -device 
> > vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
> >  vhost-user-blk: get block config failed
> > qemu-system-x86_64: Failed to set msg fds.
> > qemu-system-x86_64: vhost VQ 0 ring restore failed: -1: Resource 
> > temporarily unavailable (11)
> > 
> > After the patch, it crashes:
> > 
> > #0  0x55d0a4bd in vhost_user_read_cb (source=0x568f4690, 
> > condition=(G_IO_IN | G_IO_HUP), opaque=0x7fffcbf0) at 
> > ../hw/virtio/vhost-user.c:313
> > #1  0x55d950d3 in qio_channel_fd_source_dispatch 
> > (source=0x57c3f750, callback=0x55d0a478 , 
> > user_data=0x7fffcbf0) at ../io/channel-watch.c:84
> > #2  0x77b32a9f in g_main_context_dispatch () at 
> > /lib64/libglib-2.0.so.0
> > #3  0x77b84a98 in g_main_context_iterate.constprop () at 
> > /lib64/libglib-2.0.so.0
> > #4  0x77b32163 in g_main_loop_run () at /lib64/libglib-2.0.so.0
> > #5  0x55d0a724 in vhost_user_read (dev=0x57bc62f8, 
> > msg=0x7fffcc50) at ../hw/virtio/vhost-user.c:402
> > #6  0x55d0ee6b in vhost_user_get_config (dev=0x57bc62f8, 
> > config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost-user.c:2133
> > #7  0x55d56d46 in vhost_dev_get_config (hdev=0x57bc62f8, 
> > config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost.c:1566
> > #8  0x55cdd150 in vhost_user_blk_device_realize 
> > (dev=0x57bc60b0, errp=0x7fffcf90) at 
> > ../hw/block/vhost-user-blk.c:510
> > #9  0x55d08f6d in virtio_device_realize (dev=0x57bc60b0, 
> > errp=0x7fffcff0) at ../hw/virtio/virtio.c:3660
> > 
> > The problem is that vhost_user_read_cb() still accesses dev->opaque even
> > though the device has been cleaned up meanwhile when the connection was
> > closed (the vhost_user_blk_disconnect() added by this patch), so it's
> > NULL now. This problem was actually mentioned in the comment that is
> > removed by this patch.
> > 
> > I tried to fix this by making vhost_user_read() cope with the fact that
> > the device might have been cleaned up meanwhile, but then I'm running
> > into the next set of problems.
> > 
> > The first is that retrying is pointless, the error condition is in

Re: [PATCH] monitor: hmp_qemu_io: acquire aio contex, fix crash

2021-04-21 Thread Philippe Mathieu-Daudé

On 4/21/21 10:32 AM, Vladimir Sementsov-Ogievskiy wrote:
> Max reported the following bug:
> 
> $ ./qemu-img create -f raw src.img 1G
> $ ./qemu-img create -f raw dst.img 1G
> 
> $ (echo '
>{"execute":"qmp_capabilities"}
>{"execute":"blockdev-mirror",
> "arguments":{"job-id":"mirror",
>  "device":"source",
>  "target":"target",
>  "sync":"full",
>  "filter-node-name":"mirror-top"}}
> '; sleep 3; echo '
>{"execute":"human-monitor-command",
> "arguments":{"command-line":
>  "qemu-io mirror-top \"write 0 1G\""}}') \
> | x86_64-softmmu/qemu-system-x86_64 \
>-qmp stdio \
>-blockdev file,node-name=source,filename=src.img \
>-blockdev file,node-name=target,filename=dst.img \
>-object iothread,id=iothr0 \
>-device virtio-blk,drive=source,iothread=iothr0
> 
> crashes:
> 
> 0  raise () at /usr/lib/libc.so.6
> 1  abort () at /usr/lib/libc.so.6
> 2  error_exit
>(err=,
>msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl")
>at ../util/qemu-thread-posix.c:37
> 3  qemu_mutex_unlock_impl
>(mutex=mutex@entry=0x55fbb25ab6e0,
>file=file@entry=0x55fbb1636957 "../util/async.c",
>line=line@entry=650)
>at ../util/qemu-thread-posix.c:109
> 4  aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650
> 5  bdrv_do_drained_begin
>(bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false,
>parent=parent@entry=0x0,
>ignore_bds_parents=ignore_bds_parents@entry=false,
>poll=poll@entry=true) at ../block/io.c:441
> 6  bdrv_do_drained_begin
>(poll=true, ignore_bds_parents=false, parent=0x0, recursive=false,
>bs=0x55fbb3a87000) at ../block/io.c:448
> 7  blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718
> 8  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498
> 9  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491
> 10 hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=)
>at ../block/monitor/block-hmp-cmds.c:628
> 
> man pthread_mutex_unlock
> ...
> EPERM  The  mutex type is PTHREAD_MUTEX_ERRORCHECK or
> PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the
> current thread does not own the mutex.
> 
> So, thread doesn't own the mutex. And we have iothread here.
> 
> Next, note that AIO_WAIT_WHILE() documents that ctx must be acquired
> exactly once by caller. But where is it acquired in the call stack?
> Seems nowhere.
> 
> qemuio_command do acquire aio context.. But we need context acquired
> around blk_unref as well. Let's do it.
> 
> Reported-by: Max Reitz 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/monitor/block-hmp-cmds.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
> index ebf1033f31..934100d0eb 100644
> --- a/block/monitor/block-hmp-cmds.c
> +++ b/block/monitor/block-hmp-cmds.c
> @@ -559,6 +559,7 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
>  {
>  BlockBackend *blk;
>  BlockBackend *local_blk = NULL;
> +AioContext *ctx;
>  bool qdev = qdict_get_try_bool(qdict, "qdev", false);
>  const char *device = qdict_get_str(qdict, "device");
>  const char *command = qdict_get_str(qdict, "command");
> @@ -615,7 +616,13 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
>  qemuio_command(blk, command);
>  
>  fail:
> +ctx = blk_get_aio_context(blk);
> +aio_context_acquire(ctx);
> +
>  blk_unref(local_blk);
> +
> +aio_context_release(ctx);

I dare to mention "code smell" here... Not to your fix, but to
the API. Can't we simplify it somehow? Maybe we can't, I don't
understand it well. But it seems bug prone, and expensive in
human brain resources (either develop, debug or review).

>  hmp_handle_error(mon, err);
>  }
>  
>

Re: [PATCH 14/14] hw/nvme: move nvme emulation out of hw/block

2021-04-21 Thread Klaus Jensen


On Apr 19 21:28, Klaus Jensen wrote:

From: Klaus Jensen 

With the introduction of the nvme-subsystem device we are really
cluttering up the hw/block directory.

As suggested by Philippe previously, move the nvme emulation to hw/nvme.

Suggested-by: Philippe Mathieu-Daudé 


Hi Philippe,

You originally suggested this. Do you know if an Ack is required from 
anyone "up in the food chain" before being PR'ed to Peter?



Cheers,
Klaus


signature.asc
Description: PGP signature

Re: [PATCH v3 2/3] vhost-user-blk: perform immediate cleanup if disconnect on initialization

2021-04-21 Thread Denis Plotnikov




On 21.04.2021 18:24, Kevin Wolf wrote:

Am 25.03.2021 um 16:12 hat Denis Plotnikov geschrieben:

Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect")
introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts
because of connection problems with vhost-blk daemon.

However, it introdues a new problem. Now, any communication errors
during execution of vhost_dev_init() called by vhost_user_blk_device_realize()
lead to qemu abort on assert in vhost_dev_get_config().

This happens because vhost_user_blk_disconnect() is postponed but
it should have dropped s->connected flag by the time
vhost_user_blk_device_realize() performs a new connection opening.
On the connection opening, vhost_dev initialization in
vhost_user_blk_connect() relies on s->connection flag and
if it's not dropped, it skips vhost_dev initialization and returns
with success. Then, vhost_user_blk_device_realize()'s execution flow
goes to vhost_dev_get_config() where it's aborted on the assert.

To fix the problem this patch adds immediate cleanup on device
initialization(in vhost_user_blk_device_realize()) using different
event handlers for initialization and operation introduced in the
previous patch.
On initialization (in vhost_user_blk_device_realize()) we fully
control the initialization process. At that point, nobody can use the
device since it isn't initialized and we don't need to postpone any
cleanups, so we can do cleaup right away when there is a communication
problem with the vhost-blk daemon.
On operation we leave it as is, since the disconnect may happen when
the device is in use, so the device users may want to use vhost_dev's data
to do rollback before vhost_dev is re-initialized (e.g. in vhost_dev_set_log()).

Signed-off-by: Denis Plotnikov 
Reviewed-by: Raphael Norwitz 

I think there is something wrong with this patch.

I'm debugging an error case, specifically num-queues being larger in
QEMU that in the vhost-user-blk export. Before this patch, it has just
an unfriendly error message:

qemu-system-x86_64: -device 
vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
 Unexpected end-of-file before all data were read
qemu-system-x86_64: -device 
vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
 Failed to read msg header. Read 0 instead of 12. Original request 24.
qemu-system-x86_64: -device 
vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
 vhost-user-blk: get block config failed
qemu-system-x86_64: Failed to set msg fds.
qemu-system-x86_64: vhost VQ 0 ring restore failed: -1: Resource temporarily 
unavailable (11)

After the patch, it crashes:

#0  0x55d0a4bd in vhost_user_read_cb (source=0x568f4690, 
condition=(G_IO_IN | G_IO_HUP), opaque=0x7fffcbf0) at 
../hw/virtio/vhost-user.c:313
#1  0x55d950d3 in qio_channel_fd_source_dispatch (source=0x57c3f750, 
callback=0x55d0a478 , user_data=0x7fffcbf0) at 
../io/channel-watch.c:84
#2  0x77b32a9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#3  0x77b84a98 in g_main_context_iterate.constprop () at 
/lib64/libglib-2.0.so.0
#4  0x77b32163 in g_main_loop_run () at /lib64/libglib-2.0.so.0
#5  0x55d0a724 in vhost_user_read (dev=0x57bc62f8, 
msg=0x7fffcc50) at ../hw/virtio/vhost-user.c:402
#6  0x55d0ee6b in vhost_user_get_config (dev=0x57bc62f8, 
config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost-user.c:2133
#7  0x55d56d46 in vhost_dev_get_config (hdev=0x57bc62f8, 
config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost.c:1566
#8  0x55cdd150 in vhost_user_blk_device_realize (dev=0x57bc60b0, 
errp=0x7fffcf90) at ../hw/block/vhost-user-blk.c:510
#9  0x55d08f6d in virtio_device_realize (dev=0x57bc60b0, 
errp=0x7fffcff0) at ../hw/virtio/virtio.c:3660

The problem is that vhost_user_read_cb() still accesses dev->opaque even
though the device has been cleaned up meanwhile when the connection was
closed (the vhost_user_blk_disconnect() added by this patch), so it's
NULL now. This problem was actually mentioned in the comment that is
removed by this patch.

I tried to fix this by making vhost_user_read() cope with the fact that
the device might have been cleaned up meanwhile, but then I'm running
into the next set of problems.

The first is that retrying is pointless, the error condition is in the
configuration, it will never change.

The other is that after many repetitions of the same error message, I
got a crash where the device is cleaned up a second time in
vhost_dev_init() and the virtqueues are already NULL.

So it seems to me that erroring out during the initialisation phase
makes a lot more sense than retrying.

Kevin


But without the patch there will be another problem which the patch 
actually addresses.


It seems to me that there is a case when the retrying is

Re: [PATCH v3 2/3] vhost-user-blk: perform immediate cleanup if disconnect on initialization

2021-04-21 Thread Kevin Wolf

Am 25.03.2021 um 16:12 hat Denis Plotnikov geschrieben:
> Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect")
> introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts
> because of connection problems with vhost-blk daemon.
> 
> However, it introdues a new problem. Now, any communication errors
> during execution of vhost_dev_init() called by vhost_user_blk_device_realize()
> lead to qemu abort on assert in vhost_dev_get_config().
> 
> This happens because vhost_user_blk_disconnect() is postponed but
> it should have dropped s->connected flag by the time
> vhost_user_blk_device_realize() performs a new connection opening.
> On the connection opening, vhost_dev initialization in
> vhost_user_blk_connect() relies on s->connection flag and
> if it's not dropped, it skips vhost_dev initialization and returns
> with success. Then, vhost_user_blk_device_realize()'s execution flow
> goes to vhost_dev_get_config() where it's aborted on the assert.
> 
> To fix the problem this patch adds immediate cleanup on device
> initialization(in vhost_user_blk_device_realize()) using different
> event handlers for initialization and operation introduced in the
> previous patch.
> On initialization (in vhost_user_blk_device_realize()) we fully
> control the initialization process. At that point, nobody can use the
> device since it isn't initialized and we don't need to postpone any
> cleanups, so we can do cleaup right away when there is a communication
> problem with the vhost-blk daemon.
> On operation we leave it as is, since the disconnect may happen when
> the device is in use, so the device users may want to use vhost_dev's data
> to do rollback before vhost_dev is re-initialized (e.g. in 
> vhost_dev_set_log()).
> 
> Signed-off-by: Denis Plotnikov 
> Reviewed-by: Raphael Norwitz 

I think there is something wrong with this patch.

I'm debugging an error case, specifically num-queues being larger in
QEMU that in the vhost-user-blk export. Before this patch, it has just
an unfriendly error message:

qemu-system-x86_64: -device 
vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
 Unexpected end-of-file before all data were read
qemu-system-x86_64: -device 
vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
 Failed to read msg header. Read 0 instead of 12. Original request 24.
qemu-system-x86_64: -device 
vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4:
 vhost-user-blk: get block config failed
qemu-system-x86_64: Failed to set msg fds.
qemu-system-x86_64: vhost VQ 0 ring restore failed: -1: Resource temporarily 
unavailable (11)

After the patch, it crashes:

#0  0x55d0a4bd in vhost_user_read_cb (source=0x568f4690, 
condition=(G_IO_IN | G_IO_HUP), opaque=0x7fffcbf0) at 
../hw/virtio/vhost-user.c:313
#1  0x55d950d3 in qio_channel_fd_source_dispatch 
(source=0x57c3f750, callback=0x55d0a478 , 
user_data=0x7fffcbf0) at ../io/channel-watch.c:84
#2  0x77b32a9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#3  0x77b84a98 in g_main_context_iterate.constprop () at 
/lib64/libglib-2.0.so.0
#4  0x77b32163 in g_main_loop_run () at /lib64/libglib-2.0.so.0
#5  0x55d0a724 in vhost_user_read (dev=0x57bc62f8, 
msg=0x7fffcc50) at ../hw/virtio/vhost-user.c:402
#6  0x55d0ee6b in vhost_user_get_config (dev=0x57bc62f8, 
config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost-user.c:2133
#7  0x55d56d46 in vhost_dev_get_config (hdev=0x57bc62f8, 
config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost.c:1566
#8  0x55cdd150 in vhost_user_blk_device_realize (dev=0x57bc60b0, 
errp=0x7fffcf90) at ../hw/block/vhost-user-blk.c:510
#9  0x55d08f6d in virtio_device_realize (dev=0x57bc60b0, 
errp=0x7fffcff0) at ../hw/virtio/virtio.c:3660

The problem is that vhost_user_read_cb() still accesses dev->opaque even
though the device has been cleaned up meanwhile when the connection was
closed (the vhost_user_blk_disconnect() added by this patch), so it's
NULL now. This problem was actually mentioned in the comment that is
removed by this patch.

I tried to fix this by making vhost_user_read() cope with the fact that
the device might have been cleaned up meanwhile, but then I'm running
into the next set of problems.

The first is that retrying is pointless, the error condition is in the
configuration, it will never change.

The other is that after many repetitions of the same error message, I
got a crash where the device is cleaned up a second time in
vhost_dev_init() and the virtqueues are already NULL.

So it seems to me that erroring out during the initialisation phase
makes a lot more sense than retrying.

Kevin

>  hw/block/vhost-user-blk.c | 48 +++
>  1 file changed, 24 insertions(+), 24 deletions(-)
> 
>

Re: [PATCH v3 02/33] block/nbd: fix how state is cleared on nbd_open() failure paths

2021-04-21 Thread Roman Kagan

On Fri, Apr 16, 2021 at 11:08:40AM +0300, Vladimir Sementsov-Ogievskiy wrote:
> We have two "return error" paths in nbd_open() after
> nbd_process_options(). Actually we should call nbd_clear_bdrvstate()
> on these paths. Interesting that nbd_process_options() calls
> nbd_clear_bdrvstate() by itself.
> 
> Let's fix leaks and refactor things to be more obvious:
> 
> - intialize yank at top of nbd_open()
> - move yank cleanup to nbd_clear_bdrvstate()
> - refactor nbd_open() so that all failure paths except for
>   yank-register goes through nbd_clear_bdrvstate()
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 36 ++--
>  1 file changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index 739ae2941f..a407a3814b 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -152,8 +152,12 @@ static void 
> nbd_co_establish_connection_cancel(BlockDriverState *bs,
>  static int nbd_client_handshake(BlockDriverState *bs, Error **errp);
>  static void nbd_yank(void *opaque);
>  
> -static void nbd_clear_bdrvstate(BDRVNBDState *s)
> +static void nbd_clear_bdrvstate(BlockDriverState *bs)
>  {
> +BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
> +
> +yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
> +
>  object_unref(OBJECT(s->tlscreds));
>  qapi_free_SocketAddress(s->saddr);
>  s->saddr = NULL;
> @@ -2279,9 +2283,6 @@ static int nbd_process_options(BlockDriverState *bs, 
> QDict *options,
>  ret = 0;
>  
>   error:
> -if (ret < 0) {
> -nbd_clear_bdrvstate(s);
> -}
>  qemu_opts_del(opts);
>  return ret;
>  }
> @@ -2292,11 +2293,6 @@ static int nbd_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  int ret;
>  BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
>  
> -ret = nbd_process_options(bs, options, errp);
> -if (ret < 0) {
> -return ret;
> -}
> -
>  s->bs = bs;
>  qemu_co_mutex_init(>send_mutex);
>  qemu_co_queue_init(>free_sema);
> @@ -2305,20 +2301,23 @@ static int nbd_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  return -EEXIST;
>  }
>  
> +ret = nbd_process_options(bs, options, errp);
> +if (ret < 0) {
> +goto fail;
> +}
> +
>  /*
>   * establish TCP connection, return error if it fails
>   * TODO: Configurable retry-until-timeout behaviour.
>   */
>  if (nbd_establish_connection(bs, s->saddr, errp) < 0) {
> -yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
> -return -ECONNREFUSED;
> +ret = -ECONNREFUSED;
> +goto fail;
>  }
>  
>  ret = nbd_client_handshake(bs, errp);

Not that this was introduced by this patch, but once you're at it:
AFAICT nbd_client_handshake() calls yank_unregister_instance() on some
error path(s); I assume this needs to go too, otherwise it's called
twice (and asserts).

Roman.

>  if (ret < 0) {
> -yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
> -nbd_clear_bdrvstate(s);
> -return ret;
> +goto fail;
>  }
>  /* successfully connected */
>  s->state = NBD_CLIENT_CONNECTED;
> @@ -2330,6 +2329,10 @@ static int nbd_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co);
>  
>  return 0;
> +
> +fail:
> +nbd_clear_bdrvstate(bs);
> +return ret;
>  }
>  
>  static int nbd_co_flush(BlockDriverState *bs)
> @@ -2373,11 +2376,8 @@ static void nbd_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  
>  static void nbd_close(BlockDriverState *bs)
>  {
> -BDRVNBDState *s = bs->opaque;
> -
>  nbd_client_close(bs);
> -yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
> -nbd_clear_bdrvstate(s);
> +nbd_clear_bdrvstate(bs);
>  }
>  
>  /*

Re: [PATCH v2 0/8] Block layer thread-safety, continued

2021-04-21 Thread Paolo Bonzini


On 19/04/21 10:55, Emanuele Giuseppe Esposito wrote:

This and the following serie of patches are based on Paolo's
v1 patches sent in 2017[*]. They have been ported to the current QEMU
version, but the goal remains the same:
- make the block layer thread-safe (patches 1-5), and
- remove aio_context_acquire/release (patches 6-8).

[*] = https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg01398.html

Signed-off-by: Emanuele Giuseppe Esposito 


This looks good to me, though the commit message of patch 8 needs to be 
rewritten.


Paolo


---
v1 (2017) -> v2 (2021):
- v1 Patch "block-backup: add reqs_lock" has been dropped, because now
   is completely different from the old version and all functions
   that were affected by it have been moved or deleted.
   It will be replaced by another serie that aims to thread safety to
   block/block-copy.c
- remaining v1 patches will be integrated in next serie.
- Patch "block: do not acquire AioContext in check_to_replace_node"
   moves part of the logic of check_to_replace_node to the caller,
   so that the function can be included in the aio_context_acquire/release
   block that follows.

Emanuele Giuseppe Esposito (8):
   block: prepare write threshold code for thread safety
   block: protect write threshold QMP commands from concurrent requests
   util: use RCU accessors for notifiers
   block: make before-write notifiers thread-safe
   block: add a few more notes on locking
   block: do not acquire AioContext in check_to_replace_node
   block/replication: do not acquire AioContext
   block: do not take AioContext around reopen

  block.c   | 28 ++--
  block/block-backend.c |  4 ---
  block/io.c| 12 +
  block/mirror.c|  9 ---
  block/replication.c   | 54 +--
  block/write-threshold.c   | 39 ++--
  blockdev.c| 26 +--
  include/block/block.h |  1 +
  include/block/block_int.h | 42 +-
  util/notify.c | 13 +-
  10 files changed, 113 insertions(+), 115 deletions(-)

Re: [PATCH v2 8/8] block: do not take AioContext around reopen

2021-04-21 Thread Paolo Bonzini


On 19/04/21 10:55, Emanuele Giuseppe Esposito wrote:

Reopen needs to handle AioContext carefully due to calling
bdrv_drain_all_begin/end.  By not taking AioContext around calls to
bdrv_reopen_multiple, we can drop the function's release/acquire
pair and the AioContext argument too.


So... I wrote this commit message and I cannot parse it anymore---much 
less relate it to the code in the patch.  This is a problem, but it 
doesn't mean that the patch is wrong.


bdrv_reopen_multiple does not have the AioContext argument anymore. 
It's not doing release/acquire either.  The relevant commit is commit 
1a63a90750 ("block: Keep nodes drained between reopen_queue/multiple", 
2017-12-22).  You're basically cleaning up after that code in the same 
way as patch 7: reopen functions take care of keeping the BDS quiescent, 
so there's nothing to synchronize on.


For the future, the important step you missed was to check your diff 
against the one that you cherry-picked from.  Then you would have 
noticed that 1) it's much smaller 2) one thing that is mentioned in the 
commit message ("drop the function's release/acquire pair and argument") 
is not needed anymore.


Paolo


Signed-off-by: Paolo Bonzini 
Signed-off-by: Emanuele Giuseppe Esposito 
---
  block/block-backend.c |  4 
  block/mirror.c|  9 -
  blockdev.c| 19 ++-
  3 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 413af51f3b..6fdc698e9e 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2291,20 +2291,16 @@ int blk_commit_all(void)
  BlockBackend *blk = NULL;
  
  while ((blk = blk_all_next(blk)) != NULL) {

-AioContext *aio_context = blk_get_aio_context(blk);
  BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
  
-aio_context_acquire(aio_context);

  if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
  int ret;
  
  ret = bdrv_commit(unfiltered_bs);

  if (ret < 0) {
-aio_context_release(aio_context);
  return ret;
  }
  }
-aio_context_release(aio_context);
  }
  return 0;
  }
diff --git a/block/mirror.c b/block/mirror.c
index 5a71bd8bbc..43174bbc6b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -631,7 +631,6 @@ static int mirror_exit_common(Job *job)
  MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
  BlockJob *bjob = >common;
  MirrorBDSOpaque *bs_opaque;
-AioContext *replace_aio_context = NULL;
  BlockDriverState *src;
  BlockDriverState *target_bs;
  BlockDriverState *mirror_top_bs;
@@ -699,11 +698,6 @@ static int mirror_exit_common(Job *job)
  }
  }
  
-if (s->to_replace) {

-replace_aio_context = bdrv_get_aio_context(s->to_replace);
-aio_context_acquire(replace_aio_context);
-}
-
  if (s->should_complete && !abort) {
  BlockDriverState *to_replace = s->to_replace ?: src;
  bool ro = bdrv_is_read_only(to_replace);
@@ -740,9 +734,6 @@ static int mirror_exit_common(Job *job)
  error_free(s->replace_blocker);
  bdrv_unref(s->to_replace);
  }
-if (replace_aio_context) {
-aio_context_release(replace_aio_context);
-}
  g_free(s->replaces);
  bdrv_unref(target_bs);
  
diff --git a/blockdev.c b/blockdev.c

index e901107344..1672ef756e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3469,7 +3469,6 @@ void qmp_change_backing_file(const char *device,
   Error **errp)
  {
  BlockDriverState *bs = NULL;
-AioContext *aio_context;
  BlockDriverState *image_bs = NULL;
  Error *local_err = NULL;
  bool ro;
@@ -3480,37 +3479,34 @@ void qmp_change_backing_file(const char *device,
  return;
  }
  
-aio_context = bdrv_get_aio_context(bs);

-aio_context_acquire(aio_context);
-
  image_bs = bdrv_lookup_bs(NULL, image_node_name, _err);
  if (local_err) {
  error_propagate(errp, local_err);
-goto out;
+return;
  }
  
  if (!image_bs) {

  error_setg(errp, "image file not found");
-goto out;
+return;
  }
  
  if (bdrv_find_base(image_bs) == image_bs) {

  error_setg(errp, "not allowing backing file change on an image "
   "without a backing file");
-goto out;
+return;
  }
  
  /* even though we are not necessarily operating on bs, we need it to

   * determine if block ops are currently prohibited on the chain */
  if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_CHANGE, errp)) {
-goto out;
+return;
  }
  
  /* final sanity check */

  if (!bdrv_chain_contains(bs, image_bs)) {
  error_setg(errp, "'%s' and image file are not in the same chain",
 device);
-goto out;
+return;

Re: [RFC PATCH 0/3] block-copy: lock tasks and calls list

2021-04-21 Thread Paolo Bonzini


On 21/04/21 10:53, Vladimir Sementsov-Ogievskiy wrote:


Good point. Emanuele, can you work on ProgressMeter and 
SharedResource? AioTaskPool can also be converted to just use CoQueue 
instead of manually waking up coroutines.




That would be great.

I have one more question in mind:

Is it effective to use CoMutex here? We are protecting only some fast 
manipulations with data, not io path or something like that. Will simple 
QemuMutex work better? Even if CoMutex doesn't have any overhead, I 
don't think than if thread A wants to modify task list, but mutex is 
held by thread B (for similar thing), there is a reason for thread A to 
yield and do some other things: it can just wait several moments on 
mutex while B is modifying task list..


Indeed even CoQueue primitives count as simple manipulation of data, 
because they unlock/lock the mutex while the coroutine sleeps.  So 
you're right that it would be okay to use QemuMutex as well


The block copy code that Emanuele has touched so far is all coroutine 
based.  I like using CoMutex when that is the case, because it says 
implicitly "the monitor is not involved".  But we need to see what it 
will be like when the patches are complete.


Rate limiting ends up being called by the monitor, but it will have its 
own QemuMutex so it's fine.  What's left is cancellation and 
block_copy_kick; I think that we can make qemu_co_sleep thread-safe with 
an API similar to Linux's prepare_to_wait, so a QemuMutex wouldn't be 
needed there either.


Paolo

Re: [PATCH v3 2/2] block/rbd: Add an escape-aware strchr helper

2021-04-21 Thread Stefano Garzarella


On Fri, Apr 09, 2021 at 09:38:54AM -0500, Connor Kuehl wrote:

Sometimes the parser needs to further split a token it has collected
from the token input stream. Right now, it does a cursory check to see
if the relevant characters appear in the token to determine if it should
break it down further.

However, qemu_rbd_next_tok() will escape characters as it removes tokens
from the token stream and plain strchr() won't. This can make the
initial strchr() check slightly misleading since it implies
qemu_rbd_next_tok() will find the token and split on it, except the
reality is that qemu_rbd_next_tok() will pass over it if it is escaped.

Use a custom strchr to avoid mixing escaped and unescaped string
operations.

Reported-by: Han Han 
Fixes: https://bugzilla.redhat.com/1873913
Signed-off-by: Connor Kuehl 
---
 v2 -> v3:
   * Update qemu_rbd_strchr to only skip if there's a delimiter AND the
 next character is not the NUL terminator

block/rbd.c| 20 ++--
tests/qemu-iotests/231 |  4 
tests/qemu-iotests/231.out |  3 +++
3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 9071a00e3f..291e3f09e1 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -134,6 +134,22 @@ static char *qemu_rbd_next_tok(char *src, char delim, char 
**p)
return src;
}

+static char *qemu_rbd_strchr(char *src, char delim)
+{
+char *p;
+
+for (p = src; *p; ++p) {
+if (*p == delim) {
+return p;
+}
+if (*p == '\\' && p[1] != '\0') {
+++p;
+}
+}
+
+return NULL;
+}
+


IIUC this is similar to the code used in qemu_rbd_next_tok().
To avoid code duplication can we use this new function inside it?

I mean something like this (not tested):

diff --git a/block/rbd.c b/block/rbd.c
index f098a89c7b..eb6a839362 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -119,15 +119,8 @@ static char *qemu_rbd_next_tok(char *src, char delim, char 
**p)
 
 *p = NULL;
 
-for (end = src; *end; ++end) {

-if (*end == delim) {
-break;
-}
-if (*end == '\\' && end[1] != '\0') {
-end++;
-}
-}
-if (*end == delim) {
+end = qemu_rbd_strchr(src, delim);
+if (end && *end == delim) {
 *p = end + 1;
 *end = '\0';
 }


The rest LGTM!

Thanks for fixing this issue,
Stefano

Re: [PATCH v3 1/2] iotests/231: Update expected deprecation message

2021-04-21 Thread Stefano Garzarella


On Fri, Apr 09, 2021 at 09:38:53AM -0500, Connor Kuehl wrote:

The deprecation message in the expected output has technically been
wrong since the wrong version of a patch was applied to it. Because of
this, the test fails. Correct the expected output so that it passes.

Signed-off-by: Connor Kuehl 
Reviewed-by: Max Reitz 
---
tests/qemu-iotests/231.out | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)


Reviewed-by: Stefano Garzarella 



diff --git a/tests/qemu-iotests/231.out b/tests/qemu-iotests/231.out
index 579ba11c16..747dd221bb 100644
--- a/tests/qemu-iotests/231.out
+++ b/tests/qemu-iotests/231.out
@@ -1,9 +1,7 @@
QA output created by 231
-qemu-img: RBD options encoded in the filename as keyvalue pairs is deprecated. 
 Future versions may cease to parse these options in the future.
+qemu-img: warning: RBD options encoded in the filename as keyvalue pairs is 
deprecated
unable to get monitor info from DNS SRV with service name: ceph-mon
-no monitors specified to connect to.
qemu-img: Could not open 
'json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=BOGUS_CONF'}': 
error connecting: No such file or directory
unable to get monitor info from DNS SRV with service name: ceph-mon
-no monitors specified to connect to.
qemu-img: Could not open 
'json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'BOGUS_CONF'}':
 error connecting: No such file or directory
*** done
--
2.30.2

Re: [PATCH v3] hw/block/nvme: fix lbaf formats initialization

2021-04-21 Thread Philippe Mathieu-Daudé

On 4/16/21 1:59 PM, Gollu Appalanaidu wrote:
> Currently LBAF formats are being intialized based on metadata
> size if and only if nvme-ns "ms" parameter is non-zero value.
> Since FormatNVM command being supported device parameter "ms"
> may not be the criteria to initialize the supported LBAFs.
> 
> Signed-off-by: Gollu Appalanaidu 
> ---
> -v3: Remove "mset" constraint  check if ms < 8, "mset" can be
>  set even when ms < 8 and non-zero.
> 
> -v2: Addressing review comments (Klaus)
>  Change the current "pi" and "ms" constraint check such that it
>  will throw the error if ms < 8 and if namespace protection info,
>  location and metadata settings are set.
>  Splitting this from compare fix patch series.
> 
>  hw/block/nvme-ns.c | 58 --
>  1 file changed, 25 insertions(+), 33 deletions(-)

> +NvmeLBAF lbaf[16] = {

Unrelated to your change, but better to use a read-only array:

   static const NvmeLBAF lbaf[16] = {

> +[0] = { .ds =  9   },
> +[1] = { .ds =  9, .ms =  8 },
> +[2] = { .ds =  9, .ms = 16 },
> +[3] = { .ds =  9, .ms = 64 },
> +[4] = { .ds = 12   },
> +[5] = { .ds = 12, .ms =  8 },
> +[6] = { .ds = 12, .ms = 16 },
> +[7] = { .ds = 12, .ms = 64 },
> +};
> +
> +memcpy(_ns->lbaf, , sizeof(lbaf));
> +id_ns->nlbaf = 7;

Re: [PATCH v3] hw/block/nvme: fix lbaf formats initialization

2021-04-21 Thread Gollu Appalanaidu


On Tue, Apr 20, 2021 at 09:47:00PM +0200, Klaus Jensen wrote:

On Apr 16 17:29, Gollu Appalanaidu wrote:

Currently LBAF formats are being intialized based on metadata
size if and only if nvme-ns "ms" parameter is non-zero value.
Since FormatNVM command being supported device parameter "ms"
may not be the criteria to initialize the supported LBAFs.

Signed-off-by: Gollu Appalanaidu 
---
-v3: Remove "mset" constraint  check if ms < 8, "mset" can be
set even when ms < 8 and non-zero.

-v2: Addressing review comments (Klaus)
Change the current "pi" and "ms" constraint check such that it
will throw the error if ms < 8 and if namespace protection info,
location and metadata settings are set.
Splitting this from compare fix patch series.

hw/block/nvme-ns.c | 58 --
1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 7bb618f182..594b0003cf 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -85,38 +85,28 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
   ds = 31 - clz32(ns->blkconf.logical_block_size);
   ms = ns->params.ms;

-if (ns->params.ms) {
-id_ns->mc = 0x3;
+id_ns->mc = 0x3;

-if (ns->params.mset) {
-id_ns->flbas |= 0x10;
-}
+if (ms && ns->params.mset) {
+id_ns->flbas |= 0x10;
+}

-id_ns->dpc = 0x1f;
-id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi;
-
-NvmeLBAF lbaf[16] = {
-[0] = { .ds =  9   },
-[1] = { .ds =  9, .ms =  8 },
-[2] = { .ds =  9, .ms = 16 },
-[3] = { .ds =  9, .ms = 64 },
-[4] = { .ds = 12   },
-[5] = { .ds = 12, .ms =  8 },
-[6] = { .ds = 12, .ms = 16 },
-[7] = { .ds = 12, .ms = 64 },
-};
-
-memcpy(_ns->lbaf, , sizeof(lbaf));
-id_ns->nlbaf = 7;
-} else {
-NvmeLBAF lbaf[16] = {
-[0] = { .ds =  9 },
-[1] = { .ds = 12 },
-};
+id_ns->dpc = 0x1f;
+id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi;

-memcpy(_ns->lbaf, , sizeof(lbaf));
-id_ns->nlbaf = 1;
-}
+NvmeLBAF lbaf[16] = {
+[0] = { .ds =  9   },
+[1] = { .ds =  9, .ms =  8 },
+[2] = { .ds =  9, .ms = 16 },
+[3] = { .ds =  9, .ms = 64 },
+[4] = { .ds = 12   },
+[5] = { .ds = 12, .ms =  8 },
+[6] = { .ds = 12, .ms = 16 },
+[7] = { .ds = 12, .ms = 64 },
+};
+
+memcpy(_ns->lbaf, , sizeof(lbaf));
+id_ns->nlbaf = 7;

   for (i = 0; i <= id_ns->nlbaf; i++) {
   NvmeLBAF *lbaf = _ns->lbaf[i];


This part LGTM.


@@ -395,10 +385,12 @@ static int nvme_ns_check_constraints(NvmeCtrl *n, 
NvmeNamespace *ns,
   return -1;
   }

-if (ns->params.pi && ns->params.ms < 8) {
-error_setg(errp, "at least 8 bytes of metadata required to enable "
-   "protection information");
-return -1;
+if (ns->params.ms < 8) {
+if (ns->params.pi || ns->params.pil) {
+error_setg(errp, "at least 8 bytes of metadata required to enable "
+"protection information, protection information location");
+return -1;
+}
   }



If you do this additional check, then you should maybe also check that 
pil is only set if pi is. But if pi is not enabled, then the value of 
pil is irrelevant (even though it ends up in FLBAS). In other words, 
if you want to validate all possible parameter configurations, then we 
have a lot more checking to do!


Currently, the approach taken by the parameter validation code is to 
error out on *invalid* configurations that causes invariants to not 
hold, and I'd prefer that we stick with that to keep the check logic 
as simple as possible.


So, (without this unnecessary check):



Sure, will remove this check and send v4


Reviewed-by: Klaus Jensen

Re: [RFC PATCH 0/3] block-copy: lock tasks and calls list

2021-04-21 Thread Vladimir Sementsov-Ogievskiy


21.04.2021 11:38, Paolo Bonzini wrote:

On 20/04/21 15:12, Vladimir Sementsov-Ogievskiy wrote:

20.04.2021 13:04, Emanuele Giuseppe Esposito wrote:

This serie of patches continues Paolo's series on making the
block layer thread safe. Add a CoMutex lock for both tasks and
calls list present in block/block-copy.c



I think, we need more information about what kind of thread-safety we want. 
Should the whole interface of block-copy be thread safe? Or only part of it? 
What is going to be shared between different threads? Which functions will be 
called concurrently from different threads? This should be documented in 
include/block/block-copy.h.


I guess all of it.  So more state fields should be identified in 
BlockCopyState, especially in_flight_bytes.  ProgressMeter and SharedResource 
should be made thread-safe on their own, just like the patch I posted for 
RateLimit.


What I see here, is that some things are protected by mutex.. Some things not. 
What became thread-safe?

For example, in block_copy_dirty_clusters(), we modify task fields without any 
mutex held:

  block_copy_task_shrink doesn't take mutex.
  task->zeroes is set without mutex as well


Agreed, these are bugs in the series.


Still all these accesses are done when task is already added to the list.

Looping in block_copy_common() is not thread safe as well.


That one should be mostly safe, because only one coroutine ever writes to all 
fields except cancelled.  cancelled should be accessed with 
qatomic_read/qatomic_set, but there's also the problem that coroutine 
sleep/wake APIs are hard to use in a thread-safe manner (which affects 
block_copy_kick).  This is a different topic and it is something I'm working on,


You also forget to protect QLIST_REMOVE() call in block_copy_task_end()..

Next, block-copy uses co-shared-resource API, which is not thread-safe (as it 
is directly noted in include/qemu/co-shared-resource.h).

Same thing is block/aio_task API, which is not thread-safe too.

So, we should bring thread-safety first to these smaller helper APIs.


Good point.  Emanuele, can you work on ProgressMeter and SharedResource? 
AioTaskPool can also be converted to just use CoQueue instead of manually 
waking up coroutines.



That would be great.

I have one more question in mind:

Is it effective to use CoMutex here? We are protecting only some fast 
manipulations with data, not io path or something like that. Will simple 
QemuMutex work better? Even if CoMutex doesn't have any overhead, I don't think 
than if thread A wants to modify task list, but mutex is held by thread B (for 
similar thing), there is a reason for thread A to yield and do some other 
things: it can just wait several moments on mutex while B is modifying task 
list..

--
Best regards,
Vladimir

Re: [RFC PATCH 0/3] block-copy: lock tasks and calls list

2021-04-21 Thread Paolo Bonzini


On 20/04/21 15:12, Vladimir Sementsov-Ogievskiy wrote:

20.04.2021 13:04, Emanuele Giuseppe Esposito wrote:

This serie of patches continues Paolo's series on making the
block layer thread safe. Add a CoMutex lock for both tasks and
calls list present in block/block-copy.c



I think, we need more information about what kind of thread-safety we 
want. Should the whole interface of block-copy be thread safe? Or only 
part of it? What is going to be shared between different threads? Which 
functions will be called concurrently from different threads? This 
should be documented in include/block/block-copy.h.


I guess all of it.  So more state fields should be identified in 
BlockCopyState, especially in_flight_bytes.  ProgressMeter and 
SharedResource should be made thread-safe on their own, just like the 
patch I posted for RateLimit.


What I see here, is that some things are protected by mutex.. Some 
things not. What became thread-safe?


For example, in block_copy_dirty_clusters(), we modify task fields 
without any mutex held:


  block_copy_task_shrink doesn't take mutex.
  task->zeroes is set without mutex as well


Agreed, these are bugs in the series.


Still all these accesses are done when task is already added to the list.

Looping in block_copy_common() is not thread safe as well.


That one should be mostly safe, because only one coroutine ever writes 
to all fields except cancelled.  cancelled should be accessed with 
qatomic_read/qatomic_set, but there's also the problem that coroutine 
sleep/wake APIs are hard to use in a thread-safe manner (which affects 
block_copy_kick).  This is a different topic and it is something I'm 
working on,



You also forget to protect QLIST_REMOVE() call in block_copy_task_end()..

Next, block-copy uses co-shared-resource API, which is not thread-safe 
(as it is directly noted in include/qemu/co-shared-resource.h).


Same thing is block/aio_task API, which is not thread-safe too.

So, we should bring thread-safety first to these smaller helper APIs.


Good point.  Emanuele, can you work on ProgressMeter and SharedResource? 
 AioTaskPool can also be converted to just use CoQueue instead of 
manually waking up coroutines.

[PATCH] monitor: hmp_qemu_io: acquire aio contex, fix crash

2021-04-21 Thread Vladimir Sementsov-Ogievskiy

Max reported the following bug:

$ ./qemu-img create -f raw src.img 1G
$ ./qemu-img create -f raw dst.img 1G

$ (echo '
   {"execute":"qmp_capabilities"}
   {"execute":"blockdev-mirror",
"arguments":{"job-id":"mirror",
 "device":"source",
 "target":"target",
 "sync":"full",
 "filter-node-name":"mirror-top"}}
'; sleep 3; echo '
   {"execute":"human-monitor-command",
"arguments":{"command-line":
 "qemu-io mirror-top \"write 0 1G\""}}') \
| x86_64-softmmu/qemu-system-x86_64 \
   -qmp stdio \
   -blockdev file,node-name=source,filename=src.img \
   -blockdev file,node-name=target,filename=dst.img \
   -object iothread,id=iothr0 \
   -device virtio-blk,drive=source,iothread=iothr0

crashes:

0  raise () at /usr/lib/libc.so.6
1  abort () at /usr/lib/libc.so.6
2  error_exit
   (err=,
   msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl")
   at ../util/qemu-thread-posix.c:37
3  qemu_mutex_unlock_impl
   (mutex=mutex@entry=0x55fbb25ab6e0,
   file=file@entry=0x55fbb1636957 "../util/async.c",
   line=line@entry=650)
   at ../util/qemu-thread-posix.c:109
4  aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650
5  bdrv_do_drained_begin
   (bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false,
   parent=parent@entry=0x0,
   ignore_bds_parents=ignore_bds_parents@entry=false,
   poll=poll@entry=true) at ../block/io.c:441
6  bdrv_do_drained_begin
   (poll=true, ignore_bds_parents=false, parent=0x0, recursive=false,
   bs=0x55fbb3a87000) at ../block/io.c:448
7  blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718
8  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498
9  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491
10 hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=)
   at ../block/monitor/block-hmp-cmds.c:628

man pthread_mutex_unlock
...
EPERM  The  mutex type is PTHREAD_MUTEX_ERRORCHECK or
PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the
current thread does not own the mutex.

So, thread doesn't own the mutex. And we have iothread here.

Next, note that AIO_WAIT_WHILE() documents that ctx must be acquired
exactly once by caller. But where is it acquired in the call stack?
Seems nowhere.

qemuio_command do acquire aio context.. But we need context acquired
around blk_unref as well. Let's do it.

Reported-by: Max Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/monitor/block-hmp-cmds.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index ebf1033f31..934100d0eb 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -559,6 +559,7 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 {
 BlockBackend *blk;
 BlockBackend *local_blk = NULL;
+AioContext *ctx;
 bool qdev = qdict_get_try_bool(qdict, "qdev", false);
 const char *device = qdict_get_str(qdict, "device");
 const char *command = qdict_get_str(qdict, "command");
@@ -615,7 +616,13 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 qemuio_command(blk, command);
 
 fail:
+ctx = blk_get_aio_context(blk);
+aio_context_acquire(ctx);
+
 blk_unref(local_blk);
+
+aio_context_release(ctx);
+
 hmp_handle_error(mon, err);
 }
 
-- 
2.29.2

Re: [PATCH v4 09/23] job: call job_enter from job_pause

2021-04-21 Thread Vladimir Sementsov-Ogievskiy


07.04.2021 14:38, Vladimir Sementsov-Ogievskiy wrote:

07.04.2021 14:19, Max Reitz wrote:

On 16.01.21 22:46, Vladimir Sementsov-Ogievskiy wrote:

If main job coroutine called job_yield (while some background process
is in progress), we should give it a chance to call job_pause_point().
It will be used in backup, when moved on async block-copy.

Note, that job_user_pause is not enough: we want to handle
child_job_drained_begin() as well, which call job_pause().

Still, if job is already in job_do_yield() in job_pause_point() we
should not enter it.

iotest 109 output is modified: on stop we do bdrv_drain_all() which now
triggers job pause immediately (and pause after ready is standby).

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  job.c  |  3 +++
  tests/qemu-iotests/109.out | 24 
  2 files changed, 27 insertions(+)


While looking into

https://lists.gnu.org/archive/html/qemu-block/2021-04/msg00035.html

I noticed this:

$ ./qemu-img create -f raw src.img 1G
$ ./qemu-img create -f raw dst.img 1G

$ (echo '
    {"execute":"qmp_capabilities"}
    {"execute":"blockdev-mirror",
 "arguments":{"job-id":"mirror",
  "device":"source",
  "target":"target",
  "sync":"full",
  "filter-node-name":"mirror-top"}}
'; sleep 3; echo '
    {"execute":"human-monitor-command",
 "arguments":{"command-line":
  "qemu-io mirror-top \"write 0 1G\""}}') \
| x86_64-softmmu/qemu-system-x86_64 \
    -qmp stdio \
    -blockdev file,node-name=source,filename=src.img \
    -blockdev file,node-name=target,filename=dst.img \
    -object iothread,id=iothr0 \
    -device virtio-blk,drive=source,iothread=iothr0

Before this commit, qemu-io returned an error that there is a permission 
conflict with virtio-blk.  After this commit, there is an abort (“qemu: 
qemu_mutex_unlock_impl: Operation not permitted”):

#0  0x7f8445a4eef5 in raise () at /usr/lib/libc.so.6
#1  0x7f8445a38862 in abort () at /usr/lib/libc.so.6
#2  0x55fbb14a36bf in error_exit
    (err=, msg=msg@entry=0x55fbb1634790 <__func__.27> 
"qemu_mutex_unlock_impl")
    at ../util/qemu-thread-posix.c:37
#3  0x55fbb14a3bc3 in qemu_mutex_unlock_impl
    (mutex=mutex@entry=0x55fbb25ab6e0, file=file@entry=0x55fbb1636957 
"../util/async.c", line=line@entry=650)
    at ../util/qemu-thread-posix.c:109
#4  0x55fbb14b2e75 in aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at 
../util/async.c:650
#5  0x55fbb13d2029 in bdrv_do_drained_begin
    (bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false, 
parent=parent@entry=0x0, ignore_bds_parents=ignore_bds_parents@entry=false, 
poll=poll@entry=true) at ../block/io.c:441
#6  0x55fbb13d2192 in bdrv_do_drained_begin
    (poll=true, ignore_bds_parents=false, parent=0x0, recursive=false, 
bs=0x55fbb3a87000) at ../block/io.c:448
#7  0x55fbb13c71a7 in blk_drain (blk=0x55fbb26c5a00) at 
../block/block-backend.c:1718
#8  0x55fbb13c8bbd in blk_unref (blk=0x55fbb26c5a00) at 
../block/block-backend.c:498
#9  blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491
#10 0x55fbb1024863 in hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=)
    at ../block/monitor/block-hmp-cmds.c:628

Can you make anything out of this?



Hmm.. Interesting.

man pthread_mutex_unlock
...
     EPERM  The  mutex type is PTHREAD_MUTEX_ERRORCHECK or 
PTHREAD_MUTEX_RECURSIVE, or the mutex is a
  robust mutex, and the current thread does not own the mutex.

So, thread doesn't own the mutex.. We have an iothread here.

AIO_WAIT_WHILE() documents that ctx must be acquired exactly once by caller.. 
But I don't see, where is it acquired in the call stack?

The other question, is why permission conflict is lost with the commit. 
Strange. I ss that hmp_qemu_io creates blk with perm=0 and 
shread=BLK_PERM_ALL.. How could it conflict even before the considered commit?




Sorry, I've answered and forgot about this thread. Now, looking through my 
series I find this again. Seems that problem is really in lacking aio-context 
locking around blk_unref(). I'll send patch now.


--
Best regards,
Vladimir

[PATCH] mirror: stop cancelling in-flight requests on non-force cancel in READY

2021-04-21 Thread Vladimir Sementsov-Ogievskiy

If mirror is READY than cancel operation is not discarding the whole
result of the operation, but instead it's a documented way get a
point-in-time snapshot of source disk.

So, we should not cancel any requests if mirror is READ and
force=false. Let's fix that case.

Note, that bug that we have before this commit is not critical, as the
only .bdrv_cancel_in_flight implementation is nbd_cancel_in_flight()
and it cancels only requests waiting for reconnection, so it should be
rare case.

Fixes: 521ff8b779b11c394dbdc43f02e158dd99df308a
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block_int.h | 2 +-
 include/qemu/job.h| 2 +-
 block/backup.c| 2 +-
 block/mirror.c| 6 --
 job.c | 2 +-
 tests/qemu-iotests/264| 2 +-
 6 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 88e4111939..584381fdb0 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -357,7 +357,7 @@ struct BlockDriver {
  * of in-flight requests, so don't waste the time if possible.
  *
  * One example usage is to avoid waiting for an nbd target node reconnect
- * timeout during job-cancel.
+ * timeout during job-cancel with force=true.
  */
 void (*bdrv_cancel_in_flight)(BlockDriverState *bs);
 
diff --git a/include/qemu/job.h b/include/qemu/job.h
index efc6fa7544..41162ed494 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -254,7 +254,7 @@ struct JobDriver {
 /**
  * If the callback is not NULL, it will be invoked in job_cancel_async
  */
-void (*cancel)(Job *job);
+void (*cancel)(Job *job, bool force);
 
 
 /** Called when the job is freed */
diff --git a/block/backup.c b/block/backup.c
index 6cf2f974aa..bd3614ce70 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -331,7 +331,7 @@ static void coroutine_fn backup_set_speed(BlockJob *job, 
int64_t speed)
 }
 }
 
-static void backup_cancel(Job *job)
+static void backup_cancel(Job *job, bool force)
 {
 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
 
diff --git a/block/mirror.c b/block/mirror.c
index 5a71bd8bbc..fcd1b56991 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1178,12 +1178,14 @@ static bool mirror_drained_poll(BlockJob *job)
 return !!s->in_flight;
 }
 
-static void mirror_cancel(Job *job)
+static void mirror_cancel(Job *job, bool force)
 {
 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
 BlockDriverState *target = blk_bs(s->target);
 
-bdrv_cancel_in_flight(target);
+if (force || !job_is_ready(job)) {
+bdrv_cancel_in_flight(target);
+}
 }
 
 static const BlockJobDriver mirror_job_driver = {
diff --git a/job.c b/job.c
index 4aff13d95a..8775c1803b 100644
--- a/job.c
+++ b/job.c
@@ -716,7 +716,7 @@ static int job_finalize_single(Job *job)
 static void job_cancel_async(Job *job, bool force)
 {
 if (job->driver->cancel) {
-job->driver->cancel(job);
+job->driver->cancel(job, force);
 }
 if (job->user_paused) {
 /* Do not call job_enter here, the caller will handle it.  */
diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264
index 4f96825a22..bc431d1a19 100755
--- a/tests/qemu-iotests/264
+++ b/tests/qemu-iotests/264
@@ -95,7 +95,7 @@ class TestNbdReconnect(iotests.QMPTestCase):
 self.assert_qmp(result, 'return', {})
 
 def cancel_job(self):
-result = self.vm.qmp('block-job-cancel', device='drive0')
+result = self.vm.qmp('block-job-cancel', device='drive0', force=True)
 self.assert_qmp(result, 'return', {})
 
 start_t = time.time()
-- 
2.29.2

38 matches

Mail list logo