[PATCH 3/3] hw/block/nvme: add id ns metadata cap (mc) enum
Add Idnetify Namespace Metadata Capablities (MC) enum. Signed-off-by: Gollu Appalanaidu --- hw/block/nvme-ns.c | 2 +- include/block/nvme.h | 5 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 9065a7ae99..db75302136 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -85,7 +85,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) ds = 31 - clz32(ns->blkconf.logical_block_size); ms = ns->params.ms; -id_ns->mc = 0x3; +id_ns->mc = NVME_ID_NS_MC_EXTENDED | NVME_ID_NS_MC_SEPARATE; if (ms && ns->params.mset) { id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDEND; diff --git a/include/block/nvme.h b/include/block/nvme.h index 1d61030756..a3b610ba86 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1344,6 +1344,11 @@ enum NvmeIdNsFlbas { NVME_ID_NS_FLBAS_EXTENDEND = 1 << 4, }; +enum NvmeIdNsMc { +NVME_ID_NS_MC_EXTENDED = 1 << 0, +NVME_ID_NS_MC_SEPARATE = 1 << 1, +}; + #define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK) typedef struct NvmeDifTuple { -- 2.17.1
[PATCH 2/3] hw/block/nvme: add id ns flbas enum
Add the Identify Namespace FLBAS related enums and remove NVME_ID_NS_FLBAS_EXTENDEND macro its being used in only one place and converted into enum. Signed-off-by: Gollu Appalanaidu --- hw/block/nvme-ns.c | 2 +- hw/block/nvme-ns.h | 2 +- include/block/nvme.h | 5 - 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index ae56142fcd..9065a7ae99 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -88,7 +88,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->mc = 0x3; if (ms && ns->params.mset) { -id_ns->flbas |= 0x10; +id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDEND; } id_ns->dpc = 0x1f; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index fb0a41f912..5aa36cd1d2 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -134,7 +134,7 @@ static inline size_t nvme_m2b(NvmeNamespace *ns, uint64_t lba) static inline bool nvme_ns_ext(NvmeNamespace *ns) { -return !!NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas); +return ns->id_ns.flbas & NVME_ID_NS_FLBAS_EXTENDEND; } /* calculate the number of LBAs that the namespace can accomodate */ diff --git a/include/block/nvme.h b/include/block/nvme.h index 4ac926fbc6..1d61030756 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1321,7 +1321,6 @@ typedef struct QEMU_PACKED NvmeIdNsZoned { #define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) #define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1) -#define NVME_ID_NS_FLBAS_EXTENDED(flbas)((flbas >> 4) & 0x1) #define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) #define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) #define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1)) @@ -1341,6 +1340,10 @@ enum NvmeIdNsDps { NVME_ID_NS_DPS_FIRST_EIGHT = 8, }; +enum NvmeIdNsFlbas { +NVME_ID_NS_FLBAS_EXTENDEND = 1 << 4, +}; + #define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK) typedef struct NvmeDifTuple { -- 2.17.1
[PATCH 1/3] hw/block/nvme: fix lbaf formats initialization
Currently LBAF formats are being intialized based on metadata size if and only if nvme-ns "ms" parameter is non-zero value. Since FormatNVM command being supported device parameter "ms" may not be the criteria to initialize the supported LBAFs. And make LBAF array as read-only. Reviewed-by: Klaus Jensen Signed-off-by: Gollu Appalanaidu --- hw/block/nvme-ns.c | 51 -- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 7bb618f182..ae56142fcd 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -85,39 +85,32 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) ds = 31 - clz32(ns->blkconf.logical_block_size); ms = ns->params.ms; -if (ns->params.ms) { -id_ns->mc = 0x3; +id_ns->mc = 0x3; -if (ns->params.mset) { -id_ns->flbas |= 0x10; -} - -id_ns->dpc = 0x1f; -id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi; - -NvmeLBAF lbaf[16] = { -[0] = { .ds = 9 }, -[1] = { .ds = 9, .ms = 8 }, -[2] = { .ds = 9, .ms = 16 }, -[3] = { .ds = 9, .ms = 64 }, -[4] = { .ds = 12 }, -[5] = { .ds = 12, .ms = 8 }, -[6] = { .ds = 12, .ms = 16 }, -[7] = { .ds = 12, .ms = 64 }, -}; - -memcpy(_ns->lbaf, , sizeof(lbaf)); -id_ns->nlbaf = 7; -} else { -NvmeLBAF lbaf[16] = { -[0] = { .ds = 9 }, -[1] = { .ds = 12 }, -}; +if (ms && ns->params.mset) { +id_ns->flbas |= 0x10; +} -memcpy(_ns->lbaf, , sizeof(lbaf)); -id_ns->nlbaf = 1; +id_ns->dpc = 0x1f; +id_ns->dps = ns->params.pi; +if (ns->params.pi && ns->params.pil) { +id_ns->dps |= NVME_ID_NS_DPS_FIRST_EIGHT; } +static const NvmeLBAF lbaf[16] = { +[0] = { .ds = 9 }, +[1] = { .ds = 9, .ms = 8 }, +[2] = { .ds = 9, .ms = 16 }, +[3] = { .ds = 9, .ms = 64 }, +[4] = { .ds = 12 }, +[5] = { .ds = 12, .ms = 8 }, +[6] = { .ds = 12, .ms = 16 }, +[7] = { .ds = 12, .ms = 64 }, +}; + +memcpy(_ns->lbaf, , sizeof(lbaf)); +id_ns->nlbaf = 7; + for (i = 0; i <= id_ns->nlbaf; i++) { NvmeLBAF *lbaf = _ns->lbaf[i]; if (lbaf->ds == ds) { -- 2.17.1
[PATCH 0/3] hw/block/nvme: fixes lbaf formats initialization and adds idns enums
This series fixes the LBAF Formats intialization and Adds the Identify Namespace Parameters enums to make to more readable. Gollu Appalanaidu (3): hw/block/nvme: fix lbaf formats initialization hw/block/nvme: add id ns flbas enum hw/block/nvme: add id ns metadata cap (mc) enum hw/block/nvme-ns.c | 51 +++- hw/block/nvme-ns.h | 2 +- include/block/nvme.h | 10 - 3 files changed, 32 insertions(+), 31 deletions(-) -- 2.17.1
Re: [PATCH v2 7/7] hw/block/pflash_cfi02: Simplify pflash_cfi02_register() prototype
On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote: The previous commit removed the mapping code from TYPE_PFLASH_CFI02. pflash_cfi02_register() doesn't use the 'nb_mappings' argument anymore. Simply remove it to simplify. Reviewed-by: David Gibson Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson r~
Re: [PATCH v2 6/7] hw/block/pflash_cfi02: Remove pflash_setup_mappings()
On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote: All boards calling pflash_cfi02_register() use nb_mappings=1, which does not do any mapping: $ git grep -wl pflash_cfi02_register hw/ hw/arm/xilinx_zynq.c hw/block/pflash_cfi02.c hw/lm32/lm32_boards.c hw/ppc/ppc405_boards.c hw/sh4/r2d.c We can remove this now unneeded code. Reviewed-by: David Gibson Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson r~
Re: [PATCH v2 5/7] hw/arm/digic: Map flash using memory_region_add_subregion_aliased()
On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote: Instead of using a device specific feature for mapping the flash memory multiple times over a wider region, use the generic memory_region_add_subregion_aliased() helper. There is no change in the memory layout. * before: $ qemu-system-arm -M canon-a1100 -S -monitor stdio QEMU 5.2.90 monitor - type 'help' for more information (qemu) info mtree address-space: memory - (prio 0, i/o): system -03ff (prio 0, ram): ram c021-c02100ff (prio 0, i/o): digic-timer c0210100-c02101ff (prio 0, i/o): digic-timer c0210200-c02102ff (prio 0, i/o): digic-timer c080-c0800017 (prio 0, i/o): digic-uart f800- (prio 0, i/o): pflash f800-f83f (prio 0, romd): alias pflash-alias @pflash -003f f840-f87f (prio 0, romd): alias pflash-alias @pflash -003f f880-f8bf (prio 0, romd): alias pflash-alias @pflash -003f ... ff40-ff7f (prio 0, romd): alias pflash-alias @pflash -003f ff80-ffbf (prio 0, romd): alias pflash-alias @pflash -003f ffc0- (prio 0, romd): alias pflash-alias @pflash -003f * after: (qemu) info mtree address-space: memory - (prio 0, i/o): system -03ff (prio 0, ram): ram c021-c02100ff (prio 0, i/o): digic-timer c0210100-c02101ff (prio 0, i/o): digic-timer c0210200-c02102ff (prio 0, i/o): digic-timer c080-c0800017 (prio 0, i/o): digic-uart f800- (prio 0, i/o): masked pflash [span of 4 MiB] f800-f83f (prio 0, romd): alias pflash [#0/32] @pflash -003f f840-f87f (prio 0, romd): alias pflash [#1/32] @pflash -003f f880-f8bf (prio 0, romd): alias pflash [#2/32] @pflash -003f ... ff40-ff7f (prio 0, romd): alias pflash [#29/32] @pflash -003f ff80-ffbf (prio 0, romd): alias pflash [#30/32] @pflash -003f ffc0- (prio 0, romd): alias pflash [#31/32] @pflash -003f Signed-off-by: Philippe Mathieu-Daudé --- hw/arm/digic_boards.c | 8 +--- hw/arm/Kconfig| 1 + 2 files changed, 6 insertions(+), 3 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 3/7] hw/arm/musicpal: Map flash using memory_region_add_subregion_aliased()
On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote: Instead of using a device specific feature for mapping the flash memory multiple times over a wider region, use the generic memory_region_add_subregion_aliased() helper. There is no change in the memory layout: - before: (qemu) info mtree fe00- (prio 0, i/o): pflash fe00-fe7f (prio 0, romd): alias pflash-alias @musicpal.flash -007f fe80-feff (prio 0, romd): alias pflash-alias @musicpal.flash -007f ff00-ff7f (prio 0, romd): alias pflash-alias @musicpal.flash -007f ff80- (prio 0, romd): alias pflash-alias @musicpal.flash -007f - after: fe00- (prio 0, i/o): masked musicpal.flash [span of 8 MiB] fe00-fe7f (prio 0, romd): alias musicpal.flash [#0/4] @musicpal.flash -007f fe80-feff (prio 0, romd): alias musicpal.flash [#1/4] @musicpal.flash -007f ff00-ff7f (prio 0, romd): alias musicpal.flash [#2/4] @musicpal.flash -007f ff80- (prio 0, romd): alias musicpal.flash [#3/4] @musicpal.flash -007f Flatview is the same: (qemu) info mtree -f FlatView #0 AS "memory", root: system AS "cpu-memory-0", root: system AS "emac-dma", root: system Root memory region: system fe00-fe7f (prio 0, romd): musicpal.flash fe80-feff (prio 0, romd): musicpal.flash ff00-ff7f (prio 0, romd): musicpal.flash ff80- (prio 0, romd): musicpal.flash Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson r~
Re: [PATCH v2 4/7] hw/arm/digic: Open-code pflash_cfi02_register() call
On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote: To be able to manually map the flash region on the main memory (in the next commit), first expand the pflash_cfi02_register in place. Signed-off-by: Philippe Mathieu-Daudé --- hw/arm/digic_boards.c | 27 +-- 1 file changed, 21 insertions(+), 6 deletions(-) Reviewed-by: Richard Henderson r~
Re: [PATCH v2 2/7] hw/arm/musicpal: Open-code pflash_cfi02_register() call
On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote: To be able to manually map the flash region on the main memory (in the next commit), first expand the pflash_cfi02_register in place. Signed-off-by: Philippe Mathieu-Daudé --- hw/arm/musicpal.c | 27 +-- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/hw/arm/musicpal.c b/hw/arm/musicpal.c index 9cebece2de0..8b58b66f263 100644 --- a/hw/arm/musicpal.c +++ b/hw/arm/musicpal.c @@ -10,6 +10,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "qapi/error.h" #include "cpu.h" #include "hw/sysbus.h" @@ -1640,6 +1641,7 @@ static void musicpal_init(MachineState *machine) /* Register flash */ dinfo = drive_get(IF_PFLASH, 0, 0); if (dinfo) { +static const size_t sector_size = 64 * KiB; Drop the static. We do not need permanent storage for this. Otherwise, Reviewed-by: Richard Henderson r~
Re: [RFC PATCH v2 1/7] hw/misc: Add device to help managing aliased memory regions
On 4/19/21 2:43 AM, Philippe Mathieu-Daudé wrote: Not really RFC, simply that I'v to add the technical description, but I'd like to know if there could be a possibility to not accept this device (because I missed something) before keeping working on it. So far it is only used in hobbyist boards. Cc: Peter Xu Cc: Paolo Bonzini --- include/hw/misc/aliased_region.h | 87 hw/misc/aliased_region.c | 132 +++ MAINTAINERS | 6 ++ hw/misc/Kconfig | 3 + hw/misc/meson.build | 1 + 5 files changed, 229 insertions(+) create mode 100644 include/hw/misc/aliased_region.h create mode 100644 hw/misc/aliased_region.c Looks reasonable to me. +subregion_bits = 64 - clz64(s->span_size - 1); +s->mem.count = s->region_size >> subregion_bits; +assert(s->mem.count > 1); +subregion_size = 1ULL << subregion_bits; So... subregion_size = pow2floor(s->span_size)? Why use a floor-ish computation here and pow2ceil later in aliased_mr_realize? Why use either floor or ceil as opposed to asserting that the size is in fact a power of 2? Why must the region be a power of 2, as opposed to e.g. a multiple of page_size? Or just the most basic requirement that region_size be a multiple of span_size? r~
Re: [PATCH v3 02/33] block/nbd: fix how state is cleared on nbd_open() failure paths
21.04.2021 17:00, Roman Kagan wrote: On Fri, Apr 16, 2021 at 11:08:40AM +0300, Vladimir Sementsov-Ogievskiy wrote: We have two "return error" paths in nbd_open() after nbd_process_options(). Actually we should call nbd_clear_bdrvstate() on these paths. Interesting that nbd_process_options() calls nbd_clear_bdrvstate() by itself. Let's fix leaks and refactor things to be more obvious: - intialize yank at top of nbd_open() - move yank cleanup to nbd_clear_bdrvstate() - refactor nbd_open() so that all failure paths except for yank-register goes through nbd_clear_bdrvstate() Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/nbd.c | 36 ++-- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 739ae2941f..a407a3814b 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -152,8 +152,12 @@ static void nbd_co_establish_connection_cancel(BlockDriverState *bs, static int nbd_client_handshake(BlockDriverState *bs, Error **errp); static void nbd_yank(void *opaque); -static void nbd_clear_bdrvstate(BDRVNBDState *s) +static void nbd_clear_bdrvstate(BlockDriverState *bs) { +BDRVNBDState *s = (BDRVNBDState *)bs->opaque; + +yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); + object_unref(OBJECT(s->tlscreds)); qapi_free_SocketAddress(s->saddr); s->saddr = NULL; @@ -2279,9 +2283,6 @@ static int nbd_process_options(BlockDriverState *bs, QDict *options, ret = 0; error: -if (ret < 0) { -nbd_clear_bdrvstate(s); -} qemu_opts_del(opts); return ret; } @@ -2292,11 +2293,6 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, int ret; BDRVNBDState *s = (BDRVNBDState *)bs->opaque; -ret = nbd_process_options(bs, options, errp); -if (ret < 0) { -return ret; -} - s->bs = bs; qemu_co_mutex_init(>send_mutex); qemu_co_queue_init(>free_sema); @@ -2305,20 +2301,23 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, return -EEXIST; } +ret = nbd_process_options(bs, options, errp); +if (ret < 0) { +goto fail; +} + /* * establish TCP connection, return error if it fails * TODO: Configurable retry-until-timeout behaviour. */ if (nbd_establish_connection(bs, s->saddr, errp) < 0) { -yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); -return -ECONNREFUSED; +ret = -ECONNREFUSED; +goto fail; } ret = nbd_client_handshake(bs, errp); Not that this was introduced by this patch, but once you're at it: AFAICT nbd_client_handshake() calls yank_unregister_instance() on some error path(s); I assume this needs to go too, otherwise it's called twice (and asserts). Roman. No, nbd_client_handshake() only calls yank_unregister_function(), not instance. -- Best regards, Vladimir
Re: [PATCH] monitor: hmp_qemu_io: acquire aio contex, fix crash
21.04.2021 22:47, Philippe Mathieu-Daudé wrote: On 4/21/21 10:32 AM, Vladimir Sementsov-Ogievskiy wrote: Max reported the following bug: $ ./qemu-img create -f raw src.img 1G $ ./qemu-img create -f raw dst.img 1G $ (echo ' {"execute":"qmp_capabilities"} {"execute":"blockdev-mirror", "arguments":{"job-id":"mirror", "device":"source", "target":"target", "sync":"full", "filter-node-name":"mirror-top"}} '; sleep 3; echo' {"execute":"human-monitor-command", "arguments":{"command-line": "qemu-io mirror-top \"write 0 1G\""}}') \ | x86_64-softmmu/qemu-system-x86_64 \ -qmp stdio \ -blockdev file,node-name=source,filename=src.img \ -blockdev file,node-name=target,filename=dst.img \ -object iothread,id=iothr0 \ -device virtio-blk,drive=source,iothread=iothr0 crashes: 0 raise () at /usr/lib/libc.so.6 1 abort () at /usr/lib/libc.so.6 2 error_exit (err=, msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl") at ../util/qemu-thread-posix.c:37 3 qemu_mutex_unlock_impl (mutex=mutex@entry=0x55fbb25ab6e0, file=file@entry=0x55fbb1636957 "../util/async.c", line=line@entry=650) at ../util/qemu-thread-posix.c:109 4 aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650 5 bdrv_do_drained_begin (bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false, parent=parent@entry=0x0, ignore_bds_parents=ignore_bds_parents@entry=false, poll=poll@entry=true) at ../block/io.c:441 6 bdrv_do_drained_begin (poll=true, ignore_bds_parents=false, parent=0x0, recursive=false, bs=0x55fbb3a87000) at ../block/io.c:448 7 blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718 8 blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498 9 blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491 10 hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=) at ../block/monitor/block-hmp-cmds.c:628 man pthread_mutex_unlock ... EPERM The mutex type is PTHREAD_MUTEX_ERRORCHECK or PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the current thread does not own the mutex. So, thread doesn't own the mutex. And we have iothread here. Next, note that AIO_WAIT_WHILE() documents that ctx must be acquired exactly once by caller. But where is it acquired in the call stack? Seems nowhere. qemuio_command do acquire aio context.. But we need context acquired around blk_unref as well. Let's do it. Reported-by: Max Reitz Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/monitor/block-hmp-cmds.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index ebf1033f31..934100d0eb 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -559,6 +559,7 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) { BlockBackend *blk; BlockBackend *local_blk = NULL; +AioContext *ctx; bool qdev = qdict_get_try_bool(qdict, "qdev", false); const char *device = qdict_get_str(qdict, "device"); const char *command = qdict_get_str(qdict, "command"); @@ -615,7 +616,13 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) qemuio_command(blk, command); fail: +ctx = blk_get_aio_context(blk); +aio_context_acquire(ctx); + blk_unref(local_blk); + +aio_context_release(ctx); I dare to mention "code smell" here... Not to your fix, but to the API. Can't we simplify it somehow? Maybe we can't, I don't understand it well. But it seems bug prone, and expensive in human brain resources (either develop, debug or review). Better is move hmp_qemu_io to coroutine together with all called functions and qemu-io commands.. But it's a lot more work. -- Best regards, Vladimir
Re: [PATCH v2 4/8] block: make before-write notifiers thread-safe
22.04.2021 00:23, Vladimir Sementsov-Ogievskiy wrote: 19.04.2021 11:55, Emanuele Giuseppe Esposito wrote: Reads access the list in RCU style, so be careful to avoid use-after-free scenarios in the backup block job. Apart from this, all that's needed is protecting updates with a mutex. Note that backup doesn't use write-notifiers now. Probably best thing to do is to update other users to use filters instead of write notifiers, and drop write notifiers at all. But it may require more work, so don't care. But on the other hand.. Why not. We can go without filter and still drop write-notifiers. Look at my new patch "[PATCH] block: simplify write-threshold and drop write notifiers" -- Best regards, Vladimir
[PATCH] block: simplify write-threshold and drop write notifiers
write-notifiers are used only for write-threshold. New code for such purpose should create filters. Let's handle write-threshold simply in generic code and drop write notifiers at all. Also move part of write-threshold API that is used only for testing to the test. Signed-off-by: Vladimir Sementsov-Ogievskiy --- I agree that this could be split into 2-3 parts and not combining everything into one. But I'm tired now. I can send v2 if needed, so consider it as RFC. Or take as is if you think it's not too much-in-one. I also suggest this as a prepartion (and partly substitution) for "[PATCH v2 0/8] Block layer thread-safety, continued" include/block/block_int.h | 12 - include/block/write-threshold.h | 24 - block.c | 1 - block/io.c| 21 +--- block/write-threshold.c | 87 ++- tests/unit/test-write-threshold.c | 38 ++ 6 files changed, 54 insertions(+), 129 deletions(-) diff --git a/include/block/block_int.h b/include/block/block_int.h index 88e4111939..50af58af75 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -957,12 +957,8 @@ struct BlockDriverState { */ int64_t total_sectors; -/* Callback before write request is processed */ -NotifierWithReturnList before_write_notifiers; - /* threshold limit for writes, in bytes. "High water mark". */ uint64_t write_threshold_offset; -NotifierWithReturn write_threshold_notifier; /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex. * Reading from the list can be done with either the BQL or the @@ -1087,14 +1083,6 @@ void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix, bool bdrv_backing_overridden(BlockDriverState *bs); -/** - * bdrv_add_before_write_notifier: - * - * Register a callback that is invoked before write requests are processed but - * after any throttling or waiting for overlapping requests. - */ -void bdrv_add_before_write_notifier(BlockDriverState *bs, -NotifierWithReturn *notifier); /** * bdrv_add_aio_context_notifier: diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h index c646f267a4..23e1bfc123 100644 --- a/include/block/write-threshold.h +++ b/include/block/write-threshold.h @@ -35,28 +35,4 @@ void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes); */ uint64_t bdrv_write_threshold_get(const BlockDriverState *bs); -/* - * bdrv_write_threshold_is_set - * - * Tell if a write threshold is set for a given BDS. - */ -bool bdrv_write_threshold_is_set(const BlockDriverState *bs); - -/* - * bdrv_write_threshold_exceeded - * - * Return the extent of a write request that exceeded the threshold, - * or zero if the request is below the threshold. - * Return zero also if the threshold was not set. - * - * NOTE: here we assume the following holds for each request this code - * deals with: - * - * assert((req->offset + req->bytes) <= UINT64_MAX) - * - * Please not there is *not* an actual C assert(). - */ -uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, - const BdrvTrackedRequest *req); - #endif diff --git a/block.c b/block.c index c5b887cec1..001453105e 100644 --- a/block.c +++ b/block.c @@ -381,7 +381,6 @@ BlockDriverState *bdrv_new(void) for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { QLIST_INIT(>op_blockers[i]); } -notifier_with_return_list_init(>before_write_notifiers); qemu_co_mutex_init(>reqs_lock); qemu_mutex_init(>dirty_bitmap_mutex); bs->refcnt = 1; diff --git a/block/io.c b/block/io.c index ca2dca3007..e0aa775952 100644 --- a/block/io.c +++ b/block/io.c @@ -36,6 +36,8 @@ #include "qemu/main-loop.h" #include "sysemu/replay.h" +#include "qapi/qapi-events-block-core.h" + /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) @@ -1974,6 +1976,8 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, child->perm & BLK_PERM_RESIZE); switch (req->type) { +uint64_t write_threshold; + case BDRV_TRACKED_WRITE: case BDRV_TRACKED_DISCARD: if (flags & BDRV_REQ_WRITE_UNCHANGED) { @@ -1981,8 +1985,15 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, } else { assert(child->perm & BLK_PERM_WRITE); } -return notifier_with_return_list_notify(>before_write_notifiers, -req); +write_threshold = qatomic_read(>write_threshold_offset); +if (write_threshold > 0 && offset + bytes > write_threshold) { +qapi_event_send_block_write_threshold( +bs->node_name, +offset + bytes - write_threshold, +write_threshold); +
[PATCH v4 1/2] iotests/231: Update expected deprecation message
The deprecation message in the expected output has technically been wrong since the wrong version of a patch was applied to it. Because of this, the test fails. Correct the expected output so that it passes. Signed-off-by: Connor Kuehl Reviewed-by: Max Reitz Reviewed-by: Stefano Garzarella --- v3 -> v4: * Added Stefano's s-o-b to the commit message tests/qemu-iotests/231.out | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/qemu-iotests/231.out b/tests/qemu-iotests/231.out index 579ba11c16..747dd221bb 100644 --- a/tests/qemu-iotests/231.out +++ b/tests/qemu-iotests/231.out @@ -1,9 +1,7 @@ QA output created by 231 -qemu-img: RBD options encoded in the filename as keyvalue pairs is deprecated. Future versions may cease to parse these options in the future. +qemu-img: warning: RBD options encoded in the filename as keyvalue pairs is deprecated unable to get monitor info from DNS SRV with service name: ceph-mon -no monitors specified to connect to. qemu-img: Could not open 'json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=BOGUS_CONF'}': error connecting: No such file or directory unable to get monitor info from DNS SRV with service name: ceph-mon -no monitors specified to connect to. qemu-img: Could not open 'json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'BOGUS_CONF'}': error connecting: No such file or directory *** done -- 2.30.2
[PATCH v4 0/2] Fix segfault in qemu_rbd_parse_filename
Connor Kuehl (2): iotests/231: Update expected deprecation message block/rbd: Add an escape-aware strchr helper block/rbd.c| 32 +--- tests/qemu-iotests/231 | 4 tests/qemu-iotests/231.out | 7 --- 3 files changed, 29 insertions(+), 14 deletions(-) -- 2.30.2
[PATCH v4 2/2] block/rbd: Add an escape-aware strchr helper
Sometimes the parser needs to further split a token it has collected from the token input stream. Right now, it does a cursory check to see if the relevant characters appear in the token to determine if it should break it down further. However, qemu_rbd_next_tok() will escape characters as it removes tokens from the token stream and plain strchr() won't. This can make the initial strchr() check slightly misleading since it implies qemu_rbd_next_tok() will find the token and split on it, except the reality is that qemu_rbd_next_tok() will pass over it if it is escaped. Use a custom strchr to avoid mixing escaped and unescaped string operations. Furthermore, this code is identical to how qemu_rbd_next_tok() seeks its next token, so incorporate this custom strchr into the body of that function to reduce duplication. Reported-by: Han Han Fixes: https://bugzilla.redhat.com/1873913 Signed-off-by: Connor Kuehl --- v3 -> v4: * Replace qemu_rbd_next_tok() seek loop with qemu_rbd_strchr() since they're identical block/rbd.c| 32 +--- tests/qemu-iotests/231 | 4 tests/qemu-iotests/231.out | 3 +++ 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index f098a89c7b..26f64cce7c 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -113,21 +113,31 @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx, const char *keypairs, const char *secretid, Error **errp); +static char *qemu_rbd_strchr(char *src, char delim) +{ +char *p; + +for (p = src; *p; ++p) { +if (*p == delim) { +return p; +} +if (*p == '\\' && p[1] != '\0') { +++p; +} +} + +return NULL; +} + + static char *qemu_rbd_next_tok(char *src, char delim, char **p) { char *end; *p = NULL; -for (end = src; *end; ++end) { -if (*end == delim) { -break; -} -if (*end == '\\' && end[1] != '\0') { -end++; -} -} -if (*end == delim) { +end = qemu_rbd_strchr(src, delim); +if (end) { *p = end + 1; *end = '\0'; } @@ -171,7 +181,7 @@ static void qemu_rbd_parse_filename(const char *filename, QDict *options, qemu_rbd_unescape(found_str); qdict_put_str(options, "pool", found_str); -if (strchr(p, '@')) { +if (qemu_rbd_strchr(p, '@')) { image_name = qemu_rbd_next_tok(p, '@', ); found_str = qemu_rbd_next_tok(p, ':', ); @@ -181,7 +191,7 @@ static void qemu_rbd_parse_filename(const char *filename, QDict *options, image_name = qemu_rbd_next_tok(p, ':', ); } /* Check for namespace in the image_name */ -if (strchr(image_name, '/')) { +if (qemu_rbd_strchr(image_name, '/')) { found_str = qemu_rbd_next_tok(image_name, '/', _name); qemu_rbd_unescape(found_str); qdict_put_str(options, "namespace", found_str); diff --git a/tests/qemu-iotests/231 b/tests/qemu-iotests/231 index 0f66d0ca36..8e6c6447c1 100755 --- a/tests/qemu-iotests/231 +++ b/tests/qemu-iotests/231 @@ -55,6 +55,10 @@ _filter_conf() $QEMU_IMG info "json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=${BOGUS_CONF}'}" 2>&1 | _filter_conf $QEMU_IMG info "json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'${BOGUS_CONF}'}" 2>&1 | _filter_conf +# Regression test: the qemu-img invocation is expected to fail, but it should +# not seg fault the parser. +$QEMU_IMG create "rbd:rbd/aa\/bb:conf=${BOGUS_CONF}" 1M 2>&1 | _filter_conf + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/231.out b/tests/qemu-iotests/231.out index 747dd221bb..a785a6e859 100644 --- a/tests/qemu-iotests/231.out +++ b/tests/qemu-iotests/231.out @@ -4,4 +4,7 @@ unable to get monitor info from DNS SRV with service name: ceph-mon qemu-img: Could not open 'json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=BOGUS_CONF'}': error connecting: No such file or directory unable to get monitor info from DNS SRV with service name: ceph-mon qemu-img: Could not open 'json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'BOGUS_CONF'}': error connecting: No such file or directory +Formatting 'rbd:rbd/aa\/bb:conf=BOGUS_CONF', fmt=raw size=1048576 +unable to get monitor info from DNS SRV with service name: ceph-mon +qemu-img: rbd:rbd/aa\/bb:conf=BOGUS_CONF: error connecting: No such file or directory *** done -- 2.30.2
Re: [PATCH v2 4/8] block: make before-write notifiers thread-safe
19.04.2021 11:55, Emanuele Giuseppe Esposito wrote: Reads access the list in RCU style, so be careful to avoid use-after-free scenarios in the backup block job. Apart from this, all that's needed is protecting updates with a mutex. Note that backup doesn't use write-notifiers now. Probably best thing to do is to update other users to use filters instead of write notifiers, and drop write notifiers at all. But it may require more work, so don't care. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Signed-off-by: Emanuele Giuseppe Esposito --- block.c | 6 +++--- block/io.c| 12 block/write-threshold.c | 2 +- include/block/block_int.h | 37 + 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/block.c b/block.c index c5b887cec1..f40fb63c75 100644 --- a/block.c +++ b/block.c @@ -6434,7 +6434,7 @@ static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban) g_free(ban); } -static void bdrv_detach_aio_context(BlockDriverState *bs) +void bdrv_detach_aio_context(BlockDriverState *bs) why it become public? It's not used in this commit, and this change is unrelated to commit message.. { BdrvAioNotifier *baf, *baf_tmp; @@ -6462,8 +6462,8 @@ static void bdrv_detach_aio_context(BlockDriverState *bs) bs->aio_context = NULL; } -static void bdrv_attach_aio_context(BlockDriverState *bs, -AioContext *new_context) +void bdrv_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) { BdrvAioNotifier *ban, *ban_tmp; diff --git a/block/io.c b/block/io.c index ca2dca3007..8f6af6077b 100644 --- a/block/io.c +++ b/block/io.c @@ -3137,10 +3137,22 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return true; } +static QemuSpin notifiers_spin_lock; + void bdrv_add_before_write_notifier(BlockDriverState *bs, NotifierWithReturn *notifier) { +qemu_spin_lock(_spin_lock); notifier_with_return_list_add(>before_write_notifiers, notifier); +qemu_spin_unlock(_spin_lock); +} + +void bdrv_remove_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier) +{ +qemu_spin_lock(_spin_lock); +notifier_with_return_remove(notifier); +qemu_spin_unlock(_spin_lock); } void bdrv_io_plug(BlockDriverState *bs) diff --git a/block/write-threshold.c b/block/write-threshold.c index 77c74bdaa7..b87b749b02 100644 --- a/block/write-threshold.c +++ b/block/write-threshold.c @@ -32,7 +32,7 @@ bool bdrv_write_threshold_is_set(const BlockDriverState *bs) static void write_threshold_disable(BlockDriverState *bs) { if (bdrv_write_threshold_is_set(bs)) { -notifier_with_return_remove(>write_threshold_notifier); +bdrv_remove_before_write_notifier(bs, >write_threshold_notifier); bs->write_threshold_offset = 0; } } diff --git a/include/block/block_int.h b/include/block/block_int.h index 88e4111939..a1aad5ad2d 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1089,6 +1089,8 @@ bool bdrv_backing_overridden(BlockDriverState *bs); /** * bdrv_add_before_write_notifier: + * @bs: The #BlockDriverState for which to register the notifier + * @notifier: The notifier to add * * Register a callback that is invoked before write requests are processed but * after any throttling or waiting for overlapping requests. @@ -1096,6 +1098,41 @@ bool bdrv_backing_overridden(BlockDriverState *bs); void bdrv_add_before_write_notifier(BlockDriverState *bs, NotifierWithReturn *notifier); +/** + * bdrv_remove_before_write_notifier: + * @bs: The #BlockDriverState for which to register the notifier + * @notifier: The notifier to add + * + * Unregister a callback that is invoked before write requests are processed but + * after any throttling or waiting for overlapping requests. + * + * Note that more I/O could be pending on @bs. You need to wait for + * it to finish, for example with bdrv_drain(), before freeing @notifier. + */ +void bdrv_remove_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier); + +/** + * bdrv_detach_aio_context: + * + * May be called from .bdrv_detach_aio_context() to detach children from the + * current #AioContext. This is only needed by block drivers that manage their + * own children. Both ->file and ->backing are automatically handled and + * block drivers should not call this function on them explicitly. + */ +void bdrv_detach_aio_context(BlockDriverState *bs); + +/** + * bdrv_attach_aio_context: + * + * May be called from .bdrv_attach_aio_context() to attach children to the new + * #AioContext. This is only needed by block drivers that manage their own + *
Re: [PATCH v3 2/2] block/rbd: Add an escape-aware strchr helper
On 4/21/21 6:04 AM, Stefano Garzarella wrote: >> +static char *qemu_rbd_strchr(char *src, char delim) >> +{ >> +char *p; >> + >> +for (p = src; *p; ++p) { >> +if (*p == delim) { >> +return p; >> +} >> +if (*p == '\\' && p[1] != '\0') { >> +++p; >> +} >> +} >> + >> +return NULL; >> +} >> + > > IIUC this is similar to the code used in qemu_rbd_next_tok(). > To avoid code duplication can we use this new function inside it? Hi Stefano! Good catch. I think you're right. I've incorporated your suggestion into my next revision. The only thing I changed was the if-statement from: if (end && *end == delim) { to: if (end) { Since qemu_rbd_strchr() will only ever return a non-NULL pointer if it was able to find 'delim'. Connor > > I mean something like this (not tested): > > diff --git a/block/rbd.c b/block/rbd.c > index f098a89c7b..eb6a839362 100644 > --- a/block/rbd.c > +++ b/block/rbd.c > @@ -119,15 +119,8 @@ static char *qemu_rbd_next_tok(char *src, char delim, > char **p) > > *p = NULL; > > -for (end = src; *end; ++end) { > -if (*end == delim) { > -break; > -} > -if (*end == '\\' && end[1] != '\0') { > -end++; > -} > -} > -if (*end == delim) { > +end = qemu_rbd_strchr(src, delim); > +if (end && *end == delim) { > *p = end + 1; > *end = '\0'; > } > > > The rest LGTM! > > Thanks for fixing this issue, > Stefano >
Re: [PATCH v3 2/3] vhost-user-blk: perform immediate cleanup if disconnect on initialization
On Wed, Apr 21, 2021 at 07:13:24PM +0300, Denis Plotnikov wrote: > > On 21.04.2021 18:24, Kevin Wolf wrote: > > Am 25.03.2021 um 16:12 hat Denis Plotnikov geschrieben: > > > Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect") > > > introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts > > > because of connection problems with vhost-blk daemon. > > > > > > However, it introdues a new problem. Now, any communication errors > > > during execution of vhost_dev_init() called by > > > vhost_user_blk_device_realize() > > > lead to qemu abort on assert in vhost_dev_get_config(). > > > > > > This happens because vhost_user_blk_disconnect() is postponed but > > > it should have dropped s->connected flag by the time > > > vhost_user_blk_device_realize() performs a new connection opening. > > > On the connection opening, vhost_dev initialization in > > > vhost_user_blk_connect() relies on s->connection flag and > > > if it's not dropped, it skips vhost_dev initialization and returns > > > with success. Then, vhost_user_blk_device_realize()'s execution flow > > > goes to vhost_dev_get_config() where it's aborted on the assert. > > > > > > To fix the problem this patch adds immediate cleanup on device > > > initialization(in vhost_user_blk_device_realize()) using different > > > event handlers for initialization and operation introduced in the > > > previous patch. > > > On initialization (in vhost_user_blk_device_realize()) we fully > > > control the initialization process. At that point, nobody can use the > > > device since it isn't initialized and we don't need to postpone any > > > cleanups, so we can do cleaup right away when there is a communication > > > problem with the vhost-blk daemon. > > > On operation we leave it as is, since the disconnect may happen when > > > the device is in use, so the device users may want to use vhost_dev's data > > > to do rollback before vhost_dev is re-initialized (e.g. in > > > vhost_dev_set_log()). > > > > > > Signed-off-by: Denis Plotnikov > > > Reviewed-by: Raphael Norwitz > > I think there is something wrong with this patch. > > > > I'm debugging an error case, specifically num-queues being larger in > > QEMU that in the vhost-user-blk export. Before this patch, it has just > > an unfriendly error message: > > > > qemu-system-x86_64: -device > > vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: > > Unexpected end-of-file before all data were read > > qemu-system-x86_64: -device > > vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: > > Failed to read msg header. Read 0 instead of 12. Original request 24. > > qemu-system-x86_64: -device > > vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: > > vhost-user-blk: get block config failed > > qemu-system-x86_64: Failed to set msg fds. > > qemu-system-x86_64: vhost VQ 0 ring restore failed: -1: Resource > > temporarily unavailable (11) > > > > After the patch, it crashes: > > > > #0 0x55d0a4bd in vhost_user_read_cb (source=0x568f4690, > > condition=(G_IO_IN | G_IO_HUP), opaque=0x7fffcbf0) at > > ../hw/virtio/vhost-user.c:313 > > #1 0x55d950d3 in qio_channel_fd_source_dispatch > > (source=0x57c3f750, callback=0x55d0a478 , > > user_data=0x7fffcbf0) at ../io/channel-watch.c:84 > > #2 0x77b32a9f in g_main_context_dispatch () at > > /lib64/libglib-2.0.so.0 > > #3 0x77b84a98 in g_main_context_iterate.constprop () at > > /lib64/libglib-2.0.so.0 > > #4 0x77b32163 in g_main_loop_run () at /lib64/libglib-2.0.so.0 > > #5 0x55d0a724 in vhost_user_read (dev=0x57bc62f8, > > msg=0x7fffcc50) at ../hw/virtio/vhost-user.c:402 > > #6 0x55d0ee6b in vhost_user_get_config (dev=0x57bc62f8, > > config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost-user.c:2133 > > #7 0x55d56d46 in vhost_dev_get_config (hdev=0x57bc62f8, > > config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost.c:1566 > > #8 0x55cdd150 in vhost_user_blk_device_realize > > (dev=0x57bc60b0, errp=0x7fffcf90) at > > ../hw/block/vhost-user-blk.c:510 > > #9 0x55d08f6d in virtio_device_realize (dev=0x57bc60b0, > > errp=0x7fffcff0) at ../hw/virtio/virtio.c:3660 > > > > The problem is that vhost_user_read_cb() still accesses dev->opaque even > > though the device has been cleaned up meanwhile when the connection was > > closed (the vhost_user_blk_disconnect() added by this patch), so it's > > NULL now. This problem was actually mentioned in the comment that is > > removed by this patch. > > > > I tried to fix this by making vhost_user_read() cope with the fact that > > the device might have been cleaned up meanwhile, but then I'm running > > into the next set of problems. > > > > The first is that retrying is pointless, the error condition is in
Re: [PATCH] monitor: hmp_qemu_io: acquire aio contex, fix crash
On 4/21/21 10:32 AM, Vladimir Sementsov-Ogievskiy wrote: > Max reported the following bug: > > $ ./qemu-img create -f raw src.img 1G > $ ./qemu-img create -f raw dst.img 1G > > $ (echo ' >{"execute":"qmp_capabilities"} >{"execute":"blockdev-mirror", > "arguments":{"job-id":"mirror", > "device":"source", > "target":"target", > "sync":"full", > "filter-node-name":"mirror-top"}} > '; sleep 3; echo ' >{"execute":"human-monitor-command", > "arguments":{"command-line": > "qemu-io mirror-top \"write 0 1G\""}}') \ > | x86_64-softmmu/qemu-system-x86_64 \ >-qmp stdio \ >-blockdev file,node-name=source,filename=src.img \ >-blockdev file,node-name=target,filename=dst.img \ >-object iothread,id=iothr0 \ >-device virtio-blk,drive=source,iothread=iothr0 > > crashes: > > 0 raise () at /usr/lib/libc.so.6 > 1 abort () at /usr/lib/libc.so.6 > 2 error_exit >(err=, >msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl") >at ../util/qemu-thread-posix.c:37 > 3 qemu_mutex_unlock_impl >(mutex=mutex@entry=0x55fbb25ab6e0, >file=file@entry=0x55fbb1636957 "../util/async.c", >line=line@entry=650) >at ../util/qemu-thread-posix.c:109 > 4 aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650 > 5 bdrv_do_drained_begin >(bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false, >parent=parent@entry=0x0, >ignore_bds_parents=ignore_bds_parents@entry=false, >poll=poll@entry=true) at ../block/io.c:441 > 6 bdrv_do_drained_begin >(poll=true, ignore_bds_parents=false, parent=0x0, recursive=false, >bs=0x55fbb3a87000) at ../block/io.c:448 > 7 blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718 > 8 blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498 > 9 blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491 > 10 hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=) >at ../block/monitor/block-hmp-cmds.c:628 > > man pthread_mutex_unlock > ... > EPERM The mutex type is PTHREAD_MUTEX_ERRORCHECK or > PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the > current thread does not own the mutex. > > So, thread doesn't own the mutex. And we have iothread here. > > Next, note that AIO_WAIT_WHILE() documents that ctx must be acquired > exactly once by caller. But where is it acquired in the call stack? > Seems nowhere. > > qemuio_command do acquire aio context.. But we need context acquired > around blk_unref as well. Let's do it. > > Reported-by: Max Reitz > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > block/monitor/block-hmp-cmds.c | 7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c > index ebf1033f31..934100d0eb 100644 > --- a/block/monitor/block-hmp-cmds.c > +++ b/block/monitor/block-hmp-cmds.c > @@ -559,6 +559,7 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) > { > BlockBackend *blk; > BlockBackend *local_blk = NULL; > +AioContext *ctx; > bool qdev = qdict_get_try_bool(qdict, "qdev", false); > const char *device = qdict_get_str(qdict, "device"); > const char *command = qdict_get_str(qdict, "command"); > @@ -615,7 +616,13 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) > qemuio_command(blk, command); > > fail: > +ctx = blk_get_aio_context(blk); > +aio_context_acquire(ctx); > + > blk_unref(local_blk); > + > +aio_context_release(ctx); I dare to mention "code smell" here... Not to your fix, but to the API. Can't we simplify it somehow? Maybe we can't, I don't understand it well. But it seems bug prone, and expensive in human brain resources (either develop, debug or review). > hmp_handle_error(mon, err); > } > >
Re: [PATCH 14/14] hw/nvme: move nvme emulation out of hw/block
On Apr 19 21:28, Klaus Jensen wrote: From: Klaus Jensen With the introduction of the nvme-subsystem device we are really cluttering up the hw/block directory. As suggested by Philippe previously, move the nvme emulation to hw/nvme. Suggested-by: Philippe Mathieu-Daudé Hi Philippe, You originally suggested this. Do you know if an Ack is required from anyone "up in the food chain" before being PR'ed to Peter? Cheers, Klaus signature.asc Description: PGP signature
Re: [PATCH v3 2/3] vhost-user-blk: perform immediate cleanup if disconnect on initialization
On 21.04.2021 18:24, Kevin Wolf wrote: Am 25.03.2021 um 16:12 hat Denis Plotnikov geschrieben: Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect") introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts because of connection problems with vhost-blk daemon. However, it introdues a new problem. Now, any communication errors during execution of vhost_dev_init() called by vhost_user_blk_device_realize() lead to qemu abort on assert in vhost_dev_get_config(). This happens because vhost_user_blk_disconnect() is postponed but it should have dropped s->connected flag by the time vhost_user_blk_device_realize() performs a new connection opening. On the connection opening, vhost_dev initialization in vhost_user_blk_connect() relies on s->connection flag and if it's not dropped, it skips vhost_dev initialization and returns with success. Then, vhost_user_blk_device_realize()'s execution flow goes to vhost_dev_get_config() where it's aborted on the assert. To fix the problem this patch adds immediate cleanup on device initialization(in vhost_user_blk_device_realize()) using different event handlers for initialization and operation introduced in the previous patch. On initialization (in vhost_user_blk_device_realize()) we fully control the initialization process. At that point, nobody can use the device since it isn't initialized and we don't need to postpone any cleanups, so we can do cleaup right away when there is a communication problem with the vhost-blk daemon. On operation we leave it as is, since the disconnect may happen when the device is in use, so the device users may want to use vhost_dev's data to do rollback before vhost_dev is re-initialized (e.g. in vhost_dev_set_log()). Signed-off-by: Denis Plotnikov Reviewed-by: Raphael Norwitz I think there is something wrong with this patch. I'm debugging an error case, specifically num-queues being larger in QEMU that in the vhost-user-blk export. Before this patch, it has just an unfriendly error message: qemu-system-x86_64: -device vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: Unexpected end-of-file before all data were read qemu-system-x86_64: -device vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: Failed to read msg header. Read 0 instead of 12. Original request 24. qemu-system-x86_64: -device vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: vhost-user-blk: get block config failed qemu-system-x86_64: Failed to set msg fds. qemu-system-x86_64: vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11) After the patch, it crashes: #0 0x55d0a4bd in vhost_user_read_cb (source=0x568f4690, condition=(G_IO_IN | G_IO_HUP), opaque=0x7fffcbf0) at ../hw/virtio/vhost-user.c:313 #1 0x55d950d3 in qio_channel_fd_source_dispatch (source=0x57c3f750, callback=0x55d0a478 , user_data=0x7fffcbf0) at ../io/channel-watch.c:84 #2 0x77b32a9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0 #3 0x77b84a98 in g_main_context_iterate.constprop () at /lib64/libglib-2.0.so.0 #4 0x77b32163 in g_main_loop_run () at /lib64/libglib-2.0.so.0 #5 0x55d0a724 in vhost_user_read (dev=0x57bc62f8, msg=0x7fffcc50) at ../hw/virtio/vhost-user.c:402 #6 0x55d0ee6b in vhost_user_get_config (dev=0x57bc62f8, config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost-user.c:2133 #7 0x55d56d46 in vhost_dev_get_config (hdev=0x57bc62f8, config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost.c:1566 #8 0x55cdd150 in vhost_user_blk_device_realize (dev=0x57bc60b0, errp=0x7fffcf90) at ../hw/block/vhost-user-blk.c:510 #9 0x55d08f6d in virtio_device_realize (dev=0x57bc60b0, errp=0x7fffcff0) at ../hw/virtio/virtio.c:3660 The problem is that vhost_user_read_cb() still accesses dev->opaque even though the device has been cleaned up meanwhile when the connection was closed (the vhost_user_blk_disconnect() added by this patch), so it's NULL now. This problem was actually mentioned in the comment that is removed by this patch. I tried to fix this by making vhost_user_read() cope with the fact that the device might have been cleaned up meanwhile, but then I'm running into the next set of problems. The first is that retrying is pointless, the error condition is in the configuration, it will never change. The other is that after many repetitions of the same error message, I got a crash where the device is cleaned up a second time in vhost_dev_init() and the virtqueues are already NULL. So it seems to me that erroring out during the initialisation phase makes a lot more sense than retrying. Kevin But without the patch there will be another problem which the patch actually addresses. It seems to me that there is a case when the retrying is
Re: [PATCH v3 2/3] vhost-user-blk: perform immediate cleanup if disconnect on initialization
Am 25.03.2021 um 16:12 hat Denis Plotnikov geschrieben: > Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect") > introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts > because of connection problems with vhost-blk daemon. > > However, it introdues a new problem. Now, any communication errors > during execution of vhost_dev_init() called by vhost_user_blk_device_realize() > lead to qemu abort on assert in vhost_dev_get_config(). > > This happens because vhost_user_blk_disconnect() is postponed but > it should have dropped s->connected flag by the time > vhost_user_blk_device_realize() performs a new connection opening. > On the connection opening, vhost_dev initialization in > vhost_user_blk_connect() relies on s->connection flag and > if it's not dropped, it skips vhost_dev initialization and returns > with success. Then, vhost_user_blk_device_realize()'s execution flow > goes to vhost_dev_get_config() where it's aborted on the assert. > > To fix the problem this patch adds immediate cleanup on device > initialization(in vhost_user_blk_device_realize()) using different > event handlers for initialization and operation introduced in the > previous patch. > On initialization (in vhost_user_blk_device_realize()) we fully > control the initialization process. At that point, nobody can use the > device since it isn't initialized and we don't need to postpone any > cleanups, so we can do cleaup right away when there is a communication > problem with the vhost-blk daemon. > On operation we leave it as is, since the disconnect may happen when > the device is in use, so the device users may want to use vhost_dev's data > to do rollback before vhost_dev is re-initialized (e.g. in > vhost_dev_set_log()). > > Signed-off-by: Denis Plotnikov > Reviewed-by: Raphael Norwitz I think there is something wrong with this patch. I'm debugging an error case, specifically num-queues being larger in QEMU that in the vhost-user-blk export. Before this patch, it has just an unfriendly error message: qemu-system-x86_64: -device vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: Unexpected end-of-file before all data were read qemu-system-x86_64: -device vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: Failed to read msg header. Read 0 instead of 12. Original request 24. qemu-system-x86_64: -device vhost-user-blk-pci,chardev=vhost1,id=blk1,iommu_platform=off,disable-legacy=on,num-queues=4: vhost-user-blk: get block config failed qemu-system-x86_64: Failed to set msg fds. qemu-system-x86_64: vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11) After the patch, it crashes: #0 0x55d0a4bd in vhost_user_read_cb (source=0x568f4690, condition=(G_IO_IN | G_IO_HUP), opaque=0x7fffcbf0) at ../hw/virtio/vhost-user.c:313 #1 0x55d950d3 in qio_channel_fd_source_dispatch (source=0x57c3f750, callback=0x55d0a478 , user_data=0x7fffcbf0) at ../io/channel-watch.c:84 #2 0x77b32a9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0 #3 0x77b84a98 in g_main_context_iterate.constprop () at /lib64/libglib-2.0.so.0 #4 0x77b32163 in g_main_loop_run () at /lib64/libglib-2.0.so.0 #5 0x55d0a724 in vhost_user_read (dev=0x57bc62f8, msg=0x7fffcc50) at ../hw/virtio/vhost-user.c:402 #6 0x55d0ee6b in vhost_user_get_config (dev=0x57bc62f8, config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost-user.c:2133 #7 0x55d56d46 in vhost_dev_get_config (hdev=0x57bc62f8, config=0x57bc62ac "", config_len=60) at ../hw/virtio/vhost.c:1566 #8 0x55cdd150 in vhost_user_blk_device_realize (dev=0x57bc60b0, errp=0x7fffcf90) at ../hw/block/vhost-user-blk.c:510 #9 0x55d08f6d in virtio_device_realize (dev=0x57bc60b0, errp=0x7fffcff0) at ../hw/virtio/virtio.c:3660 The problem is that vhost_user_read_cb() still accesses dev->opaque even though the device has been cleaned up meanwhile when the connection was closed (the vhost_user_blk_disconnect() added by this patch), so it's NULL now. This problem was actually mentioned in the comment that is removed by this patch. I tried to fix this by making vhost_user_read() cope with the fact that the device might have been cleaned up meanwhile, but then I'm running into the next set of problems. The first is that retrying is pointless, the error condition is in the configuration, it will never change. The other is that after many repetitions of the same error message, I got a crash where the device is cleaned up a second time in vhost_dev_init() and the virtqueues are already NULL. So it seems to me that erroring out during the initialisation phase makes a lot more sense than retrying. Kevin > hw/block/vhost-user-blk.c | 48 +++ > 1 file changed, 24 insertions(+), 24 deletions(-) > >
Re: [PATCH v3 02/33] block/nbd: fix how state is cleared on nbd_open() failure paths
On Fri, Apr 16, 2021 at 11:08:40AM +0300, Vladimir Sementsov-Ogievskiy wrote: > We have two "return error" paths in nbd_open() after > nbd_process_options(). Actually we should call nbd_clear_bdrvstate() > on these paths. Interesting that nbd_process_options() calls > nbd_clear_bdrvstate() by itself. > > Let's fix leaks and refactor things to be more obvious: > > - intialize yank at top of nbd_open() > - move yank cleanup to nbd_clear_bdrvstate() > - refactor nbd_open() so that all failure paths except for > yank-register goes through nbd_clear_bdrvstate() > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > block/nbd.c | 36 ++-- > 1 file changed, 18 insertions(+), 18 deletions(-) > > diff --git a/block/nbd.c b/block/nbd.c > index 739ae2941f..a407a3814b 100644 > --- a/block/nbd.c > +++ b/block/nbd.c > @@ -152,8 +152,12 @@ static void > nbd_co_establish_connection_cancel(BlockDriverState *bs, > static int nbd_client_handshake(BlockDriverState *bs, Error **errp); > static void nbd_yank(void *opaque); > > -static void nbd_clear_bdrvstate(BDRVNBDState *s) > +static void nbd_clear_bdrvstate(BlockDriverState *bs) > { > +BDRVNBDState *s = (BDRVNBDState *)bs->opaque; > + > +yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); > + > object_unref(OBJECT(s->tlscreds)); > qapi_free_SocketAddress(s->saddr); > s->saddr = NULL; > @@ -2279,9 +2283,6 @@ static int nbd_process_options(BlockDriverState *bs, > QDict *options, > ret = 0; > > error: > -if (ret < 0) { > -nbd_clear_bdrvstate(s); > -} > qemu_opts_del(opts); > return ret; > } > @@ -2292,11 +2293,6 @@ static int nbd_open(BlockDriverState *bs, QDict > *options, int flags, > int ret; > BDRVNBDState *s = (BDRVNBDState *)bs->opaque; > > -ret = nbd_process_options(bs, options, errp); > -if (ret < 0) { > -return ret; > -} > - > s->bs = bs; > qemu_co_mutex_init(>send_mutex); > qemu_co_queue_init(>free_sema); > @@ -2305,20 +2301,23 @@ static int nbd_open(BlockDriverState *bs, QDict > *options, int flags, > return -EEXIST; > } > > +ret = nbd_process_options(bs, options, errp); > +if (ret < 0) { > +goto fail; > +} > + > /* > * establish TCP connection, return error if it fails > * TODO: Configurable retry-until-timeout behaviour. > */ > if (nbd_establish_connection(bs, s->saddr, errp) < 0) { > -yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); > -return -ECONNREFUSED; > +ret = -ECONNREFUSED; > +goto fail; > } > > ret = nbd_client_handshake(bs, errp); Not that this was introduced by this patch, but once you're at it: AFAICT nbd_client_handshake() calls yank_unregister_instance() on some error path(s); I assume this needs to go too, otherwise it's called twice (and asserts). Roman. > if (ret < 0) { > -yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); > -nbd_clear_bdrvstate(s); > -return ret; > +goto fail; > } > /* successfully connected */ > s->state = NBD_CLIENT_CONNECTED; > @@ -2330,6 +2329,10 @@ static int nbd_open(BlockDriverState *bs, QDict > *options, int flags, > aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co); > > return 0; > + > +fail: > +nbd_clear_bdrvstate(bs); > +return ret; > } > > static int nbd_co_flush(BlockDriverState *bs) > @@ -2373,11 +2376,8 @@ static void nbd_refresh_limits(BlockDriverState *bs, > Error **errp) > > static void nbd_close(BlockDriverState *bs) > { > -BDRVNBDState *s = bs->opaque; > - > nbd_client_close(bs); > -yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); > -nbd_clear_bdrvstate(s); > +nbd_clear_bdrvstate(bs); > } > > /*
Re: [PATCH v2 0/8] Block layer thread-safety, continued
On 19/04/21 10:55, Emanuele Giuseppe Esposito wrote: This and the following serie of patches are based on Paolo's v1 patches sent in 2017[*]. They have been ported to the current QEMU version, but the goal remains the same: - make the block layer thread-safe (patches 1-5), and - remove aio_context_acquire/release (patches 6-8). [*] = https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg01398.html Signed-off-by: Emanuele Giuseppe Esposito This looks good to me, though the commit message of patch 8 needs to be rewritten. Paolo --- v1 (2017) -> v2 (2021): - v1 Patch "block-backup: add reqs_lock" has been dropped, because now is completely different from the old version and all functions that were affected by it have been moved or deleted. It will be replaced by another serie that aims to thread safety to block/block-copy.c - remaining v1 patches will be integrated in next serie. - Patch "block: do not acquire AioContext in check_to_replace_node" moves part of the logic of check_to_replace_node to the caller, so that the function can be included in the aio_context_acquire/release block that follows. Emanuele Giuseppe Esposito (8): block: prepare write threshold code for thread safety block: protect write threshold QMP commands from concurrent requests util: use RCU accessors for notifiers block: make before-write notifiers thread-safe block: add a few more notes on locking block: do not acquire AioContext in check_to_replace_node block/replication: do not acquire AioContext block: do not take AioContext around reopen block.c | 28 ++-- block/block-backend.c | 4 --- block/io.c| 12 + block/mirror.c| 9 --- block/replication.c | 54 +-- block/write-threshold.c | 39 ++-- blockdev.c| 26 +-- include/block/block.h | 1 + include/block/block_int.h | 42 +- util/notify.c | 13 +- 10 files changed, 113 insertions(+), 115 deletions(-)
Re: [PATCH v2 8/8] block: do not take AioContext around reopen
On 19/04/21 10:55, Emanuele Giuseppe Esposito wrote: Reopen needs to handle AioContext carefully due to calling bdrv_drain_all_begin/end. By not taking AioContext around calls to bdrv_reopen_multiple, we can drop the function's release/acquire pair and the AioContext argument too. So... I wrote this commit message and I cannot parse it anymore---much less relate it to the code in the patch. This is a problem, but it doesn't mean that the patch is wrong. bdrv_reopen_multiple does not have the AioContext argument anymore. It's not doing release/acquire either. The relevant commit is commit 1a63a90750 ("block: Keep nodes drained between reopen_queue/multiple", 2017-12-22). You're basically cleaning up after that code in the same way as patch 7: reopen functions take care of keeping the BDS quiescent, so there's nothing to synchronize on. For the future, the important step you missed was to check your diff against the one that you cherry-picked from. Then you would have noticed that 1) it's much smaller 2) one thing that is mentioned in the commit message ("drop the function's release/acquire pair and argument") is not needed anymore. Paolo Signed-off-by: Paolo Bonzini Signed-off-by: Emanuele Giuseppe Esposito --- block/block-backend.c | 4 block/mirror.c| 9 - blockdev.c| 19 ++- 3 files changed, 6 insertions(+), 26 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 413af51f3b..6fdc698e9e 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2291,20 +2291,16 @@ int blk_commit_all(void) BlockBackend *blk = NULL; while ((blk = blk_all_next(blk)) != NULL) { -AioContext *aio_context = blk_get_aio_context(blk); BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk)); -aio_context_acquire(aio_context); if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) { int ret; ret = bdrv_commit(unfiltered_bs); if (ret < 0) { -aio_context_release(aio_context); return ret; } } -aio_context_release(aio_context); } return 0; } diff --git a/block/mirror.c b/block/mirror.c index 5a71bd8bbc..43174bbc6b 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -631,7 +631,6 @@ static int mirror_exit_common(Job *job) MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); BlockJob *bjob = >common; MirrorBDSOpaque *bs_opaque; -AioContext *replace_aio_context = NULL; BlockDriverState *src; BlockDriverState *target_bs; BlockDriverState *mirror_top_bs; @@ -699,11 +698,6 @@ static int mirror_exit_common(Job *job) } } -if (s->to_replace) { -replace_aio_context = bdrv_get_aio_context(s->to_replace); -aio_context_acquire(replace_aio_context); -} - if (s->should_complete && !abort) { BlockDriverState *to_replace = s->to_replace ?: src; bool ro = bdrv_is_read_only(to_replace); @@ -740,9 +734,6 @@ static int mirror_exit_common(Job *job) error_free(s->replace_blocker); bdrv_unref(s->to_replace); } -if (replace_aio_context) { -aio_context_release(replace_aio_context); -} g_free(s->replaces); bdrv_unref(target_bs); diff --git a/blockdev.c b/blockdev.c index e901107344..1672ef756e 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3469,7 +3469,6 @@ void qmp_change_backing_file(const char *device, Error **errp) { BlockDriverState *bs = NULL; -AioContext *aio_context; BlockDriverState *image_bs = NULL; Error *local_err = NULL; bool ro; @@ -3480,37 +3479,34 @@ void qmp_change_backing_file(const char *device, return; } -aio_context = bdrv_get_aio_context(bs); -aio_context_acquire(aio_context); - image_bs = bdrv_lookup_bs(NULL, image_node_name, _err); if (local_err) { error_propagate(errp, local_err); -goto out; +return; } if (!image_bs) { error_setg(errp, "image file not found"); -goto out; +return; } if (bdrv_find_base(image_bs) == image_bs) { error_setg(errp, "not allowing backing file change on an image " "without a backing file"); -goto out; +return; } /* even though we are not necessarily operating on bs, we need it to * determine if block ops are currently prohibited on the chain */ if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_CHANGE, errp)) { -goto out; +return; } /* final sanity check */ if (!bdrv_chain_contains(bs, image_bs)) { error_setg(errp, "'%s' and image file are not in the same chain", device); -goto out; +return;
Re: [RFC PATCH 0/3] block-copy: lock tasks and calls list
On 21/04/21 10:53, Vladimir Sementsov-Ogievskiy wrote: Good point. Emanuele, can you work on ProgressMeter and SharedResource? AioTaskPool can also be converted to just use CoQueue instead of manually waking up coroutines. That would be great. I have one more question in mind: Is it effective to use CoMutex here? We are protecting only some fast manipulations with data, not io path or something like that. Will simple QemuMutex work better? Even if CoMutex doesn't have any overhead, I don't think than if thread A wants to modify task list, but mutex is held by thread B (for similar thing), there is a reason for thread A to yield and do some other things: it can just wait several moments on mutex while B is modifying task list.. Indeed even CoQueue primitives count as simple manipulation of data, because they unlock/lock the mutex while the coroutine sleeps. So you're right that it would be okay to use QemuMutex as well The block copy code that Emanuele has touched so far is all coroutine based. I like using CoMutex when that is the case, because it says implicitly "the monitor is not involved". But we need to see what it will be like when the patches are complete. Rate limiting ends up being called by the monitor, but it will have its own QemuMutex so it's fine. What's left is cancellation and block_copy_kick; I think that we can make qemu_co_sleep thread-safe with an API similar to Linux's prepare_to_wait, so a QemuMutex wouldn't be needed there either. Paolo
Re: [PATCH v3 2/2] block/rbd: Add an escape-aware strchr helper
On Fri, Apr 09, 2021 at 09:38:54AM -0500, Connor Kuehl wrote: Sometimes the parser needs to further split a token it has collected from the token input stream. Right now, it does a cursory check to see if the relevant characters appear in the token to determine if it should break it down further. However, qemu_rbd_next_tok() will escape characters as it removes tokens from the token stream and plain strchr() won't. This can make the initial strchr() check slightly misleading since it implies qemu_rbd_next_tok() will find the token and split on it, except the reality is that qemu_rbd_next_tok() will pass over it if it is escaped. Use a custom strchr to avoid mixing escaped and unescaped string operations. Reported-by: Han Han Fixes: https://bugzilla.redhat.com/1873913 Signed-off-by: Connor Kuehl --- v2 -> v3: * Update qemu_rbd_strchr to only skip if there's a delimiter AND the next character is not the NUL terminator block/rbd.c| 20 ++-- tests/qemu-iotests/231 | 4 tests/qemu-iotests/231.out | 3 +++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 9071a00e3f..291e3f09e1 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -134,6 +134,22 @@ static char *qemu_rbd_next_tok(char *src, char delim, char **p) return src; } +static char *qemu_rbd_strchr(char *src, char delim) +{ +char *p; + +for (p = src; *p; ++p) { +if (*p == delim) { +return p; +} +if (*p == '\\' && p[1] != '\0') { +++p; +} +} + +return NULL; +} + IIUC this is similar to the code used in qemu_rbd_next_tok(). To avoid code duplication can we use this new function inside it? I mean something like this (not tested): diff --git a/block/rbd.c b/block/rbd.c index f098a89c7b..eb6a839362 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -119,15 +119,8 @@ static char *qemu_rbd_next_tok(char *src, char delim, char **p) *p = NULL; -for (end = src; *end; ++end) { -if (*end == delim) { -break; -} -if (*end == '\\' && end[1] != '\0') { -end++; -} -} -if (*end == delim) { +end = qemu_rbd_strchr(src, delim); +if (end && *end == delim) { *p = end + 1; *end = '\0'; } The rest LGTM! Thanks for fixing this issue, Stefano
Re: [PATCH v3 1/2] iotests/231: Update expected deprecation message
On Fri, Apr 09, 2021 at 09:38:53AM -0500, Connor Kuehl wrote: The deprecation message in the expected output has technically been wrong since the wrong version of a patch was applied to it. Because of this, the test fails. Correct the expected output so that it passes. Signed-off-by: Connor Kuehl Reviewed-by: Max Reitz --- tests/qemu-iotests/231.out | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) Reviewed-by: Stefano Garzarella diff --git a/tests/qemu-iotests/231.out b/tests/qemu-iotests/231.out index 579ba11c16..747dd221bb 100644 --- a/tests/qemu-iotests/231.out +++ b/tests/qemu-iotests/231.out @@ -1,9 +1,7 @@ QA output created by 231 -qemu-img: RBD options encoded in the filename as keyvalue pairs is deprecated. Future versions may cease to parse these options in the future. +qemu-img: warning: RBD options encoded in the filename as keyvalue pairs is deprecated unable to get monitor info from DNS SRV with service name: ceph-mon -no monitors specified to connect to. qemu-img: Could not open 'json:{'file.driver':'rbd','file.filename':'rbd:rbd/bogus:conf=BOGUS_CONF'}': error connecting: No such file or directory unable to get monitor info from DNS SRV with service name: ceph-mon -no monitors specified to connect to. qemu-img: Could not open 'json:{'file.driver':'rbd','file.pool':'rbd','file.image':'bogus','file.conf':'BOGUS_CONF'}': error connecting: No such file or directory *** done -- 2.30.2
Re: [PATCH v3] hw/block/nvme: fix lbaf formats initialization
On 4/16/21 1:59 PM, Gollu Appalanaidu wrote: > Currently LBAF formats are being intialized based on metadata > size if and only if nvme-ns "ms" parameter is non-zero value. > Since FormatNVM command being supported device parameter "ms" > may not be the criteria to initialize the supported LBAFs. > > Signed-off-by: Gollu Appalanaidu > --- > -v3: Remove "mset" constraint check if ms < 8, "mset" can be > set even when ms < 8 and non-zero. > > -v2: Addressing review comments (Klaus) > Change the current "pi" and "ms" constraint check such that it > will throw the error if ms < 8 and if namespace protection info, > location and metadata settings are set. > Splitting this from compare fix patch series. > > hw/block/nvme-ns.c | 58 -- > 1 file changed, 25 insertions(+), 33 deletions(-) > +NvmeLBAF lbaf[16] = { Unrelated to your change, but better to use a read-only array: static const NvmeLBAF lbaf[16] = { > +[0] = { .ds = 9 }, > +[1] = { .ds = 9, .ms = 8 }, > +[2] = { .ds = 9, .ms = 16 }, > +[3] = { .ds = 9, .ms = 64 }, > +[4] = { .ds = 12 }, > +[5] = { .ds = 12, .ms = 8 }, > +[6] = { .ds = 12, .ms = 16 }, > +[7] = { .ds = 12, .ms = 64 }, > +}; > + > +memcpy(_ns->lbaf, , sizeof(lbaf)); > +id_ns->nlbaf = 7;
Re: [PATCH v3] hw/block/nvme: fix lbaf formats initialization
On Tue, Apr 20, 2021 at 09:47:00PM +0200, Klaus Jensen wrote: On Apr 16 17:29, Gollu Appalanaidu wrote: Currently LBAF formats are being intialized based on metadata size if and only if nvme-ns "ms" parameter is non-zero value. Since FormatNVM command being supported device parameter "ms" may not be the criteria to initialize the supported LBAFs. Signed-off-by: Gollu Appalanaidu --- -v3: Remove "mset" constraint check if ms < 8, "mset" can be set even when ms < 8 and non-zero. -v2: Addressing review comments (Klaus) Change the current "pi" and "ms" constraint check such that it will throw the error if ms < 8 and if namespace protection info, location and metadata settings are set. Splitting this from compare fix patch series. hw/block/nvme-ns.c | 58 -- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 7bb618f182..594b0003cf 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -85,38 +85,28 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) ds = 31 - clz32(ns->blkconf.logical_block_size); ms = ns->params.ms; -if (ns->params.ms) { -id_ns->mc = 0x3; +id_ns->mc = 0x3; -if (ns->params.mset) { -id_ns->flbas |= 0x10; -} +if (ms && ns->params.mset) { +id_ns->flbas |= 0x10; +} -id_ns->dpc = 0x1f; -id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi; - -NvmeLBAF lbaf[16] = { -[0] = { .ds = 9 }, -[1] = { .ds = 9, .ms = 8 }, -[2] = { .ds = 9, .ms = 16 }, -[3] = { .ds = 9, .ms = 64 }, -[4] = { .ds = 12 }, -[5] = { .ds = 12, .ms = 8 }, -[6] = { .ds = 12, .ms = 16 }, -[7] = { .ds = 12, .ms = 64 }, -}; - -memcpy(_ns->lbaf, , sizeof(lbaf)); -id_ns->nlbaf = 7; -} else { -NvmeLBAF lbaf[16] = { -[0] = { .ds = 9 }, -[1] = { .ds = 12 }, -}; +id_ns->dpc = 0x1f; +id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi; -memcpy(_ns->lbaf, , sizeof(lbaf)); -id_ns->nlbaf = 1; -} +NvmeLBAF lbaf[16] = { +[0] = { .ds = 9 }, +[1] = { .ds = 9, .ms = 8 }, +[2] = { .ds = 9, .ms = 16 }, +[3] = { .ds = 9, .ms = 64 }, +[4] = { .ds = 12 }, +[5] = { .ds = 12, .ms = 8 }, +[6] = { .ds = 12, .ms = 16 }, +[7] = { .ds = 12, .ms = 64 }, +}; + +memcpy(_ns->lbaf, , sizeof(lbaf)); +id_ns->nlbaf = 7; for (i = 0; i <= id_ns->nlbaf; i++) { NvmeLBAF *lbaf = _ns->lbaf[i]; This part LGTM. @@ -395,10 +385,12 @@ static int nvme_ns_check_constraints(NvmeCtrl *n, NvmeNamespace *ns, return -1; } -if (ns->params.pi && ns->params.ms < 8) { -error_setg(errp, "at least 8 bytes of metadata required to enable " - "protection information"); -return -1; +if (ns->params.ms < 8) { +if (ns->params.pi || ns->params.pil) { +error_setg(errp, "at least 8 bytes of metadata required to enable " +"protection information, protection information location"); +return -1; +} } If you do this additional check, then you should maybe also check that pil is only set if pi is. But if pi is not enabled, then the value of pil is irrelevant (even though it ends up in FLBAS). In other words, if you want to validate all possible parameter configurations, then we have a lot more checking to do! Currently, the approach taken by the parameter validation code is to error out on *invalid* configurations that causes invariants to not hold, and I'd prefer that we stick with that to keep the check logic as simple as possible. So, (without this unnecessary check): Sure, will remove this check and send v4 Reviewed-by: Klaus Jensen
Re: [RFC PATCH 0/3] block-copy: lock tasks and calls list
21.04.2021 11:38, Paolo Bonzini wrote: On 20/04/21 15:12, Vladimir Sementsov-Ogievskiy wrote: 20.04.2021 13:04, Emanuele Giuseppe Esposito wrote: This serie of patches continues Paolo's series on making the block layer thread safe. Add a CoMutex lock for both tasks and calls list present in block/block-copy.c I think, we need more information about what kind of thread-safety we want. Should the whole interface of block-copy be thread safe? Or only part of it? What is going to be shared between different threads? Which functions will be called concurrently from different threads? This should be documented in include/block/block-copy.h. I guess all of it. So more state fields should be identified in BlockCopyState, especially in_flight_bytes. ProgressMeter and SharedResource should be made thread-safe on their own, just like the patch I posted for RateLimit. What I see here, is that some things are protected by mutex.. Some things not. What became thread-safe? For example, in block_copy_dirty_clusters(), we modify task fields without any mutex held: block_copy_task_shrink doesn't take mutex. task->zeroes is set without mutex as well Agreed, these are bugs in the series. Still all these accesses are done when task is already added to the list. Looping in block_copy_common() is not thread safe as well. That one should be mostly safe, because only one coroutine ever writes to all fields except cancelled. cancelled should be accessed with qatomic_read/qatomic_set, but there's also the problem that coroutine sleep/wake APIs are hard to use in a thread-safe manner (which affects block_copy_kick). This is a different topic and it is something I'm working on, You also forget to protect QLIST_REMOVE() call in block_copy_task_end().. Next, block-copy uses co-shared-resource API, which is not thread-safe (as it is directly noted in include/qemu/co-shared-resource.h). Same thing is block/aio_task API, which is not thread-safe too. So, we should bring thread-safety first to these smaller helper APIs. Good point. Emanuele, can you work on ProgressMeter and SharedResource? AioTaskPool can also be converted to just use CoQueue instead of manually waking up coroutines. That would be great. I have one more question in mind: Is it effective to use CoMutex here? We are protecting only some fast manipulations with data, not io path or something like that. Will simple QemuMutex work better? Even if CoMutex doesn't have any overhead, I don't think than if thread A wants to modify task list, but mutex is held by thread B (for similar thing), there is a reason for thread A to yield and do some other things: it can just wait several moments on mutex while B is modifying task list.. -- Best regards, Vladimir
Re: [RFC PATCH 0/3] block-copy: lock tasks and calls list
On 20/04/21 15:12, Vladimir Sementsov-Ogievskiy wrote: 20.04.2021 13:04, Emanuele Giuseppe Esposito wrote: This serie of patches continues Paolo's series on making the block layer thread safe. Add a CoMutex lock for both tasks and calls list present in block/block-copy.c I think, we need more information about what kind of thread-safety we want. Should the whole interface of block-copy be thread safe? Or only part of it? What is going to be shared between different threads? Which functions will be called concurrently from different threads? This should be documented in include/block/block-copy.h. I guess all of it. So more state fields should be identified in BlockCopyState, especially in_flight_bytes. ProgressMeter and SharedResource should be made thread-safe on their own, just like the patch I posted for RateLimit. What I see here, is that some things are protected by mutex.. Some things not. What became thread-safe? For example, in block_copy_dirty_clusters(), we modify task fields without any mutex held: block_copy_task_shrink doesn't take mutex. task->zeroes is set without mutex as well Agreed, these are bugs in the series. Still all these accesses are done when task is already added to the list. Looping in block_copy_common() is not thread safe as well. That one should be mostly safe, because only one coroutine ever writes to all fields except cancelled. cancelled should be accessed with qatomic_read/qatomic_set, but there's also the problem that coroutine sleep/wake APIs are hard to use in a thread-safe manner (which affects block_copy_kick). This is a different topic and it is something I'm working on, You also forget to protect QLIST_REMOVE() call in block_copy_task_end().. Next, block-copy uses co-shared-resource API, which is not thread-safe (as it is directly noted in include/qemu/co-shared-resource.h). Same thing is block/aio_task API, which is not thread-safe too. So, we should bring thread-safety first to these smaller helper APIs. Good point. Emanuele, can you work on ProgressMeter and SharedResource? AioTaskPool can also be converted to just use CoQueue instead of manually waking up coroutines.
[PATCH] monitor: hmp_qemu_io: acquire aio contex, fix crash
Max reported the following bug: $ ./qemu-img create -f raw src.img 1G $ ./qemu-img create -f raw dst.img 1G $ (echo ' {"execute":"qmp_capabilities"} {"execute":"blockdev-mirror", "arguments":{"job-id":"mirror", "device":"source", "target":"target", "sync":"full", "filter-node-name":"mirror-top"}} '; sleep 3; echo ' {"execute":"human-monitor-command", "arguments":{"command-line": "qemu-io mirror-top \"write 0 1G\""}}') \ | x86_64-softmmu/qemu-system-x86_64 \ -qmp stdio \ -blockdev file,node-name=source,filename=src.img \ -blockdev file,node-name=target,filename=dst.img \ -object iothread,id=iothr0 \ -device virtio-blk,drive=source,iothread=iothr0 crashes: 0 raise () at /usr/lib/libc.so.6 1 abort () at /usr/lib/libc.so.6 2 error_exit (err=, msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl") at ../util/qemu-thread-posix.c:37 3 qemu_mutex_unlock_impl (mutex=mutex@entry=0x55fbb25ab6e0, file=file@entry=0x55fbb1636957 "../util/async.c", line=line@entry=650) at ../util/qemu-thread-posix.c:109 4 aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650 5 bdrv_do_drained_begin (bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false, parent=parent@entry=0x0, ignore_bds_parents=ignore_bds_parents@entry=false, poll=poll@entry=true) at ../block/io.c:441 6 bdrv_do_drained_begin (poll=true, ignore_bds_parents=false, parent=0x0, recursive=false, bs=0x55fbb3a87000) at ../block/io.c:448 7 blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718 8 blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498 9 blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491 10 hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=) at ../block/monitor/block-hmp-cmds.c:628 man pthread_mutex_unlock ... EPERM The mutex type is PTHREAD_MUTEX_ERRORCHECK or PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the current thread does not own the mutex. So, thread doesn't own the mutex. And we have iothread here. Next, note that AIO_WAIT_WHILE() documents that ctx must be acquired exactly once by caller. But where is it acquired in the call stack? Seems nowhere. qemuio_command do acquire aio context.. But we need context acquired around blk_unref as well. Let's do it. Reported-by: Max Reitz Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/monitor/block-hmp-cmds.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index ebf1033f31..934100d0eb 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -559,6 +559,7 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) { BlockBackend *blk; BlockBackend *local_blk = NULL; +AioContext *ctx; bool qdev = qdict_get_try_bool(qdict, "qdev", false); const char *device = qdict_get_str(qdict, "device"); const char *command = qdict_get_str(qdict, "command"); @@ -615,7 +616,13 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) qemuio_command(blk, command); fail: +ctx = blk_get_aio_context(blk); +aio_context_acquire(ctx); + blk_unref(local_blk); + +aio_context_release(ctx); + hmp_handle_error(mon, err); } -- 2.29.2
Re: [PATCH v4 09/23] job: call job_enter from job_pause
07.04.2021 14:38, Vladimir Sementsov-Ogievskiy wrote: 07.04.2021 14:19, Max Reitz wrote: On 16.01.21 22:46, Vladimir Sementsov-Ogievskiy wrote: If main job coroutine called job_yield (while some background process is in progress), we should give it a chance to call job_pause_point(). It will be used in backup, when moved on async block-copy. Note, that job_user_pause is not enough: we want to handle child_job_drained_begin() as well, which call job_pause(). Still, if job is already in job_do_yield() in job_pause_point() we should not enter it. iotest 109 output is modified: on stop we do bdrv_drain_all() which now triggers job pause immediately (and pause after ready is standby). Signed-off-by: Vladimir Sementsov-Ogievskiy --- job.c | 3 +++ tests/qemu-iotests/109.out | 24 2 files changed, 27 insertions(+) While looking into https://lists.gnu.org/archive/html/qemu-block/2021-04/msg00035.html I noticed this: $ ./qemu-img create -f raw src.img 1G $ ./qemu-img create -f raw dst.img 1G $ (echo ' {"execute":"qmp_capabilities"} {"execute":"blockdev-mirror", "arguments":{"job-id":"mirror", "device":"source", "target":"target", "sync":"full", "filter-node-name":"mirror-top"}} '; sleep 3; echo ' {"execute":"human-monitor-command", "arguments":{"command-line": "qemu-io mirror-top \"write 0 1G\""}}') \ | x86_64-softmmu/qemu-system-x86_64 \ -qmp stdio \ -blockdev file,node-name=source,filename=src.img \ -blockdev file,node-name=target,filename=dst.img \ -object iothread,id=iothr0 \ -device virtio-blk,drive=source,iothread=iothr0 Before this commit, qemu-io returned an error that there is a permission conflict with virtio-blk. After this commit, there is an abort (“qemu: qemu_mutex_unlock_impl: Operation not permitted”): #0 0x7f8445a4eef5 in raise () at /usr/lib/libc.so.6 #1 0x7f8445a38862 in abort () at /usr/lib/libc.so.6 #2 0x55fbb14a36bf in error_exit (err=, msg=msg@entry=0x55fbb1634790 <__func__.27> "qemu_mutex_unlock_impl") at ../util/qemu-thread-posix.c:37 #3 0x55fbb14a3bc3 in qemu_mutex_unlock_impl (mutex=mutex@entry=0x55fbb25ab6e0, file=file@entry=0x55fbb1636957 "../util/async.c", line=line@entry=650) at ../util/qemu-thread-posix.c:109 #4 0x55fbb14b2e75 in aio_context_release (ctx=ctx@entry=0x55fbb25ab680) at ../util/async.c:650 #5 0x55fbb13d2029 in bdrv_do_drained_begin (bs=bs@entry=0x55fbb3a87000, recursive=recursive@entry=false, parent=parent@entry=0x0, ignore_bds_parents=ignore_bds_parents@entry=false, poll=poll@entry=true) at ../block/io.c:441 #6 0x55fbb13d2192 in bdrv_do_drained_begin (poll=true, ignore_bds_parents=false, parent=0x0, recursive=false, bs=0x55fbb3a87000) at ../block/io.c:448 #7 0x55fbb13c71a7 in blk_drain (blk=0x55fbb26c5a00) at ../block/block-backend.c:1718 #8 0x55fbb13c8bbd in blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:498 #9 blk_unref (blk=0x55fbb26c5a00) at ../block/block-backend.c:491 #10 0x55fbb1024863 in hmp_qemu_io (mon=0x7fffaf3fc7d0, qdict=) at ../block/monitor/block-hmp-cmds.c:628 Can you make anything out of this? Hmm.. Interesting. man pthread_mutex_unlock ... EPERM The mutex type is PTHREAD_MUTEX_ERRORCHECK or PTHREAD_MUTEX_RECURSIVE, or the mutex is a robust mutex, and the current thread does not own the mutex. So, thread doesn't own the mutex.. We have an iothread here. AIO_WAIT_WHILE() documents that ctx must be acquired exactly once by caller.. But I don't see, where is it acquired in the call stack? The other question, is why permission conflict is lost with the commit. Strange. I ss that hmp_qemu_io creates blk with perm=0 and shread=BLK_PERM_ALL.. How could it conflict even before the considered commit? Sorry, I've answered and forgot about this thread. Now, looking through my series I find this again. Seems that problem is really in lacking aio-context locking around blk_unref(). I'll send patch now. -- Best regards, Vladimir
[PATCH] mirror: stop cancelling in-flight requests on non-force cancel in READY
If mirror is READY than cancel operation is not discarding the whole result of the operation, but instead it's a documented way get a point-in-time snapshot of source disk. So, we should not cancel any requests if mirror is READ and force=false. Let's fix that case. Note, that bug that we have before this commit is not critical, as the only .bdrv_cancel_in_flight implementation is nbd_cancel_in_flight() and it cancels only requests waiting for reconnection, so it should be rare case. Fixes: 521ff8b779b11c394dbdc43f02e158dd99df308a Signed-off-by: Vladimir Sementsov-Ogievskiy --- include/block/block_int.h | 2 +- include/qemu/job.h| 2 +- block/backup.c| 2 +- block/mirror.c| 6 -- job.c | 2 +- tests/qemu-iotests/264| 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/block/block_int.h b/include/block/block_int.h index 88e4111939..584381fdb0 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -357,7 +357,7 @@ struct BlockDriver { * of in-flight requests, so don't waste the time if possible. * * One example usage is to avoid waiting for an nbd target node reconnect - * timeout during job-cancel. + * timeout during job-cancel with force=true. */ void (*bdrv_cancel_in_flight)(BlockDriverState *bs); diff --git a/include/qemu/job.h b/include/qemu/job.h index efc6fa7544..41162ed494 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -254,7 +254,7 @@ struct JobDriver { /** * If the callback is not NULL, it will be invoked in job_cancel_async */ -void (*cancel)(Job *job); +void (*cancel)(Job *job, bool force); /** Called when the job is freed */ diff --git a/block/backup.c b/block/backup.c index 6cf2f974aa..bd3614ce70 100644 --- a/block/backup.c +++ b/block/backup.c @@ -331,7 +331,7 @@ static void coroutine_fn backup_set_speed(BlockJob *job, int64_t speed) } } -static void backup_cancel(Job *job) +static void backup_cancel(Job *job, bool force) { BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); diff --git a/block/mirror.c b/block/mirror.c index 5a71bd8bbc..fcd1b56991 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1178,12 +1178,14 @@ static bool mirror_drained_poll(BlockJob *job) return !!s->in_flight; } -static void mirror_cancel(Job *job) +static void mirror_cancel(Job *job, bool force) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); BlockDriverState *target = blk_bs(s->target); -bdrv_cancel_in_flight(target); +if (force || !job_is_ready(job)) { +bdrv_cancel_in_flight(target); +} } static const BlockJobDriver mirror_job_driver = { diff --git a/job.c b/job.c index 4aff13d95a..8775c1803b 100644 --- a/job.c +++ b/job.c @@ -716,7 +716,7 @@ static int job_finalize_single(Job *job) static void job_cancel_async(Job *job, bool force) { if (job->driver->cancel) { -job->driver->cancel(job); +job->driver->cancel(job, force); } if (job->user_paused) { /* Do not call job_enter here, the caller will handle it. */ diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264 index 4f96825a22..bc431d1a19 100755 --- a/tests/qemu-iotests/264 +++ b/tests/qemu-iotests/264 @@ -95,7 +95,7 @@ class TestNbdReconnect(iotests.QMPTestCase): self.assert_qmp(result, 'return', {}) def cancel_job(self): -result = self.vm.qmp('block-job-cancel', device='drive0') +result = self.vm.qmp('block-job-cancel', device='drive0', force=True) self.assert_qmp(result, 'return', {}) start_t = time.time() -- 2.29.2