[PATCH v2] block: replace TABs with space
Bring the block files in line with the QEMU coding style, with spaces for indentation. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/371 Signed-off-by: Yeqi Fu --- block/bochs.c | 14 +++ block/file-posix.c | 92 ++--- block/file-win32.c | 53 +- block/parallels.c | 28 +++--- block/qcow.c| 44 +++--- include/block/nbd.h | 2 +- 6 files changed, 116 insertions(+), 117 deletions(-) diff --git a/block/bochs.c b/block/bochs.c index 2f5ae52c90..85d06f3315 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -293,15 +293,15 @@ static void bochs_close(BlockDriverState *bs) } static BlockDriver bdrv_bochs = { -.format_name = "bochs", -.instance_size = sizeof(BDRVBochsState), -.bdrv_probe= bochs_probe, -.bdrv_open = bochs_open, -.bdrv_child_perm = bdrv_default_perms, +.format_name = "bochs", +.instance_size = sizeof(BDRVBochsState), +.bdrv_probe = bochs_probe, +.bdrv_open = bochs_open, +.bdrv_child_perm = bdrv_default_perms, .bdrv_refresh_limits = bochs_refresh_limits, .bdrv_co_preadv = bochs_co_preadv, -.bdrv_close= bochs_close, -.is_format = true, +.bdrv_close = bochs_close, +.is_format = true, }; static void bdrv_bochs_init(void) diff --git a/block/file-posix.c b/block/file-posix.c index 5760cf22d1..18159b6d83 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -124,7 +124,7 @@ #define FTYPE_FILE 0 #define FTYPE_CD 1 -#define MAX_BLOCKSIZE 4096 +#define MAX_BLOCKSIZE 4096 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, * leaving a few more bytes for its future use. */ @@ -3819,42 +3819,42 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) } static BlockDriver bdrv_host_cdrom = { -.format_name= "host_cdrom", -.protocol_name = "host_cdrom", -.instance_size = sizeof(BDRVRawState), +.format_name = "host_cdrom", +.protocol_name = "host_cdrom", +.instance_size = sizeof(BDRVRawState), .bdrv_needs_filename = true, -.bdrv_probe_device = cdrom_probe_device, +.bdrv_probe_device = cdrom_probe_device, .bdrv_parse_filename = cdrom_parse_filename, -.bdrv_file_open = cdrom_open, -.bdrv_close = raw_close, +.bdrv_file_open = cdrom_open, +.bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, -.bdrv_reopen_commit = raw_reopen_commit, -.bdrv_reopen_abort = raw_reopen_abort, +.bdrv_reopen_commit = raw_reopen_commit, +.bdrv_reopen_abort = raw_reopen_abort, .bdrv_co_create_opts = bdrv_co_create_opts_simple, -.create_opts = _create_opts_simple, -.mutable_opts= mutable_opts, +.create_opts = _create_opts_simple, +.mutable_opts = mutable_opts, .bdrv_co_invalidate_cache = raw_co_invalidate_cache, -.bdrv_co_preadv = raw_co_preadv, -.bdrv_co_pwritev= raw_co_pwritev, -.bdrv_co_flush_to_disk = raw_co_flush_to_disk, +.bdrv_co_preadv = raw_co_preadv, +.bdrv_co_pwritev = raw_co_pwritev, +.bdrv_co_flush_to_disk = raw_co_flush_to_disk, .bdrv_refresh_limits = raw_refresh_limits, -.bdrv_co_io_plug= raw_co_io_plug, -.bdrv_co_io_unplug = raw_co_io_unplug, +.bdrv_co_io_plug = raw_co_io_plug, +.bdrv_co_io_unplug = raw_co_io_unplug, .bdrv_attach_aio_context = raw_aio_attach_aio_context, -.bdrv_co_truncate = raw_co_truncate, -.bdrv_co_getlength = raw_co_getlength, -.has_variable_length= true, -.bdrv_co_get_allocated_file_size= raw_co_get_allocated_file_size, +.bdrv_co_truncate = raw_co_truncate, +.bdrv_co_getlength = raw_co_getlength, +.has_variable_length = true, +.bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size, /* removable device support */ -.bdrv_co_is_inserted= cdrom_co_is_inserted, -.bdrv_co_eject = cdrom_co_eject, -.bdrv_co_lock_medium= cdrom_co_lock_medium, +.bdrv_co_is_inserted = cdrom_co_is_inserted, +.bdrv_co_eject = cdrom_co_eject, +.bdrv_co_lock_medium = cdrom_co_lock_medium, /* generic scsi device */ -.bdrv_co_ioctl = hdev_co_ioctl, +.bdrv_co_ioctl = hdev_co_ioctl, }; #endif /* __linux__ */ @@ -3949,38 +3949,38 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) } static BlockDriver bdrv_host_cdrom = { -.format_name= "host_cdrom", -.protocol_name = "host_cdrom", -.instance_size = sizeof(BDRVRawState), +.format_name = "host_cdrom", +.protocol_name = "host_cdrom", +.instance_size = sizeof(BDRVRawState), .bdrv_needs_filename = true, -.bdrv_probe_device = cdrom_probe_device, +
[PATCH v2 5/6] hmp: convert handle_hmp_command() to AIO_WAIT_WHILE_UNLOCKED()
The HMP monitor runs in the main loop thread. Calling AIO_WAIT_WHILE(qemu_get_aio_context(), ...) from the main loop thread is equivalent to AIO_WAIT_WHILE_UNLOCKED(NULL, ...) because neither unlocks the AioContext and the latter's assertion that we're in the main loop succeeds. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Reviewed-by: Markus Armbruster Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- monitor/hmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitor/hmp.c b/monitor/hmp.c index fee410362f..5cab56d355 100644 --- a/monitor/hmp.c +++ b/monitor/hmp.c @@ -1167,7 +1167,7 @@ void handle_hmp_command(MonitorHMP *mon, const char *cmdline) Coroutine *co = qemu_coroutine_create(handle_hmp_command_co, ); monitor_set_cur(co, >common); aio_co_enter(qemu_get_aio_context(), co); -AIO_WAIT_WHILE(qemu_get_aio_context(), !data.done); +AIO_WAIT_WHILE_UNLOCKED(NULL, !data.done); } qobject_unref(qdict); -- 2.39.2
[PATCH v2 4/6] block: convert bdrv_drain_all_begin() to AIO_WAIT_WHILE_UNLOCKED()
Since the AioContext argument was already NULL, AIO_WAIT_WHILE() was never going to unlock the AioContext. Therefore it is possible to replace AIO_WAIT_WHILE() with AIO_WAIT_WHILE_UNLOCKED(). Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/io.c b/block/io.c index 8974d46941..db438c7657 100644 --- a/block/io.c +++ b/block/io.c @@ -520,7 +520,7 @@ void bdrv_drain_all_begin(void) bdrv_drain_all_begin_nopoll(); /* Now poll the in-flight requests */ -AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); +AIO_WAIT_WHILE_UNLOCKED(NULL, bdrv_drain_all_poll()); while ((bs = bdrv_next_all_states(bs))) { bdrv_drain_assert_idle(bs); -- 2.39.2
[PATCH v2 6/6] monitor: convert monitor_cleanup() to AIO_WAIT_WHILE_UNLOCKED()
monitor_cleanup() is called from the main loop thread. Calling AIO_WAIT_WHILE(qemu_get_aio_context(), ...) from the main loop thread is equivalent to AIO_WAIT_WHILE_UNLOCKED(NULL, ...) because neither unlocks the AioContext and the latter's assertion that we're in the main loop succeeds. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Reviewed-by: Markus Armbruster Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- monitor/monitor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitor/monitor.c b/monitor/monitor.c index 8dc96f6af9..602535696c 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -666,7 +666,7 @@ void monitor_cleanup(void) * We need to poll both qemu_aio_context and iohandler_ctx to make * sure that the dispatcher coroutine keeps making progress and * eventually terminates. qemu_aio_context is automatically - * polled by calling AIO_WAIT_WHILE on it, but we must poll + * polled by calling AIO_WAIT_WHILE_UNLOCKED on it, but we must poll * iohandler_ctx manually. * * Letting the iothread continue while shutting down the dispatcher @@ -679,7 +679,7 @@ void monitor_cleanup(void) aio_co_wake(qmp_dispatcher_co); } -AIO_WAIT_WHILE(qemu_get_aio_context(), +AIO_WAIT_WHILE_UNLOCKED(NULL, (aio_poll(iohandler_get_aio_context(), false), qatomic_mb_read(_dispatcher_co_busy))); -- 2.39.2
[PATCH v2 1/6] block: don't acquire AioContext lock in bdrv_drain_all()
There is no need for the AioContext lock in bdrv_drain_all() because nothing in AIO_WAIT_WHILE() needs the lock and the condition is atomic. AIO_WAIT_WHILE_UNLOCKED() has no use for the AioContext parameter other than performing a check that is nowadays already done by the GLOBAL_STATE_CODE()/IO_CODE() macros. Set the ctx argument to NULL here to help us keep track of all converted callers. Eventually all callers will have been converted and then the argument can be dropped entirely. Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/block-backend.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 278b04ce69..d2b6b3652d 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1835,14 +1835,8 @@ void blk_drain_all(void) bdrv_drain_all_begin(); while ((blk = blk_all_next(blk)) != NULL) { -AioContext *ctx = blk_get_aio_context(blk); - -aio_context_acquire(ctx); - /* We may have -ENOMEDIUM completions in flight */ -AIO_WAIT_WHILE(ctx, qatomic_mb_read(>in_flight) > 0); - -aio_context_release(ctx); +AIO_WAIT_WHILE_UNLOCKED(NULL, qatomic_mb_read(>in_flight) > 0); } bdrv_drain_all_end(); -- 2.39.2
[PATCH v2 0/6] block: switch to AIO_WAIT_WHILE_UNLOCKED() where possible
v2: - Clarify NULL ctx argument in Patch 1 commit description [Kevin] AIO_WAIT_WHILE_UNLOCKED() is the future replacement for AIO_WAIT_WHILE(). Most callers haven't been converted yet because they rely on the AioContext lock. I looked through the code and found the easy cases that can be converted today. Stefan Hajnoczi (6): block: don't acquire AioContext lock in bdrv_drain_all() block: convert blk_exp_close_all_type() to AIO_WAIT_WHILE_UNLOCKED() block: convert bdrv_graph_wrlock() to AIO_WAIT_WHILE_UNLOCKED() block: convert bdrv_drain_all_begin() to AIO_WAIT_WHILE_UNLOCKED() hmp: convert handle_hmp_command() to AIO_WAIT_WHILE_UNLOCKED() monitor: convert monitor_cleanup() to AIO_WAIT_WHILE_UNLOCKED() block/block-backend.c | 8 +--- block/export/export.c | 2 +- block/graph-lock.c| 2 +- block/io.c| 2 +- monitor/hmp.c | 2 +- monitor/monitor.c | 4 ++-- 6 files changed, 7 insertions(+), 13 deletions(-) -- 2.39.2
[PATCH v2 2/6] block: convert blk_exp_close_all_type() to AIO_WAIT_WHILE_UNLOCKED()
There is no change in behavior. Switch to AIO_WAIT_WHILE_UNLOCKED() instead of AIO_WAIT_WHILE() to document that this code has already been audited and converted. The AioContext argument is already NULL so aio_context_release() is never called anyway. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/export/export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/export/export.c b/block/export/export.c index 28a91c9c42..e3fee60611 100644 --- a/block/export/export.c +++ b/block/export/export.c @@ -306,7 +306,7 @@ void blk_exp_close_all_type(BlockExportType type) blk_exp_request_shutdown(exp); } -AIO_WAIT_WHILE(NULL, blk_exp_has_type(type)); +AIO_WAIT_WHILE_UNLOCKED(NULL, blk_exp_has_type(type)); } void blk_exp_close_all(void) -- 2.39.2
[PATCH v2 3/6] block: convert bdrv_graph_wrlock() to AIO_WAIT_WHILE_UNLOCKED()
The following conversion is safe and does not change behavior: GLOBAL_STATE_CODE(); ... - AIO_WAIT_WHILE(qemu_get_aio_context(), ...); + AIO_WAIT_WHILE_UNLOCKED(NULL, ...); Since we're in GLOBAL_STATE_CODE(), qemu_get_aio_context() is our home thread's AioContext. Thus AIO_WAIT_WHILE() does not unlock the AioContext: if (ctx_ && in_aio_context_home_thread(ctx_)) {\ while ((cond)) { \ aio_poll(ctx_, true); \ waited_ = true;\ } \ And that means AIO_WAIT_WHILE_UNLOCKED(NULL, ...) can be substituted. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/graph-lock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/graph-lock.c b/block/graph-lock.c index 454c31e691..639526608f 100644 --- a/block/graph-lock.c +++ b/block/graph-lock.c @@ -127,7 +127,7 @@ void bdrv_graph_wrlock(void) * reader lock. */ qatomic_set(_writer, 0); -AIO_WAIT_WHILE(qemu_get_aio_context(), reader_count() >= 1); +AIO_WAIT_WHILE_UNLOCKED(NULL, reader_count() >= 1); qatomic_set(_writer, 1); /* -- 2.39.2
Re: [PATCH] block: replace TABs with space
On Fri, Mar 10, 2023 at 12:10:08AM +0800, Yeqi Fu wrote: > Bring the block files in line with the QEMU coding style, with spaces > for indentation. > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/371 > > Signed-off-by: Yeqi Fu > --- > block/bochs.c | 12 +-- > block/file-posix.c | 52 ++--- > block/file-win32.c | 38 - > block/parallels.c | 10 - > block/qcow.c| 10 - > include/block/nbd.h | 2 +- > 6 files changed, 62 insertions(+), 62 deletions(-) > > diff --git a/block/bochs.c b/block/bochs.c > index 2f5ae52c90..8ff38ac0d9 100644 > --- a/block/bochs.c > +++ b/block/bochs.c > @@ -293,15 +293,15 @@ static void bochs_close(BlockDriverState *bs) > } > > static BlockDriver bdrv_bochs = { > -.format_name = "bochs", > -.instance_size = sizeof(BDRVBochsState), > -.bdrv_probe = bochs_probe, > -.bdrv_open = bochs_open, > +.format_name= "bochs", > +.instance_size = sizeof(BDRVBochsState), > +.bdrv_probe = bochs_probe, > +.bdrv_open = bochs_open, > .bdrv_child_perm = bdrv_default_perms, > .bdrv_refresh_limits = bochs_refresh_limits, Hmm. When shown in the diff, it looks like you are changing the alignment of the .bdrv_probe line; but in reality, the extra prefix from diff changes which tabstop the '=' goes out to, so you are preserving the original column. Still, it looks really awkward that we have one alignment for .format_name through .bdrv_open, and another alignment for .bdrv_child_perm and .bdrv_refresh_limits. My personal preference: '=' alignment is a lost cause. It causes pointless churn if we later add a field with a longer name (all other lines have to reindent to match the new length). I'd prefer exactly one space before '=' on all lines that you touch (and maybe touch a few more lines in the process). But if we DO stick with alignment, I'd much rather that ALL '=' be in the same column, rather than the pre-existing half-and-half mix, even though your patch kept that visually undisturbed (other than when diff injects a prefix that rejiggers the tab stops). I wouldn't mind a v2 if we can reach consensus on whether to consistently align '=' or always use just one space; but I don't know if that consensus will be easy, so I can also live with: Reviewed-by: Eric Blake even if we end up using v1 as-is. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH] qed: remove spurious BDRV_POLL_WHILE()
Am 09.03.2023 um 17:31 hat Stefan Hajnoczi geschrieben: > This looks like a copy-paste or merge error. BDRV_POLL_WHILE() is > already called above. It's not needed in the qemu_in_coroutine() case. > > Fixes: 9fb4dfc570ce ("qed: make bdrv_qed_do_open a coroutine_fn") > Signed-off-by: Stefan Hajnoczi Thanks, applied to the block branch. Kevin
Re: [PATCH for-8.0] ide: Fix manual in-flight count for TRIM BH
Am 09.03.2023 um 14:59 hat Paolo Bonzini geschrieben: > On 3/9/23 13:31, Hanna Czenczek wrote: > > On 09.03.23 13:08, Paolo Bonzini wrote: > > > On Thu, Mar 9, 2023 at 1:05 PM Paolo Bonzini wrote: > > > > I think having to do this is problematic, because the blk_drain should > > > > leave no pending operation. > > > > > > > > Here it seems okay because you do it in a controlled situation, but the > > > > main thread can also issue blk_drain(), or worse bdrv_drained_begin(), > > > > and there would be pending I/O operations when it returns. > > > > Not really. We would stop in the middle of a trim that processes a list > > of discard requests. So I see it more like stopping in the middle of > > anything that processes guest requests. Once drain ends, we continue > > processing them, and that’s not exactly pending I/O. > > > > There is a pending object in s->bus->dma->aiocb on the IDE side, so > > there is a pending DMA operation, but naïvely, I don’t see that as a > > problem. > > What about the bdrv_drain_all() when a VM stops, would the guest continue to > access memory and disks after bdrv_drain() return? That one shouldn't be a problem because the devices are stopped before the backends. > Migration could also be a problem, because the partial TRIM would not be > recorded in the s->bus->error_status field of IDEState (no surprise there, > it's not an error). Also, errors happening after bdrv_drain() might not be > migrated correctly. Yes, I think it would be good to have the I/O operation fully completed on the IDE level rather than just in the block layer. > > Or the issue is generally that IDE uses dma_* functions, which might > > cause I/O functions to be run from new BHs (I guess through > > reschedule_dma()?). None of those complicated scenarios actually. The problem solved by the request queuing is simply that nothing else stops the guest from submitting new requests to drained nodes if the CPUs are still running. Drain uses aio_disable_external() to disable processing of external events, in particular the ioeventfd used by virtio-blk and virtio-scsi. But IDE submits requests through MMIO or port I/O, i.e. the vcpu thread exits to userspace and calls directly into the IDE code, so it's completely unaffected by aio_disable_external(). > Ah, you mean that you can have pending I/O operations while blk->in_flight > is zero? That would be a problem indeed. We already have BlockDevOps for > ide-cd and ide-hd, should we add a .drained_poll callback there? To be more precise, you suggested in the call that .drained_poll should return that IDE is still busy while aiocb != NULL. Without having looked at the code in detail yet, that seems to make sense to me. And maybe even the blk_inc/dec_in_flight() pair can then go completely away because IDE takes care of its drain state itself then. Kevin
[PATCH] block: replace TABs with space
Bring the block files in line with the QEMU coding style, with spaces for indentation. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/371 Signed-off-by: Yeqi Fu --- block/bochs.c | 12 +-- block/file-posix.c | 52 ++--- block/file-win32.c | 38 - block/parallels.c | 10 - block/qcow.c| 10 - include/block/nbd.h | 2 +- 6 files changed, 62 insertions(+), 62 deletions(-) diff --git a/block/bochs.c b/block/bochs.c index 2f5ae52c90..8ff38ac0d9 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -293,15 +293,15 @@ static void bochs_close(BlockDriverState *bs) } static BlockDriver bdrv_bochs = { -.format_name = "bochs", -.instance_size = sizeof(BDRVBochsState), -.bdrv_probe= bochs_probe, -.bdrv_open = bochs_open, +.format_name= "bochs", +.instance_size = sizeof(BDRVBochsState), +.bdrv_probe = bochs_probe, +.bdrv_open = bochs_open, .bdrv_child_perm = bdrv_default_perms, .bdrv_refresh_limits = bochs_refresh_limits, .bdrv_co_preadv = bochs_co_preadv, -.bdrv_close= bochs_close, -.is_format = true, +.bdrv_close = bochs_close, +.is_format = true, }; static void bdrv_bochs_init(void) diff --git a/block/file-posix.c b/block/file-posix.c index 5760cf22d1..9ba1600a77 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -124,7 +124,7 @@ #define FTYPE_FILE 0 #define FTYPE_CD 1 -#define MAX_BLOCKSIZE 4096 +#define MAX_BLOCKSIZE 4096 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, * leaving a few more bytes for its future use. */ @@ -3819,14 +3819,14 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) } static BlockDriver bdrv_host_cdrom = { -.format_name= "host_cdrom", -.protocol_name = "host_cdrom", -.instance_size = sizeof(BDRVRawState), +.format_name = "host_cdrom", +.protocol_name = "host_cdrom", +.instance_size = sizeof(BDRVRawState), .bdrv_needs_filename = true, -.bdrv_probe_device = cdrom_probe_device, +.bdrv_probe_device = cdrom_probe_device, .bdrv_parse_filename = cdrom_parse_filename, -.bdrv_file_open = cdrom_open, -.bdrv_close = raw_close, +.bdrv_file_open = cdrom_open, +.bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, @@ -3835,12 +3835,12 @@ static BlockDriver bdrv_host_cdrom = { .mutable_opts= mutable_opts, .bdrv_co_invalidate_cache = raw_co_invalidate_cache, -.bdrv_co_preadv = raw_co_preadv, -.bdrv_co_pwritev= raw_co_pwritev, -.bdrv_co_flush_to_disk = raw_co_flush_to_disk, -.bdrv_refresh_limits = raw_refresh_limits, -.bdrv_co_io_plug= raw_co_io_plug, -.bdrv_co_io_unplug = raw_co_io_unplug, +.bdrv_co_preadv = raw_co_preadv, +.bdrv_co_pwritev = raw_co_pwritev, +.bdrv_co_flush_to_disk = raw_co_flush_to_disk, +.bdrv_refresh_limits = raw_refresh_limits, +.bdrv_co_io_plug = raw_co_io_plug, +.bdrv_co_io_unplug = raw_co_io_unplug, .bdrv_attach_aio_context = raw_aio_attach_aio_context, .bdrv_co_truncate = raw_co_truncate, @@ -3949,27 +3949,27 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) } static BlockDriver bdrv_host_cdrom = { -.format_name= "host_cdrom", -.protocol_name = "host_cdrom", -.instance_size = sizeof(BDRVRawState), +.format_name = "host_cdrom", +.protocol_name = "host_cdrom", +.instance_size = sizeof(BDRVRawState), .bdrv_needs_filename = true, -.bdrv_probe_device = cdrom_probe_device, +.bdrv_probe_device = cdrom_probe_device, .bdrv_parse_filename = cdrom_parse_filename, -.bdrv_file_open = cdrom_open, -.bdrv_close = raw_close, +.bdrv_file_open = cdrom_open, +.bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, .bdrv_co_create_opts = bdrv_co_create_opts_simple, .create_opts = _create_opts_simple, -.mutable_opts = mutable_opts, +.mutable_opts= mutable_opts, -.bdrv_co_preadv = raw_co_preadv, -.bdrv_co_pwritev= raw_co_pwritev, -.bdrv_co_flush_to_disk = raw_co_flush_to_disk, -.bdrv_refresh_limits = raw_refresh_limits, -.bdrv_co_io_plug= raw_co_io_plug, -.bdrv_co_io_unplug = raw_co_io_unplug, +.bdrv_co_preadv = raw_co_preadv, +.bdrv_co_pwritev =
[PATCH] qed: remove spurious BDRV_POLL_WHILE()
This looks like a copy-paste or merge error. BDRV_POLL_WHILE() is already called above. It's not needed in the qemu_in_coroutine() case. Fixes: 9fb4dfc570ce ("qed: make bdrv_qed_do_open a coroutine_fn") Signed-off-by: Stefan Hajnoczi --- block/qed.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/qed.c b/block/qed.c index ed94bb61ca..0705a7b4e2 100644 --- a/block/qed.c +++ b/block/qed.c @@ -594,7 +594,6 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, )); BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS); } -BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS); return qoc.ret; } -- 2.39.2
Re: [PATCH 4/9] nbd: mark more coroutine_fns, do not use co_wrappers
On Thu, Mar 09, 2023 at 09:44:51AM +0100, Paolo Bonzini wrote: > Signed-off-by: Paolo Bonzini > --- > nbd/server.c | 48 > 1 file changed, 24 insertions(+), 24 deletions(-) > > diff --git a/nbd/server.c b/nbd/server.c > index a4750e41880a..6f5fcade2a54 100644 > --- a/nbd/server.c > +++ b/nbd/server.c > @@ -1409,8 +1409,8 @@ nbd_read_eof(NBDClient *client, void *buffer, size_t > size, Error **errp) > return 1; > } > > -static int nbd_receive_request(NBDClient *client, NBDRequest *request, > - Error **errp) > +static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest > *request, > +Error **errp) > { Should we rename this nbd_co_receive_request() while at it? ... > @@ -2198,9 +2198,9 @@ static int coroutine_fn > blockalloc_to_extents(BlockBackend *blk, > * @ea is converted to BE by the function > * @last controls whether NBD_REPLY_FLAG_DONE is sent. > */ > -static int nbd_co_send_extents(NBDClient *client, uint64_t handle, > - NBDExtentArray *ea, > - bool last, uint32_t context_id, Error **errp) > +static int coroutine_fn nbd_co_send_extents(NBDClient *client, uint64_t > handle, > +NBDExtentArray *ea, > + bool last, uint32_t context_id, > Error **errp) Whitespace damage. ... > @@ -2297,8 +2297,8 @@ static int nbd_co_send_bitmap(NBDClient *client, > uint64_t handle, > * to the client (although the caller may still need to disconnect after > * reporting the error). > */ > -static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, > - Error **errp) > +static int coroutine_fn nbd_co_receive_request(NBDRequestData *req, > NBDRequest *request, > + Error **errp) > { > NBDClient *client = req->client; > int valid_flags; > @@ -2446,7 +2446,7 @@ static coroutine_fn int nbd_do_cmd_read(NBDClient > *client, NBDRequest *request, Most uses of coroutine_fn in this patch occur after the return type, but in this and later hunks, the function has it the other way around. Should we touch that up in this patch? Likewise, should we add _co_ in the name of these pre-existing coroutine_fn functions nbd_do_cmd_read and nbd_handle_request? But I'm liking the efforts to use our annotations more consistently, particularly if it is a result of you making progress on having the compiler point out inconsistencies. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH for-8.0] ide: Fix manual in-flight count for TRIM BH
On 3/9/23 13:31, Hanna Czenczek wrote: On 09.03.23 13:08, Paolo Bonzini wrote: On Thu, Mar 9, 2023 at 1:05 PM Paolo Bonzini wrote: I think having to do this is problematic, because the blk_drain should leave no pending operation. Here it seems okay because you do it in a controlled situation, but the main thread can also issue blk_drain(), or worse bdrv_drained_begin(), and there would be pending I/O operations when it returns. Not really. We would stop in the middle of a trim that processes a list of discard requests. So I see it more like stopping in the middle of anything that processes guest requests. Once drain ends, we continue processing them, and that’s not exactly pending I/O. There is a pending object in s->bus->dma->aiocb on the IDE side, so there is a pending DMA operation, but naïvely, I don’t see that as a problem. What about the bdrv_drain_all() when a VM stops, would the guest continue to access memory and disks after bdrv_drain() return? Migration could also be a problem, because the partial TRIM would not be recorded in the s->bus->error_status field of IDEState (no surprise there, it's not an error). Also, errors happening after bdrv_drain() might not be migrated correctly. Or the issue is generally that IDE uses dma_* functions, which might cause I/O functions to be run from new BHs (I guess through reschedule_dma()?). Ah, you mean that you can have pending I/O operations while blk->in_flight is zero? That would be a problem indeed. We already have BlockDevOps for ide-cd and ide-hd, should we add a .drained_poll callback there? Hmm, what about making blk_aio_prwv non-static and calling bdrv_co_pdiscard directly from IDE? You mean transforming ide_issue_trim_cb() into an iterative coroutine (instead of being recursive and using AIO) and invoking it via blk_aio_prwv()? It doesn’t feel right to call a bdrv_* function directly from a user external to the core block layer, so in this case I’d rather fall back to Fiona’s idea of invoking all discards concurrently. Yeah, honestly it doesn't feel very much right to me either. Paolo
Re: [PATCH v2 2/3] block: make BlockBackend->disable_request_queuing atomic
On 3/9/23 13:31, Stefan Hajnoczi wrote: On Thu, Mar 09, 2023 at 10:07:40AM +0100, Paolo Bonzini wrote: On 3/7/23 22:04, Stefan Hajnoczi wrote: This field is accessed by multiple threads without a lock. Use explicit qatomic_read()/qatomic_set() calls. There is no need for acquire/release because blk_set_disable_request_queuing() doesn't provide any guarantees (it helps that it's used at BlockBackend creation time and not when there is I/O in flight). This in turn means itdoes not need to be atomic - atomics are only needed if there are concurrent writes. No big deal; I am now resurrecting the series from the time I had noticed the queued_requests thread-safety problem, so this will be simplified in 8.1. For now your version is okay, thanks for fixing it! I was under the impression that variables accessed by multiple threads outside a lock or similar primitive need memory_order_relaxed both as documentation and to tell the compiler that they should indeed be atomic (but without ordering guarantees). Atomic accesses are needed to avoid data races. Data races are concurrent accesses, of which at least one is a non-atomic write. (A is concurrent with B is you can't be sure that A happens before B or vice versa; this intuitively is the "lock or similar primitive" that you mentioned. Happens-before changes from one execution to the other, but it is enough to somehow prove there _is_ an ordering; for example, given two accesses that are done while a mutex is taken, one will always happen before the other). In this case all writes to disable_request_queuing happen not just outside I/O, but even *before* the first I/O. No writes that are concurrent with reads => no need to use atomic for reads. For example the stdin global variable is accessed from multiple threads and you would never use atomics to read the pointer. Just don't write to it and there won't be data races. I think memory_order_relaxed also tells the compiler to do a bit more, like to generate just a single store to the variable for each occurrence in the code ("speculative" and "out-of-thin air" stores). The correspondence is not necessarily 1:1, some optimizations are possible; for example this: qatomic_set(, 0); a = qatomic_read(); qatomic_set(, a + 1); can be changed to a = 0; qatomic_set(, 1); (because it is safe to assume that no other thread sees the state where x==0). Or the first read here: a = qatomic_read(); a = qatomic_read(); can be optimized out, unlike Linux's READ_ONCE(). I have no idea if compilers actually perform these optimizations, but if they do they are neither frequent (maybe in languages that inline a lot more, but not in QEMU) nor problematic. Even though there's some freedom in removing/consolidating code, it's true as you said that speculation is a no-no! It's the documentation part that's most interesting in this case. Do we not want to identify variables that are accessed outside a lock and therefore require some thought? I think it depends, see the stdin example before. As the API is now, I agree that using qatomic_read() is the right thing to do. In principle the flag could bounce back and forth many times. :/ With a better API, the balance may tilt on the side of not using atomics. We'll see when I post the patch. :) Paolo
Re: [PATCH 1/6] block: don't acquire AioContext lock in bdrv_drain_all()
On Wed, Mar 08, 2023 at 06:25:43PM +0100, Kevin Wolf wrote: > Am 08.03.2023 um 15:26 hat Stefan Hajnoczi geschrieben: > > On Wed, Mar 08, 2023 at 09:48:17AM +0100, Kevin Wolf wrote: > > > Am 07.03.2023 um 20:20 hat Stefan Hajnoczi geschrieben: > > > > On Tue, Mar 07, 2023 at 06:17:22PM +0100, Kevin Wolf wrote: > > > > > Am 01.03.2023 um 21:57 hat Stefan Hajnoczi geschrieben: > > > > > > There is no need for the AioContext lock in bdrv_drain_all() because > > > > > > nothing in AIO_WAIT_WHILE() needs the lock and the condition is > > > > > > atomic. > > > > > > > > > > > > Note that the NULL AioContext argument to AIO_WAIT_WHILE() is odd. > > > > > > In > > > > > > the future it can be removed. > > > > > > > > > > It can be removed for all callers that run in the main loop context. > > > > > For > > > > > code running in an iothread, it's still important to pass a non-NULL > > > > > context. This makes me doubt that the ctx parameter can really be > > > > > removed without changing more. > > > > > > > > > > Is your plan to remove the if from AIO_WAIT_WHILE_INTERNAL(), too, and > > > > > to poll qemu_get_current_aio_context() instead of ctx_ or the main > > > > > context? > > > > > > > > This is what I'd like once everything has been converted to > > > > AIO_WAIT_WHILE_UNLOCKED() - and at this point we might as well call it > > > > AIO_WAIT_WHILE() again: > > > > > > > > #define AIO_WAIT_WHILE(cond) ({\ > > > > bool waited_ = false; \ > > > > AioWait *wait_ = _aio_wait; \ > > > > /* Increment wait_->num_waiters before evaluating cond. */ \ > > > > qatomic_inc(_->num_waiters); \ > > > > /* Paired with smp_mb in aio_wait_kick(). */ \ > > > > smp_mb(); \ > > > > while ((cond)) { \ > > > > aio_poll(qemu_get_current_aio_context(), true);\ > > > > waited_ = true;\ > > > > } \ > > > > qatomic_dec(_->num_waiters); \ > > > > waited_; }) > > > > > > Ok, yes, this is what I tried to describe above. > > > > > > > However, I just realized this only works in the main loop thread because > > > > that's where aio_wait_kick() notifications are received. An IOThread > > > > running AIO_WAIT_WHILE() won't be woken when another thread (including > > > > the main loop thread) calls aio_wait_kick(). > > > > > > Which is of course a limitation we already have today. You can wait for > > > things in your own iothread, or for all threads from the main loop. > > > > > > However, in the future multiqueue world, the first case probably becomes > > > pretty much useless because even for the same node, you could get > > > activity in any thread. > > > > > > So essentially AIO_WAIT_WHILE() becomes GLOBAL_STATE_CODE(). Which is > > > probably a good idea anyway, but I'm not entirely sure how many places > > > we currently have where it's called from an iothread. I know the drain > > > in mirror_run(), but Emanuele already had a patch in his queue where > > > bdrv_co_yield_to_drain() schedules drain in the main context, so if that > > > works, mirror_run() would be solved. > > > > > > https://gitlab.com/eesposit/qemu/-/commit/63562351aca4fb05d5711eb410feb96e64b5d4ad > > > > > > > I would propose introducing a QemuCond for each condition that we wait > > > > on, but QemuCond lacks event loop integration. The current thread would > > > > be unable to run aio_poll() while also waiting on a QemuCond. > > > > > > > > Life outside coroutines is hard, man! I need to think about this more. > > > > Luckily this problem doesn't block this patch series. > > > > > > I hope that we don't really need all of this if we can limit running > > > synchronous code to the main loop. > > > > Great idea, I think you're right. > > > > I'll audit the code to find the IOThread AIO_WAIT_WHILE() callers and > > maybe a future patch series can work on that. > > > > > > > > There is an assertion in > > > > > > AIO_WAIT_WHILE() that checks that we're in the main loop AioContext > > > > > > and > > > > > > we would lose that check by dropping the argument. However, that > > > > > > was a > > > > > > precursor to the GLOBAL_STATE_CODE()/IO_CODE() macros and is now a > > > > > > duplicate check. So I think we won't lose much by dropping it, but > > > > > > let's > > > > > > do a few more AIO_WAIT_WHILE_UNLOCKED() coversions of this sort to > > > > > > confirm this is the case. > > > > > > > > > > > > Signed-off-by: Stefan Hajnoczi > > > > > > > > > > Yes, it seems that we don't lose much, except maybe some consistency > > > > > in > > > > > the intermediate state. The
Re: [PATCH for-8.0] ide: Fix manual in-flight count for TRIM BH
On 09.03.23 13:08, Paolo Bonzini wrote: On Thu, Mar 9, 2023 at 1:05 PM Paolo Bonzini wrote: I think having to do this is problematic, because the blk_drain should leave no pending operation. Here it seems okay because you do it in a controlled situation, but the main thread can also issue blk_drain(), or worse bdrv_drained_begin(), and there would be pending I/O operations when it returns. Not really. We would stop in the middle of a trim that processes a list of discard requests. So I see it more like stopping in the middle of anything that processes guest requests. Once drain ends, we continue processing them, and that’s not exactly pending I/O. There is a pending object in s->bus->dma->aiocb on the IDE side, so there is a pending DMA operation, but naïvely, I don’t see that as a problem. Unfortunately I don't have a solution (I'm not considering the idea of using disable_request_queuing and having even more atomics magic in block/block-backend.c), but I'll read the thread. I wouldn’t disable request queuing, because its introducing commit message (cf3129323f900ef5ddbccbe86e4fa801e88c566e) specifically says it fixes IDE. I suppose the reason might actually be this problem here, in that before request queuing, it was possible that IDE would continue issuing discard requests even while drained, because processing the list didn’t stop. Maybe. Or the issue is generally that IDE uses dma_* functions, which might cause I/O functions to be run from new BHs (I guess through reschedule_dma()?). Hmm, what about making blk_aio_prwv non-static and calling bdrv_co_pdiscard directly from IDE? You mean transforming ide_issue_trim_cb() into an iterative coroutine (instead of being recursive and using AIO) and invoking it via blk_aio_prwv()? It doesn’t feel right to call a bdrv_* function directly from a user external to the core block layer, so in this case I’d rather fall back to Fiona’s idea of invoking all discards concurrently. Hanna
Re: [PATCH v2 2/3] block: make BlockBackend->disable_request_queuing atomic
On Thu, Mar 09, 2023 at 10:07:40AM +0100, Paolo Bonzini wrote: > On 3/7/23 22:04, Stefan Hajnoczi wrote: > > This field is accessed by multiple threads without a lock. Use explicit > > qatomic_read()/qatomic_set() calls. There is no need for acquire/release > > because blk_set_disable_request_queuing() doesn't provide any > > guarantees (it helps that it's used at BlockBackend creation time and > > not when there is I/O in flight). > > This in turn means itdoes not need to be atomic - atomics are only needed if > there are concurrent writes. No big deal; I am now resurrecting the series > from the time I had noticed the queued_requests thread-safety problem, so > this will be simplified in 8.1. For now your version is okay, thanks for > fixing it! I was under the impression that variables accessed by multiple threads outside a lock or similar primitive need memory_order_relaxed both as documentation and to tell the compiler that they should indeed be atomic (but without ordering guarantees). I think memory_order_relaxed also tells the compiler to do a bit more, like to generate just a single store to the variable for each occurrence in the code ("speculative" and "out-of-thin air" stores). It's the documentation part that's most interesting in this case. Do we not want to identify variables that are accessed outside a lock and therefore require some thought? Stefan signature.asc Description: PGP signature
Re: [PATCH v2 2/3] block: make BlockBackend->disable_request_queuing atomic
On Thu, Mar 09, 2023 at 12:18:00PM +0100, Paolo Bonzini wrote: > On 3/7/23 22:04, Stefan Hajnoczi wrote: > > static int coroutine_fn GRAPH_RDLOCK > > @@ -1271,7 +1271,8 @@ static void coroutine_fn > > blk_wait_while_drained(BlockBackend *blk) > > { > > assert(blk->in_flight > 0); > > -if (qatomic_read(>quiesce_counter) && > > !blk->disable_request_queuing) { > > +if (qatomic_read(>quiesce_counter) && > > +!qatomic_read(>disable_request_queuing)) { > > The qatomic_inc in blk_inc_in_flight() made me a bit nervous that > smp_mb__after_rmw() was needed there, but it's okay. Yes. I wrote it under the assumption that sequentially consistent operations like qatomic_inc() are implicit barriers. > First, anyway blk_wait_while_drained() has to _eventually_ pause the device, > not immediately. Even if it misses that blk->quiesce_counter == 1, the I/O > will proceed and it'll just take a little more polling before > bdrv_drained_begin() exits. > > Second, I checked with CPPMEM the barriers in AIO_WAIT_WHILE() and > aio_wait_kick() save the day, even if loading blk->quiesce_counter is > reordered before the incremented value (1) is stored to blk->in_flight. > > The CPPMEM model here uses mo_relaxed to force all possible kinds of havoc: > > int main() { > atomic_int quiesce_counter = 0; > atomic_int waiters = 0; > atomic_int in_flight = 0; > > {{{ { quiesce_counter.store(1, mo_relaxed); > waiters.store(1, mo_relaxed);// AIO_WAIT_WHILE starts here > atomic_thread_fence(mo_seq_cst); > in_flight.load(mo_relaxed).readsvalue(1); } // if 1, sleep > > ||| { in_flight.store(1, mo_relaxed); // bdrv_inc_in_flight > quiesce_counter.load(mo_relaxed).readsvalue(1); // go down "if" > in_flight.store(0, mo_release); // bdrv_dec_in_flight > atomic_thread_fence(mo_seq_cst); // aio_wait_kick starts here > waiters.load(mo_relaxed).readsvalue(0); } // if 0, do not wake > }}}; > > return 0; > } > > > Because CPPMEM shows no execution consistent with the buggy .readsvalue(), > either AIO_WAIT_WHILE will not go to sleep or it will be woken up with > in_flight == 0. The polling loop ends and drained_end restarts the > coroutine from blk->queued_requests. Okay. Stefan signature.asc Description: PGP signature
Re: [PATCH for-8.0] ide: Fix manual in-flight count for TRIM BH
On Thu, Mar 9, 2023 at 1:05 PM Paolo Bonzini wrote: > I think having to do this is problematic, because the blk_drain should > leave no pending operation. > > Here it seems okay because you do it in a controlled situation, but the > main thread can also issue blk_drain(), or worse bdrv_drained_begin(), > and there would be pending I/O operations when it returns. > > Unfortunately I don't have a solution (I'm not considering the idea of > using disable_request_queuing and having even more atomics magic in > block/block-backend.c), but I'll read the thread. Hmm, what about making blk_aio_prwv non-static and calling bdrv_co_pdiscard directly from IDE? Paolo
Re: [PATCH for-8.0] ide: Fix manual in-flight count for TRIM BH
On 3/9/23 12:44, Hanna Czenczek wrote: + * + * Note that TRIM operations call blk_aio_pdiscard() multiple + * times (and finally increment s->blk's in-flight counter while + * ide_trim_bh_cb() is scheduled), so we have to loop blk_drain() + * until the whole operation is done. */ if (s->bus->dma->aiocb) { trace_ide_cancel_dma_sync_remaining(); -blk_drain(s->blk); -assert(s->bus->dma->aiocb == NULL); +while (s->bus->dma->aiocb) { +blk_drain(s->blk); +} I think having to do this is problematic, because the blk_drain should leave no pending operation. Here it seems okay because you do it in a controlled situation, but the main thread can also issue blk_drain(), or worse bdrv_drained_begin(), and there would be pending I/O operations when it returns. Unfortunately I don't have a solution (I'm not considering the idea of using disable_request_queuing and having even more atomics magic in block/block-backend.c), but I'll read the thread. Paolo
[PATCH for-8.0] ide: Fix manual in-flight count for TRIM BH
Commit 7e5cdb345f77d76cb4877fe6230c4e17a7d0d0ca ("ide: Increment BB in-flight counter for TRIM BH") fixed a problem when cancelling trims: ide_cancel_dma_sync() drains the BB to stop a potentially ongoing request, and then asserts that no request is active anymore. When trying to cancel a trim, this would only drain a single blk_aio_pdiscard() operation, but not necessarily the trim as a whole. Said commit fixed that by counting the whole trim as an operation, incrementing the in-flight counter for it, so the blk_drain() in ide_cancel_dma_sync() would wait for it. Fiona found a problem with this, specifically that draining a BB while an IDE trim operation is going on can produce a deadlock: An ongoing blk_aio_pdiscard() will be settled, but any further discard in the same trim will go into the BB's queued_request list and wait there until the drain stops. Meanwhile, the whole trim keeps the BB's in_flight_counter incremented, so the BB will never be considered drained, and qemu hangs. Therefore, we cannot keep the in-flight counter incremented throughout the trim operation. We should still increment it before the completion BH (ide_trim_bh_cb()) is scheduled and decrement it there, so that the blk_drain() in ide_cancel_dma_sync() can await completion of this scheduled BH. With that change, however, it can take multiple iterations of blk_drain() to really settle the whole trim operation, and so ide_cancel_dma_sync() must run it in a loop. Reported-by: Fiona Ebner Fixes: 7e5cdb345f77d76cb4877fe6230c4e17a7d0d0ca ("ide: Increment BB in-flight counter for TRIM BH") Signed-off-by: Hanna Czenczek --- Tested with a small test boot sector that issues a trim operation with any number of discards from 0 to 64. Before this patch, doing so hangs starting from 2 discards; and before 7e5cdb345f, doing so crashes qemu with any number of discards. With this patch, it always works. --- hw/ide/core.c | 23 +-- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index 2d034731cf..54c51084d2 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -444,7 +444,7 @@ static void ide_trim_bh_cb(void *opaque) iocb->bh = NULL; qemu_aio_unref(iocb); -/* Paired with an increment in ide_issue_trim() */ +/* Paired with an increment in ide_issue_trim_cb() */ blk_dec_in_flight(blk); } @@ -504,6 +504,14 @@ static void ide_issue_trim_cb(void *opaque, int ret) done: iocb->aiocb = NULL; if (iocb->bh) { +/* + * Paired with a decrement in ide_trim_bh_cb(): Ensure we have + * an in-flight count while this TrimAIOCB object lives. + * There is no ongoing blk_aio_pdiscard() operation anymore, + * so here we increment the counter manually before returning. + */ +blk_inc_in_flight(s->blk); + replay_bh_schedule_event(iocb->bh); } } @@ -515,9 +523,6 @@ BlockAIOCB *ide_issue_trim( IDEState *s = opaque; TrimAIOCB *iocb; -/* Paired with a decrement in ide_trim_bh_cb() */ -blk_inc_in_flight(s->blk); - iocb = blk_aio_get(_aiocb_info, s->blk, cb, cb_opaque); iocb->s = s; iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb); @@ -737,11 +742,17 @@ void ide_cancel_dma_sync(IDEState *s) * In the future we'll be able to safely cancel the I/O if the * whole DMA operation will be submitted to disk with a single * aio operation with preadv/pwritev. + * + * Note that TRIM operations call blk_aio_pdiscard() multiple + * times (and finally increment s->blk's in-flight counter while + * ide_trim_bh_cb() is scheduled), so we have to loop blk_drain() + * until the whole operation is done. */ if (s->bus->dma->aiocb) { trace_ide_cancel_dma_sync_remaining(); -blk_drain(s->blk); -assert(s->bus->dma->aiocb == NULL); +while (s->bus->dma->aiocb) { +blk_drain(s->blk); +} } } -- 2.39.1
[PATCH nbd 1/4] nbd: Add multi-conn option
Add multi-conn option to the NBD client. This commit just adds the option, it is not functional. Setting this to a value > 1 permits multiple connections to the NBD server; a typical value might be 4. The default is 1, meaning only a single connection is made. If the NBD server does not advertise that it is safe for multi-conn then this setting is forced to 1. Signed-off-by: Richard W.M. Jones --- block/nbd.c | 24 1 file changed, 24 insertions(+) diff --git a/block/nbd.c b/block/nbd.c index bf2894ad5c..5ffae0b798 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -49,6 +49,7 @@ #define EN_OPTSTR ":exportname=" #define MAX_NBD_REQUESTS16 +#define MAX_MULTI_CONN 16 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs)) #define INDEX_TO_HANDLE(bs, index) ((index) ^ (uint64_t)(intptr_t)(bs)) @@ -98,6 +99,7 @@ typedef struct BDRVNBDState { /* Connection parameters */ uint32_t reconnect_delay; uint32_t open_timeout; +uint32_t multi_conn; SocketAddress *saddr; char *export; char *tlscredsid; @@ -1803,6 +1805,15 @@ static QemuOptsList nbd_runtime_opts = { "attempts until successful or until @open-timeout seconds " "have elapsed. Default 0", }, +{ +.name = "multi-conn", +.type = QEMU_OPT_NUMBER, +.help = "If > 1 permit up to this number of connections to the " +"server. The server must also advertise multi-conn " +"support. If <= 1, only a single connection is made " +"to the server even if the server advertises multi-conn. " +"Default 1", +}, { /* end of list */ } }, }; @@ -1858,6 +1869,10 @@ static int nbd_process_options(BlockDriverState *bs, QDict *options, s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0); s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 0); +s->multi_conn = qemu_opt_get_number(opts, "multi-conn", 1); +if (s->multi_conn > MAX_MULTI_CONN) { +s->multi_conn = MAX_MULTI_CONN; +} ret = 0; @@ -1912,6 +1927,15 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, nbd_client_connection_enable_retry(s->conn); +/* + * We set s->multi_conn in nbd_process_options above, but now that + * we have connected if the server doesn't advertise that it is + * safe for multi-conn, force it to 1. + */ +if (!(s->info.flags & NBD_FLAG_CAN_MULTI_CONN)) { +s->multi_conn = 1; +} + return 0; fail: -- 2.39.2
[PATCH nbd 2/4] nbd: Split out block device state from underlying NBD connections
To implement multi-conn, we will put multiple underlying NBD connections (ie. NBDClientConnection) inside the NBD block device handle (BDRVNBDState). This requires first breaking the one-to-one relationship between NBDClientConnection and BDRVNBDState. To do this a new structure (NBDConnState) is implemented. NBDConnState takes all the per-connection state out of the block driver struct. BDRVNBDState now contains a conns[] array of pointers to NBDConnState, one for each underlying multi-conn connection. After this change there is still a one-to-one relationship because we only ever use the zeroth slot in the conns[] array. Thus this does not implement multi-conn yet. Signed-off-by: Richard W.M. Jones --- block/coroutines.h | 5 +- block/nbd.c| 674 - 2 files changed, 358 insertions(+), 321 deletions(-) diff --git a/block/coroutines.h b/block/coroutines.h index dd9f3d449b..14b01d8591 100644 --- a/block/coroutines.h +++ b/block/coroutines.h @@ -62,7 +62,7 @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); int coroutine_fn -nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking, +nbd_co_do_establish_connection(BlockDriverState *bs, void *cs, bool blocking, Error **errp); @@ -86,6 +86,7 @@ bdrv_common_block_status_above(BlockDriverState *bs, BlockDriverState **file, int *depth); int co_wrapper_mixed -nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp); +nbd_do_establish_connection(BlockDriverState *bs, void *cs, bool blocking, +Error **errp); #endif /* BLOCK_COROUTINES_H */ diff --git a/block/nbd.c b/block/nbd.c index 5ffae0b798..84e8a1add0 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -51,8 +51,8 @@ #define MAX_NBD_REQUESTS16 #define MAX_MULTI_CONN 16 -#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs)) -#define INDEX_TO_HANDLE(bs, index) ((index) ^ (uint64_t)(intptr_t)(bs)) +#define HANDLE_TO_INDEX(cs, handle) ((handle) ^ (uint64_t)(intptr_t)(cs)) +#define INDEX_TO_HANDLE(cs, index) ((index) ^ (uint64_t)(intptr_t)(cs)) typedef struct { Coroutine *coroutine; @@ -67,7 +67,10 @@ typedef enum NBDClientState { NBD_CLIENT_QUIT } NBDClientState; -typedef struct BDRVNBDState { +/* A single client connection. */ +typedef struct NBDConnState { +struct BDRVNBDState *s; /* Points to enclosing BDRVNBDState */ + QIOChannel *ioc; /* The current I/O channel */ NBDExportInfo info; @@ -94,7 +97,12 @@ typedef struct BDRVNBDState { QEMUTimer *open_timer; -BlockDriverState *bs; +NBDClientConnection *conn; +} NBDConnState; + +typedef struct BDRVNBDState { +/* The underlying NBD connections */ +NBDConnState *conns[MAX_MULTI_CONN]; /* Connection parameters */ uint32_t reconnect_delay; @@ -108,7 +116,7 @@ typedef struct BDRVNBDState { char *x_dirty_bitmap; bool alloc_depth; -NBDClientConnection *conn; +BlockDriverState *bs; } BDRVNBDState; static void nbd_yank(void *opaque); @@ -117,14 +125,16 @@ static void nbd_clear_bdrvstate(BlockDriverState *bs) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; -nbd_client_connection_release(s->conn); -s->conn = NULL; +nbd_client_connection_release(s->conns[0]->conn); +s->conns[0]->conn = NULL; yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); /* Must not leave timers behind that would access freed data */ -assert(!s->reconnect_delay_timer); -assert(!s->open_timer); +assert(!s->conns[0]->reconnect_delay_timer); +assert(!s->conns[0]->open_timer); + +g_free(s->conns[0]); object_unref(OBJECT(s->tlscreds)); qapi_free_SocketAddress(s->saddr); @@ -151,139 +161,143 @@ static bool coroutine_fn nbd_recv_coroutine_wake_one(NBDClientRequest *req) return false; } -static void coroutine_fn nbd_recv_coroutines_wake(BDRVNBDState *s) +static void coroutine_fn nbd_recv_coroutines_wake(NBDConnState *cs) { int i; -QEMU_LOCK_GUARD(>receive_mutex); +QEMU_LOCK_GUARD(>receive_mutex); for (i = 0; i < MAX_NBD_REQUESTS; i++) { -if (nbd_recv_coroutine_wake_one(>requests[i])) { +if (nbd_recv_coroutine_wake_one(>requests[i])) { return; } } } /* Called with s->requests_lock held. */ -static void coroutine_fn nbd_channel_error_locked(BDRVNBDState *s, int ret) +static void coroutine_fn nbd_channel_error_locked(NBDConnState *cs, int ret) { -if (s->state == NBD_CLIENT_CONNECTED) { -qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); +if (cs->state == NBD_CLIENT_CONNECTED) { +qio_channel_shutdown(cs->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } if (ret == -EIO) { -if (s->state ==
[PATCH nbd 3/4] nbd: Open multiple NBD connections if multi-conn is set
If the user opts in to multi-conn and the server advertises it, open multiple NBD connections. They are opened with the same parameters as the initial connection. Similarly on the close path the connections are closed. However only the zeroth connection is used, so this commit does not actually enable multi-conn capabilities. (XXX) The strategy here is very naive. Firstly if you were going to open them, you'd probably want to open them in parallel. Secondly it would make sense to delay opening until multiple parallel requests are seen (perhaps above some threshold), so that simple or shortlived NBD operations do not require multiple connections to be made. Signed-off-by: Richard W.M. Jones --- block/nbd.c | 128 1 file changed, 90 insertions(+), 38 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 84e8a1add0..4c99c3f865 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -124,18 +124,23 @@ static void nbd_yank(void *opaque); static void nbd_clear_bdrvstate(BlockDriverState *bs) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; +size_t i; -nbd_client_connection_release(s->conns[0]->conn); -s->conns[0]->conn = NULL; +for (i = 0; i < MAX_MULTI_CONN; ++i) { +if (s->conns[i]) { +nbd_client_connection_release(s->conns[i]->conn); +s->conns[i]->conn = NULL; + +/* Must not leave timers behind that would access freed data */ +assert(!s->conns[i]->reconnect_delay_timer); +assert(!s->conns[i]->open_timer); + +g_free(s->conns[i]); +} +} yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); -/* Must not leave timers behind that would access freed data */ -assert(!s->conns[0]->reconnect_delay_timer); -assert(!s->conns[0]->open_timer); - -g_free(s->conns[0]); - object_unref(OBJECT(s->tlscreds)); qapi_free_SocketAddress(s->saddr); s->saddr = NULL; @@ -1905,43 +1910,39 @@ static int nbd_process_options(BlockDriverState *bs, QDict *options, return ret; } -static int nbd_open(BlockDriverState *bs, QDict *options, int flags, -Error **errp) +static NBDConnState *init_conn_state(BDRVNBDState *s) { +NBDConnState *cs; + +cs = g_new0(NBDConnState, 1); +cs->s = s; +qemu_mutex_init(>requests_lock); +qemu_co_queue_init(>free_sema); +qemu_co_mutex_init(>send_mutex); +qemu_co_mutex_init(>receive_mutex); + +return cs; +} + +static int conn_state_connect(BlockDriverState *bs, NBDConnState *cs, + Error **errp) +{ +BDRVNBDState *s = cs->s; int ret; -BDRVNBDState *s = (BDRVNBDState *)bs->opaque; -s->bs = bs; - -s->conns[0] = g_new0(NBDConnState, 1); -s->conns[0]->s = s; -qemu_mutex_init(>conns[0]->requests_lock); -qemu_co_queue_init(>conns[0]->free_sema); -qemu_co_mutex_init(>conns[0]->send_mutex); -qemu_co_mutex_init(>conns[0]->receive_mutex); - -if (!yank_register_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name), errp)) { -return -EEXIST; -} - -ret = nbd_process_options(bs, options, errp); -if (ret < 0) { -goto fail; -} - -s->conns[0]->conn = +cs->conn = nbd_client_connection_new(s->saddr, true, s->export, s->x_dirty_bitmap, s->tlscreds, s->tlshostname); if (s->open_timeout) { -nbd_client_connection_enable_retry(s->conns[0]->conn); -open_timer_init(s->conns[0], qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + +nbd_client_connection_enable_retry(cs->conn); +open_timer_init(cs, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->open_timeout * NANOSECONDS_PER_SECOND); } -s->conns[0]->state = NBD_CLIENT_CONNECTING_WAIT; -ret = nbd_do_establish_connection(bs, s->conns[0], true, errp); +cs->state = NBD_CLIENT_CONNECTING_WAIT; +ret = nbd_do_establish_connection(bs, cs, true, errp); if (ret < 0) { goto fail; } @@ -1951,9 +1952,44 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, * Delete it, because we do not want it to be around when this node * is drained or closed. */ -open_timer_del(s->conns[0]); +open_timer_del(cs); -nbd_client_connection_enable_retry(s->conns[0]->conn); +nbd_client_connection_enable_retry(cs->conn); + +return 0; + +fail: +open_timer_del(cs); +return ret; +} + +static int nbd_open(BlockDriverState *bs, QDict *options, int flags, +Error **errp) +{ +int ret; +BDRVNBDState *s = (BDRVNBDState *)bs->opaque; +size_t i; + +s->bs = bs; + +if (!yank_register_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name), errp)) { +return -EEXIST; +} + +ret = nbd_process_options(bs, options, errp); +if (ret < 0) { +goto fail; +} + +/* +
[PATCH nbd 4/4] nbd: Enable multi-conn using round-robin
Enable NBD multi-conn by spreading operations across multiple connections. (XXX) This uses a naive round-robin approach which could be improved. For example we could look at how many requests are in flight and assign operations to the connections with fewest. Or we could try to estimate (based on size of requests outstanding) the load on each connection. But this implementation doesn't do any of that. Signed-off-by: Richard W.M. Jones --- block/nbd.c | 67 +++-- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 4c99c3f865..df32ba67ed 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1232,6 +1232,26 @@ static int coroutine_fn nbd_co_request(NBDConnState *cs, NBDRequest *request, return ret ? ret : request_ret; } +/* + * If multi-conn, choose a connection for this operation. + */ +static NBDConnState *choose_connection(BDRVNBDState *s) +{ +static size_t next; +size_t i; + +if (s->multi_conn <= 1) { +return s->conns[0]; +} + +/* XXX Stupid simple round robin. */ +i = qatomic_fetch_inc(); +i %= s->multi_conn; + +assert(s->conns[i] != NULL); +return s->conns[i]; +} + static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) @@ -1244,7 +1264,7 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse .from = offset, .len = bytes, }; -NBDConnState * const cs = s->conns[0]; +NBDConnState * const cs = choose_connection(s); assert(bytes <= NBD_MAX_BUFFER_SIZE); @@ -1301,7 +1321,7 @@ static int coroutine_fn nbd_client_co_pwritev(BlockDriverState *bs, int64_t offs .from = offset, .len = bytes, }; -NBDConnState * const cs = s->conns[0]; +NBDConnState * const cs = choose_connection(s); assert(!(cs->info.flags & NBD_FLAG_READ_ONLY)); if (flags & BDRV_REQ_FUA) { @@ -1326,7 +1346,7 @@ static int coroutine_fn nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_ .from = offset, .len = bytes, /* .len is uint32_t actually */ }; -NBDConnState * const cs = s->conns[0]; +NBDConnState * const cs = choose_connection(s); assert(bytes <= UINT32_MAX); /* rely on max_pwrite_zeroes */ @@ -1357,7 +1377,13 @@ static int coroutine_fn nbd_client_co_flush(BlockDriverState *bs) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; NBDRequest request = { .type = NBD_CMD_FLUSH }; -NBDConnState * const cs = s->conns[0]; + +/* + * Multi-conn (if used) guarantees that flushing on any connection + * flushes caches on all connections, so we can perform this + * operation on any. + */ +NBDConnState * const cs = choose_connection(s); if (!(cs->info.flags & NBD_FLAG_SEND_FLUSH)) { return 0; @@ -1378,7 +1404,7 @@ static int coroutine_fn nbd_client_co_pdiscard(BlockDriverState *bs, int64_t off .from = offset, .len = bytes, /* len is uint32_t */ }; -NBDConnState * const cs = s->conns[0]; +NBDConnState * const cs = choose_connection(s); assert(bytes <= UINT32_MAX); /* rely on max_pdiscard */ @@ -1398,7 +1424,7 @@ static int coroutine_fn nbd_client_co_block_status( NBDExtent extent = { 0 }; BDRVNBDState *s = (BDRVNBDState *)bs->opaque; Error *local_err = NULL; -NBDConnState * const cs = s->conns[0]; +NBDConnState * const cs = choose_connection(s); NBDRequest request = { .type = NBD_CMD_BLOCK_STATUS, @@ -2027,7 +2053,7 @@ static int coroutine_fn nbd_co_flush(BlockDriverState *bs) static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; -NBDConnState * const cs = s->conns[0]; +NBDConnState * const cs = choose_connection(s); uint32_t min = cs->info.min_block; uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, cs->info.max_block); @@ -2085,7 +2111,7 @@ static int coroutine_fn nbd_co_truncate(BlockDriverState *bs, int64_t offset, BdrvRequestFlags flags, Error **errp) { BDRVNBDState *s = bs->opaque; -NBDConnState * const cs = s->conns[0]; +NBDConnState * const cs = choose_connection(s); if (offset != cs->info.size && exact) { error_setg(errp, "Cannot resize NBD nodes"); @@ -2168,24 +2194,29 @@ static const char *const nbd_strong_runtime_opts[] = { static void nbd_cancel_in_flight(BlockDriverState *bs) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; -NBDConnState * const cs = s->conns[0]; +size_t i; +NBDConnState *cs; -reconnect_delay_timer_del(cs); +for (i = 0; i < MAX_MULTI_CONN; ++i) { +cs = s->conns[i]; -qemu_mutex_lock(>requests_lock); -if (cs->state ==
[PATCH nbd 0/4] Enable multi-conn NBD [for discussion only]
[ Patch series also available here, along with this cover letter and the script used to generate test results: https://gitlab.com/rwmjones/qemu/-/commits/2023-nbd-multi-conn-v1 ] This patch series adds multi-conn support to the NBD block driver in qemu. It is only meant for discussion and testing because it has a number of obvious shortcomings (see "XXX" in commit messages and code). If we decided this was a good idea, we can work on a better patch. The Network Block Driver (NBD) protocol allows servers to advertise that they are capable of multi-conn. This means they obey certain requirements around how data is cached, allowing multiple connections to be safely opened to the NBD server. For example, a flush or FUA operation on one connection is guaranteed to flush the cache on all connections. Clients that use multi-conn can achieve better performance. This seems to be down to at least two factors: - Avoids "head of line blocking" of large requests. - With NBD over Unix domain sockets, more cores can be used. qemu-nbd, nbdkit and libnbd have all supported multi-conn for ages, but qemu's built in NBD client does not, which is what this patch fixes. Below I've produced a few benchmarks. Note these are mostly concoted to try to test NBD performance and may not make sense in their own terms (eg. qemu's disk image layer has a curl client so you wouldn't need to run one separately). In the real world we use long pipelines of NBD operations where different tools are mixed together to achieve efficient downloads, conversions, disk modifications and sparsification, and they would exercise different aspects of this. I've also included nbdcopy as a client for comparison in some tests. All tests were run 4 times, the first result discarded, and the last 3 averaged. If any of the last 3 were > 10% away from the average then the test was stopped. My summary: - It works effectively for qemu client & nbdkit server, especially in cases where the server does large, heavyweight requests. This is important for us because virt-v2v uses an nbdkit Python plugin and various other heavyweight plugins (eg. plugins that access remote servers for each request). - It seems to make little or no difference with qemu + qemu-nbd server. I speculate that's because qemu-nbd doesn't support system threads, so networking is bottlenecked through a single core. Even though there are coroutines handling different sockets, they must all wait in turn to issue send(3) or recv(3) calls on the same core. - qemu-img unfortunately uses a single thread for all coroutines so it suffers from a similar problem to qemu-nbd. This change would be much more effective if we could distribute coroutines across threads. - For tests which are highly bottlenecked on disk I/O (eg. the large local file test and null test) multi-conn doesn't make much difference. - Multi-conn even with only 2 connections can make up for the overhead of range requests, exceeding the performance of wget. - In the curlremote test, qemu-nbd is especially slow, for unknown reasons. Integrity test (./multi-conn.pl integrity) == nbdkit-sparse-random-plugin | ^ | nbd+unix| nbd+unix v | qemu-img convert Reading from and writing the same data back to nbdkit sparse-random plugin checks that the data written is the same as the data read. This uses two Unix domain sockets, with or without multi-conn. This test is mainly here to check we don't crash or corrupt data with this patch. server clientmulti-conn --- nbdkitqemu-img [u/s] 9.07s nbdkitqemu-img 1 9.05s nbdkitqemu-img 2 9.02s nbdkitqemu-img 4 8.98s [u/s] = upstream qemu 7.2.0 Curl local server test (./multi-conn.pl curlhttp) = Localhost Apache serving a file over http | | http v nbdkit-curl-plugin or qemu-nbd | | nbd+unix v qemu-img convert or nbdcopy We download an image from a local web server through nbdkit-curl-plugin or qemu-nbd using the curl block driver, over NBD. The image is copied to /dev/null. server clientmulti-conn --- qemu-nbd nbdcopy 1 8.88s qemu-nbd nbdcopy 2 8.64s qemu-nbd nbdcopy 4 8.37s qemu-nbdqemu-img [u/s] 6.47s qemu-nbdqemu-img 1 6.56s qemu-nbdqemu-img 2 6.63s qemu-nbdqemu-img 4 6.50s nbdkit nbdcopy 1 12.15s nbdkit
Re: [PATCH v2 2/3] block: make BlockBackend->disable_request_queuing atomic
On 3/7/23 22:04, Stefan Hajnoczi wrote: static int coroutine_fn GRAPH_RDLOCK @@ -1271,7 +1271,8 @@ static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) { assert(blk->in_flight > 0); -if (qatomic_read(>quiesce_counter) && !blk->disable_request_queuing) { +if (qatomic_read(>quiesce_counter) && +!qatomic_read(>disable_request_queuing)) { The qatomic_inc in blk_inc_in_flight() made me a bit nervous that smp_mb__after_rmw() was needed there, but it's okay. First, anyway blk_wait_while_drained() has to _eventually_ pause the device, not immediately. Even if it misses that blk->quiesce_counter == 1, the I/O will proceed and it'll just take a little more polling before bdrv_drained_begin() exits. Second, I checked with CPPMEM the barriers in AIO_WAIT_WHILE() and aio_wait_kick() save the day, even if loading blk->quiesce_counter is reordered before the incremented value (1) is stored to blk->in_flight. The CPPMEM model here uses mo_relaxed to force all possible kinds of havoc: int main() { atomic_int quiesce_counter = 0; atomic_int waiters = 0; atomic_int in_flight = 0; {{{ { quiesce_counter.store(1, mo_relaxed); waiters.store(1, mo_relaxed);// AIO_WAIT_WHILE starts here atomic_thread_fence(mo_seq_cst); in_flight.load(mo_relaxed).readsvalue(1); } // if 1, sleep ||| { in_flight.store(1, mo_relaxed); // bdrv_inc_in_flight quiesce_counter.load(mo_relaxed).readsvalue(1); // go down "if" in_flight.store(0, mo_release); // bdrv_dec_in_flight atomic_thread_fence(mo_seq_cst); // aio_wait_kick starts here waiters.load(mo_relaxed).readsvalue(0); } // if 0, do not wake }}}; return 0; } Because CPPMEM shows no execution consistent with the buggy .readsvalue(), either AIO_WAIT_WHILE will not go to sleep or it will be woken up with in_flight == 0. The polling loop ends and drained_end restarts the coroutine from blk->queued_requests. Paolo blk_dec_in_flight(blk); qemu_co_queue_wait(>queued_requests, NULL); blk_inc_in_flight(blk); -- 2.39.2
Re: [PATCH 0/3] block: remove separate bdrv_file_open callback
On 9/3/23 09:50, Paolo Bonzini wrote: The value of the bdrv_file_open is sometimes checked to distinguish protocol and format drivers, but apart from that there is no difference between bdrv_file_open and bdrv_open. However, they can all be distinguished by the non-NULL .protocol_name member. Change the checks to use .protocol_name instead of .bdrv_file_open, and unify the two callbacks. Nice cleanup. Series: Reviewed-by: Philippe Mathieu-Daudé Paolo Bonzini (3): block: make assertion more generic block: do not check bdrv_file_open block: remove separate bdrv_file_open callback
Re: [PATCH 5/9] 9pfs: mark more coroutine_fns
On Thursday, March 9, 2023 9:44:52 AM CET Paolo Bonzini wrote: > Signed-off-by: Paolo Bonzini > --- > hw/9pfs/9p.h| 4 ++-- > hw/9pfs/codir.c | 6 +++--- > 2 files changed, 5 insertions(+), 5 deletions(-) > > diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h > index 2fce4140d1e9..1b0d805b9c12 100644 > --- a/hw/9pfs/9p.h > +++ b/hw/9pfs/9p.h > @@ -203,7 +203,7 @@ typedef struct V9fsDir { > QemuMutex readdir_mutex_L; > } V9fsDir; > > -static inline void v9fs_readdir_lock(V9fsDir *dir) > +static inline void coroutine_fn v9fs_readdir_lock(V9fsDir *dir) > { > if (dir->proto_version == V9FS_PROTO_2000U) { > qemu_co_mutex_lock(>readdir_mutex_u); > @@ -212,7 +212,7 @@ static inline void v9fs_readdir_lock(V9fsDir *dir) > } > } > > -static inline void v9fs_readdir_unlock(V9fsDir *dir) > +static inline void coroutine_fn v9fs_readdir_unlock(V9fsDir *dir) > { > if (dir->proto_version == V9FS_PROTO_2000U) { > qemu_co_mutex_unlock(>readdir_mutex_u); > diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c > index 7ba63be489e7..0d0ffa1d2ba8 100644 > --- a/hw/9pfs/codir.c > +++ b/hw/9pfs/codir.c > @@ -68,9 +68,9 @@ int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState > *fidp, > * > * See v9fs_co_readdir_many() (as its only user) below for details. > */ > -static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, > - struct V9fsDirEnt **entries, off_t offset, > - int32_t maxsize, bool dostat) > +static int coroutine_fn do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, > +struct V9fsDirEnt **entries, off_t > offset, > +int32_t maxsize, bool dostat) You should probably fix wrapping here, as the line exceeds 80 characters. Except of that: Reviewed-by: Christian Schoenebeck > { > V9fsState *s = pdu->s; > V9fsString name; >
Re: [PATCH v2 2/3] block: make BlockBackend->disable_request_queuing atomic
On 3/7/23 22:04, Stefan Hajnoczi wrote: This field is accessed by multiple threads without a lock. Use explicit qatomic_read()/qatomic_set() calls. There is no need for acquire/release because blk_set_disable_request_queuing() doesn't provide any guarantees (it helps that it's used at BlockBackend creation time and not when there is I/O in flight). This in turn means itdoes not need to be atomic - atomics are only needed if there are concurrent writes. No big deal; I am now resurrecting the series from the time I had noticed the queued_requests thread-safety problem, so this will be simplified in 8.1. For now your version is okay, thanks for fixing it! Paolo Signed-off-by: Stefan Hajnoczi Reviewed-by: Hanna Czenczek --- block/block-backend.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 68807be32b..0cba4add20 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -82,7 +82,7 @@ struct BlockBackend { int quiesce_counter; /* atomic: written under BQL, read by other threads */ CoQueue queued_requests; -bool disable_request_queuing; +bool disable_request_queuing; /* atomic */ VMChangeStateEntry *vmsh; bool force_allow_inactivate; @@ -1232,7 +1232,7 @@ void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow) void blk_set_disable_request_queuing(BlockBackend *blk, bool disable) { IO_CODE(); -blk->disable_request_queuing = disable; +qatomic_set(>disable_request_queuing, disable); } static int coroutine_fn GRAPH_RDLOCK @@ -1271,7 +1271,8 @@ static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) { assert(blk->in_flight > 0); -if (qatomic_read(>quiesce_counter) && !blk->disable_request_queuing) { +if (qatomic_read(>quiesce_counter) && +!qatomic_read(>disable_request_queuing)) { blk_dec_in_flight(blk); qemu_co_queue_wait(>queued_requests, NULL); blk_inc_in_flight(blk);
[PATCH 3/3] block: remove separate bdrv_file_open callback
bdrv_file_open and bdrv_open are completely equivalent, they are never checked except to see which one to invoke. So merge them into a single one. Signed-off-by: Paolo Bonzini --- block.c | 2 -- block/blkdebug.c | 2 +- block/blkio.c| 2 +- block/blkverify.c| 2 +- block/curl.c | 8 block/file-posix.c | 8 block/file-win32.c | 4 ++-- block/gluster.c | 8 block/iscsi.c| 4 ++-- block/nbd.c | 6 +++--- block/nfs.c | 2 +- block/null.c | 4 ++-- block/nvme.c | 2 +- block/rbd.c | 3 ++- block/ssh.c | 2 +- block/vvfat.c| 2 +- include/block/block_int-common.h | 3 --- 17 files changed, 30 insertions(+), 34 deletions(-) diff --git a/block.c b/block.c index 71f0ea24870e..e6d20f89ba82 100644 --- a/block.c +++ b/block.c @@ -1628,8 +1628,6 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, assert(!drv->bdrv_needs_filename || bs->filename[0]); if (drv->bdrv_open) { -ret = drv->bdrv_file_open(bs, options, open_flags, _err); -} else if (drv->bdrv_open) { ret = drv->bdrv_open(bs, options, open_flags, _err); } else { ret = 0; diff --git a/block/blkdebug.c b/block/blkdebug.c index addad914b3f7..c9ae3cb6ae3d 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -1072,7 +1072,7 @@ static BlockDriver bdrv_blkdebug = { .is_filter = true, .bdrv_parse_filename= blkdebug_parse_filename, -.bdrv_file_open = blkdebug_open, +.bdrv_open = blkdebug_open, .bdrv_close = blkdebug_close, .bdrv_reopen_prepare= blkdebug_reopen_prepare, .bdrv_child_perm= blkdebug_child_perm, diff --git a/block/blkio.c b/block/blkio.c index 0cdc99a72960..c2efe1a8feec 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -997,7 +997,7 @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) .format_name = name, \ .protocol_name = name, \ .instance_size = sizeof(BDRVBlkioState), \ -.bdrv_file_open = blkio_file_open, \ +.bdrv_open = blkio_open, \ .bdrv_close = blkio_close, \ .bdrv_co_getlength = blkio_co_getlength, \ .bdrv_co_truncate= blkio_truncate, \ diff --git a/block/blkverify.c b/block/blkverify.c index 1c16f86b2e70..9b59b34c02ba 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -312,7 +312,7 @@ static BlockDriver bdrv_blkverify = { .instance_size= sizeof(BDRVBlkverifyState), .bdrv_parse_filename = blkverify_parse_filename, -.bdrv_file_open = blkverify_open, +.bdrv_open= blkverify_open, .bdrv_close = blkverify_close, .bdrv_child_perm = bdrv_default_perms, .bdrv_co_getlength= blkverify_co_getlength, diff --git a/block/curl.c b/block/curl.c index 8bb39a134e4b..809858692652 100644 --- a/block/curl.c +++ b/block/curl.c @@ -1032,7 +1032,7 @@ static BlockDriver bdrv_http = { .instance_size = sizeof(BDRVCURLState), .bdrv_parse_filename= curl_parse_filename, -.bdrv_file_open = curl_open, +.bdrv_open = curl_open, .bdrv_close = curl_close, .bdrv_co_getlength = curl_co_getlength, @@ -1051,7 +1051,7 @@ static BlockDriver bdrv_https = { .instance_size = sizeof(BDRVCURLState), .bdrv_parse_filename= curl_parse_filename, -.bdrv_file_open = curl_open, +.bdrv_open = curl_open, .bdrv_close = curl_close, .bdrv_co_getlength = curl_co_getlength, @@ -1070,7 +1070,7 @@ static BlockDriver bdrv_ftp = { .instance_size = sizeof(BDRVCURLState), .bdrv_parse_filename= curl_parse_filename, -.bdrv_file_open = curl_open, +.bdrv_open = curl_open, .bdrv_close = curl_close, .bdrv_co_getlength = curl_co_getlength, @@ -1089,7 +1089,7 @@ static BlockDriver bdrv_ftps = { .instance_size = sizeof(BDRVCURLState), .bdrv_parse_filename= curl_parse_filename, -.bdrv_file_open = curl_open, +.bdrv_open = curl_open, .bdrv_close = curl_close, .bdrv_co_getlength = curl_co_getlength, diff --git a/block/file-posix.c b/block/file-posix.c index 5760cf22d17d..578b512ed515 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -3323,7 +3323,7 @@ BlockDriver
[PATCH 0/3] block: remove separate bdrv_file_open callback
The value of the bdrv_file_open is sometimes checked to distinguish protocol and format drivers, but apart from that there is no difference between bdrv_file_open and bdrv_open. However, they can all be distinguished by the non-NULL .protocol_name member. Change the checks to use .protocol_name instead of .bdrv_file_open, and unify the two callbacks. Paolo Paolo Bonzini (3): block: make assertion more generic block: do not check bdrv_file_open block: remove separate bdrv_file_open callback block.c | 17 +++-- block/blkdebug.c | 2 +- block/blkio.c| 2 +- block/blkverify.c| 2 +- block/curl.c | 8 block/file-posix.c | 8 block/file-win32.c | 4 ++-- block/gluster.c | 8 block/iscsi.c| 4 ++-- block/nbd.c | 6 +++--- block/nfs.c | 2 +- block/null.c | 4 ++-- block/nvme.c | 2 +- block/rbd.c | 3 ++- block/ssh.c | 2 +- block/vvfat.c| 2 +- include/block/block_int-common.h | 3 --- 17 files changed, 37 insertions(+), 42 deletions(-) -- 2.39.2
[PATCH 2/3] block: do not check bdrv_file_open
The set of BlockDrivers that have .bdrv_file_open coincides with those that have .protocol_name and guess what---checking drv->bdrv_file_open is done to see if the driver is a protocol. So check drv->protocol_name instead. Signed-off-by: Paolo Bonzini --- block.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block.c b/block.c index 075da6517b7f..71f0ea24870e 100644 --- a/block.c +++ b/block.c @@ -914,7 +914,6 @@ BlockDriver *bdrv_find_protocol(const char *filename, int i; GLOBAL_STATE_CODE(); -/* TODO Drivers without bdrv_file_open must be specified explicitly */ /* * XXX(hch): we really should not let host device detection @@ -1628,7 +1627,7 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, bs->opaque = g_malloc0(drv->instance_size); assert(!drv->bdrv_needs_filename || bs->filename[0]); -if (drv->bdrv_file_open) { +if (drv->bdrv_open) { ret = drv->bdrv_file_open(bs, options, open_flags, _err); } else if (drv->bdrv_open) { ret = drv->bdrv_open(bs, options, open_flags, _err); @@ -1940,7 +1939,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file, open_flags = bdrv_open_flags(bs, bs->open_flags); node_name = qemu_opt_get(opts, "node-name"); -assert(!drv->bdrv_file_open || file == NULL); +assert(!drv->protocol_name || file == NULL); ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp); if (ret < 0) { goto fail_opts; @@ -2040,7 +2039,7 @@ static int bdrv_fill_options(QDict **options, const char *filename, } /* If the user has explicitly specified the driver, this choice should * override the BDRV_O_PROTOCOL flag */ -protocol = drv->bdrv_file_open; +protocol = drv->protocol_name; } if (protocol) { @@ -4000,7 +3999,7 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, } /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */ -assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open); +assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->protocol_name); /* file must be NULL if a protocol BDS is about to be created * (the inverse results in an error message from bdrv_open_common()) */ assert(!(flags & BDRV_O_PROTOCOL) || !file); @@ -5785,7 +5784,7 @@ int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs) return drv->bdrv_co_get_allocated_file_size(bs); } -if (drv->bdrv_file_open) { +if (drv->protocol_name) { /* * Protocol drivers default to -ENOTSUP (most of their data is * not stored in any of their children (if they even have any), @@ -7888,7 +7887,7 @@ void bdrv_refresh_filename(BlockDriverState *bs) * Both of these conditions are represented by generate_json_filename. */ if (primary_child_bs->exact_filename[0] && -primary_child_bs->drv->bdrv_file_open && +primary_child_bs->drv->protocol_name && !drv->is_filter && !generate_json_filename) { strcpy(bs->exact_filename, primary_child_bs->exact_filename); -- 2.39.2
[PATCH 1/3] block: make assertion more generic
.bdrv_needs_filename is only set for drivers that also set bdrv_file_open, i.e. protocol drivers. So we can make the assertion always, it will always pass for those drivers that use bdrv_open. Signed-off-by: Paolo Bonzini --- block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block.c b/block.c index 0dd604d0f6a8..075da6517b7f 100644 --- a/block.c +++ b/block.c @@ -1627,8 +1627,8 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, bs->drv = drv; bs->opaque = g_malloc0(drv->instance_size); +assert(!drv->bdrv_needs_filename || bs->filename[0]); if (drv->bdrv_file_open) { -assert(!drv->bdrv_needs_filename || bs->filename[0]); ret = drv->bdrv_file_open(bs, options, open_flags, _err); } else if (drv->bdrv_open) { ret = drv->bdrv_open(bs, options, open_flags, _err); -- 2.39.2
[PATCH 6/9] qemu-pr-helper: mark more coroutine_fns
do_sgio can suspend via the coroutine function thread_pool_submit_co, so it has to be coroutine_fn as well---and the same is true of all its direct and indirect callers. Signed-off-by: Paolo Bonzini --- scsi/qemu-pr-helper.c | 22 +++--- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c index 199227a556e6..9df82d93a7d2 100644 --- a/scsi/qemu-pr-helper.c +++ b/scsi/qemu-pr-helper.c @@ -177,8 +177,8 @@ static int do_sgio_worker(void *opaque) return status; } -static int do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, -uint8_t *buf, int *sz, int dir) +static int coroutine_fn do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, +uint8_t *buf, int *sz, int dir) { ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); int r; @@ -320,7 +320,7 @@ static SCSISense mpath_generic_sense(int r) } } -static int mpath_reconstruct_sense(int fd, int r, uint8_t *sense) +static int coroutine_fn mpath_reconstruct_sense(int fd, int r, uint8_t *sense) { switch (r) { case MPATH_PR_SUCCESS: @@ -372,8 +372,8 @@ static int mpath_reconstruct_sense(int fd, int r, uint8_t *sense) } } -static int multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, - uint8_t *data, int sz) +static int coroutine_fn multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, +uint8_t *data, int sz) { int rq_servact = cdb[1]; struct prin_resp resp; @@ -427,8 +427,8 @@ static int multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, return mpath_reconstruct_sense(fd, r, sense); } -static int multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, -const uint8_t *param, int sz) +static int coroutine_fn multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) { int rq_servact = cdb[1]; int rq_scope = cdb[2] >> 4; @@ -545,8 +545,8 @@ static int multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, } #endif -static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, -uint8_t *data, int *resp_sz) +static int coroutine_fn do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *data, int *resp_sz) { #ifdef CONFIG_MPATH if (is_mpath(fd)) { @@ -563,8 +563,8 @@ static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, SG_DXFER_FROM_DEV); } -static int do_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, - const uint8_t *param, int sz) +static int coroutine_fn do_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) { int resp_sz; -- 2.39.2
[PATCH 7/9] tests: mark more coroutine_fns
Signed-off-by: Paolo Bonzini --- tests/unit/test-thread-pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test-thread-pool.c b/tests/unit/test-thread-pool.c index 6020e65d6986..493b966b94f9 100644 --- a/tests/unit/test-thread-pool.c +++ b/tests/unit/test-thread-pool.c @@ -71,7 +71,7 @@ static void test_submit_aio(void) g_assert_cmpint(data.ret, ==, 0); } -static void co_test_cb(void *opaque) +static void coroutine_fn co_test_cb(void *opaque) { WorkerTestData *data = opaque; -- 2.39.2
[PATCH 8/9] qcow2: mark various functions as coroutine_fn and GRAPH_RDLOCK
Functions that can do I/O (including calling bdrv_is_allocated and bdrv_block_status functions) are prime candidates for being coroutine_fns. Make the change for those that are themselves called only from coroutine_fns. Also annotate that they are called with the graph rdlock taken, thus allowing them to call bdrv_co_*() functions for I/O. Signed-off-by: Paolo Bonzini --- block/qcow2-bitmap.c | 2 +- block/qcow2-cluster.c | 20 block/qcow2-refcount.c | 8 block/qcow2-snapshot.c | 25 + block/qcow2.c | 26 +- block/qcow2.h | 15 --- 6 files changed, 51 insertions(+), 45 deletions(-) diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c index 5f456a2785c6..a952fd58d85e 100644 --- a/block/qcow2-bitmap.c +++ b/block/qcow2-bitmap.c @@ -1221,7 +1221,7 @@ out: } /* Checks to see if it's safe to resize bitmaps */ -int qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp) +int coroutine_fn qcow2_truncate_bitmaps_check(BlockDriverState *bs, Error **errp) { BDRVQcow2State *s = bs->opaque; Qcow2BitmapList *bm_list; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index a9e6622fe300..92526dea504c 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1126,7 +1126,7 @@ err: * Frees the allocated clusters because the request failed and they won't * actually be linked. */ -void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) +void coroutine_fn qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) { BDRVQcow2State *s = bs->opaque; if (!has_data_file(bs) && !m->keep_old_clusters) { @@ -1156,9 +1156,11 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) * * Returns 0 on success, -errno on failure. */ -static int calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset, - uint64_t guest_offset, unsigned bytes, - uint64_t *l2_slice, QCowL2Meta **m, bool keep_old) +static int coroutine_fn calculate_l2_meta(BlockDriverState *bs, + uint64_t host_cluster_offset, + uint64_t guest_offset, unsigned bytes, + uint64_t *l2_slice, QCowL2Meta **m, + bool keep_old) { BDRVQcow2State *s = bs->opaque; int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset); @@ -1599,8 +1601,10 @@ out: * function has been waiting for another request and the allocation must be * restarted, but the whole request should not be failed. */ -static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, uint64_t *nb_clusters) +static int coroutine_fn do_alloc_cluster_offset(BlockDriverState *bs, + uint64_t guest_offset, +uint64_t *host_offset, + uint64_t *nb_clusters) { BDRVQcow2State *s = bs->opaque; @@ -2065,8 +2069,8 @@ static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, return nb_clusters; } -static int zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, - unsigned nb_subclusters) +static int coroutine_fn zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, +unsigned nb_subclusters) { BDRVQcow2State *s = bs->opaque; uint64_t *l2_slice; diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index b092f89da98b..b2a81ff707ab 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1030,8 +1030,8 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) return offset; } -int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, -int64_t nb_clusters) +int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters) { BDRVQcow2State *s = bs->opaque; uint64_t cluster_index, refcount; @@ -1069,7 +1069,7 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, /* only used to allocate compressed sectors. We try to allocate contiguous sectors. size must be <= cluster_size */ -int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) +int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size) { BDRVQcow2State *s = bs->opaque; int64_t offset; @@ -3685,7 +3685,7 @@ out: return ret; } -int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) +int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) { BDRVQcow2State *s = bs->opaque; int64_t i; diff --git
[PATCH 4/9] nbd: mark more coroutine_fns, do not use co_wrappers
Signed-off-by: Paolo Bonzini --- nbd/server.c | 48 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index a4750e41880a..6f5fcade2a54 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1409,8 +1409,8 @@ nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp) return 1; } -static int nbd_receive_request(NBDClient *client, NBDRequest *request, - Error **errp) +static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *request, +Error **errp) { uint8_t buf[NBD_REQUEST_SIZE]; uint32_t magic; @@ -1895,12 +1895,12 @@ static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error, stq_be_p(>handle, handle); } -static int nbd_co_send_simple_reply(NBDClient *client, -uint64_t handle, -uint32_t error, -void *data, -size_t len, -Error **errp) +static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client, + uint64_t handle, + uint32_t error, + void *data, + size_t len, + Error **errp) { NBDSimpleReply reply; int nbd_err = system_errno_to_nbd_errno(error); @@ -2038,8 +2038,8 @@ static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client, stl_be_p(, pnum); ret = nbd_co_send_iov(client, iov, 1, errp); } else { -ret = blk_pread(exp->common.blk, offset + progress, pnum, -data + progress, 0); +ret = blk_co_pread(exp->common.blk, offset + progress, pnum, + data + progress, 0); if (ret < 0) { error_setg_errno(errp, -ret, "reading from file failed"); break; @@ -2198,9 +2198,9 @@ static int coroutine_fn blockalloc_to_extents(BlockBackend *blk, * @ea is converted to BE by the function * @last controls whether NBD_REPLY_FLAG_DONE is sent. */ -static int nbd_co_send_extents(NBDClient *client, uint64_t handle, - NBDExtentArray *ea, - bool last, uint32_t context_id, Error **errp) +static int coroutine_fn nbd_co_send_extents(NBDClient *client, uint64_t handle, +NBDExtentArray *ea, + bool last, uint32_t context_id, Error **errp) { NBDStructuredMeta chunk; struct iovec iov[] = { @@ -2277,10 +2277,10 @@ static void bitmap_to_extents(BdrvDirtyBitmap *bitmap, bdrv_dirty_bitmap_unlock(bitmap); } -static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle, - BdrvDirtyBitmap *bitmap, uint64_t offset, - uint32_t length, bool dont_fragment, bool last, - uint32_t context_id, Error **errp) +static int coroutine_fn nbd_co_send_bitmap(NBDClient *client, uint64_t handle, + BdrvDirtyBitmap *bitmap, uint64_t offset, + uint32_t length, bool dont_fragment, bool last, + uint32_t context_id, Error **errp) { unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS; g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents); @@ -2297,8 +2297,8 @@ static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle, * to the client (although the caller may still need to disconnect after * reporting the error). */ -static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, - Error **errp) +static int coroutine_fn nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, + Error **errp) { NBDClient *client = req->client; int valid_flags; @@ -2446,7 +2446,7 @@ static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, data, request->len, errp); } -ret = blk_pread(exp->common.blk, request->from, request->len, data, 0); +ret = blk_co_pread(exp->common.blk, request->from, request->len, data, 0); if (ret < 0) { return nbd_send_generic_reply(client, request->handle, ret, "reading from file failed", errp); @@ -2513,8 +2513,8 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (request->flags & NBD_CMD_FLAG_FUA) { flags |=
[PATCH 9/9] vmdk: make vmdk_is_cid_valid a coroutine_fn
Functions that can do I/O are prime candidates for being coroutine_fns. Make the change for the one that is itself called only from coroutine_fns. Unfortunately vmdk does not use a coroutine_fn for the bulk of the open (like qcow2 does) so vmdk_read_cid cannot have the same treatment. Signed-off-by: Paolo Bonzini --- block/vmdk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/vmdk.c b/block/vmdk.c index f5f49018fe4a..3f8c731e32e8 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -376,7 +376,7 @@ out: return ret; } -static int vmdk_is_cid_valid(BlockDriverState *bs) +static int coroutine_fn vmdk_is_cid_valid(BlockDriverState *bs) { BDRVVmdkState *s = bs->opaque; uint32_t cur_pcid; -- 2.39.2
[PATCH 0/9] (mostly) block: add more coroutine_fn annotations, use bdrv/blk_co_*
"Mostly" because there is a 9pfs patch in here too. The series was developed with the help of vrc and the clang TSA annotations. Paolo Paolo Bonzini (9): vvfat: mark various functions as coroutine_fn blkdebug: add missing coroutine_fn annotation mirror: make mirror_flush a coroutine_fn nbd: mark more coroutine_fns 9pfs: mark more coroutine_fns qemu-pr-helper: mark more coroutine_fns tests: mark more coroutine_fns qcow2: mark various functions as coroutine_fn and GRAPH_RDLOCK vmdk: make vmdk_is_cid_valid a coroutine_fn block/blkdebug.c | 4 +-- block/mirror.c| 4 +-- block/qcow2-bitmap.c | 2 +- block/qcow2-cluster.c | 20 +++- block/qcow2-refcount.c| 8 ++--- block/qcow2-snapshot.c| 25 +++ block/qcow2.c | 26 block/qcow2.h | 15 - block/vmdk.c | 2 +- block/vvfat.c | 58 ++- hw/9pfs/9p.h | 4 +-- hw/9pfs/codir.c | 6 ++-- nbd/server.c | 48 ++--- scsi/qemu-pr-helper.c | 22 ++--- tests/unit/test-thread-pool.c | 2 +- 15 files changed, 127 insertions(+), 119 deletions(-) -- 2.39.2
[PATCH 1/9] vvfat: mark various functions as coroutine_fn
Functions that can do I/O are prime candidates for being coroutine_fns. Make the change for those that are themselves called only from coroutine_fns. In addition, coroutine_fns should do I/O using bdrv_co_*() functions, for which it is required to hold the BlockDriverState graph lock. So also nnotate functions on the I/O path with TSA attributes, making it possible to switch them to use bdrv_co_*() functions. Signed-off-by: Paolo Bonzini --- block/vvfat.c | 58 ++- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/block/vvfat.c b/block/vvfat.c index fd45e86416b2..0ddc91fc096a 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1053,7 +1053,7 @@ static BDRVVVFATState *vvv = NULL; #endif static int enable_write_target(BlockDriverState *bs, Error **errp); -static int is_consistent(BDRVVVFATState *s); +static int coroutine_fn is_consistent(BDRVVVFATState *s); static QemuOptsList runtime_opts = { .name = "vvfat", @@ -1469,8 +1469,8 @@ static void print_mapping(const mapping_t* mapping) } #endif -static int vvfat_read(BlockDriverState *bs, int64_t sector_num, -uint8_t *buf, int nb_sectors) +static int coroutine_fn GRAPH_RDLOCK +vvfat_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { BDRVVVFATState *s = bs->opaque; int i; @@ -1490,8 +1490,8 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num, DLOG(fprintf(stderr, "sectors %" PRId64 "+%" PRId64 " allocated\n", sector_num, n >> BDRV_SECTOR_BITS)); -if (bdrv_pread(s->qcow, sector_num * BDRV_SECTOR_SIZE, n, - buf + i * 0x200, 0) < 0) { +if (bdrv_co_pread(s->qcow, sector_num * BDRV_SECTOR_SIZE, n, + buf + i * 0x200, 0) < 0) { return -1; } i += (n >> BDRV_SECTOR_BITS) - 1; @@ -1532,7 +1532,7 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num, return 0; } -static int coroutine_fn +static int coroutine_fn GRAPH_RDLOCK vvfat_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1796,8 +1796,8 @@ static inline uint32_t modified_fat_get(BDRVVVFATState* s, } } -static inline bool cluster_was_modified(BDRVVVFATState *s, -uint32_t cluster_num) +static inline bool coroutine_fn GRAPH_RDLOCK +cluster_was_modified(BDRVVVFATState *s, uint32_t cluster_num) { int was_modified = 0; int i; @@ -1852,8 +1852,8 @@ typedef enum { * Further, the files/directories handled by this function are * assumed to be *not* deleted (and *only* those). */ -static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, -direntry_t* direntry, const char* path) +static uint32_t coroutine_fn GRAPH_RDLOCK +get_cluster_count_for_direntry(BDRVVVFATState* s, direntry_t* direntry, const char* path) { /* * This is a little bit tricky: @@ -1979,9 +1979,9 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, if (res) { return -1; } -res = bdrv_pwrite(s->qcow, offset * BDRV_SECTOR_SIZE, - BDRV_SECTOR_SIZE, s->cluster_buffer, - 0); +res = bdrv_co_pwrite(s->qcow, offset * BDRV_SECTOR_SIZE, + BDRV_SECTOR_SIZE, s->cluster_buffer, + 0); if (res < 0) { return -2; } @@ -2011,8 +2011,8 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, * It returns 0 upon inconsistency or error, and the number of clusters * used by the directory, its subdirectories and their files. */ -static int check_directory_consistency(BDRVVVFATState *s, -int cluster_num, const char* path) +static int coroutine_fn GRAPH_RDLOCK +check_directory_consistency(BDRVVVFATState *s, int cluster_num, const char* path) { int ret = 0; unsigned char* cluster = g_malloc(s->cluster_size); @@ -2138,7 +2138,8 @@ DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i)) } /* returns 1 on success */ -static int is_consistent(BDRVVVFATState* s) +static int coroutine_fn GRAPH_RDLOCK +is_consistent(BDRVVVFATState* s) { int i, check; int used_clusters_count = 0; @@ -2414,8 +2415,8 @@ static int commit_mappings(BDRVVVFATState* s, return 0; } -static int commit_direntries(BDRVVVFATState* s, -int dir_index, int parent_mapping_index) +static int coroutine_fn GRAPH_RDLOCK +commit_direntries(BDRVVVFATState* s, int dir_index,
[PATCH 3/9] mirror: make mirror_flush a coroutine_fn, do not use co_wrappers
mirror_flush calls a mixed function blk_flush but it is only called from mirror_run; so call the coroutine version and make mirror_flush a coroutine_fn too. Signed-off-by: Paolo Bonzini --- block/mirror.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 663e2b700241..af9bbd23d4cf 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -886,9 +886,9 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) /* Called when going out of the streaming phase to flush the bulk of the * data to the medium, or just before completing. */ -static int mirror_flush(MirrorBlockJob *s) +static int coroutine_fn mirror_flush(MirrorBlockJob *s) { -int ret = blk_flush(s->target); +int ret = blk_co_flush(s->target); if (ret < 0) { if (mirror_error_action(s, false, -ret) == BLOCK_ERROR_ACTION_REPORT) { s->ret = ret; -- 2.39.2
[PATCH 2/9] blkdebug: add missing coroutine_fn annotation
Signed-off-by: Paolo Bonzini --- block/blkdebug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index 978c8cff9e33..addad914b3f7 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -583,8 +583,8 @@ out: return ret; } -static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - BlkdebugIOType iotype) +static int coroutine_fn rule_check(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, BlkdebugIOType iotype) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; -- 2.39.2
[PATCH 5/9] 9pfs: mark more coroutine_fns
Signed-off-by: Paolo Bonzini --- hw/9pfs/9p.h| 4 ++-- hw/9pfs/codir.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h index 2fce4140d1e9..1b0d805b9c12 100644 --- a/hw/9pfs/9p.h +++ b/hw/9pfs/9p.h @@ -203,7 +203,7 @@ typedef struct V9fsDir { QemuMutex readdir_mutex_L; } V9fsDir; -static inline void v9fs_readdir_lock(V9fsDir *dir) +static inline void coroutine_fn v9fs_readdir_lock(V9fsDir *dir) { if (dir->proto_version == V9FS_PROTO_2000U) { qemu_co_mutex_lock(>readdir_mutex_u); @@ -212,7 +212,7 @@ static inline void v9fs_readdir_lock(V9fsDir *dir) } } -static inline void v9fs_readdir_unlock(V9fsDir *dir) +static inline void coroutine_fn v9fs_readdir_unlock(V9fsDir *dir) { if (dir->proto_version == V9FS_PROTO_2000U) { qemu_co_mutex_unlock(>readdir_mutex_u); diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c index 7ba63be489e7..0d0ffa1d2ba8 100644 --- a/hw/9pfs/codir.c +++ b/hw/9pfs/codir.c @@ -68,9 +68,9 @@ int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp, * * See v9fs_co_readdir_many() (as its only user) below for details. */ -static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, - struct V9fsDirEnt **entries, off_t offset, - int32_t maxsize, bool dostat) +static int coroutine_fn do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, +struct V9fsDirEnt **entries, off_t offset, +int32_t maxsize, bool dostat) { V9fsState *s = pdu->s; V9fsString name; -- 2.39.2