The reconnection logic doesn't need to stop while in a drained section. Moreover it has to be active during the drained section, as the requests that were caught in-flight with the connection to the server broken can only usefully get drained if the connection is restored. Otherwise such requests can only either stall resulting in a deadlock (before 8c517de24a), or be aborted defeating the purpose of the reconnection machinery (after 8c517de24a).
Since the pieces of the reconnection logic are now properly migrated from one aio_context to another, it appears safe to just stop messing with the drained section in the reconnection code. Fixes: 5ad81b4946 ("nbd: Restrict connection_co reentrance") Fixes: 8c517de24a ("block/nbd: fix drain dead-lock because of nbd reconnect-delay") Signed-off-by: Roman Kagan <rvka...@yandex-team.ru> --- block/nbd.c | 79 +++-------------------------------------------------- 1 file changed, 4 insertions(+), 75 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index a3eb9b9079..a5a9e4aca5 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -117,8 +117,6 @@ typedef struct BDRVNBDState { Coroutine *connection_co; Coroutine *teardown_co; QemuCoSleepState *connection_co_sleep_ns_state; - bool drained; - bool wait_drained_end; int in_flight; NBDClientState state; int connect_status; @@ -311,12 +309,6 @@ static void nbd_client_attach_aio_context_bh(void *opaque) qemu_mutex_unlock(&thr->mutex); if (s->connection_co) { - /* - * The node is still drained, so we know the coroutine has yielded in - * nbd_read_eof(), the only place where bs->in_flight can reach 0, or - * it is entered for the first time. Both places are safe for entering - * the coroutine. - */ qemu_aio_coroutine_enter(bs->aio_context, s->connection_co); } bdrv_dec_in_flight(bs); @@ -344,37 +336,6 @@ static void nbd_client_attach_aio_context(BlockDriverState *bs, aio_wait_bh_oneshot(new_context, nbd_client_attach_aio_context_bh, bs); } -static void coroutine_fn nbd_client_co_drain_begin(BlockDriverState *bs) -{ - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - s->drained = true; - if (s->connection_co_sleep_ns_state) { - qemu_co_sleep_wake(s->connection_co_sleep_ns_state); - } - - nbd_co_establish_connection_cancel(bs, false); - - reconnect_delay_timer_del(s); - - if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) { - s->state = NBD_CLIENT_CONNECTING_NOWAIT; - qemu_co_queue_restart_all(&s->free_sema); - } -} - -static void coroutine_fn nbd_client_co_drain_end(BlockDriverState *bs) -{ - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - s->drained = false; - if (s->wait_drained_end) { - s->wait_drained_end = false; - aio_co_wake(s->connection_co); - } -} - - static void nbd_teardown_connection(BlockDriverState *bs) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; @@ -686,16 +647,6 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) ret = nbd_client_handshake(s->bs, &local_err); - if (s->drained) { - s->wait_drained_end = true; - while (s->drained) { - /* - * We may be entered once from nbd_client_attach_aio_context_bh - * and then from nbd_client_co_drain_end. So here is a loop. - */ - qemu_coroutine_yield(); - } - } bdrv_inc_in_flight(s->bs); out: @@ -724,26 +675,10 @@ static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s) nbd_reconnect_attempt(s); while (nbd_client_connecting(s)) { - if (s->drained) { - bdrv_dec_in_flight(s->bs); - s->wait_drained_end = true; - while (s->drained) { - /* - * We may be entered once from nbd_client_attach_aio_context_bh - * and then from nbd_client_co_drain_end. So here is a loop. - */ - qemu_coroutine_yield(); - } - bdrv_inc_in_flight(s->bs); - } else { - qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout, - &s->connection_co_sleep_ns_state); - if (s->drained) { - continue; - } - if (timeout < max_timeout) { - timeout *= 2; - } + qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout, + &s->connection_co_sleep_ns_state); + if (timeout < max_timeout) { + timeout *= 2; } nbd_reconnect_attempt(s); @@ -2548,8 +2483,6 @@ static BlockDriver bdrv_nbd = { .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_client_detach_aio_context, .bdrv_attach_aio_context = nbd_client_attach_aio_context, - .bdrv_co_drain_begin = nbd_client_co_drain_begin, - .bdrv_co_drain_end = nbd_client_co_drain_end, .bdrv_refresh_filename = nbd_refresh_filename, .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, @@ -2577,8 +2510,6 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_client_detach_aio_context, .bdrv_attach_aio_context = nbd_client_attach_aio_context, - .bdrv_co_drain_begin = nbd_client_co_drain_begin, - .bdrv_co_drain_end = nbd_client_co_drain_end, .bdrv_refresh_filename = nbd_refresh_filename, .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, @@ -2606,8 +2537,6 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_client_detach_aio_context, .bdrv_attach_aio_context = nbd_client_attach_aio_context, - .bdrv_co_drain_begin = nbd_client_co_drain_begin, - .bdrv_co_drain_end = nbd_client_co_drain_end, .bdrv_refresh_filename = nbd_refresh_filename, .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, -- 2.30.2