Re: [Qemu-block] [PATCH v3 10/11] block/nbd-client: nbd reconnect
09.06.2018 18:32, Vladimir Sementsov-Ogievskiy wrote: Implement reconnect. To achieve this: 1. Move from quit bool variable to state. 4 states are introduced: connecting-wait: means, that reconnecting is in progress, and there were small number of reconnect attempts, so all requests are waiting for the connection. connecting-nowait: reconnecting is in progress, there were a lot of attempts of reconnect, all requests will return errors. connected: normal state quit: exiting after fatal error or on close Possible transitions are: * -> quit connecting-* -> connected connecting-wait -> connecting-nowait connected -> connecting-wait 2. Implement reconnect in connection_co. So, in connecting-* mode, connection_co, tries to reconnect every NBD_RECONNECT_NS. Configuring of this parameter (as well as NBD_RECONNECT_ATTEMPTS, which specifies bound of transition from connecting-wait to connecting-nowait) may be done as a follow-up patch. 3. Retry nbd queries on channel error, if we are in connecting-wait state. 4. In init, wait until for connection until transition to connecting-nowait. So, NBD_RECONNECT_ATTEMPTS is a bound of fail for initial connection too. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/nbd-client.h | 2 + block/nbd-client.c | 170 ++--- 2 files changed, 123 insertions(+), 49 deletions(-) diff --git a/block/nbd-client.h b/block/nbd-client.h index 2561e1ea42..1249f2eb52 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -44,6 +44,8 @@ typedef struct NBDClientSession { bool receiving; int connect_status; Error *connect_err; +int connect_attempts; +bool wait_in_flight; NBDClientRequest requests[MAX_NBD_REQUESTS]; NBDReply reply; diff --git a/block/nbd-client.c b/block/nbd-client.c index f22ed7f404..49b1f67047 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -41,10 +41,16 @@ static int nbd_client_connect(BlockDriverState *bs, const char *hostname, Error **errp); -/* @ret would be used for reconnect in future */ static void nbd_channel_error(NBDClientSession *s, int ret) { -s->state = NBD_CLIENT_QUIT; +if (ret == -EIO) { +if (s->state == NBD_CLIENT_CONNECTED) { +s->state = NBD_CLIENT_CONNECTING_WAIT; +s->connect_attempts = 0; +} +} else { +s->state = NBD_CLIENT_QUIT; +} } static void nbd_recv_coroutines_wake_all(NBDClientSession *s) @@ -90,6 +96,19 @@ typedef struct NBDConnection { uint64_t reconnect_timeout; } NBDConnection; +static bool nbd_client_connecting(NBDClientSession *client) +{ +return client->state == NBD_CLIENT_CONNECTING_WAIT || + client->state == NBD_CLIENT_CONNECTING_NOWAIT || + client->state == NBD_CLIENT_CONNECTING_INIT; +} + +static bool nbd_client_connecting_wait(NBDClientSession *client) +{ +return client->state == NBD_CLIENT_CONNECTING_WAIT || + client->state == NBD_CLIENT_CONNECTING_INIT; +} + static coroutine_fn void nbd_connection_entry(void *opaque) { NBDConnection *con = opaque; @@ -98,26 +117,55 @@ static coroutine_fn void nbd_connection_entry(void *opaque) int ret = 0; Error *local_err = NULL; -if (con->reconnect_attempts != 0) { -error_setg(>connect_err, "Reconnect is not supported yet"); -s->connect_status = -EINVAL; -nbd_channel_error(s, s->connect_status); -return; -} +while (s->state != NBD_CLIENT_QUIT) { +assert(s->reply.handle == 0); -s->connect_status = nbd_client_connect(con->bs, con->saddr, - con->export, con->tlscreds, - con->hostname, >connect_err); -if (s->connect_status < 0) { -nbd_channel_error(s, s->connect_status); -return; -} +if (nbd_client_connecting(s)) { +if (s->connect_attempts == con->reconnect_attempts) { +s->state = NBD_CLIENT_CONNECTING_NOWAIT; +qemu_co_queue_restart_all(>free_sema); +} -/* successfully connected */ -s->state = NBD_CLIENT_CONNECTED; +qemu_co_mutex_lock(>send_mutex); + +while (s->in_flight > 0) { +qemu_co_mutex_unlock(>send_mutex); +nbd_recv_coroutines_wake_all(s); +s->wait_in_flight = true; +qemu_coroutine_yield(); +s->wait_in_flight = false; +qemu_co_mutex_lock(>send_mutex); +} + +qemu_co_mutex_unlock(>send_mutex); + +/* Now we are sure, that nobody accessing the channel now and nobody + * will try to access the channel, until we set state to CONNECTED + */ + +
Re: [Qemu-block] [PATCH v3 10/11] block/nbd-client: nbd reconnect
09.06.2018 18:32, Vladimir Sementsov-Ogievskiy wrote: Implement reconnect. To achieve this: 1. Move from quit bool variable to state. 4 states are introduced: connecting-wait: means, that reconnecting is in progress, and there were small number of reconnect attempts, so all requests are waiting for the connection. connecting-nowait: reconnecting is in progress, there were a lot of attempts of reconnect, all requests will return errors. connected: normal state quit: exiting after fatal error or on close Possible transitions are: * -> quit connecting-* -> connected connecting-wait -> connecting-nowait connected -> connecting-wait 2. Implement reconnect in connection_co. So, in connecting-* mode, connection_co, tries to reconnect every NBD_RECONNECT_NS. Configuring of this parameter (as well as NBD_RECONNECT_ATTEMPTS, which specifies bound of transition from connecting-wait to connecting-nowait) may be done as a follow-up patch. 3. Retry nbd queries on channel error, if we are in connecting-wait state. 4. In init, wait until for connection until transition to connecting-nowait. So, NBD_RECONNECT_ATTEMPTS is a bound of fail for initial connection too. Signed-off-by: Vladimir Sementsov-Ogievskiy squash: @@ -616,7 +617,10 @@ static coroutine_fn int nbd_co_receive_one_chunk( s->reply.handle = 0; } - if (s->connection_co) { + if (s->connection_co && !s->wait_in_flight) { + /* We must check s->wait_in_flight, because we may entered by + * nbd_recv_coroutines_wake_all(), int this case we should not + * wake connection_co here, it will woken by last request. */ aio_co_wake(s->connection_co); } -- Best regards, Vladimir
[Qemu-block] [PATCH v3 10/11] block/nbd-client: nbd reconnect
Implement reconnect. To achieve this: 1. Move from quit bool variable to state. 4 states are introduced: connecting-wait: means, that reconnecting is in progress, and there were small number of reconnect attempts, so all requests are waiting for the connection. connecting-nowait: reconnecting is in progress, there were a lot of attempts of reconnect, all requests will return errors. connected: normal state quit: exiting after fatal error or on close Possible transitions are: * -> quit connecting-* -> connected connecting-wait -> connecting-nowait connected -> connecting-wait 2. Implement reconnect in connection_co. So, in connecting-* mode, connection_co, tries to reconnect every NBD_RECONNECT_NS. Configuring of this parameter (as well as NBD_RECONNECT_ATTEMPTS, which specifies bound of transition from connecting-wait to connecting-nowait) may be done as a follow-up patch. 3. Retry nbd queries on channel error, if we are in connecting-wait state. 4. In init, wait until for connection until transition to connecting-nowait. So, NBD_RECONNECT_ATTEMPTS is a bound of fail for initial connection too. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/nbd-client.h | 2 + block/nbd-client.c | 170 ++--- 2 files changed, 123 insertions(+), 49 deletions(-) diff --git a/block/nbd-client.h b/block/nbd-client.h index 2561e1ea42..1249f2eb52 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -44,6 +44,8 @@ typedef struct NBDClientSession { bool receiving; int connect_status; Error *connect_err; +int connect_attempts; +bool wait_in_flight; NBDClientRequest requests[MAX_NBD_REQUESTS]; NBDReply reply; diff --git a/block/nbd-client.c b/block/nbd-client.c index f22ed7f404..49b1f67047 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -41,10 +41,16 @@ static int nbd_client_connect(BlockDriverState *bs, const char *hostname, Error **errp); -/* @ret would be used for reconnect in future */ static void nbd_channel_error(NBDClientSession *s, int ret) { -s->state = NBD_CLIENT_QUIT; +if (ret == -EIO) { +if (s->state == NBD_CLIENT_CONNECTED) { +s->state = NBD_CLIENT_CONNECTING_WAIT; +s->connect_attempts = 0; +} +} else { +s->state = NBD_CLIENT_QUIT; +} } static void nbd_recv_coroutines_wake_all(NBDClientSession *s) @@ -90,6 +96,19 @@ typedef struct NBDConnection { uint64_t reconnect_timeout; } NBDConnection; +static bool nbd_client_connecting(NBDClientSession *client) +{ +return client->state == NBD_CLIENT_CONNECTING_WAIT || + client->state == NBD_CLIENT_CONNECTING_NOWAIT || + client->state == NBD_CLIENT_CONNECTING_INIT; +} + +static bool nbd_client_connecting_wait(NBDClientSession *client) +{ +return client->state == NBD_CLIENT_CONNECTING_WAIT || + client->state == NBD_CLIENT_CONNECTING_INIT; +} + static coroutine_fn void nbd_connection_entry(void *opaque) { NBDConnection *con = opaque; @@ -98,26 +117,55 @@ static coroutine_fn void nbd_connection_entry(void *opaque) int ret = 0; Error *local_err = NULL; -if (con->reconnect_attempts != 0) { -error_setg(>connect_err, "Reconnect is not supported yet"); -s->connect_status = -EINVAL; -nbd_channel_error(s, s->connect_status); -return; -} +while (s->state != NBD_CLIENT_QUIT) { +assert(s->reply.handle == 0); -s->connect_status = nbd_client_connect(con->bs, con->saddr, - con->export, con->tlscreds, - con->hostname, >connect_err); -if (s->connect_status < 0) { -nbd_channel_error(s, s->connect_status); -return; -} +if (nbd_client_connecting(s)) { +if (s->connect_attempts == con->reconnect_attempts) { +s->state = NBD_CLIENT_CONNECTING_NOWAIT; +qemu_co_queue_restart_all(>free_sema); +} -/* successfully connected */ -s->state = NBD_CLIENT_CONNECTED; +qemu_co_mutex_lock(>send_mutex); + +while (s->in_flight > 0) { +qemu_co_mutex_unlock(>send_mutex); +nbd_recv_coroutines_wake_all(s); +s->wait_in_flight = true; +qemu_coroutine_yield(); +s->wait_in_flight = false; +qemu_co_mutex_lock(>send_mutex); +} + +qemu_co_mutex_unlock(>send_mutex); + +/* Now we are sure, that nobody accessing the channel now and nobody + * will try to access the channel, until we set state to CONNECTED + */ + +s->connect_status = nbd_client_connect(con->bs, con->saddr, +