Minimal implementation: always send DF flag, to not deal with fragmented replies.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> --- block/nbd-client.c | 47 +++++++++++---- block/nbd-client.h | 2 + include/block/nbd.h | 15 +++-- nbd/client.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++------ qemu-nbd.c | 2 +- 5 files changed, 203 insertions(+), 33 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 3779c6c999..ff96bd1635 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -180,13 +180,20 @@ static void nbd_co_receive_reply(NBDClientSession *s, *reply = s->reply; if (reply->handle != request->handle || !s->ioc) { + reply->simple = true; reply->error = EIO; } else { - if (qiov && reply->error == 0) { - ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, - true); - if (ret != request->len) { - reply->error = EIO; + if (qiov) { + if ((reply->simple ? reply->error == 0 : + reply->type == NBD_REPLY_TYPE_OFFSET_DATA)) { + ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, + true); + if (ret != request->len) { + reply->error = EIO; + } + } else if (!reply->simple && + reply->type == NBD_REPLY_TYPE_OFFSET_HOLE) { + qemu_iovec_memset(qiov, 0, 0, request->len); } } @@ -227,6 +234,7 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, .type = NBD_CMD_READ, .from = offset, .len = bytes, + .flags = client->structured_reply ? NBD_CMD_FLAG_DF : 0, }; NBDReply reply; ssize_t ret; @@ -237,12 +245,30 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(client, &request, &reply, qiov); + goto out; } + + nbd_co_receive_reply(client, &request, &reply, qiov); + if (reply.error != 0) { + ret = -reply.error; + } + + if (!reply.simple) { + while (!(reply.flags & NBD_REPLY_FLAG_DONE)) { + nbd_co_receive_reply(client, &request, &reply, qiov); + if (reply.error != 0) { + ret = -reply.error; + } + if (reply.simple) { + ret = -EIO; + goto out; + } + } + } + +out: nbd_coroutine_end(client, &request); - return -reply.error; + return ret; } int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, @@ -408,7 +434,8 @@ int nbd_client_init(BlockDriverState *bs, &client->nbdflags, tlscreds, hostname, &client->ioc, - &client->size, errp); + &client->size, + &client->structured_reply, errp); if (ret < 0) { logout("Failed to negotiate with the NBD server\n"); return ret; diff --git a/block/nbd-client.h b/block/nbd-client.h index f8d6006849..cba1f965bf 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -32,6 +32,8 @@ typedef struct NBDClientSession { NBDReply reply; bool is_unix; + + bool structured_reply; } NBDClientSession; NBDClientSession *nbd_get_client_session(BlockDriverState *bs); diff --git a/include/block/nbd.h b/include/block/nbd.h index 58b864f145..dae2e4bd03 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -57,11 +57,16 @@ struct NBDRequest { }; typedef struct NBDRequest NBDRequest; -struct NBDReply { +typedef struct NBDReply { + bool simple; uint64_t handle; uint32_t error; -}; -typedef struct NBDReply NBDReply; + + uint16_t flags; + uint16_t type; + uint32_t length; + uint64_t offset; +} NBDReply; struct NBDSimpleReply { /* uint32_t NBD_SIMPLE_REPLY_MAGIC */ @@ -169,10 +174,10 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, - off_t *size, Error **errp); + off_t *size, bool *structured_reply, Error **errp); int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size); ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request); -ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply); +int nbd_receive_reply(QIOChannel *ioc, NBDReply *reply); int nbd_client(int fd); int nbd_disconnect(int fd); diff --git a/nbd/client.c b/nbd/client.c index 1c274f3012..9225f7e30d 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -472,11 +472,10 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, return QIO_CHANNEL(tioc); } - int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, - off_t *size, Error **errp) + off_t *size, bool *structured_reply, Error **errp) { char buf[256]; uint64_t magic, s; @@ -584,6 +583,12 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, if (nbd_receive_query_exports(ioc, name, errp) < 0) { goto fail; } + + if (structured_reply != NULL) { + *structured_reply = + nbd_receive_simple_option(ioc, NBD_OPT_STRUCTURED_REPLY, + false, NULL) == 0; + } } /* write the export name request */ if (nbd_send_option_request(ioc, NBD_OPT_EXPORT_NAME, -1, name, @@ -603,6 +608,14 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } be16_to_cpus(flags); + + if (!!structured_reply && *structured_reply && + !(*flags & NBD_CMD_FLAG_DF)) + { + error_setg(errp, "Structured reply is negotiated, " + "but DF flag is not."); + goto fail; + } } else if (magic == NBD_CLIENT_MAGIC) { uint32_t oldflags; @@ -790,20 +803,33 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) return 0; } -ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) +static inline int read_sync_check(QIOChannel *ioc, void *buffer, size_t size) { - uint8_t buf[NBD_REPLY_SIZE]; - uint32_t magic; ssize_t ret; - ret = read_sync(ioc, buf, sizeof(buf)); + ret = read_sync(ioc, buffer, size); if (ret < 0) { return ret; } - - if (ret != sizeof(buf)) { + if (ret != size) { LOG("read failed"); - return -EINVAL; + return -EIO; + } + + return 0; +} + +/* nbd_receive_simple_reply + * Read simple reply except magic field (which should be already read) + */ +static int nbd_receive_simple_reply(QIOChannel *ioc, NBDReply *reply) +{ + uint8_t buf[NBD_REPLY_SIZE - 4]; + ssize_t ret; + + ret = read_sync_check(ioc, buf, sizeof(buf)); + if (ret < 0) { + return ret; } /* Reply @@ -812,9 +838,124 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) [ 7 .. 15] handle */ - magic = ldl_be_p(buf); - reply->error = ldl_be_p(buf + 4); - reply->handle = ldq_be_p(buf + 8); + reply->error = ldl_be_p(buf); + reply->handle = ldq_be_p(buf + 4); + + return 0; +} + +/* nbd_receive_structured_reply_chunk + * Read structured reply chunk except magic field (which should be already read) + * Data for NBD_REPLY_TYPE_OFFSET_DATA is not read too. + * Length field of reply out parameter corresponds to unread part of reply. + */ +static int nbd_receive_structured_reply_chunk(QIOChannel *ioc, NBDReply *reply) +{ + NBDStructuredReplyChunk chunk; + ssize_t ret; + uint16_t message_size; + + ret = read_sync_check(ioc, (uint8_t *)&chunk + sizeof(chunk.magic), + sizeof(chunk) - sizeof(chunk.magic)); + if (ret < 0) { + return ret; + } + + reply->flags = be16_to_cpu(chunk.flags); + reply->type = be16_to_cpu(chunk.type); + reply->handle = be64_to_cpu(chunk.handle); + reply->length = be32_to_cpu(chunk.length); + + switch (reply->type) { + case NBD_REPLY_TYPE_NONE: + break; + case NBD_REPLY_TYPE_OFFSET_DATA: + case NBD_REPLY_TYPE_OFFSET_HOLE: + ret = read_sync_check(ioc, &reply->offset, sizeof(reply->offset)); + if (ret < 0) { + return ret; + } + be64_to_cpus(&reply->offset); + reply->length -= sizeof(reply->offset); + break; + case NBD_REPLY_TYPE_ERROR: + case NBD_REPLY_TYPE_ERROR_OFFSET: + ret = read_sync_check(ioc, &reply->error, sizeof(reply->error)); + if (ret < 0) { + return ret; + } + be32_to_cpus(&reply->error); + + ret = read_sync_check(ioc, &message_size, sizeof(message_size)); + if (ret < 0) { + return ret; + } + be16_to_cpus(&message_size); + + if (message_size > 0) { + /* TODO: provide error message to user */ + ret = drop_sync(ioc, message_size); + if (ret < 0) { + return ret; + } + } + + if (reply->type == NBD_REPLY_TYPE_ERROR_OFFSET) { + /* drop 64bit offset */ + ret = drop_sync(ioc, 8); + if (ret < 0) { + return ret; + } + } + break; + default: + if (reply->type & (1 << 15)) { + /* unknown error */ + ret = drop_sync(ioc, reply->length); + if (ret < 0) { + return ret; + } + + reply->error = NBD_EINVAL; + reply->length = 0; + } else { + /* unknown non-error reply type */ + return -EINVAL; + } + } + + return 0; +} + +int nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) +{ + uint32_t magic; + int ret; + + ret = read_sync_check(ioc, &magic, sizeof(magic)); + if (ret < 0) { + return ret; + } + + be32_to_cpus(&magic); + + switch (magic) { + case NBD_SIMPLE_REPLY_MAGIC: + reply->simple = true; + ret = nbd_receive_simple_reply(ioc, reply); + break; + case NBD_STRUCTURED_REPLY_MAGIC: + reply->simple = false; + ret = nbd_receive_structured_reply_chunk(ioc, reply); + break; + default: + LOG("invalid magic (got 0x%" PRIx32 ")", magic); + return -EINVAL; + } + + if (ret < 0) { + return ret; + } reply->error = nbd_errno_to_system_errno(reply->error); @@ -827,10 +968,5 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) ", handle = %" PRIu64" }", magic, reply->error, reply->handle); - if (magic != NBD_SIMPLE_REPLY_MAGIC) { - LOG("invalid magic (got 0x%" PRIx32 ")", magic); - return -EINVAL; - } return 0; } - diff --git a/qemu-nbd.c b/qemu-nbd.c index c734f627b4..de0099e333 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -272,7 +272,7 @@ static void *nbd_client_thread(void *arg) ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), NULL, &nbdflags, NULL, NULL, NULL, - &size, &local_error); + &size, NULL, &local_error); if (ret < 0) { if (local_error) { error_report_err(local_error); -- 2.11.0