Re: [Qemu-block] [Qemu-devel] [PATCH v3 02/44] nbd: Quit server after any write error

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> We should never ignore failure from nbd_negotiate_send_rep(); if
> we are unable to write to the client, then it is not worth trying
> to continue the negotiation.  Fortunately, the problem is not
> too severe - chances are that the errors being ignored here (mainly
> inability to write the reply to the client) are indications of
> a closed connection or something similar, which will also affect
> the next attempt to interact with the client and eventually reach
> a point where the errors are detected to end the loop.
> 
> Signed-off-by: Eric Blake 

Reviewed-by: Alex Bligh 

> ---
> nbd/server.c | 32 +++-
> 1 file changed, 23 insertions(+), 9 deletions(-)
> 
> diff --git a/nbd/server.c b/nbd/server.c
> index fc36f4d..0a003e4 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -334,7 +334,10 @@ static QIOChannel 
> *nbd_negotiate_handle_starttls(NBDClient *client,
> return NULL;
> }
> 
> -nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_STARTTLS);
> +if (nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK,
> +   NBD_OPT_STARTTLS) < 0) {
> +return NULL;
> +}
> 
> tioc = qio_channel_tls_new_server(ioc,
>   client->tlscreds,
> @@ -460,8 +463,11 @@ static int nbd_negotiate_options(NBDClient *client)
> if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
> return -EIO;
> }
> -nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_TLS_REQD,
> -   clientflags);
> +ret = nbd_negotiate_send_rep(client->ioc, 
> NBD_REP_ERR_TLS_REQD,
> + clientflags);
> +if (ret < 0) {
> +return ret;
> +}
> break;
> }
> } else if (fixedNewstyle) {
> @@ -485,12 +491,17 @@ static int nbd_negotiate_options(NBDClient *client)
> }
> if (client->tlscreds) {
> TRACE("TLS already enabled");
> -nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_INVALID,
> -   clientflags);
> +ret = nbd_negotiate_send_rep(client->ioc,
> + NBD_REP_ERR_INVALID,
> + clientflags);
> } else {
> TRACE("TLS not configured");
> -nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_POLICY,
> -   clientflags);
> +ret = nbd_negotiate_send_rep(client->ioc,
> + NBD_REP_ERR_POLICY,
> + clientflags);
> +}
> +if (ret < 0) {
> +return ret;
> }
> break;
> default:
> @@ -498,8 +509,11 @@ static int nbd_negotiate_options(NBDClient *client)
> if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
> return -EIO;
> }
> -nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_UNSUP,
> -   clientflags);
> +ret = nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_UNSUP,
> + clientflags);
> +if (ret < 0) {
> +return ret;
> +}
> break;
> }
> } else {
> -- 
> 2.5.5
> 
> 

-- 
Alex Bligh







Re: [Qemu-block] [Qemu-devel] [PATCH v3 07/44] nbd: Limit nbdflags to 16 bits

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> Rather than asserting that nbdflags is within range, just give
> it the correct type to begin with :)  nbdflags corresponds to
> the per-export portion of NBD Protocol "transmission flags", which
> is 16 bits in response to NBD_OPT_EXPORT_NAME and NBD_OPT_GO.
> 
> Furthermore, upstream NBD has never passed the global flags to
> the kernel via ioctl(NBD_SET_FLAGS) (the ioctl was first
> introduced in NBD 2.9.22; then a latent bug in NBD 3.1 actually
> tried to OR the global flags with the transmission flags, with
> the disaster that the addition of NBD_FLAG_NO_ZEROES in 3.9
> caused all earlier NBD 3.x clients to treat every export as
> read-only; NBD 3.10 and later intentionally clip things to 16
> bits to pass only transmission flags).  Qemu should follow suit,
> since the current two global flags (NBD_FLAG_FIXED_NEWSTYLE
> and NBD_FLAG_NO_ZEROES) have no impact on the kernel's behavior
> during transmission.
> 
> Signed-off-by: Eric Blake 

Looks sensible, but NBD has at least three types of flags. Perhaps
rather than calling them nbdflags you could call them
nbdtransmissionflags, nbdclientflags or whatever which might
help avoid this confusion in future.

Alex


> 
> ---
> v3: expand scope of patch
> ---
> block/nbd-client.h  |  2 +-
> include/block/nbd.h |  6 +++---
> nbd/client.c| 28 +++-
> nbd/server.c| 10 --
> qemu-nbd.c  |  4 ++--
> 5 files changed, 25 insertions(+), 25 deletions(-)
> 
> diff --git a/block/nbd-client.h b/block/nbd-client.h
> index bc7aec0..1243612 100644
> --- a/block/nbd-client.h
> +++ b/block/nbd-client.h
> @@ -20,7 +20,7 @@
> typedef struct NbdClientSession {
> QIOChannelSocket *sioc; /* The master data channel */
> QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
> -uint32_t nbdflags;
> +uint16_t nbdflags;
> off_t size;
> 
> CoMutex send_mutex;
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index b86a976..134f117 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -83,11 +83,11 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
>  size_t offset,
>  size_t length,
>  bool do_read);
> -int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags,
> +int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
>   QCryptoTLSCreds *tlscreds, const char *hostname,
>   QIOChannel **outioc,
>   off_t *size, Error **errp);
> -int nbd_init(int fd, QIOChannelSocket *sioc, uint32_t flags, off_t size);
> +int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size);
> ssize_t nbd_send_request(QIOChannel *ioc, struct nbd_request *request);
> ssize_t nbd_receive_reply(QIOChannel *ioc, struct nbd_reply *reply);
> int nbd_client(int fd);
> @@ -97,7 +97,7 @@ typedef struct NBDExport NBDExport;
> typedef struct NBDClient NBDClient;
> 
> NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
> -  uint32_t nbdflags, void (*close)(NBDExport *),
> +  uint16_t nbdflags, void (*close)(NBDExport *),
>   Error **errp);
> void nbd_export_close(NBDExport *exp);
> void nbd_export_get(NBDExport *exp);
> diff --git a/nbd/client.c b/nbd/client.c
> index f1afa49..937344c 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -406,7 +406,7 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc,
> }
> 
> 
> -int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags,
> +int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
>   QCryptoTLSCreds *tlscreds, const char *hostname,
>   QIOChannel **outioc,
>   off_t *size, Error **errp)
> @@ -466,7 +466,6 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char 
> *name, uint32_t *flags,
> uint32_t opt;
> uint32_t namesize;
> uint16_t globalflags;
> -uint16_t exportflags;
> bool fixedNewStyle = false;
> 
> if (read_sync(ioc, &globalflags, sizeof(globalflags)) !=
> @@ -475,7 +474,6 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char 
> *name, uint32_t *flags,
> goto fail;
> }
> globalflags = be16_to_cpu(globalflags);
> -*flags = globalflags << 16;
> TRACE("Global flags are %" PRIx32, globalflags);
> if (globalflags & NBD_FLAG_FIXED_NEWSTYLE) {
> fixedNewStyle = true;
> @@ -543,17 +541,15 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char 
> *name, uint32_t *flags,
> goto fail;
> }
> *size = be64_to_cpu(s);
> -TRACE("Size is %" PRIu64, *size);
> 
> -if (read_sync(ioc, &exportflags, sizeof(exportflags)) !=
> -sizeof(exportflags)) {
> +if (rea

Re: [Qemu-block] [Qemu-devel] [PATCH v3 08/44] nbd: Add qemu-nbd -D for human-readable description

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> The NBD protocol allows servers to advertise a human-readable
> description alongside an export name during NBD_OPT_LIST.  Add
> an option to pass through the user's string to the NBD client.
> 
> Doing this also makes it easier to test commit 200650d4, which
> is the client counterpart of receiving the description.
> 
> Signed-off-by: Eric Blake 

Reviewed-by: Alex Bligh 

> ---
> include/block/nbd.h |  1 +
> nbd/nbd-internal.h  |  5 +++--
> nbd/server.c| 34 ++
> qemu-nbd.c  | 12 +++-
> qemu-nbd.texi   |  5 -
> 5 files changed, 45 insertions(+), 12 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index 134f117..3e2d76b 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -107,6 +107,7 @@ BlockBackend *nbd_export_get_blockdev(NBDExport *exp);
> 
> NBDExport *nbd_export_find(const char *name);
> void nbd_export_set_name(NBDExport *exp, const char *name);
> +void nbd_export_set_description(NBDExport *exp, const char *description);
> void nbd_export_close_all(void);
> 
> void nbd_client_new(NBDExport *exp,
> diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
> index 3791535..035ead4 100644
> --- a/nbd/nbd-internal.h
> +++ b/nbd/nbd-internal.h
> @@ -103,9 +103,10 @@ static inline ssize_t read_sync(QIOChannel *ioc, void 
> *buffer, size_t size)
> return nbd_wr_syncv(ioc, &iov, 1, 0, size, true);
> }
> 
> -static inline ssize_t write_sync(QIOChannel *ioc, void *buffer, size_t size)
> +static inline ssize_t write_sync(QIOChannel *ioc, const void *buffer,
> + size_t size)
> {
> -struct iovec iov = { .iov_base = buffer, .iov_len = size };
> +struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size };
> 
> return nbd_wr_syncv(ioc, &iov, 1, 0, size, false);
> }
> diff --git a/nbd/server.c b/nbd/server.c
> index 31fc9cf..aa252a4 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -61,6 +61,7 @@ struct NBDExport {
> 
> BlockBackend *blk;
> char *name;
> +char *description;
> off_t dev_offset;
> off_t size;
> uint16_t nbdflags;
> @@ -128,7 +129,8 @@ static ssize_t nbd_negotiate_read(QIOChannel *ioc, void 
> *buffer, size_t size)
> 
> }
> 
> -static ssize_t nbd_negotiate_write(QIOChannel *ioc, void *buffer, size_t 
> size)
> +static ssize_t nbd_negotiate_write(QIOChannel *ioc, const void *buffer,
> +   size_t size)
> {
> ssize_t ret;
> guint watch;
> @@ -224,11 +226,15 @@ static int nbd_negotiate_send_rep(QIOChannel *ioc, 
> uint32_t type, uint32_t opt)
> 
> static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp)
> {
> -uint64_t magic, name_len;
> +uint64_t magic;
> +size_t name_len, desc_len;
> uint32_t opt, type, len;
> +const char *name = exp->name ? exp->name : "";
> +const char *desc = exp->description ? exp->description : "";
> 
> -TRACE("Advertising export name '%s'", exp->name ? exp->name : "");
> -name_len = strlen(exp->name);
> +TRACE("Advertising export name '%s' description '%s'", name, desc);
> +name_len = strlen(name);
> +desc_len = strlen(desc);
> magic = cpu_to_be64(NBD_REP_MAGIC);
> if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
> LOG("write failed (magic)");
> @@ -244,18 +250,22 @@ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, 
> NBDExport *exp)
> LOG("write failed (reply type)");
> return -EINVAL;
> }
> -len = cpu_to_be32(name_len + sizeof(len));
> +len = cpu_to_be32(name_len + desc_len + sizeof(len));
> if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
> LOG("write failed (length)");
> return -EINVAL;
> }
> len = cpu_to_be32(name_len);
> if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
> -LOG("write failed (length)");
> +LOG("write failed (name length)");
> return -EINVAL;
> }
> -if (nbd_negotiate_write(ioc, exp->name, name_len) != name_len) {
> -LOG("write failed (buffer)");
> +if (nbd_negotiate_write(ioc, name, name_len) != name_len) {
> +LOG("write failed (name buffer)");
> +return -EINVAL;
> +}
> +if (nbd_negotiate_write(ioc, desc, desc_len) != desc_len) {
> +LOG("write failed (description buffer)");
> return -EINVAL;
> }
> return 0;
> @@ -877,6 +887,12 @@ void nbd_export_set_name(NBDExport *exp, const char 
> *name)
> nbd_export_put(exp);
> }
> 
> +void nbd_export_set_description(NBDExport *exp, const char *description)
> +{
> +g_free(exp->description);
> +exp->description = g_strdup(description);
> +}
> +
> void nbd_export_close(NBDExport *exp)
> {
> NBDClient *client, *next;
> @@ -886,6 +902,7 @@ void nbd_export_close(NBDExport *exp)
> client_close(client);
> }
> nbd_export_set_name(exp, NULL);
> + 

Re: [Qemu-block] [Qemu-devel] [PATCH v3 03/44] nbd: Improve server handling of bogus commands

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> We have a few bugs in how we handle invalid client commands:
> 
> - A client can send an NBD_CMD_DISC where from + len overflows,
> convincing us to reply with an error and stay connected, even
> though the protocol requires us to silently disconnect. Fix by
> hoisting the special case sooner.
> 
> - A client can send an NBD_CMD_WRITE with bogus from and len,
> where we reply to the client with EINVAL without consuming the
> payload; this will normally cause us to fail if the next thing
> read is not the right magic, but in rare cases, could cause us
> to interpret the data payload as valid commands and do things
> not requested by the client. Fix by adding a complete flag to
> track whether we are in sync or must disconnect.
> 
> - If we report an error to NBD_CMD_READ, we are not writing out
> any data payload; but the protocol says that a client can expect
> to read the payload no matter what (and must instead ignore it),
> which means the client will start reading our next replies as
> its data payload. Fix by disconnecting.
> 
> Furthermore, we have split the checks for bogus from/len across
> two functions, when it is easier to do it all at once.
> 
> Signed-off-by: Eric Blake 
> ---
> nbd/server.c | 67 +---
> 1 file changed, 46 insertions(+), 21 deletions(-)
> 
> diff --git a/nbd/server.c b/nbd/server.c
> index 0a003e4..6a6b5a2 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -52,6 +52,7 @@ struct NBDRequest {
> QSIMPLEQ_ENTRY(NBDRequest) entry;
> NBDClient *client;
> uint8_t *data;
> +bool complete;
> };
> 
> struct NBDExport {
> @@ -985,7 +986,13 @@ static ssize_t nbd_co_send_reply(NBDRequest *req, struct 
> nbd_reply *reply,
> return rc;
> }
> 
> -static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request 
> *request)
> +/* Collect a client request.  Return 0 if request looks valid, -EAGAIN
> + * to keep trying the collection, -EIO to drop connection right away,
> + * and any other negative value to report an error to the client
> + * (although the caller may still need to disconnect after reporting
> + * the error).  */
> +static ssize_t nbd_co_receive_request(NBDRequest *req,
> +  struct nbd_request *request)
> {
> NBDClient *client = req->client;
> uint32_t command;
> @@ -1003,16 +1010,26 @@ static ssize_t nbd_co_receive_request(NBDRequest 
> *req, struct nbd_request *reque
> goto out;
> }
> 
> -if ((request->from + request->len) < request->from) {
> -LOG("integer overflow detected! "
> -"you're probably being attacked");
> -rc = -EINVAL;
> -goto out;
> -}
> -
> TRACE("Decoding type");
> 
> command = request->type & NBD_CMD_MASK_COMMAND;
> +if (command == NBD_CMD_DISC) {
> +/* Special case: we're going to disconnect without a reply,
> + * whether or not flags, from, or len are bogus */
> +TRACE("Request type is DISCONNECT");
> +rc = -EIO;
> +goto out;
> +}
> +
> +/* Check for sanity in the parameters, part 1.  Defer as many
> + * checks as possible until after reading any NBD_CMD_WRITE
> + * payload, so we can try and keep the connection alive.  */
> +if ((request->from + request->len) < request->from) {
> +LOG("integer overflow detected, you're probably being attacked");

I realise this is unchanged since the previous code, but why
'you're probably being attacked'? More likely you're probably
using a buggy client.

> +rc = -EINVAL;
> +goto out;
> +}
> +
> if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
> if (request->len > NBD_MAX_BUFFER_SIZE) {
> LOG("len (%" PRIu32" ) is larger than max len (%u)",
> @@ -1035,7 +1052,18 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, 
> struct nbd_request *reque
> rc = -EIO;
> goto out;
> }
> +req->complete = true;
> }
> +
> +/* Sanity checks, part 2. */
> +if (request->from + request->len > client->exp->size) {
> +LOG("operation past EOF; From: %" PRIu64 ", Len: %" PRIu32
> +", Size: %" PRIu64, request->from, request->len,
> +(uint64_t)client->exp->size);
> +rc = -EINVAL;
> +goto out;
> +}
> +
> rc = 0;
> 
> out:
> @@ -1077,14 +1105,6 @@ static void nbd_trip(void *opaque)
> goto error_reply;
> }
> command = request.type & NBD_CMD_MASK_COMMAND;
> -if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) 
> {
> -LOG("From: %" PRIu64 ", Len: %" PRIu32", Size: %" PRIu64
> -", Offset: %" PRIu64 "\n",
> -request.from, request.len,
> -(uint64_t)exp->size, (uint64_t)exp->dev_offset);
> -LOG("requested operation past EOF--bad client?");
> -goto invalid_request;
> -   

Re: [Qemu-block] [Qemu-devel] [PATCH v3 29/44] nbd: Avoid magic number for NBD max name size

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> Declare a constant and use that when determining if an export
> name fits within the constraints we are willing to support.
> 
> Note that upstream NBD recently documented that clients MUST
> support export names of 256 bytes (not including trailing NUL),
> and SHOULD support names up to 4096 bytes.  4096 is a bit big
> (we would lose benefits of stack-allocation of a name array),
> and we already have other limits in place (for example, qcow2
> snapshot names are clamped around 1024).  So for now, just
> stick to the required minimum.
> 
> Signed-off-by: Eric Blake 
Reviewed-by: Alex Bligh 
> 
> ---
> v3: enlarge the limit, and document choice of new value
> ---
> include/block/nbd.h | 6 ++
> nbd/client.c| 2 +-
> nbd/server.c| 4 ++--
> 3 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index 3e2d76b..2c753cc 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -76,6 +76,12 @@ enum {
> 
> /* Maximum size of a single READ/WRITE data buffer */
> #define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024)
> +/* Maximum size of an export name. The NBD spec requires 256 and
> + * suggests that servers support up to 4096, but we stick to only the
> + * required size so that we can stack-allocate the names, and because
> + * going larger would require an audit of more code to make sure we
> + * aren't overflowing some other buffer. */
> +#define NBD_MAX_NAME_SIZE 256
> 
> ssize_t nbd_wr_syncv(QIOChannel *ioc,
>  struct iovec *iov,
> diff --git a/nbd/client.c b/nbd/client.c
> index 4659df3..b700100 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -210,7 +210,7 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, 
> Error **errp)
> error_setg(errp, "incorrect option name length");
> return -1;
> }
> -if (namelen > 255) {
> +if (namelen > NBD_MAX_NAME_SIZE) {
> error_setg(errp, "export name length too long %" PRIu32, namelen);
> return -1;
> }
> diff --git a/nbd/server.c b/nbd/server.c
> index fa05a73..a20bf8a 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -296,13 +296,13 @@ static int nbd_negotiate_handle_list(NBDClient *client, 
> uint32_t length)
> static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t 
> length)
> {
> int rc = -EINVAL;
> -char name[256];
> +char name[NBD_MAX_NAME_SIZE + 1];
> 
> /* Client sends:
> [20 ..  xx]   export name (length bytes)
>  */
> TRACE("Checking length");
> -if (length > 255) {
> +if (length >= sizeof(name)) {
> LOG("Bad length received");
> goto fail;
> }
> -- 
> 2.5.5
> 
> 

-- 
Alex Bligh







Re: [Qemu-block] [Qemu-devel] [PATCH v3 30/44] nbd: Treat flags vs. command type as separate fields

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> Current upstream NBD documents that requests have a 16-bit flags,
> followed by a 16-bit type integer; although older versions mentioned
> only a 32-bit field with masking to find flags.  Since the protocol
> is in network order (big-endian over the wire), the ABI is unchanged;
> but dealing with the flags as a separate field rather than masking
> will make it easier to add support for upcoming NBD extensions that
> increase the number of both flags and commands.
> 
> Improve some comments in nbd.h based on the current upstream
> NBD protocol (https://github.com/yoe/nbd/blob/master/doc/proto.md),
> and touch some nearby code to keep checkpatch.pl happy.
> 
> Signed-off-by: Eric Blake 
Reviewed-by: Alex Bligh 
> 
> ---
> v3: rebase to other changes earlier in series
> ---
> include/block/nbd.h | 18 --
> nbd/nbd-internal.h  |  4 ++--
> block/nbd-client.c  |  9 +++--
> nbd/client.c| 17 ++---
> nbd/server.c| 51 ++-
> 5 files changed, 53 insertions(+), 46 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index 2c753cc..f4ae86c 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -1,4 +1,5 @@
> /*
> + *  Copyright (C) 2016 Red Hat, Inc.
>  *  Copyright (C) 2005  Anthony Liguori 
>  *
>  *  Network Block Device
> @@ -27,7 +28,8 @@
> 
> struct nbd_request {
> uint32_t magic;
> -uint32_t type;
> +uint16_t flags;
> +uint16_t type;
> uint64_t handle;
> uint64_t from;
> uint32_t len;
> @@ -39,6 +41,8 @@ struct nbd_reply {
> uint64_t handle;
> } QEMU_PACKED;
> 
> +/* Transmission (export) flags: sent from server to client during handshake,
> +   but describe what will happen during transmission */
> #define NBD_FLAG_HAS_FLAGS  (1 << 0)/* Flags are there */
> #define NBD_FLAG_READ_ONLY  (1 << 1)/* Device is read-only */
> #define NBD_FLAG_SEND_FLUSH (1 << 2)/* Send FLUSH */
> @@ -46,10 +50,12 @@ struct nbd_reply {
> #define NBD_FLAG_ROTATIONAL (1 << 4)/* Use elevator algorithm - 
> rotational media */
> #define NBD_FLAG_SEND_TRIM  (1 << 5)/* Send TRIM (discard) */
> 
> -/* New-style global flags. */
> +/* New-style handshake (global) flags, sent from server to client, and
> +   control what will happen during handshake phase. */
> #define NBD_FLAG_FIXED_NEWSTYLE (1 << 0)/* Fixed newstyle protocol. */
> 
> -/* New-style client flags. */
> +/* New-style client flags, sent from client to server to control what happens
> +   during handshake phase. */
> #define NBD_FLAG_C_FIXED_NEWSTYLE   (1 << 0)/* Fixed newstyle protocol. */
> 
> /* Reply types. */
> @@ -60,10 +66,10 @@ struct nbd_reply {
> #define NBD_REP_ERR_INVALID ((UINT32_C(1) << 31) | 3) /* Invalid length. 
> */
> #define NBD_REP_ERR_TLS_REQD((UINT32_C(1) << 31) | 5) /* TLS required */
> 
> +/* Request flags, sent from client to server during transmission phase */
> +#define NBD_CMD_FLAG_FUA(1 << 0)
> 
> -#define NBD_CMD_MASK_COMMAND 0x
> -#define NBD_CMD_FLAG_FUA (1 << 16)
> -
> +/* Supported request types */
> enum {
> NBD_CMD_READ = 0,
> NBD_CMD_WRITE = 1,
> diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
> index 035ead4..d0108a1 100644
> --- a/nbd/nbd-internal.h
> +++ b/nbd/nbd-internal.h
> @@ -52,10 +52,10 @@
> /* This is all part of the "official" NBD API.
>  *
>  * The most up-to-date documentation is available at:
> - * https://github.com/yoe/nbd/blob/master/doc/proto.txt
> + * https://github.com/yoe/nbd/blob/master/doc/proto.md
>  */
> 
> -#define NBD_REQUEST_SIZE(4 + 4 + 8 + 8 + 4)
> +#define NBD_REQUEST_SIZE(4 + 2 + 2 + 8 + 8 + 4)
> #define NBD_REPLY_SIZE  (4 + 4 + 8)
> #define NBD_REQUEST_MAGIC   0x25609513
> #define NBD_REPLY_MAGIC 0x67446698
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 878e879..285025d 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -1,6 +1,7 @@
> /*
>  * QEMU Block driver for  NBD
>  *
> + * Copyright (C) 2016 Red Hat, Inc.
>  * Copyright (C) 2008 Bull S.A.S.
>  * Author: Laurent Vivier 
>  *
> @@ -252,7 +253,7 @@ static int nbd_co_writev_1(BlockDriverState *bs, int64_t 
> sector_num,
> 
> if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) {
> *flags &= ~BDRV_REQ_FUA;
> -request.type |= NBD_CMD_FLAG_FUA;
> +request.flags |= NBD_CMD_FLAG_FUA;
> }
> 
> request.from = sector_num * 512;
> @@ -376,11 +377,7 @@ void nbd_client_attach_aio_context(BlockDriverState *bs,
> void nbd_client_close(BlockDriverState *bs)
> {
> NbdClientSession *client = nbd_get_client_session(bs);
> -struct nbd_request request = {
> -.type = NBD_CMD_DISC,
> -.from = 0,
> -.len = 0
> -};
> +struct nbd_request request = { .type = NBD_CMD_DISC };
> 
> if (client->ioc == NU

Re: [Qemu-block] [Qemu-devel] [PATCH v3 31/44] nbd: Share common reply-sending code in server

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> Rather than open-coding NBD_REP_SERVER, reuse the code we
> already have by adding a length parameter.  Additionally,
> the refactoring will make adding NBD_OPT_GO in a later patch
> easier.
> 
> Signed-off-by: Eric Blake 
Reviewed-by: Alex Bligh 
> 
> ---
> v3: rebase to changes earlier in series
> ---
> nbd/server.c | 48 +++-
> 1 file changed, 23 insertions(+), 25 deletions(-)
> 
> diff --git a/nbd/server.c b/nbd/server.c
> index 1d30b6d..4435d37 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -195,12 +195,15 @@ static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, 
> size_t size)
> 
> */
> 
> -static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t 
> opt)
> +/* Send a reply header, including length, but no payload.
> + * Return -errno to kill connection, 0 to continue negotiation. */
> +static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type,
> +  uint32_t opt, uint32_t len)
> {
> uint64_t magic;
> -uint32_t len;
> 
> -TRACE("Reply opt=%" PRIx32 " type=%" PRIx32, type, opt);
> +TRACE("Reply opt=%" PRIx32 " type=%" PRIx32 " len=%" PRIu32,
> +  type, opt, len);
> 
> magic = cpu_to_be64(NBD_REP_MAGIC);
> if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
> @@ -217,7 +220,7 @@ static int nbd_negotiate_send_rep(QIOChannel *ioc, 
> uint32_t type, uint32_t opt)
> LOG("write failed (rep type)");
> return -EINVAL;
> }
> -len = cpu_to_be32(0);
> +len = cpu_to_be32(len);
> if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
> LOG("write failed (rep data length)");
> return -EINVAL;
> @@ -225,37 +228,32 @@ static int nbd_negotiate_send_rep(QIOChannel *ioc, 
> uint32_t type, uint32_t opt)
> return 0;
> }
> 
> +/* Send a reply header with default 0 length.
> + * Return -errno to kill connection, 0 to continue negotiation. */
> +static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t 
> opt)
> +{
> +return nbd_negotiate_send_rep_len(ioc, type, opt, 0);
> +}
> +
> +/* Send an NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
> + * Return -errno to kill connection, 0 to continue negotiation. */
> static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp)
> {
> -uint64_t magic;
> size_t name_len, desc_len;
> -uint32_t opt, type, len;
> +uint32_t len;
> const char *name = exp->name ? exp->name : "";
> const char *desc = exp->description ? exp->description : "";
> +int rc;
> 
> TRACE("Advertising export name '%s' description '%s'", name, desc);
> name_len = strlen(name);
> desc_len = strlen(desc);
> -magic = cpu_to_be64(NBD_REP_MAGIC);
> -if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
> -LOG("write failed (magic)");
> -return -EINVAL;
> - }
> -opt = cpu_to_be32(NBD_OPT_LIST);
> -if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) {
> -LOG("write failed (opt)");
> -return -EINVAL;
> -}
> -type = cpu_to_be32(NBD_REP_SERVER);
> -if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) {
> -LOG("write failed (reply type)");
> -return -EINVAL;
> -}
> -len = cpu_to_be32(name_len + desc_len + sizeof(len));
> -if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
> -LOG("write failed (length)");
> -return -EINVAL;
> +len = name_len + desc_len + sizeof(len);
> +rc = nbd_negotiate_send_rep_len(ioc, NBD_REP_SERVER, NBD_OPT_LIST, len);
> +if (rc < 0) {
> +return rc;
> }
> +
> len = cpu_to_be32(name_len);
> if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
> LOG("write failed (name length)");
> -- 
> 2.5.5
> 
> 

-- 
Alex Bligh







Re: [Qemu-block] [Qemu-devel] [PATCH v3 32/44] nbd: Share common option-sending code in client

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> Rather than open-coding each option request, it's easier to
> have common helper functions do the work.  That in turn requires
> having convenient packed types for handling option requests
> and replies.
> 
> Signed-off-by: Eric Blake 
> Reviewed-by: Alex Bligh 
> 
> ---
> v3: rebase, tweak a debug message
> ---
> include/block/nbd.h |  29 +-
> nbd/nbd-internal.h  |   2 +-
> nbd/client.c| 250 ++--
> 3 files changed, 129 insertions(+), 152 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index f4ae86c..5227ec6 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -26,20 +26,41 @@
> #include "io/channel-socket.h"
> #include "crypto/tlscreds.h"
> 
> +/* Handshake phase structs */
> +
> +struct nbd_option {
> +uint64_t magic; /* NBD_OPTS_MAGIC */
> +uint32_t option; /* NBD_OPT_* */
> +uint32_t length;
> +} QEMU_PACKED;
> +typedef struct nbd_option nbd_option;
> +
> +struct nbd_opt_reply {
> +uint64_t magic; /* NBD_REP_MAGIC */
> +uint32_t option; /* NBD_OPT_* */
> +uint32_t type; /* NBD_REP_* */
> +uint32_t length;
> +} QEMU_PACKED;
> +typedef struct nbd_opt_reply nbd_opt_reply;
> +
> +/* Transmission phase structs */
> +
> struct nbd_request {
> -uint32_t magic;
> -uint16_t flags;
> -uint16_t type;
> +uint32_t magic; /* NBD_REQUEST_MAGIC */
> +uint16_t flags; /* NBD_CMD_FLAG_* */
> +uint16_t type; /* NBD_CMD_* */
> uint64_t handle;
> uint64_t from;
> uint32_t len;
> } QEMU_PACKED;
> +typedef struct nbd_request nbd_request;
> 
> struct nbd_reply {
> -uint32_t magic;
> +uint32_t magic; /* NBD_REPLY_MAGIC */
> uint32_t error;
> uint64_t handle;
> } QEMU_PACKED;
> +typedef struct nbd_reply nbd_reply;
> 
> /* Transmission (export) flags: sent from server to client during handshake,
>but describe what will happen during transmission */
> diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
> index d0108a1..95069db 100644
> --- a/nbd/nbd-internal.h
> +++ b/nbd/nbd-internal.h
> @@ -61,7 +61,7 @@
> #define NBD_REPLY_MAGIC 0x67446698
> #define NBD_OPTS_MAGIC  0x49484156454F5054LL
> #define NBD_CLIENT_MAGIC0x420281861253LL
> -#define NBD_REP_MAGIC   0x3e889045565a9LL
> +#define NBD_REP_MAGIC   0x0003e889045565a9LL
> 
> #define NBD_SET_SOCK_IO(0xab, 0)
> #define NBD_SET_BLKSIZE _IO(0xab, 1)
> diff --git a/nbd/client.c b/nbd/client.c
> index a9e173a..6626fa8 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -75,64 +75,123 @@ static QTAILQ_HEAD(, NBDExport) exports = 
> QTAILQ_HEAD_INITIALIZER(exports);
> 
> */
> 
> +/* Send an option request. Return 0 if successful, -1 with errp set if
> + * it is impossible to continue. */
> +static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt,
> +   uint32_t len, const char *data,
> +   Error **errp)
> +{
> +nbd_option req;
> +QEMU_BUILD_BUG_ON(sizeof(req) != 16);
> 
> -/* If type represents success, return 1 without further action.
> - * If type represents an error reply, consume the rest of the packet on ioc.
> - * Then return 0 for unsupported (so the client can fall back to
> - * other approaches), or -1 with errp set for other errors.
> +if (len == -1) {
> +req.length = len = strlen(data);
> +}
> +TRACE("Sending option request %"PRIu32", len %"PRIu32, opt, len);
> +
> +stq_be_p(&req.magic, NBD_OPTS_MAGIC);
> +stl_be_p(&req.option, opt);
> +stl_be_p(&req.length, len);
> +
> +if (write_sync(ioc, &req, sizeof(req)) != sizeof(req)) {
> +error_setg(errp, "Failed to send option request header");
> +return -1;
> +}
> +
> +if (len && write_sync(ioc, (char *) data, len) != len) {
> +error_setg(errp, "Failed to send option request data");
> +return -1;
> +}
> +
> +return 0;
> +}
> +
> +/* Receive the header of an option reply, which should match the given
> + * opt.  Read through the length field, but NOT the length bytes of
> + * payload. Return 0 if successful, -1 with errp set if it is
> + * impossible to continue. */
> +static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt,
> +nbd_opt_reply *reply, Error **errp)
> +{
> +QEMU_BUILD_BUG_ON(sizeof(*reply) != 20);
> +if (read_sync(ioc, reply, sizeof(*reply)) != sizeof(*reply)) {
> +error_setg(errp, "failed to read option reply");
> +return -1;
> +}
> +be64_to_cpus(&reply->magic);
> +be32_to_cpus(&reply->option);
> +be32_to_cpus(&reply->type);
> +be32_to_cpus(&reply->length);
> +
> +TRACE("Received option reply %"PRIx32", type %"PRIx32", len %"PRIu32,
> +  reply->option, reply->type, reply->length);
> +
> +if (reply->magic != NBD_REP_MAGIC) {
> +error_setg(errp, "Unex

Re: [Qemu-block] [PATCH v18 0/8] Block replication for continuous checkpoints

2016-04-25 Thread Changlong Xie
The main migration code for COLO looks OK now,and they depend on my 
block part. So i'd like to ping here.


Thanks
-Xie

On 04/15/2016 04:10 PM, Changlong Xie wrote:

Block replication is a very important feature which is used for
continuous checkpoints(for example: COLO).

You can get the detailed information about block replication from here:
http://wiki.qemu.org/Features/BlockReplication

Usage:
Please refer to docs/block-replication.txt

This patch series is based on the following patch series:
http://lists.nongnu.org/archive/html/qemu-devel/2016-04/msg02093.html

You can get the patch here:
https://github.com/Pating/qemu/tree/changlox/block-replication-v18

You can get the patch with framework here:
https://github.com/Pating/qemu/tree/changlox/colo_framework_v17

TODO:
1. Continuous block replication. It will be started after basic functions
are accepted.

Changs Log:
V18:
p6: add local_err in all replication callbacks to prevent "errp == NULL"
p7: add missing qemu_iovec_destroy(xxx)
V17:
1. Rebase to the lastest codes
p2: refactor backup_do_checkpoint addressed comments from Jeff Cody
p4: fix bugs in "drive_add buddy xxx" hmp commands
p6: add "since: 2.7"
p7: fix bug in replication_close(), add missing "qapi/error.h", add 
test-replication
p8: add "since: 2.7"
V16:
1. Rebase to the newest codes
2. Address comments from Stefan & hailiang
p3: we don't need this patch now
p4: add "top-id" parameters for secondary
p6: fix NULL pointer in replication callbacks, remove unnecessary typedefs,
add doc comments that explain the semantics of Replication
p7: Refactor AioContext for thread-safe, remove unnecessary get_top_bs()
*Note*: I'm working on replication testcase now, will send out in V17
V15:
1. Rebase to the newest codes
2. Fix typos and coding style addresed Eric's comments
3. Address Stefan's comments
1) Make backup_do_checkpoint public, drop the changes on BlockJobDriver
2) Update the message and description for [PATCH 4/9]
3) Make replication_(start/stop/do_checkpoint)_all as global interfaces
4) Introduce AioContext lock to protect start/stop/do_checkpoint callbacks
5) Use BdrvChild instead of holding on to BlockDriverState * pointers
4. Clear BDRV_O_INACTIVE for hidden disk's open_flags since commit 09e0c771
5. Introduce replication_get_error_all to check replication status
6. Remove useless discard interface
V14:
1. Implement auto complete active commit
2. Implement active commit block job for replication.c
3. Address the comments from Stefan, add replication-specific API and data
structure, also remove old block layer APIs
V13:
1. Rebase to the newest codes
2. Remove redundant marcos and semicolon in replication.c
3. Fix typos in block-replication.txt
V12:
1. Rebase to the newest codes
2. Use backing reference to replcace 'allow-write-backing-file'
V11:
1. Reopen the backing file when starting blcok replication if it is not
opened in R/W mode
2. Unblock BLOCK_OP_TYPE_BACKUP_SOURCE and BLOCK_OP_TYPE_BACKUP_TARGET
when opening backing file
3. Block the top BDS so there is only one block job for the top BDS and
its backing chain.
V10:
1. Use blockdev-remove-medium and blockdev-insert-medium to replace backing
reference.
2. Address the comments from Eric Blake
V9:
1. Update the error messages
2. Rebase to the newest qemu
3. Split child add/delete support. These patches are sent in another patchset.
V8:
1. Address Alberto Garcia's comments
V7:
1. Implement adding/removing quorum child. Remove the option non-connect.
2. Simplify the backing refrence option according to Stefan Hajnoczi's 
suggestion
V6:
1. Rebase to the newest qemu.
V5:
1. Address the comments from Gong Lei
2. Speed the failover up. The secondary vm can take over very quickly even
if there are too many I/O requests.
V4:
1. Introduce a new driver replication to avoid touch nbd and qcow2.
V3:
1: use error_setg() instead of error_set()
2. Add a new block job API
3. Active disk, hidden disk and nbd target uses the same AioContext
4. Add a testcase to test new hbitmap API
V2:
1. Redesign the secondary qemu(use image-fleecing)
2. Use Error objects to return error message
3. Address the comments from Max Reitz and Eric Blake

Changlong Xie (1):
   Introduce new APIs to do replication operation

Wen Congyang (7):
   unblock backup operations in backing file
   Backup: clear all bitmap when doing block checkpoint
   Link backup into block core
   docs: block replication's description
   auto complete active commit
   Implement new driver for block replication
   support replication driver in blockdev-add

  Makefile.objs  |   1 +
  block.c|  17 ++
  block/Makefile.objs|   3 +-
  block/backup.c |  17 ++
  block/mirror.c |  13 +-
  block/replication.c| 625 +
  blockdev.c |   2 +-
  docs/block-replication.txt | 239 +
  include/block/block_int.h  |

Re: [Qemu-block] [Qemu-devel] [PATCH v3 36/44] nbd: Improve handling of shutdown requests

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> NBD commit 6d34500b clarified how clients and servers are supposed
> to behave before closing a connection. It added NBD_REP_ERR_SHUTDOWN
> (for the server to announce it is about to go away during option
> haggling, so the client should quit sending NBD_OPT_* other than
> NBD_OPT_ABORT) and ESHUTDOWN (for the server to announce it is about
> to go away during transmission, so the client should quit sending
> NBD_CMD_* other than NBD_CMD_DISC).  It also clarified that
> NBD_OPT_ABORT gets a reply, while NBD_CMD_DISC does not.
> 
> This patch merely adds the missing reply to NBD_OPT_ABORT and teaches
> the client to recognize server errors.  Actually teaching the server
> to send NBD_REP_ERR_SHUTDOWN or ESHUTDOWN would require knowing that
> the server has been requested to shut down soon (maybe we could do
> that by installing a SIGINT handler in qemu-nbd, which transitions
> from RUNNING to a new state that waits for the client to react,
> rather than just out-right quitting).
> 
> Signed-off-by: Eric Blake 
> ---
> include/block/nbd.h | 13 +
> nbd/nbd-internal.h  |  1 +
> nbd/client.c| 16 
> nbd/server.c| 16 +++-
> 4 files changed, 41 insertions(+), 5 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index d707761..2fd1a67 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -82,12 +82,17 @@ typedef struct nbd_reply nbd_reply;
> #define NBD_FLAG_C_NO_ZEROES  (1 << 1) /* End handshake without zeroes. */
> 
> /* Reply types. */
> +#define NBD_REP_ERR(value) ((UINT32_C(1) << 31) | (value))
> +
> #define NBD_REP_ACK (1) /* Data sending finished. */
> #define NBD_REP_SERVER  (2) /* Export description. */
> -#define NBD_REP_ERR_UNSUP   ((UINT32_C(1) << 31) | 1) /* Unknown option. 
> */
> -#define NBD_REP_ERR_POLICY  ((UINT32_C(1) << 31) | 2) /* Server denied */
> -#define NBD_REP_ERR_INVALID ((UINT32_C(1) << 31) | 3) /* Invalid length. 
> */
> -#define NBD_REP_ERR_TLS_REQD((UINT32_C(1) << 31) | 5) /* TLS required */
> +
> +#define NBD_REP_ERR_UNSUP   NBD_REP_ERR(1)  /* Unknown option. */
> +#define NBD_REP_ERR_POLICY  NBD_REP_ERR(2)  /* Server denied */
> +#define NBD_REP_ERR_INVALID NBD_REP_ERR(3)  /* Invalid length */
> +#define NBD_REP_ERR_PLATFORMNBD_REP_ERR(4)  /* Not compiled in */
> +#define NBD_REP_ERR_TLS_REQDNBD_REP_ERR(5)  /* TLS required */
> +#define NBD_REP_ERR_SHUTDOWNNBD_REP_ERR(7)  /* Server shutting down */
> 
> /* Request flags, sent from client to server during transmission phase */
> #define NBD_CMD_FLAG_FUA(1 << 0)
> diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
> index 95069db..0d40b1f 100644
> --- a/nbd/nbd-internal.h
> +++ b/nbd/nbd-internal.h
> @@ -91,6 +91,7 @@
> #define NBD_ENOMEM 12
> #define NBD_EINVAL 22
> #define NBD_ENOSPC 28
> +#define NBD_ESHUTDOWN  108
> 
> static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size)
> {
> diff --git a/nbd/client.c b/nbd/client.c
> index 68e9473..4140d13 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -34,6 +34,8 @@ static int nbd_errno_to_system_errno(int err)
> return ENOMEM;
> case NBD_ENOSPC:
> return ENOSPC;
> +case NBD_ESHUTDOWN:
> +return ESHUTDOWN;
> default:
> TRACE("Squashing unexpected error %d to EINVAL", err);
> /* fallthrough */
> @@ -210,11 +212,21 @@ static int nbd_handle_reply_err(QIOChannel *ioc, 
> nbd_opt_reply *reply,
>reply->option);
> break;
> 
> +case NBD_REP_ERR_PLATFORM:
> +error_setg(errp, "Server lacks support for option %" PRIx32,
> +   reply->option);
> +break;
> +
> case NBD_REP_ERR_TLS_REQD:
> error_setg(errp, "TLS negotiation required before option %" PRIx32,
>reply->option);
> break;
> 
> +case NBD_REP_ERR_SHUTDOWN:
> +error_setg(errp, "Server shutting down before option %" PRIx32,
> +   reply->option);
> +break;
> +
> default:
> error_setg(errp, "Unknown error code when asking for option %" PRIx32,
>reply->option);
> @@ -754,6 +766,10 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, struct 
> nbd_reply *reply)
> LOG("invalid magic (got 0x%" PRIx32 ")", magic);
> return -EINVAL;
> }
> +if (reply->error == ESHUTDOWN) {
> +LOG("server shutting down");
> +return -EINVAL;
> +}
> return 0;
> }
> 
> diff --git a/nbd/server.c b/nbd/server.c
> index dadc928..fa6a994 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -39,6 +39,8 @@ static int system_errno_to_nbd_errno(int err)
> case EFBIG:
> case ENOSPC:
> return NBD_ENOSPC;
> +case ESHUTDOWN:
> +return NBD_ESHUTDOWN;
> case EINVAL:
> default:
> return NBD_EINVAL;
> @@ -484,6 +486

Re: [Qemu-block] [Qemu-devel] [PATCH v3 40/44] nbd: Implement NBD_OPT_GO on client

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> NBD_OPT_EXPORT_NAME is lousy: it doesn't have any sane error
> reporting.  Upstream NBD recently added NBD_OPT_GO as the
> improved version of the option that does what we want: it
> reports sane errors on failures (including when a server
> requires TLS but does not have NBD_OPT_GO!), and on success
> it provides at least as much info as NBD_OPT_EXPORT_NAME sends.
> 
> Signed-off-by: Eric Blake 
> 

Reviewed-by: Alex Bligh 

> ---
> v3: revamp to match latest version of NBD protocol
> ---
> nbd/nbd-internal.h |   3 ++
> nbd/client.c   | 120 -
> 2 files changed, 121 insertions(+), 2 deletions(-)
> 
> diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
> index c597bb8..1935102 100644
> --- a/nbd/nbd-internal.h
> +++ b/nbd/nbd-internal.h
> @@ -55,8 +55,11 @@
>  * https://github.com/yoe/nbd/blob/master/doc/proto.md
>  */
> 
> +/* Size of all NBD_OPT_*, without payload */
> #define NBD_REQUEST_SIZE(4 + 2 + 2 + 8 + 8 + 4)
> +/* Size of all NBD_REP_* sent in answer to most NBD_OPT_*, without payload */
> #define NBD_REPLY_SIZE  (4 + 4 + 8)
> +
> #define NBD_REQUEST_MAGIC   0x25609513
> #define NBD_REPLY_MAGIC 0x67446698
> #define NBD_OPTS_MAGIC  0x49484156454F5054LL
> diff --git a/nbd/client.c b/nbd/client.c
> index 89fa2c3..dac4f29 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -222,6 +222,11 @@ static int nbd_handle_reply_err(QIOChannel *ioc, 
> nbd_opt_reply *reply,
>reply->option);
> break;
> 
> +case NBD_REP_ERR_UNKNOWN:
> +error_setg(errp, "Requested export not available for option %" 
> PRIx32,
> +   reply->option);
> +break;
> +
> case NBD_REP_ERR_SHUTDOWN:
> error_setg(errp, "Server shutting down before option %" PRIx32,
>reply->option);
> @@ -311,6 +316,103 @@ static int nbd_receive_list(QIOChannel *ioc, const char 
> *want, Error **errp)
> }
> 
> 
> +/* Returns -1 if NBD_OPT_GO proves the export @wantname cannot be
> + * used, 0 if NBD_OPT_GO is unsupported (fall back to NBD_OPT_LIST and
> + * NBD_OPT_EXPORT_NAME in that case), and > 0 if the export is good to
> + * go (with @size and @flags populated). */
> +static int nbd_opt_go(QIOChannel *ioc, const char *wantname,
> +  NbdExportInfo *info, Error **errp)
> +{
> +nbd_opt_reply reply;
> +uint32_t len;
> +uint16_t type;
> +int error;
> +
> +/* The protocol requires that the server send NBD_INFO_EXPORT with
> + * a non-zero flags (at least NBD_FLAG_HAS_FLAGS must be set); so
> + * flags still 0 is a witness of a broken server. */
> +info->flags = 0;
> +
> +TRACE("Attempting NBD_OPT_GO for export '%s'", wantname);
> +if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) {
> +return -1;
> +}
> +
> +TRACE("Reading export info");
> +while (1) {
> +if (nbd_receive_option_reply(ioc, NBD_OPT_GO, &reply, errp) < 0) {
> +return -1;
> +}
> +error = nbd_handle_reply_err(ioc, &reply, errp);
> +if (error <= 0) {
> +return error;
> +}
> +len = reply.length;
> +
> +if (reply.type == NBD_REP_ACK) {
> +/* Server is done sending info and moved into transmission
> +   phase, but make sure it sent flags */
> +if (len) {
> +error_setg(errp, "server sent invalid NBD_REP_ACK");
> +return -1;
> +}
> +if (!info->flags) {
> +error_setg(errp, "broken server omitted NBD_INFO_EXPORT");
> +return -1;
> +}
> +TRACE("export is good to go");
> +return 1;
> +}
> +if (reply.type != NBD_REP_INFO) {
> +error_setg(errp, "unexpected reply type %" PRIx32 ", expected 
> %x",
> +   reply.type, NBD_REP_INFO);
> +return -1;
> +}
> +if (len < sizeof(type)) {
> +error_setg(errp, "NBD_REP_INFO length %" PRIu32 " is too short",
> +   len);
> +return -1;
> +}
> +if (read_sync(ioc, &type, sizeof(type)) != sizeof(type)) {
> +error_setg(errp, "failed to read info type");
> +return -1;
> +}
> +len -= sizeof(type);
> +be16_to_cpus(&type);
> +switch (type) {
> +case NBD_INFO_EXPORT:
> +if (len != sizeof(info->size) + sizeof(info->flags)) {
> +error_setg(errp, "remaining export info len %" PRIu32
> +   " is unexpected size", len);
> +return -1;
> +}
> +if (read_sync(ioc, &info->size, sizeof(info->size)) !=
> +sizeof(info->size)) {
> +error_setg(errp, "failed to read info size");
> +return -1;
> +   

Re: [Qemu-block] [Qemu-devel] [PATCH v3 41/44] nbd: Implement NBD_CMD_WRITE_ZEROES on server

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> Upstream NBD protocol recently added the ability to efficiently
> write zeroes without having to send the zeroes over the wire,
> along with a flag to control whether the client wants a hole.
> 
> Signed-off-by: Eric Blake 

Reviewed-by: Alex Bligh 

> 
> ---
> v3: abandon NBD_CMD_CLOSE extension, rebase to use blk_pwrite_zeroes
> ---
> include/block/nbd.h |  7 +--
> nbd/server.c| 42 --
> 2 files changed, 45 insertions(+), 4 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index 05c0e48..1072d9e 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -70,6 +70,7 @@ typedef struct nbd_reply nbd_reply;
> #define NBD_FLAG_SEND_FUA   (1 << 3)/* Send FUA (Force Unit 
> Access) */
> #define NBD_FLAG_ROTATIONAL (1 << 4)/* Use elevator algorithm - 
> rotational media */
> #define NBD_FLAG_SEND_TRIM  (1 << 5)/* Send TRIM (discard) */
> +#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6) /* Send WRITE_ZEROES */
> 
> /* New-style handshake (global) flags, sent from server to client, and
>control what will happen during handshake phase. */
> @@ -102,7 +103,8 @@ typedef struct nbd_reply nbd_reply;
> #define NBD_INFO_DESCRIPTION2
> 
> /* Request flags, sent from client to server during transmission phase */
> -#define NBD_CMD_FLAG_FUA(1 << 0)
> +#define NBD_CMD_FLAG_FUA(1 << 0) /* 'force unit access' during write 
> */
> +#define NBD_CMD_FLAG_NO_HOLE(1 << 1) /* don't punch hole on zero run */
> 
> /* Supported request types */
> enum {
> @@ -110,7 +112,8 @@ enum {
> NBD_CMD_WRITE = 1,
> NBD_CMD_DISC = 2,
> NBD_CMD_FLUSH = 3,
> -NBD_CMD_TRIM = 4
> +NBD_CMD_TRIM = 4,
> +NBD_CMD_WRITE_ZEROES = 5,
> };
> 
> #define NBD_DEFAULT_PORT  10809
> diff --git a/nbd/server.c b/nbd/server.c
> index 1edb5f3..563afb2 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -689,7 +689,8 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData 
> *data)
> char buf[8 + 8 + 8 + 128];
> int rc;
> const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
> -  NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
> +  NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA |
> +  NBD_FLAG_SEND_WRITE_ZEROES);
> bool oldStyle;
> size_t len;
> 
> @@ -1199,11 +1200,17 @@ static ssize_t nbd_co_receive_request(NBDRequest *req,
> rc = -EINVAL;
> goto out;
> }
> -if (request->flags & ~NBD_CMD_FLAG_FUA) {
> +if (request->flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
> LOG("unsupported flags (got 0x%x)", request->flags);
> rc = -EINVAL;
> goto out;
> }
> +if (request->type != NBD_CMD_WRITE_ZEROES &&
> +(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
> +LOG("unexpected flags (got 0x%x)", request->flags);
> +rc = -EINVAL;
> +goto out;
> +}
> 
> rc = 0;
> 
> @@ -1308,6 +1315,37 @@ static void nbd_trip(void *opaque)
> }
> break;
> 
> +case NBD_CMD_WRITE_ZEROES:
> +TRACE("Request type is WRITE_ZEROES");
> +
> +if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
> +TRACE("Server is read-only, return error");
> +reply.error = EROFS;
> +goto error_reply;
> +}
> +
> +TRACE("Writing to device");
> +
> +flags = 0;
> +if (request.flags & NBD_CMD_FLAG_FUA) {
> +flags |= BDRV_REQ_FUA;
> +}
> +if (!(request.flags & NBD_CMD_FLAG_NO_HOLE)) {
> +flags |= BDRV_REQ_MAY_UNMAP;
> +}
> +ret = blk_pwrite_zeroes(exp->blk, request.from + exp->dev_offset,
> +request.len, flags);
> +if (ret < 0) {
> +LOG("writing to file failed");
> +reply.error = -ret;
> +goto error_reply;
> +}
> +
> +if (nbd_co_send_reply(req, &reply, 0) < 0) {
> +goto out;
> +}
> +break;
> +
> case NBD_CMD_DISC:
> /* unreachable, thanks to special case in nbd_co_receive_request() */
> abort();
> -- 
> 2.5.5
> 
> 

-- 
Alex Bligh







Re: [Qemu-block] [Qemu-devel] [PATCH v3 42/44] nbd: Implement NBD_CMD_WRITE_ZEROES on client

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> Upstream NBD protocol recently added the ability to efficiently
> write zeroes without having to send the zeroes over the wire,
> along with a flag to control whether the client wants a hole.
> 
> The generic block code takes care of falling back to the obvious
> write of lots of zeroes if we return -ENOTSUP because the server
> does not have WRITE_ZEROES.
> 
> Signed-off-by: Eric Blake 
Reviewed-by: Alex Bligh 
> 
> ---
> v3: rebase, tell block layer about our support
> ---
> block/nbd-client.h |  2 ++
> block/nbd-client.c | 34 ++
> block/nbd.c| 24 
> 3 files changed, 60 insertions(+)
> 
> diff --git a/block/nbd-client.h b/block/nbd-client.h
> index 0867147..07630ab 100644
> --- a/block/nbd-client.h
> +++ b/block/nbd-client.h
> @@ -46,6 +46,8 @@ void nbd_client_close(BlockDriverState *bs);
> int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
>   int nb_sectors);
> int nbd_client_co_flush(BlockDriverState *bs);
> +int nbd_client_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
> +   int nb_sectors, int *flags);
> int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
>  int nb_sectors, QEMUIOVector *qiov, int *flags);
> int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index f20219b..2b6ac27 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -291,6 +291,40 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t 
> sector_num,
> return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
> }
> 
> +int nbd_client_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
> +   int nb_sectors, int *flags)
> +{
> +ssize_t ret;
> +NbdClientSession *client = nbd_get_client_session(bs);
> +struct nbd_request request = { .type = NBD_CMD_WRITE_ZEROES };
> +struct nbd_reply reply;
> +
> +if (!(client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) {
> +return -ENOTSUP;
> +}
> +
> +if ((*flags & BDRV_REQ_FUA) && (client->info.flags & NBD_FLAG_SEND_FUA)) 
> {
> +*flags &= ~BDRV_REQ_FUA;
> +request.flags |= NBD_CMD_FLAG_FUA;
> +}
> +if (!(*flags & BDRV_REQ_MAY_UNMAP)) {
> +request.flags |= NBD_CMD_FLAG_NO_HOLE;
> +}
> +
> +request.from = sector_num * 512;
> +request.len = nb_sectors * 512;
> +
> +nbd_coroutine_start(client, &request);
> +ret = nbd_co_send_request(bs, &request, NULL, 0);
> +if (ret < 0) {
> +reply.error = -ret;
> +} else {
> +nbd_co_receive_reply(client, &request, &reply, NULL, 0);
> +}
> +nbd_coroutine_end(client, &request);
> +return -reply.error;
> +}
> +
> int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
>  int nb_sectors, QEMUIOVector *qiov, int *flags)
> {
> diff --git a/block/nbd.c b/block/nbd.c
> index 34db83e..5172039 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -355,6 +355,26 @@ static int nbd_co_readv(BlockDriverState *bs, int64_t 
> sector_num,
> return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
> }
> 
> +static int nbd_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
> +   int nb_sectors, BdrvRequestFlags orig_flags)
> +{
> +int flags = orig_flags;
> +int ret;
> +
> +ret = nbd_client_co_write_zeroes(bs, sector_num, nb_sectors, &flags);
> +if (ret < 0) {
> +return ret;
> +}
> +
> +/* The flag wasn't sent to the server, so we need to emulate it with an
> + * explicit flush */
> +if (flags & BDRV_REQ_FUA) {
> +ret = nbd_client_co_flush(bs);
> +}
> +
> +return ret;
> +}
> +
> static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num,
>int nb_sectors, QEMUIOVector *qiov, int flags)
> {
> @@ -388,6 +408,7 @@ static int nbd_co_flush(BlockDriverState *bs)
> static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
> {
> bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
> +bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS;
> bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
> }
> 
> @@ -476,6 +497,7 @@ static BlockDriver bdrv_nbd = {
> .bdrv_parse_filename= nbd_parse_filename,
> .bdrv_file_open = nbd_open,
> .bdrv_co_readv  = nbd_co_readv,
> +.bdrv_co_write_zeroes   = nbd_co_write_zeroes,
> .bdrv_co_writev = nbd_co_writev,
> .bdrv_co_writev_flags   = nbd_co_writev_flags,
> .supported_write_flags  = BDRV_REQ_FUA,
> @@ -496,6 +518,7 @@ static BlockDriver bdrv_nbd_tcp = {
> .bdrv_parse_filename= nbd_parse_filename,
> .bdrv_file_open = nbd_open,
> .bdrv_co_readv  = nbd_co_re

Re: [Qemu-block] [Qemu-devel] [PATCH v3 43/44] nbd: Implement NBD_OPT_BLOCK_SIZE on server

2016-04-25 Thread Alex Bligh
Eric,

See my message on nbd-general today re the necessity (or not)
of getting NBD_OPT_BLOCK_SIZE first; it may be just that you
can assume 512 is OK.

Otherwise

Reviewed-by: Alex Bligh 

Alex

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> The upstream NBD Protocol has defined a new extension to allow
> the server to advertise block sizes to the client, as well as
> a way for the client to inform the server that it intends to
> obey block sizes.
> 
> Thanks to a recent fix, our minimum transfer size is always
> 1 (the block layer takes care of read-modify-write on our
> behalf), although if we wanted down the road, we could
> advertise a minimum of 512 based on our usage patterns to a
> client that is willing to honor block sizes.  Meanwhile,
> advertising our absolute maximum transfer size of 32M will
> help newer clients avoid EINVAL failures.
> 
> We do not reject clients for using the older NBD_OPT_EXPORT_NAME;
> we are no worse off for those clients than we used to be. But
> for clients new enough to use NBD_OPT_GO, we require the client
> to first send us NBD_OPT_BLOCK_SIZE to prove they know about
> sizing restraints, by failing with NBD_REP_ERR_BLOCK_SIZE_REQD.
> All existing released qemu clients (whether old-style or new, at
> least by the end of this series) already honor our limits, and
> will still connect; so at most, this would reject a new non-qemu
> client that doesn't intend to obey limits (and that client could
> still use NBD_OPT_EXPORT_NAME to bypass our rejection).
> 
> Signed-off-by: Eric Blake 
> ---
> include/block/nbd.h |  2 ++
> nbd/nbd-internal.h  |  1 +
> nbd/server.c| 62 +
> 3 files changed, 65 insertions(+)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index 1072d9e..a5c68df 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -96,11 +96,13 @@ typedef struct nbd_reply nbd_reply;
> #define NBD_REP_ERR_TLS_REQDNBD_REP_ERR(5)  /* TLS required */
> #define NBD_REP_ERR_UNKNOWN NBD_REP_ERR(6)  /* Export unknown */
> #define NBD_REP_ERR_SHUTDOWNNBD_REP_ERR(7)  /* Server shutting down */
> +#define NBD_REP_ERR_BLOCK_SIZE_REQD NBD_REP_ERR(8) /* Missing OPT_BLOCK_SIZE 
> */
> 
> /* Info types, used during NBD_REP_INFO */
> #define NBD_INFO_EXPORT 0
> #define NBD_INFO_NAME   1
> #define NBD_INFO_DESCRIPTION2
> +#define NBD_INFO_BLOCK_SIZE 3
> 
> /* Request flags, sent from client to server during transmission phase */
> #define NBD_CMD_FLAG_FUA(1 << 0) /* 'force unit access' during write 
> */
> diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
> index 1935102..1354182 100644
> --- a/nbd/nbd-internal.h
> +++ b/nbd/nbd-internal.h
> @@ -85,6 +85,7 @@
> #define NBD_OPT_STARTTLS(5)
> #define NBD_OPT_INFO(6)
> #define NBD_OPT_GO  (7)
> +#define NBD_OPT_BLOCK_SIZE  (9)
> 
> /* NBD errors are based on errno numbers, so there is a 1:1 mapping,
>  * but only a limited set of errno values is specified in the protocol.
> diff --git a/nbd/server.c b/nbd/server.c
> index 563afb2..86d1e2d 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -83,6 +83,7 @@ struct NBDClient {
> void (*close)(NBDClient *client);
> 
> bool no_zeroes;
> +bool block_size;
> NBDExport *exp;
> QCryptoTLSCreds *tlscreds;
> char *tlsaclname;
> @@ -346,6 +347,7 @@ static int nbd_negotiate_handle_info(NBDClient *client, 
> uint32_t length,
> uint16_t type;
> uint64_t size;
> uint16_t flags;
> +uint32_t block;
> 
> /* Client sends:
> [20 ..  xx]   export name (length bytes)
> @@ -391,6 +393,57 @@ static int nbd_negotiate_handle_info(NBDClient *client, 
> uint32_t length,
> }
> 
> rc = nbd_negotiate_send_rep_len(client->ioc, NBD_REP_INFO, opt,
> +sizeof(type) + 3 * sizeof(block));
> +if (rc < 0) {
> +return rc;
> +}
> +
> +type = cpu_to_be16(NBD_INFO_BLOCK_SIZE);
> +if (nbd_negotiate_write(client->ioc, &type, sizeof(type)) !=
> +sizeof(type)) {
> +LOG("write failed");
> +return -EIO;
> +}
> +/* minimum - Always 1, because we use blk_pread().
> + * TODO: Advertise 512 if guest used NBD_OPT_BLOCK_SIZE? */
> +block = cpu_to_be32(1);
> +if (nbd_negotiate_write(client->ioc, &block, sizeof(block)) !=
> +sizeof(block)) {
> +LOG("write failed");
> +return -EIO;
> +}
> +/* preferred - At least 4096, but larger as appropriate. */
> +block = blk_get_opt_transfer_length(exp->blk) * BDRV_SECTOR_SIZE;
> +block = cpu_to_be32(MAX(4096, block));
> +if (nbd_negotiate_write(client->ioc, &block, sizeof(block)) !=
> +sizeof(block)) {
> +LOG("write failed");
> +return -EIO;
> +}
> +/* maximum - At most 32M, but smaller as appropriate. */
> +block = blk_get_max_transfer_length(exp->blk);
> +if (block && block < NBD_MAX_BUFFER_SIZE / B

Re: [Qemu-block] [Qemu-devel] [PATCH v3 44/44] nbd: Implement NBD_OPT_BLOCK_SIZE on client

2016-04-25 Thread Alex Bligh

On 23 Apr 2016, at 00:40, Eric Blake  wrote:

> The upstream NBD Protocol has defined a new extension to allow
> the server to advertise block sizes to the client, as well as
> a way for the client to inform the server that it intends to
> obey block sizes.
> 
> Pass any received sizes on to the block layer.
> 
> Use the minimum block size as the sector size we pass to the
> kernel - which also has the nice effect of cooperating with
> (non-qemu) servers that don't do read-modify-write when exposing
> a block device with 4k sectors; it can also allow us to visit a
> file larger than 2T on a 32-bit kernel.
> 
> Signed-off-by: Eric Blake 
> ---
> include/block/nbd.h |  3 +++
> block/nbd-client.c  |  3 +++
> block/nbd.c | 17 +---
> nbd/client.c| 74 -
> 4 files changed, 87 insertions(+), 10 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index a5c68df..27a6854 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -133,6 +133,9 @@ enum {
> struct NbdExportInfo {
> uint64_t size;
> uint16_t flags;
> +uint32_t min_block;
> +uint32_t opt_block;
> +uint32_t max_block;
> };
> typedef struct NbdExportInfo NbdExportInfo;
> 
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 2b6ac27..602a8ab 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -443,6 +443,9 @@ int nbd_client_init(BlockDriverState *bs,
> logout("Failed to negotiate with the NBD server\n");
> return ret;
> }
> +if (client->info.min_block > bs->request_alignment) {
> +bs->request_alignment = client->info.min_block;
> +}
> 
> qemu_co_mutex_init(&client->send_mutex);
> qemu_co_mutex_init(&client->free_sema);
> diff --git a/block/nbd.c b/block/nbd.c
> index 5172039..bb7df55 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -407,9 +407,20 @@ static int nbd_co_flush(BlockDriverState *bs)
> 
> static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
> {
> -bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
> -bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS;
> -bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
> +NbdClientSession *s = nbd_get_client_session(bs);
> +int max = UINT32_MAX >> BDRV_SECTOR_BITS;
> +
> +if (s->info.max_block) {
> +max = s->info.max_block >> BDRV_SECTOR_BITS;
> +}
> +bs->bl.max_discard = max;
> +bs->bl.max_write_zeroes = max;
> +bs->bl.max_transfer_length = max;
> +
> +if (s->info.opt_block &&
> +s->info.opt_block >> BDRV_SECTOR_BITS > bs->bl.opt_transfer_length) {
> +bs->bl.opt_transfer_length = s->info.opt_block >> BDRV_SECTOR_BITS;
> +}
> }
> 
> static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
> diff --git a/nbd/client.c b/nbd/client.c
> index dac4f29..24f6b0b 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -232,6 +232,11 @@ static int nbd_handle_reply_err(QIOChannel *ioc, 
> nbd_opt_reply *reply,
>reply->option);
> break;
> 
> +case NBD_REP_ERR_BLOCK_SIZE_REQD:
> +error_setg(errp, "Server wants OPT_BLOCK_SIZE before option %" 
> PRIx32,
> +   reply->option);
> +break;
> +
> default:
> error_setg(errp, "Unknown error code when asking for option %" PRIx32,
>reply->option);
> @@ -333,6 +338,21 @@ static int nbd_opt_go(QIOChannel *ioc, const char 
> *wantname,
>  * flags still 0 is a witness of a broken server. */
> info->flags = 0;
> 
> +/* Some servers use NBD_OPT_GO to advertise non-default block
> + * sizes, and require that we first use NBD_OPT_BLOCK_SIZE to
> + * agree to that. */
> +TRACE("Attempting NBD_OPT_BLOCK_SIZE");
> +if (nbd_send_option_request(ioc, NBD_OPT_BLOCK_SIZE, 0, NULL, errp) < 0) 
> {
> +return -1;
> +}
> +if (nbd_receive_option_reply(ioc, NBD_OPT_BLOCK_SIZE, &reply, errp) < 0) 
> {
> +return -1;
> +}
> +error = nbd_handle_reply_err(ioc, &reply, errp);
> +if (error < 0) {
> +return error;
> +}
> +
> TRACE("Attempting NBD_OPT_GO for export '%s'", wantname);
> if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) {
> return -1;
> @@ -402,6 +422,45 @@ static int nbd_opt_go(QIOChannel *ioc, const char 
> *wantname,
>   info->size, info->flags);
> break;
> 
> +case NBD_INFO_BLOCK_SIZE:
> +if (len != sizeof(info->min_block) * 3) {
> +error_setg(errp, "remaining export info len %" PRIu32
> +   " is unexpected size", len);
> +return -1;
> +}
> +if (read_sync(ioc, &info->min_block, sizeof(info->min_block)) !=
> +sizeof(info->min_block)) {
> +error_setg(errp, "failed to read info minimum block size");
> +return -1;
> +

Re: [Qemu-block] [Qemu-devel] [PATCH v3 44/44] nbd: Implement NBD_OPT_BLOCK_SIZE on client

2016-04-25 Thread Eric Blake
On 04/25/2016 06:19 AM, Alex Bligh wrote:
> 
> On 23 Apr 2016, at 00:40, Eric Blake  wrote:
> 
>> The upstream NBD Protocol has defined a new extension to allow
>> the server to advertise block sizes to the client, as well as
>> a way for the client to inform the server that it intends to
>> obey block sizes.
>>
>> Pass any received sizes on to the block layer.
>>
>> Use the minimum block size as the sector size we pass to the
>> kernel - which also has the nice effect of cooperating with
>> (non-qemu) servers that don't do read-modify-write when exposing
>> a block device with 4k sectors; it can also allow us to visit a
>> file larger than 2T on a 32-bit kernel.
>>

>> +be32_to_cpus(&info->max_block);
>> +TRACE("Block sizes are 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" 
>> PRIx32,
>> +  info->min_block, info->opt_block, info->max_block);
>> +break;
>> +
> 
> You should probably check min_block <= opt_block <= max_block here

opt_block > max_block is possible if max_block is clamped to export size
(in the degenerate case where you have a small export that is too small
for the granularity of a hole or efficient I/O).  But yes, some sanity
checks that the server isn't horribly broken might be worthwhile.

> 
> Also should there be a check that BDRV_SECTOR_SIZE >= min_block?

No, because we take the server's min_block and feed it into
bs->request_align (which forces the block layer to comply with a minimum
alignment larger than 512, using code already tested on physical block
drives with 4k sectors), see the hunk in nbd-client.c.

>> int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
>> {
>> -unsigned long sectors = info->size / BDRV_SECTOR_SIZE;
>> -if (info->size / BDRV_SECTOR_SIZE != sectors) {
>> +unsigned long sector_size = MAX(BDRV_SECTOR_SIZE, info->min_block);
>> +unsigned long sectors = info->size / sector_size;
>> +if (info->size / sector_size != sectors) {
>> LOG("Export size %" PRId64 " too large for 32-bit kernel", 
>> info->size);
>> return -E2BIG;
>> }
>> @@ -724,18 +784,18 @@ int nbd_init(int fd, QIOChannelSocket *sioc, 
>> NbdExportInfo *info)
>> return -serrno;
>> }
>>
>> -TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
>> +TRACE("Setting block size to %lu", sector_size);
>>
>> -if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) {
>> +if (ioctl(fd, NBD_SET_BLKSIZE, sector_size) < 0) {

We also feed the maximum of 512 or the advertised minimum block size
into the kernel when using ioctl() for the kernel to take over
transmission phase; although I'm not certain whether the kernel obeys
NBD_SET_BLKSIZE as a hint rather than a hard rule - but if that needs
patching, it needs patching in the kernel implementation, not in qemu.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v3 36/44] nbd: Improve handling of shutdown requests

2016-04-25 Thread Eric Blake
On 04/25/2016 03:47 AM, Alex Bligh wrote:
> 
> On 23 Apr 2016, at 00:40, Eric Blake  wrote:
> 
>> NBD commit 6d34500b clarified how clients and servers are supposed
>> to behave before closing a connection. It added NBD_REP_ERR_SHUTDOWN
>> (for the server to announce it is about to go away during option
>> haggling, so the client should quit sending NBD_OPT_* other than
>> NBD_OPT_ABORT) and ESHUTDOWN (for the server to announce it is about
>> to go away during transmission, so the client should quit sending
>> NBD_CMD_* other than NBD_CMD_DISC).  It also clarified that
>> NBD_OPT_ABORT gets a reply, while NBD_CMD_DISC does not.
>>
>> This patch merely adds the missing reply to NBD_OPT_ABORT and teaches
>> the client to recognize server errors.  Actually teaching the server
>> to send NBD_REP_ERR_SHUTDOWN or ESHUTDOWN would require knowing that
>> the server has been requested to shut down soon (maybe we could do
>> that by installing a SIGINT handler in qemu-nbd, which transitions
>> from RUNNING to a new state that waits for the client to react,
>> rather than just out-right quitting).
>>
>> Signed-off-by: Eric Blake 
>> ---

>> @@ -484,6 +486,10 @@ static int nbd_negotiate_options(NBDClient *client)
>> if (ret < 0) {
>> return ret;
>> }
>> +/* Let the client keep trying, unless they asked to quit */
>> +if (clientflags == NBD_OPT_ABORT) {
> 
> OK that's totally confusing. clientflags is not the client flags. clientflags
> is the NBD option ID, which happens to be the two bytes after the NBD OPT 
> magic,
> which is the client flags if we were doing oldstyle negotiation, not newstyle
> negotiation.

Yes, 'clientflags' is a poor name; I should rename it in a separate
patch.   It is the option-negotiation command type.

> 
> Except:
> 
>> +return -EINVAL;
>> +}
>> break;
>> }
>> } else if (fixedNewstyle) {
> 
> So the above is for NewStyle (not fixedNewStyle)?

The above is for fixedNewStyle when TLS is not negotiated yet; the 'else
if' is for fixedNewStyle after TLS has been negotiated.  Prior to TLS,
we have to special-case NBD_OPT_ABORT and actually quit.

> 
> In which case more than one option isn't even supported, so what's the
> stuff purporting to handle TLS doing there?
> 
> Confused ...

Sounds like a cleanup patch as a prerequisite on my next respin would
help, then.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v3 36/44] nbd: Improve handling of shutdown requests

2016-04-25 Thread Alex Bligh

On 25 Apr 2016, at 20:20, Eric Blake  wrote:

>>>} else if (fixedNewstyle) {
>> 
>> So the above is for NewStyle (not fixedNewStyle)?
> 
> The above is for fixedNewStyle when TLS is not negotiated yet; the 'else
> if' is for fixedNewStyle after TLS has been negotiated.  Prior to TLS,
> we have to special-case NBD_OPT_ABORT and actually quit.

OK. fixedNewStyle is defined as a prerequisite for TLS. I'm hoping
nothing in Qemuland ever did non-fixed NewStyle, and assuming that's
the case I would not even support it (certainly it won't play
nicely with all the other stuff you've been doing).

--
Alex Bligh






signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [Qemu-block] [Qemu-devel] [PATCH v3 36/44] nbd: Improve handling of shutdown requests

2016-04-25 Thread Eric Blake
On 04/25/2016 01:40 PM, Alex Bligh wrote:
> 
> On 25 Apr 2016, at 20:20, Eric Blake  wrote:
> 
} else if (fixedNewstyle) {
>>>
>>> So the above is for NewStyle (not fixedNewStyle)?
>>
>> The above is for fixedNewStyle when TLS is not negotiated yet; the 'else
>> if' is for fixedNewStyle after TLS has been negotiated.  Prior to TLS,
>> we have to special-case NBD_OPT_ABORT and actually quit.
> 
> OK. fixedNewStyle is defined as a prerequisite for TLS. I'm hoping
> nothing in Qemuland ever did non-fixed NewStyle, and assuming that's
> the case I would not even support it (certainly it won't play
> nicely with all the other stuff you've been doing).

Well, there were some last-minute patches that went into the 2.6
candidates that fixed qemu to actually be a fixedNewStyle server
(without commit 156f6a10, for example, qemu had the very bug of
disconnecting on unknown client options that fixedNewStyle was supposed
to prevent). Fortunately, qemu 2.5 is oldstyle only, and qemu 2.6 is the
first newstyle server, and I think I got the worst of the
interoperability bugs nailed in 2.6, whereas this series is focusing on
the feature enhancements for inclusion in 2.7.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v3 16/44] atapi: Switch to byte-based block access

2016-04-25 Thread John Snow


On 04/22/2016 07:40 PM, Eric Blake wrote:
> Sector-based blk_read() should die; switch to byte-based
> blk_pread() instead.
> 
> Signed-off-by: Eric Blake 
> ---
>  hw/ide/atapi.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
> index 2bb606c..81000d8 100644
> --- a/hw/ide/atapi.c
> +++ b/hw/ide/atapi.c
> @@ -119,12 +119,12 @@ cd_read_sector_sync(IDEState *s)
> 
>  switch (s->cd_sector_size) {
>  case 2048:
> -ret = blk_read(s->blk, (int64_t)s->lba << 2,
> -   s->io_buffer, 4);
> +ret = blk_pread(s->blk, (int64_t)s->lba << (2 + BDRV_SECTOR_BITS),
> +s->io_buffer, 4 << BDRV_SECTOR_BITS);
>  break;
>  case 2352:
> -ret = blk_read(s->blk, (int64_t)s->lba << 2,
> -   s->io_buffer + 16, 4);
> +ret = blk_pread(s->blk, (int64_t)s->lba << (2 + BDRV_SECTOR_BITS),

Uh, hm. So what we have is a cdrom-sector-based LBA, that we need to
transform into IDE-based sectors, then to bytes.

We could just define an ATAPI_SECTOR_BITS to be (2 + BDRV_SECTOR_BITS).

Then, the lba conversion would be just:

s->lba << ATAPI_SECTOR_BITS

and the size would be just:

1 << ATAPI_SECTOR_BITS

And that's probably better on the eyes.

> +s->io_buffer + 16, 4 << BDRV_SECTOR_BITS);
>  if (ret >= 0) {
>  cd_data_to_raw(s->io_buffer, s->lba);
>  }
> 

The code already uses lots of different stuff like
4 * BDRV_SECTOR_SIZE
or
"2048"

so we probably need some staple definition here.


...Otherwise, none of this is a problem you've created, just one this
patch highlights. Fix at your own peril.

Acked-by: John Snow 



Re: [Qemu-block] [Qemu-devel] [PATCH for-2.7 v2 04/17] block: Introduce image file locking

2016-04-25 Thread Laszlo Ersek
On 04/15/16 05:27, Fam Zheng wrote:
> Block drivers can implement this new operation .bdrv_lockf to actually lock 
> the
> image in the protocol specific way.
> 
> Signed-off-by: Fam Zheng 
> ---
>  block.c   | 42 ++
>  include/block/block_int.h | 12 
>  2 files changed, 54 insertions(+)
> 
> diff --git a/block.c b/block.c
> index 1c575e4..7971a25 100644
> --- a/block.c
> +++ b/block.c
> @@ -846,6 +846,34 @@ out:
>  g_free(gen_node_name);
>  }
>  
> +static int bdrv_lock_unlock_image_do(BlockDriverState *bs, bool lock_image)
> +{
> +int cmd = BDRV_LOCKF_UNLOCK;
> +
> +if (bs->image_locked == lock_image) {
> +return 0;
> +} else if (!bs->drv) {
> +return -ENOMEDIUM;
> +} else if (!bs->drv->bdrv_lockf) {
> +return 0;
> +}
> +if (lock_image) {
> +cmd = bs->open_flags & BDRV_O_RDWR ? BDRV_LOCKF_RWLOCK :
> + BDRV_LOCKF_ROLOCK;
> +}
> +return bs->drv->bdrv_lockf(bs, cmd);
> +}
> +
> +static int bdrv_lock_image(BlockDriverState *bs)
> +{
> +return bdrv_lock_unlock_image_do(bs, true);
> +}
> +
> +static int bdrv_unlock_image(BlockDriverState *bs)
> +{
> +return bdrv_lock_unlock_image_do(bs, false);
> +}
> +
>  static QemuOptsList bdrv_runtime_opts = {
>  .name = "bdrv_common",
>  .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
> @@ -995,6 +1023,14 @@ static int bdrv_open_common(BlockDriverState *bs, 
> BdrvChild *file,
>  goto free_and_fail;
>  }
>  
> +if (!(open_flags & (BDRV_O_NO_LOCK | BDRV_O_INACTIVE))) {
> +ret = bdrv_lock_image(bs);
> +if (ret) {
> +error_setg(errp, "Failed to lock image");
> +goto free_and_fail;
> +}
> +}
> +
>  ret = refresh_total_sectors(bs, bs->total_sectors);
>  if (ret < 0) {
>  error_setg_errno(errp, -ret, "Could not refresh total sector count");
> @@ -2144,6 +2180,7 @@ static void bdrv_close(BlockDriverState *bs)
>  if (bs->drv) {
>  BdrvChild *child, *next;
>  
> +bdrv_unlock_image(bs);
>  bs->drv->bdrv_close(bs);
>  bs->drv = NULL;
>  
> @@ -3230,6 +3267,9 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error 
> **errp)
>  error_setg_errno(errp, -ret, "Could not refresh total sector count");
>  return;
>  }
> +if (!(bs->open_flags & BDRV_O_NO_LOCK)) {
> +bdrv_lock_image(bs);
> +}
>  }
>  
>  void bdrv_invalidate_cache_all(Error **errp)
> @@ -3262,6 +3302,7 @@ static int bdrv_inactivate(BlockDriverState *bs)
>  }
>  
>  bs->open_flags |= BDRV_O_INACTIVE;
> +ret = bdrv_unlock_image(bs);
>  return 0;
>  }
>  
> @@ -3981,3 +4022,4 @@ void bdrv_refresh_filename(BlockDriverState *bs)
>  QDECREF(json);
>  }
>  }
> +
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 10d8759..ffa30b0 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -85,6 +85,12 @@ typedef struct BdrvTrackedRequest {
>  struct BdrvTrackedRequest *waiting_for;
>  } BdrvTrackedRequest;
>  
> +typedef enum {
> +BDRV_LOCKF_RWLOCK,
> +BDRV_LOCKF_ROLOCK,
> +BDRV_LOCKF_UNLOCK,
> +} BdrvLockfCmd;
> +
>  struct BlockDriver {
>  const char *format_name;
>  int instance_size;
> @@ -317,6 +323,11 @@ struct BlockDriver {
>   */
>  void (*bdrv_drain)(BlockDriverState *bs);
>  
> +/**
> + * Lock/unlock the image.
> + */
> +int (*bdrv_lockf)(BlockDriverState *bs, BdrvLockfCmd cmd);
> +
>  QLIST_ENTRY(BlockDriver) list;
>  };
>  
> @@ -485,6 +496,7 @@ struct BlockDriverState {
>  NotifierWithReturn write_threshold_notifier;
>  
>  int quiesce_counter;
> +bool image_locked;
>  };
>  
>  struct BlockBackendRootState {
> 

I'd like to raise one point which I think may not have been, yet (after
briefly skimming the v1 / v2 comments). Sorry if this has been discussed
already.

IIUC, the idea is that "protocols" (in the block layer sense) implement
the lockf method, and then bdrv_open_common() automatically locks image
files, if the lockf method is available, and if various settings
(cmdline options etc) don't request otherwise.

I tried to see if this series modifies -- for example --
raw_reopen_commit() and raw_reopen_abort(), in "block/raw-posix.c". Or,
if it modifies bdrv_reopen_multiple(), in "block.c". It doesn't seem to.

Those functions are relevant for the following reason. Given the
following chain of references:

  file descriptor --> file description --> file

an fcntl() lock is associated with the file. However, the fcntl() lock
held by the process on the file is dropped if the process closes *any*
file descriptor that points (through the same or another file
description) to the file. From
:

All locks associated with a file for a given process shall be
  

Re: [Qemu-block] [Qemu-devel] [PATCH for-2.7 v2 04/17] block: Introduce image file locking

2016-04-25 Thread Fam Zheng
On Tue, 04/26 01:55, Laszlo Ersek wrote:
> On 04/15/16 05:27, Fam Zheng wrote:
> > Block drivers can implement this new operation .bdrv_lockf to actually lock 
> > the
> > image in the protocol specific way.
> > 
> > Signed-off-by: Fam Zheng 
> > ---
> >  block.c   | 42 ++
> >  include/block/block_int.h | 12 
> >  2 files changed, 54 insertions(+)
> > 
> > diff --git a/block.c b/block.c
> > index 1c575e4..7971a25 100644
> > --- a/block.c
> > +++ b/block.c
> > @@ -846,6 +846,34 @@ out:
> >  g_free(gen_node_name);
> >  }
> >  
> > +static int bdrv_lock_unlock_image_do(BlockDriverState *bs, bool lock_image)
> > +{
> > +int cmd = BDRV_LOCKF_UNLOCK;
> > +
> > +if (bs->image_locked == lock_image) {
> > +return 0;
> > +} else if (!bs->drv) {
> > +return -ENOMEDIUM;
> > +} else if (!bs->drv->bdrv_lockf) {
> > +return 0;
> > +}
> > +if (lock_image) {
> > +cmd = bs->open_flags & BDRV_O_RDWR ? BDRV_LOCKF_RWLOCK :
> > + BDRV_LOCKF_ROLOCK;
> > +}
> > +return bs->drv->bdrv_lockf(bs, cmd);
> > +}
> > +
> > +static int bdrv_lock_image(BlockDriverState *bs)
> > +{
> > +return bdrv_lock_unlock_image_do(bs, true);
> > +}
> > +
> > +static int bdrv_unlock_image(BlockDriverState *bs)
> > +{
> > +return bdrv_lock_unlock_image_do(bs, false);
> > +}
> > +
> >  static QemuOptsList bdrv_runtime_opts = {
> >  .name = "bdrv_common",
> >  .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
> > @@ -995,6 +1023,14 @@ static int bdrv_open_common(BlockDriverState *bs, 
> > BdrvChild *file,
> >  goto free_and_fail;
> >  }
> >  
> > +if (!(open_flags & (BDRV_O_NO_LOCK | BDRV_O_INACTIVE))) {
> > +ret = bdrv_lock_image(bs);
> > +if (ret) {
> > +error_setg(errp, "Failed to lock image");
> > +goto free_and_fail;
> > +}
> > +}
> > +
> >  ret = refresh_total_sectors(bs, bs->total_sectors);
> >  if (ret < 0) {
> >  error_setg_errno(errp, -ret, "Could not refresh total sector 
> > count");
> > @@ -2144,6 +2180,7 @@ static void bdrv_close(BlockDriverState *bs)
> >  if (bs->drv) {
> >  BdrvChild *child, *next;
> >  
> > +bdrv_unlock_image(bs);
> >  bs->drv->bdrv_close(bs);
> >  bs->drv = NULL;
> >  
> > @@ -3230,6 +3267,9 @@ void bdrv_invalidate_cache(BlockDriverState *bs, 
> > Error **errp)
> >  error_setg_errno(errp, -ret, "Could not refresh total sector 
> > count");
> >  return;
> >  }
> > +if (!(bs->open_flags & BDRV_O_NO_LOCK)) {
> > +bdrv_lock_image(bs);
> > +}
> >  }
> >  
> >  void bdrv_invalidate_cache_all(Error **errp)
> > @@ -3262,6 +3302,7 @@ static int bdrv_inactivate(BlockDriverState *bs)
> >  }
> >  
> >  bs->open_flags |= BDRV_O_INACTIVE;
> > +ret = bdrv_unlock_image(bs);
> >  return 0;
> >  }
> >  
> > @@ -3981,3 +4022,4 @@ void bdrv_refresh_filename(BlockDriverState *bs)
> >  QDECREF(json);
> >  }
> >  }
> > +
> > diff --git a/include/block/block_int.h b/include/block/block_int.h
> > index 10d8759..ffa30b0 100644
> > --- a/include/block/block_int.h
> > +++ b/include/block/block_int.h
> > @@ -85,6 +85,12 @@ typedef struct BdrvTrackedRequest {
> >  struct BdrvTrackedRequest *waiting_for;
> >  } BdrvTrackedRequest;
> >  
> > +typedef enum {
> > +BDRV_LOCKF_RWLOCK,
> > +BDRV_LOCKF_ROLOCK,
> > +BDRV_LOCKF_UNLOCK,
> > +} BdrvLockfCmd;
> > +
> >  struct BlockDriver {
> >  const char *format_name;
> >  int instance_size;
> > @@ -317,6 +323,11 @@ struct BlockDriver {
> >   */
> >  void (*bdrv_drain)(BlockDriverState *bs);
> >  
> > +/**
> > + * Lock/unlock the image.
> > + */
> > +int (*bdrv_lockf)(BlockDriverState *bs, BdrvLockfCmd cmd);
> > +
> >  QLIST_ENTRY(BlockDriver) list;
> >  };
> >  
> > @@ -485,6 +496,7 @@ struct BlockDriverState {
> >  NotifierWithReturn write_threshold_notifier;
> >  
> >  int quiesce_counter;
> > +bool image_locked;
> >  };
> >  
> >  struct BlockBackendRootState {
> > 
> 
> I'd like to raise one point which I think may not have been, yet (after
> briefly skimming the v1 / v2 comments). Sorry if this has been discussed
> already.
> 
> IIUC, the idea is that "protocols" (in the block layer sense) implement
> the lockf method, and then bdrv_open_common() automatically locks image
> files, if the lockf method is available, and if various settings
> (cmdline options etc) don't request otherwise.
> 
> I tried to see if this series modifies -- for example --
> raw_reopen_commit() and raw_reopen_abort(), in "block/raw-posix.c". Or,
> if it modifies bdrv_reopen_multiple(), in "block.c". It doesn't seem to.
> 
> Those functions are relevant for the following reason. Given the
> following chain of references:
> 
>   file descriptor --> file description --> file
> 
> an fcntl() lo