On 23 Apr 2016, at 00:40, Eric Blake wrote:
> The upstream NBD Protocol has defined a new extension to allow
> the server to advertise block sizes to the client, as well as
> a way for the client to inform the server that it intends to
> obey block sizes.
>
> Pass any received sizes on to the block layer.
>
> Use the minimum block size as the sector size we pass to the
> kernel - which also has the nice effect of cooperating with
> (non-qemu) servers that don't do read-modify-write when exposing
> a block device with 4k sectors; it can also allow us to visit a
> file larger than 2T on a 32-bit kernel.
>
> Signed-off-by: Eric Blake
> ---
> include/block/nbd.h | 3 +++
> block/nbd-client.c | 3 +++
> block/nbd.c | 17 +---
> nbd/client.c| 74 -
> 4 files changed, 87 insertions(+), 10 deletions(-)
>
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index a5c68df..27a6854 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -133,6 +133,9 @@ enum {
> struct NbdExportInfo {
> uint64_t size;
> uint16_t flags;
> +uint32_t min_block;
> +uint32_t opt_block;
> +uint32_t max_block;
> };
> typedef struct NbdExportInfo NbdExportInfo;
>
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 2b6ac27..602a8ab 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -443,6 +443,9 @@ int nbd_client_init(BlockDriverState *bs,
> logout("Failed to negotiate with the NBD server\n");
> return ret;
> }
> +if (client->info.min_block > bs->request_alignment) {
> +bs->request_alignment = client->info.min_block;
> +}
>
> qemu_co_mutex_init(>send_mutex);
> qemu_co_mutex_init(>free_sema);
> diff --git a/block/nbd.c b/block/nbd.c
> index 5172039..bb7df55 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -407,9 +407,20 @@ static int nbd_co_flush(BlockDriverState *bs)
>
> static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
> {
> -bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
> -bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS;
> -bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
> +NbdClientSession *s = nbd_get_client_session(bs);
> +int max = UINT32_MAX >> BDRV_SECTOR_BITS;
> +
> +if (s->info.max_block) {
> +max = s->info.max_block >> BDRV_SECTOR_BITS;
> +}
> +bs->bl.max_discard = max;
> +bs->bl.max_write_zeroes = max;
> +bs->bl.max_transfer_length = max;
> +
> +if (s->info.opt_block &&
> +s->info.opt_block >> BDRV_SECTOR_BITS > bs->bl.opt_transfer_length) {
> +bs->bl.opt_transfer_length = s->info.opt_block >> BDRV_SECTOR_BITS;
> +}
> }
>
> static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
> diff --git a/nbd/client.c b/nbd/client.c
> index dac4f29..24f6b0b 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -232,6 +232,11 @@ static int nbd_handle_reply_err(QIOChannel *ioc,
> nbd_opt_reply *reply,
>reply->option);
> break;
>
> +case NBD_REP_ERR_BLOCK_SIZE_REQD:
> +error_setg(errp, "Server wants OPT_BLOCK_SIZE before option %"
> PRIx32,
> + reply->option);
> +break;
> +
> default:
> error_setg(errp, "Unknown error code when asking for option %" PRIx32,
>reply->option);
> @@ -333,6 +338,21 @@ static int nbd_opt_go(QIOChannel *ioc, const char
> *wantname,
> * flags still 0 is a witness of a broken server. */
> info->flags = 0;
>
> +/* Some servers use NBD_OPT_GO to advertise non-default block
> + * sizes, and require that we first use NBD_OPT_BLOCK_SIZE to
> + * agree to that. */
> +TRACE("Attempting NBD_OPT_BLOCK_SIZE");
> +if (nbd_send_option_request(ioc, NBD_OPT_BLOCK_SIZE, 0, NULL, errp) < 0)
> {
> +return -1;
> +}
> +if (nbd_receive_option_reply(ioc, NBD_OPT_BLOCK_SIZE, , errp) < 0)
> {
> +return -1;
> +}
> +error = nbd_handle_reply_err(ioc, , errp);
> +if (error < 0) {
> +return error;
> +}
> +
> TRACE("Attempting NBD_OPT_GO for export '%s'", wantname);
> if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) {
> return -1;
> @@ -402,6 +422,45 @@ static int nbd_opt_go(QIOChannel *ioc, const char
> *wantname,
> info->size, info->flags);
> break;
>
> +case NBD_INFO_BLOCK_SIZE:
> +if (len != sizeof(info->min_block) * 3) {
> +error_setg(errp, "remaining export info len %" PRIu32
> + " is unexpected size", len);
> +return -1;
> +}
> +if (read_sync(ioc, >min_block, sizeof(info->min_block)) !=
> +sizeof(info->min_block)) {
> +error_setg(errp, "failed to read info minimum block size");
> +return