On 23 Apr 2016, at 00:40, Eric Blake <ebl...@redhat.com> wrote:

> The upstream NBD Protocol has defined a new extension to allow
> the server to advertise block sizes to the client, as well as
> a way for the client to inform the server that it intends to
> obey block sizes.
> 
> Pass any received sizes on to the block layer.
> 
> Use the minimum block size as the sector size we pass to the
> kernel - which also has the nice effect of cooperating with
> (non-qemu) servers that don't do read-modify-write when exposing
> a block device with 4k sectors; it can also allow us to visit a
> file larger than 2T on a 32-bit kernel.
> 
> Signed-off-by: Eric Blake <ebl...@redhat.com>
> ---
> include/block/nbd.h |  3 +++
> block/nbd-client.c  |  3 +++
> block/nbd.c         | 17 +++++++++---
> nbd/client.c        | 74 ++++++++++++++++++++++++++++++++++++++++++++++++-----
> 4 files changed, 87 insertions(+), 10 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index a5c68df..27a6854 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -133,6 +133,9 @@ enum {
> struct NbdExportInfo {
>     uint64_t size;
>     uint16_t flags;
> +    uint32_t min_block;
> +    uint32_t opt_block;
> +    uint32_t max_block;
> };
> typedef struct NbdExportInfo NbdExportInfo;
> 
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 2b6ac27..602a8ab 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -443,6 +443,9 @@ int nbd_client_init(BlockDriverState *bs,
>         logout("Failed to negotiate with the NBD server\n");
>         return ret;
>     }
> +    if (client->info.min_block > bs->request_alignment) {
> +        bs->request_alignment = client->info.min_block;
> +    }
> 
>     qemu_co_mutex_init(&client->send_mutex);
>     qemu_co_mutex_init(&client->free_sema);
> diff --git a/block/nbd.c b/block/nbd.c
> index 5172039..bb7df55 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -407,9 +407,20 @@ static int nbd_co_flush(BlockDriverState *bs)
> 
> static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
> {
> -    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
> -    bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS;
> -    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
> +    NbdClientSession *s = nbd_get_client_session(bs);
> +    int max = UINT32_MAX >> BDRV_SECTOR_BITS;
> +
> +    if (s->info.max_block) {
> +        max = s->info.max_block >> BDRV_SECTOR_BITS;
> +    }
> +    bs->bl.max_discard = max;
> +    bs->bl.max_write_zeroes = max;
> +    bs->bl.max_transfer_length = max;
> +
> +    if (s->info.opt_block &&
> +        s->info.opt_block >> BDRV_SECTOR_BITS > bs->bl.opt_transfer_length) {
> +        bs->bl.opt_transfer_length = s->info.opt_block >> BDRV_SECTOR_BITS;
> +    }
> }
> 
> static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
> diff --git a/nbd/client.c b/nbd/client.c
> index dac4f29..24f6b0b 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -232,6 +232,11 @@ static int nbd_handle_reply_err(QIOChannel *ioc, 
> nbd_opt_reply *reply,
>                    reply->option);
>         break;
> 
> +    case NBD_REP_ERR_BLOCK_SIZE_REQD:
> +        error_setg(errp, "Server wants OPT_BLOCK_SIZE before option %" 
> PRIx32,
> +                   reply->option);
> +        break;
> +
>     default:
>         error_setg(errp, "Unknown error code when asking for option %" PRIx32,
>                    reply->option);
> @@ -333,6 +338,21 @@ static int nbd_opt_go(QIOChannel *ioc, const char 
> *wantname,
>      * flags still 0 is a witness of a broken server. */
>     info->flags = 0;
> 
> +    /* Some servers use NBD_OPT_GO to advertise non-default block
> +     * sizes, and require that we first use NBD_OPT_BLOCK_SIZE to
> +     * agree to that. */
> +    TRACE("Attempting NBD_OPT_BLOCK_SIZE");
> +    if (nbd_send_option_request(ioc, NBD_OPT_BLOCK_SIZE, 0, NULL, errp) < 0) 
> {
> +        return -1;
> +    }
> +    if (nbd_receive_option_reply(ioc, NBD_OPT_BLOCK_SIZE, &reply, errp) < 0) 
> {
> +        return -1;
> +    }
> +    error = nbd_handle_reply_err(ioc, &reply, errp);
> +    if (error < 0) {
> +        return error;
> +    }
> +
>     TRACE("Attempting NBD_OPT_GO for export '%s'", wantname);
>     if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) {
>         return -1;
> @@ -402,6 +422,45 @@ static int nbd_opt_go(QIOChannel *ioc, const char 
> *wantname,
>                   info->size, info->flags);
>             break;
> 
> +        case NBD_INFO_BLOCK_SIZE:
> +            if (len != sizeof(info->min_block) * 3) {
> +                error_setg(errp, "remaining export info len %" PRIu32
> +                           " is unexpected size", len);
> +                return -1;
> +            }
> +            if (read_sync(ioc, &info->min_block, sizeof(info->min_block)) !=
> +                sizeof(info->min_block)) {
> +                error_setg(errp, "failed to read info minimum block size");
> +                return -1;
> +            }
> +            be32_to_cpus(&info->min_block);
> +            if (!is_power_of_2(info->min_block)) {
> +                error_setg(errp, "server minimum block size %" PRId32
> +                           "is not a power of two", info->min_block);
> +                return -1;
> +            }
> +            if (read_sync(ioc, &info->opt_block, sizeof(info->opt_block)) !=
> +                sizeof(info->opt_block)) {
> +                error_setg(errp, "failed to read info preferred block size");
> +                return -1;
> +            }
> +            be32_to_cpus(&info->opt_block);
> +            if (!is_power_of_2(info->opt_block) ||
> +                info->opt_block < info->min_block) {
> +                error_setg(errp, "server preferred block size %" PRId32
> +                           "is not valid", info->opt_block);
> +                return -1;
> +            }
> +            if (read_sync(ioc, &info->max_block, sizeof(info->max_block)) !=
> +                sizeof(info->max_block)) {
> +                error_setg(errp, "failed to read info maximum block size");
> +                return -1;
> +            }
> +            be32_to_cpus(&info->max_block);
> +            TRACE("Block sizes are 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32,
> +                  info->min_block, info->opt_block, info->max_block);
> +            break;
> +

You should probably check min_block <= opt_block <= max_block here

Also should there be a check that BDRV_SECTOR_SIZE >= min_block?


>         default:
>             TRACE("ignoring unknown export info %" PRIu16, type);
>             if (drop_sync(ioc, len) != len) {
> @@ -710,8 +769,9 @@ fail:
> #ifdef __linux__
> int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
> {
> -    unsigned long sectors = info->size / BDRV_SECTOR_SIZE;
> -    if (info->size / BDRV_SECTOR_SIZE != sectors) {
> +    unsigned long sector_size = MAX(BDRV_SECTOR_SIZE, info->min_block);
> +    unsigned long sectors = info->size / sector_size;
> +    if (info->size / sector_size != sectors) {
>         LOG("Export size %" PRId64 " too large for 32-bit kernel", 
> info->size);
>         return -E2BIG;
>     }
> @@ -724,18 +784,18 @@ int nbd_init(int fd, QIOChannelSocket *sioc, 
> NbdExportInfo *info)
>         return -serrno;
>     }
> 
> -    TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
> +    TRACE("Setting block size to %lu", sector_size);
> 
> -    if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) {
> +    if (ioctl(fd, NBD_SET_BLKSIZE, sector_size) < 0) {
>         int serrno = errno;
>         LOG("Failed setting NBD block size");
>         return -serrno;
>     }
> 
>     TRACE("Setting size to %lu block(s)", sectors);
> -    if (size % BDRV_SECTOR_SIZE) {
> -        TRACE("Ignoring trailing %d bytes of export",
> -              (int) (size % BDRV_SECTOR_SIZE));
> +    if (info->size % sector_size) {
> +        TRACE("Ignoring trailing %" PRId64 " bytes of export",
> +              info->size % sector_size);
>     }
> 
>     if (ioctl(fd, NBD_SET_SIZE_BLOCKS, sectors) < 0) {
> -- 
> 2.5.5
> 
> 

-- 
Alex Bligh





Reply via email to