On 23 Apr 2016, at 00:40, Eric Blake <ebl...@redhat.com> wrote: > The upstream NBD Protocol has defined a new extension to allow > the server to advertise block sizes to the client, as well as > a way for the client to inform the server that it intends to > obey block sizes. > > Pass any received sizes on to the block layer. > > Use the minimum block size as the sector size we pass to the > kernel - which also has the nice effect of cooperating with > (non-qemu) servers that don't do read-modify-write when exposing > a block device with 4k sectors; it can also allow us to visit a > file larger than 2T on a 32-bit kernel. > > Signed-off-by: Eric Blake <ebl...@redhat.com> > --- > include/block/nbd.h | 3 +++ > block/nbd-client.c | 3 +++ > block/nbd.c | 17 +++++++++--- > nbd/client.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++----- > 4 files changed, 87 insertions(+), 10 deletions(-) > > diff --git a/include/block/nbd.h b/include/block/nbd.h > index a5c68df..27a6854 100644 > --- a/include/block/nbd.h > +++ b/include/block/nbd.h > @@ -133,6 +133,9 @@ enum { > struct NbdExportInfo { > uint64_t size; > uint16_t flags; > + uint32_t min_block; > + uint32_t opt_block; > + uint32_t max_block; > }; > typedef struct NbdExportInfo NbdExportInfo; > > diff --git a/block/nbd-client.c b/block/nbd-client.c > index 2b6ac27..602a8ab 100644 > --- a/block/nbd-client.c > +++ b/block/nbd-client.c > @@ -443,6 +443,9 @@ int nbd_client_init(BlockDriverState *bs, > logout("Failed to negotiate with the NBD server\n"); > return ret; > } > + if (client->info.min_block > bs->request_alignment) { > + bs->request_alignment = client->info.min_block; > + } > > qemu_co_mutex_init(&client->send_mutex); > qemu_co_mutex_init(&client->free_sema); > diff --git a/block/nbd.c b/block/nbd.c > index 5172039..bb7df55 100644 > --- a/block/nbd.c > +++ b/block/nbd.c > @@ -407,9 +407,20 @@ static int nbd_co_flush(BlockDriverState *bs) > > static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) > { > - bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS; > - bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS; > - bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS; > + NbdClientSession *s = nbd_get_client_session(bs); > + int max = UINT32_MAX >> BDRV_SECTOR_BITS; > + > + if (s->info.max_block) { > + max = s->info.max_block >> BDRV_SECTOR_BITS; > + } > + bs->bl.max_discard = max; > + bs->bl.max_write_zeroes = max; > + bs->bl.max_transfer_length = max; > + > + if (s->info.opt_block && > + s->info.opt_block >> BDRV_SECTOR_BITS > bs->bl.opt_transfer_length) { > + bs->bl.opt_transfer_length = s->info.opt_block >> BDRV_SECTOR_BITS; > + } > } > > static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, > diff --git a/nbd/client.c b/nbd/client.c > index dac4f29..24f6b0b 100644 > --- a/nbd/client.c > +++ b/nbd/client.c > @@ -232,6 +232,11 @@ static int nbd_handle_reply_err(QIOChannel *ioc, > nbd_opt_reply *reply, > reply->option); > break; > > + case NBD_REP_ERR_BLOCK_SIZE_REQD: > + error_setg(errp, "Server wants OPT_BLOCK_SIZE before option %" > PRIx32, > + reply->option); > + break; > + > default: > error_setg(errp, "Unknown error code when asking for option %" PRIx32, > reply->option); > @@ -333,6 +338,21 @@ static int nbd_opt_go(QIOChannel *ioc, const char > *wantname, > * flags still 0 is a witness of a broken server. */ > info->flags = 0; > > + /* Some servers use NBD_OPT_GO to advertise non-default block > + * sizes, and require that we first use NBD_OPT_BLOCK_SIZE to > + * agree to that. */ > + TRACE("Attempting NBD_OPT_BLOCK_SIZE"); > + if (nbd_send_option_request(ioc, NBD_OPT_BLOCK_SIZE, 0, NULL, errp) < 0) > { > + return -1; > + } > + if (nbd_receive_option_reply(ioc, NBD_OPT_BLOCK_SIZE, &reply, errp) < 0) > { > + return -1; > + } > + error = nbd_handle_reply_err(ioc, &reply, errp); > + if (error < 0) { > + return error; > + } > + > TRACE("Attempting NBD_OPT_GO for export '%s'", wantname); > if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) { > return -1; > @@ -402,6 +422,45 @@ static int nbd_opt_go(QIOChannel *ioc, const char > *wantname, > info->size, info->flags); > break; > > + case NBD_INFO_BLOCK_SIZE: > + if (len != sizeof(info->min_block) * 3) { > + error_setg(errp, "remaining export info len %" PRIu32 > + " is unexpected size", len); > + return -1; > + } > + if (read_sync(ioc, &info->min_block, sizeof(info->min_block)) != > + sizeof(info->min_block)) { > + error_setg(errp, "failed to read info minimum block size"); > + return -1; > + } > + be32_to_cpus(&info->min_block); > + if (!is_power_of_2(info->min_block)) { > + error_setg(errp, "server minimum block size %" PRId32 > + "is not a power of two", info->min_block); > + return -1; > + } > + if (read_sync(ioc, &info->opt_block, sizeof(info->opt_block)) != > + sizeof(info->opt_block)) { > + error_setg(errp, "failed to read info preferred block size"); > + return -1; > + } > + be32_to_cpus(&info->opt_block); > + if (!is_power_of_2(info->opt_block) || > + info->opt_block < info->min_block) { > + error_setg(errp, "server preferred block size %" PRId32 > + "is not valid", info->opt_block); > + return -1; > + } > + if (read_sync(ioc, &info->max_block, sizeof(info->max_block)) != > + sizeof(info->max_block)) { > + error_setg(errp, "failed to read info maximum block size"); > + return -1; > + } > + be32_to_cpus(&info->max_block); > + TRACE("Block sizes are 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32, > + info->min_block, info->opt_block, info->max_block); > + break; > +
You should probably check min_block <= opt_block <= max_block here Also should there be a check that BDRV_SECTOR_SIZE >= min_block? > default: > TRACE("ignoring unknown export info %" PRIu16, type); > if (drop_sync(ioc, len) != len) { > @@ -710,8 +769,9 @@ fail: > #ifdef __linux__ > int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info) > { > - unsigned long sectors = info->size / BDRV_SECTOR_SIZE; > - if (info->size / BDRV_SECTOR_SIZE != sectors) { > + unsigned long sector_size = MAX(BDRV_SECTOR_SIZE, info->min_block); > + unsigned long sectors = info->size / sector_size; > + if (info->size / sector_size != sectors) { > LOG("Export size %" PRId64 " too large for 32-bit kernel", > info->size); > return -E2BIG; > } > @@ -724,18 +784,18 @@ int nbd_init(int fd, QIOChannelSocket *sioc, > NbdExportInfo *info) > return -serrno; > } > > - TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE); > + TRACE("Setting block size to %lu", sector_size); > > - if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) { > + if (ioctl(fd, NBD_SET_BLKSIZE, sector_size) < 0) { > int serrno = errno; > LOG("Failed setting NBD block size"); > return -serrno; > } > > TRACE("Setting size to %lu block(s)", sectors); > - if (size % BDRV_SECTOR_SIZE) { > - TRACE("Ignoring trailing %d bytes of export", > - (int) (size % BDRV_SECTOR_SIZE)); > + if (info->size % sector_size) { > + TRACE("Ignoring trailing %" PRId64 " bytes of export", > + info->size % sector_size); > } > > if (ioctl(fd, NBD_SET_SIZE_BLOCKS, sectors) < 0) { > -- > 2.5.5 > > -- Alex Bligh