Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices
Damien Le Moal 于2022年10月17日周一 13:22写道: > > On 10/16/22 23:56, Sam Li wrote: > > A zone append command is a write operation that specifies the first > > logical block of a zone as the write position. When writing to a zoned > > block device using zone append, the byte offset of writes is pointing > > to the write pointer of that zone. Upon completion the device will > > respond with the position the data has been written in the zone. > > > > Signed-off-by: Sam Li > > --- > > block/block-backend.c | 65 ++ > > block/file-posix.c| 89 +-- > > block/io.c| 21 > > block/raw-format.c| 8 +++ > > include/block/block-io.h | 3 ++ > > include/block/block_int-common.h | 5 ++ > > include/block/raw-aio.h | 4 +- > > include/sysemu/block-backend-io.h | 9 > > 8 files changed, 198 insertions(+), 6 deletions(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index 1c618e9c68..06931ddd24 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > > struct { > > unsigned long op; > > } zone_mgmt; > > +struct { > > +int64_t *append_sector; > > As mentioned previosuly, call this sector. "append" is already in the > zone_append struct member name Right, missed it here. I'll use "offset" then since it's in block layer. > > > +} zone_append; > > }; > > } BlkRwCo; > > > > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > > BlockZoneOp op, > > return &acb->common; > > } > > > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > > +{ > > +BlkAioEmAIOCB *acb = opaque; > > +BlkRwCo *rwco = &acb->rwco; > > + > > +rwco->ret = blk_co_zone_append(rwco->blk, > > rwco->zone_append.append_sector, > > ...so you avoid awkward repetitions of "append" like here. You'll have: > rwco->zone_append.sector, which is shorter and more natural. > > > + rwco->iobuf, rwco->flags); > > +blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > > +QEMUIOVector *qiov, BdrvRequestFlags flags, > > +BlockCompletionFunc *cb, void *opaque) { > > +BlkAioEmAIOCB *acb; > > +Coroutine *co; > > +IO_CODE(); > > + > > +blk_inc_in_flight(blk); > > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > +acb->rwco = (BlkRwCo) { > > +.blk= blk, > > +.ret= NOT_DONE, > > +.flags = flags, > > +.iobuf = qiov, > > +.zone_append = { > > +.append_sector = offset, > > +}, > > +}; > > +acb->has_returned = false; > > + > > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > > +bdrv_coroutine_enter(blk_bs(blk), co); > > +acb->has_returned = true; > > +if (acb->rwco.ret != NOT_DONE) { > > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > +} > > + > > +return &acb->common; > > +} > > + > > /* > > * Send a zone_report command. > > * offset is a byte offset from the start of the device. No alignment > > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > > BlockZoneOp op, > > return ret; > > } > > > > +/* > > + * Send a zone_append command. > > + */ > > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > > +QEMUIOVector *qiov, BdrvRequestFlags flags) > > +{ > > +int ret; > > +IO_CODE(); > > + > > +blk_inc_in_flight(blk); > > +blk_wait_while_drained(blk); > > +if (!blk_is_available(blk)) { > > +blk_dec_in_flight(blk); > > +return -ENOMEDIUM; > > +} > > + > > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > > +blk_dec_in_flight(blk); > > +return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 5ff5500301..3d0cc33d02 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { > > struct { > > struct iovec *iov; > > int niov; > > +int64_t *offset; > > } io; > > struct { > > uint64_t cmd; > > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, > > Error **errp) > > bs->bl.max_active_zones = ret; > > } > > > > +ret = get_sysfs_long_val(&st, "physical_block_size"); > > +if (ret >= 0) { > > +bs->bl.write_granularity = ret; > > +
Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices
On 10/16/22 23:56, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of writes is pointing > to the write pointer of that zone. Upon completion the device will > respond with the position the data has been written in the zone. > > Signed-off-by: Sam Li > --- > block/block-backend.c | 65 ++ > block/file-posix.c| 89 +-- > block/io.c| 21 > block/raw-format.c| 8 +++ > include/block/block-io.h | 3 ++ > include/block/block_int-common.h | 5 ++ > include/block/raw-aio.h | 4 +- > include/sysemu/block-backend-io.h | 9 > 8 files changed, 198 insertions(+), 6 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index 1c618e9c68..06931ddd24 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > struct { > unsigned long op; > } zone_mgmt; > +struct { > +int64_t *append_sector; As mentioned previosuly, call this sector. "append" is already in the zone_append struct member name > +} zone_append; > }; > } BlkRwCo; > > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return &acb->common; > } > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > +{ > +BlkAioEmAIOCB *acb = opaque; > +BlkRwCo *rwco = &acb->rwco; > + > +rwco->ret = blk_co_zone_append(rwco->blk, > rwco->zone_append.append_sector, ...so you avoid awkward repetitions of "append" like here. You'll have: rwco->zone_append.sector, which is shorter and more natural. > + rwco->iobuf, rwco->flags); > +blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > +QEMUIOVector *qiov, BdrvRequestFlags flags, > +BlockCompletionFunc *cb, void *opaque) { > +BlkAioEmAIOCB *acb; > +Coroutine *co; > +IO_CODE(); > + > +blk_inc_in_flight(blk); > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > +acb->rwco = (BlkRwCo) { > +.blk= blk, > +.ret= NOT_DONE, > +.flags = flags, > +.iobuf = qiov, > +.zone_append = { > +.append_sector = offset, > +}, > +}; > +acb->has_returned = false; > + > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > +bdrv_coroutine_enter(blk_bs(blk), co); > +acb->has_returned = true; > +if (acb->rwco.ret != NOT_DONE) { > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > +} > + > +return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > +QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > +int ret; > +IO_CODE(); > + > +blk_inc_in_flight(blk); > +blk_wait_while_drained(blk); > +if (!blk_is_available(blk)) { > +blk_dec_in_flight(blk); > +return -ENOMEDIUM; > +} > + > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > +blk_dec_in_flight(blk); > +return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 5ff5500301..3d0cc33d02 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { > struct { > struct iovec *iov; > int niov; > +int64_t *offset; > } io; > struct { > uint64_t cmd; > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > bs->bl.max_active_zones = ret; > } > > +ret = get_sysfs_long_val(&st, "physical_block_size"); > +if (ret >= 0) { > +bs->bl.write_granularity = ret; > +} > + > bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { > error_report("report wps failed"); > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > { > ssize_t
Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of writes is pointing > to the write pointer of that zone. Upon completion the device will > respond with the position the data has been written in the zone. > > Signed-off-by: Sam Li > --- > block/block-backend.c | 65 ++ > block/file-posix.c | 89 +-- > block/io.c | 21 > block/raw-format.c | 8 +++ > include/block/block-io.h | 3 ++ > include/block/block_int-common.h | 5 ++ > include/block/raw-aio.h | 4 +- > include/sysemu/block-backend-io.h | 9 > 8 files changed, 198 insertions(+), 6 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index 1c618e9c68..06931ddd24 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > struct { > unsigned long op; > } zone_mgmt; > + struct { > + int64_t *append_sector; > + } zone_append; > }; > } BlkRwCo; > > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return &acb->common; > } > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_append(rwco->blk, > rwco->zone_append.append_sector, > + rwco->iobuf, rwco->flags); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .ret = NOT_DONE, > + .flags = flags, > + .iobuf = qiov, > + .zone_append = { > + .append_sector = offset, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 5ff5500301..3d0cc33d02 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { > struct { > struct iovec *iov; > int niov; > + int64_t *offset; > } io; > struct { > uint64_t cmd; > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > bs->bl.max_active_zones = ret; > } > > + ret = get_sysfs_long_val(&st, "physical_block_size"); > + if (ret >= 0) { > + bs->bl.write_granularity = ret; > + } > + > bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { > error_report("report wps failed"); > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > { > ssize_t len; > + BlockZoneWps *wps = aiocb->bs->bl.wps; > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; Can this code ever be called for a non-zoned device with 0 zone size? If yes, you need to avoid division by zero h
[PATCH v4 2/3] block: introduce zone append write for zoned devices
A zone append command is a write operation that specifies the first logical block of a zone as the write position. When writing to a zoned block device using zone append, the byte offset of writes is pointing to the write pointer of that zone. Upon completion the device will respond with the position the data has been written in the zone. Signed-off-by: Sam Li --- block/block-backend.c | 65 ++ block/file-posix.c| 89 +-- block/io.c| 21 block/raw-format.c| 8 +++ include/block/block-io.h | 3 ++ include/block/block_int-common.h | 5 ++ include/block/raw-aio.h | 4 +- include/sysemu/block-backend-io.h | 9 8 files changed, 198 insertions(+), 6 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 1c618e9c68..06931ddd24 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { struct { unsigned long op; } zone_mgmt; +struct { +int64_t *append_sector; +} zone_append; }; } BlkRwCo; @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, return &acb->common; } +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) +{ +BlkAioEmAIOCB *acb = opaque; +BlkRwCo *rwco = &acb->rwco; + +rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector, + rwco->iobuf, rwco->flags); +blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, +QEMUIOVector *qiov, BdrvRequestFlags flags, +BlockCompletionFunc *cb, void *opaque) { +BlkAioEmAIOCB *acb; +Coroutine *co; +IO_CODE(); + +blk_inc_in_flight(blk); +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); +acb->rwco = (BlkRwCo) { +.blk= blk, +.ret= NOT_DONE, +.flags = flags, +.iobuf = qiov, +.zone_append = { +.append_sector = offset, +}, +}; +acb->has_returned = false; + +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); +bdrv_coroutine_enter(blk_bs(blk), co); +acb->has_returned = true; +if (acb->rwco.ret != NOT_DONE) { +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); +} + +return &acb->common; +} + /* * Send a zone_report command. * offset is a byte offset from the start of the device. No alignment @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, return ret; } +/* + * Send a zone_append command. + */ +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, +QEMUIOVector *qiov, BdrvRequestFlags flags) +{ +int ret; +IO_CODE(); + +blk_inc_in_flight(blk); +blk_wait_while_drained(blk); +if (!blk_is_available(blk)) { +blk_dec_in_flight(blk); +return -ENOMEDIUM; +} + +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); +blk_dec_in_flight(blk); +return ret; +} + void blk_drain(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); diff --git a/block/file-posix.c b/block/file-posix.c index 5ff5500301..3d0cc33d02 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { struct { struct iovec *iov; int niov; +int64_t *offset; } io; struct { uint64_t cmd; @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.max_active_zones = ret; } +ret = get_sysfs_long_val(&st, "physical_block_size"); +if (ret >= 0) { +bs->bl.write_granularity = ret; +} + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { error_report("report wps failed"); @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) { ssize_t len; +BlockZoneWps *wps = aiocb->bs->bl.wps; +int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; + +if (wps) { +qemu_mutex_lock(&wps->lock); +if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { +aiocb->aio_offset = wps->wp[index]; +} +} do { -if (aiocb->aio_type & QEMU_AIO_WRITE) +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) len = qemu_pwritev(aiocb->aio_fildes, aiocb->io.iov, aiocb->i