Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices
On 2022/10/13 16:27, Sam Li wrote: > Damien Le Moal 于2022年10月13日周四 13:55写道: >> >> On 10/10/22 11:33, Sam Li wrote: >>> A zone append command is a write operation that specifies the first >>> logical block of a zone as the write position. When writing to a zoned >>> block device using zone append, the byte offset of writes is pointing >>> to the write pointer of that zone. Upon completion the device will >>> respond with the position the data has been written in the zone. >>> >>> Signed-off-by: Sam Li >>> --- >>> block/block-backend.c | 64 +++ >>> block/file-posix.c| 64 --- >>> block/io.c| 21 ++ >>> block/raw-format.c| 7 >>> include/block/block-io.h | 3 ++ >>> include/block/block_int-common.h | 3 ++ >>> include/block/raw-aio.h | 4 +- >>> include/sysemu/block-backend-io.h | 9 + >>> 8 files changed, 168 insertions(+), 7 deletions(-) >>> >>> diff --git a/block/block-backend.c b/block/block-backend.c >>> index ddc569e3ac..bfdb719bc8 100644 >>> --- a/block/block-backend.c >>> +++ b/block/block-backend.c >>> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { >>> struct { >>> BlockZoneOp op; >>> } zone_mgmt; >>> +struct { >>> +int64_t *append_sector; >> >> I would call this "sector", since it will always be referenced as >> "->zone_append.sector", you get the "append" for free :) >> >> That said, shouldn't this be a byte value, so called "offset" ? Not >> entirely sure... > > Yes, it can be changed to "offset"(byte) following QEMU's convention. > Just need to add conversions to virtio_blk_zone_append/*_complete, > which is easily done. > >> >>> +} zone_append; >>> }; >>> } BlkRwCo; >>> >>> @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, >>> BlockZoneOp op, >>> return &acb->common; >>> } >>> >>> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) { >>> +BlkAioEmAIOCB *acb = opaque; >>> +BlkRwCo *rwco = &acb->rwco; >>> + >>> +rwco->ret = blk_co_zone_append(rwco->blk, >>> rwco->zone_append.append_sector, >>> + rwco->iobuf, rwco->flags); >>> +blk_aio_complete(acb); >>> +} >>> + >>> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, >>> +QEMUIOVector *qiov, BdrvRequestFlags flags, >>> +BlockCompletionFunc *cb, void *opaque) { >>> +BlkAioEmAIOCB *acb; >>> +Coroutine *co; >>> +IO_CODE(); >>> + >>> +blk_inc_in_flight(blk); >>> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); >>> +acb->rwco = (BlkRwCo) { >>> +.blk= blk, >>> +.ret= NOT_DONE, >>> +.flags = flags, >>> +.iobuf = qiov, >>> +.zone_append = { >>> +.append_sector = offset, >> >> See above comment. So since this is a byte value, this needs to be >> called "offset", no ? > > Yes, same answers above. > >> >>> +}, >>> +}; >>> +acb->has_returned = false; >>> + >>> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); >>> +bdrv_coroutine_enter(blk_bs(blk), co); >>> +acb->has_returned = true; >>> +if (acb->rwco.ret != NOT_DONE) { >>> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), >>> + blk_aio_complete_bh, acb); >>> +} >>> + >>> +return &acb->common; >>> +} >>> + >>> /* >>> * Send a zone_report command. >>> * offset is a byte offset from the start of the device. No alignment >>> @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, >>> BlockZoneOp op, >>> return ret; >>> } >>> >>> +/* >>> + * Send a zone_append command. >>> + */ >>> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, >>> +QEMUIOVector *qiov, BdrvRequestFlags flags) >>> +{ >>> +int ret; >>> +IO_CODE(); >>> + >>> +blk_inc_in_flight(blk); >>> +blk_wait_while_drained(blk); >>> +if (!blk_is_available(blk)) { >>> +blk_dec_in_flight(blk); >>> +return -ENOMEDIUM; >>> +} >>> + >>> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); >>> +blk_dec_in_flight(blk); >>> +return ret; >>> +} >>> + >>> void blk_drain(BlockBackend *blk) >>> { >>> BlockDriverState *bs = blk_bs(blk); >>> diff --git a/block/file-posix.c b/block/file-posix.c >>> index 17c0b58158..08ab164df4 100755 >>> --- a/block/file-posix.c >>> +++ b/block/file-posix.c >>> @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData >>> *aiocb) >>> ssize_t len; >>> >>> do { >>> -if (aiocb->aio_type & QEMU_AIO_WRITE) >>> +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) >>> len = qemu_pwritev(aiocb->aio_fildes, >>>
Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices
Damien Le Moal 于2022年10月13日周四 13:55写道: > > On 10/10/22 11:33, Sam Li wrote: > > A zone append command is a write operation that specifies the first > > logical block of a zone as the write position. When writing to a zoned > > block device using zone append, the byte offset of writes is pointing > > to the write pointer of that zone. Upon completion the device will > > respond with the position the data has been written in the zone. > > > > Signed-off-by: Sam Li > > --- > > block/block-backend.c | 64 +++ > > block/file-posix.c| 64 --- > > block/io.c| 21 ++ > > block/raw-format.c| 7 > > include/block/block-io.h | 3 ++ > > include/block/block_int-common.h | 3 ++ > > include/block/raw-aio.h | 4 +- > > include/sysemu/block-backend-io.h | 9 + > > 8 files changed, 168 insertions(+), 7 deletions(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index ddc569e3ac..bfdb719bc8 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > > struct { > > BlockZoneOp op; > > } zone_mgmt; > > +struct { > > +int64_t *append_sector; > > I would call this "sector", since it will always be referenced as > "->zone_append.sector", you get the "append" for free :) > > That said, shouldn't this be a byte value, so called "offset" ? Not > entirely sure... Yes, it can be changed to "offset"(byte) following QEMU's convention. Just need to add conversions to virtio_blk_zone_append/*_complete, which is easily done. > > > +} zone_append; > > }; > > } BlkRwCo; > > > > @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > > BlockZoneOp op, > > return &acb->common; > > } > > > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) { > > +BlkAioEmAIOCB *acb = opaque; > > +BlkRwCo *rwco = &acb->rwco; > > + > > +rwco->ret = blk_co_zone_append(rwco->blk, > > rwco->zone_append.append_sector, > > + rwco->iobuf, rwco->flags); > > +blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > > +QEMUIOVector *qiov, BdrvRequestFlags flags, > > +BlockCompletionFunc *cb, void *opaque) { > > +BlkAioEmAIOCB *acb; > > +Coroutine *co; > > +IO_CODE(); > > + > > +blk_inc_in_flight(blk); > > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > +acb->rwco = (BlkRwCo) { > > +.blk= blk, > > +.ret= NOT_DONE, > > +.flags = flags, > > +.iobuf = qiov, > > +.zone_append = { > > +.append_sector = offset, > > See above comment. So since this is a byte value, this needs to be > called "offset", no ? Yes, same answers above. > > > +}, > > +}; > > +acb->has_returned = false; > > + > > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > > +bdrv_coroutine_enter(blk_bs(blk), co); > > +acb->has_returned = true; > > +if (acb->rwco.ret != NOT_DONE) { > > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > +} > > + > > +return &acb->common; > > +} > > + > > /* > > * Send a zone_report command. > > * offset is a byte offset from the start of the device. No alignment > > @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > > BlockZoneOp op, > > return ret; > > } > > > > +/* > > + * Send a zone_append command. > > + */ > > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > > +QEMUIOVector *qiov, BdrvRequestFlags flags) > > +{ > > +int ret; > > +IO_CODE(); > > + > > +blk_inc_in_flight(blk); > > +blk_wait_while_drained(blk); > > +if (!blk_is_available(blk)) { > > +blk_dec_in_flight(blk); > > +return -ENOMEDIUM; > > +} > > + > > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > > +blk_dec_in_flight(blk); > > +return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 17c0b58158..08ab164df4 100755 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > > *aiocb) > > ssize_t len; > > > > do { > > -if (aiocb->aio_type & QEMU_AIO_WRITE) > > +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > > len = qemu_pwritev(aiocb->aio_fildes, > > aiocb->io.iov, > >
Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices
On 10/10/22 11:33, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of writes is pointing > to the write pointer of that zone. Upon completion the device will > respond with the position the data has been written in the zone. > > Signed-off-by: Sam Li > --- > block/block-backend.c | 64 +++ > block/file-posix.c| 64 --- > block/io.c| 21 ++ > block/raw-format.c| 7 > include/block/block-io.h | 3 ++ > include/block/block_int-common.h | 3 ++ > include/block/raw-aio.h | 4 +- > include/sysemu/block-backend-io.h | 9 + > 8 files changed, 168 insertions(+), 7 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index ddc569e3ac..bfdb719bc8 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > struct { > BlockZoneOp op; > } zone_mgmt; > +struct { > +int64_t *append_sector; I would call this "sector", since it will always be referenced as "->zone_append.sector", you get the "append" for free :) That said, shouldn't this be a byte value, so called "offset" ? Not entirely sure... > +} zone_append; > }; > } BlkRwCo; > > @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return &acb->common; > } > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) { > +BlkAioEmAIOCB *acb = opaque; > +BlkRwCo *rwco = &acb->rwco; > + > +rwco->ret = blk_co_zone_append(rwco->blk, > rwco->zone_append.append_sector, > + rwco->iobuf, rwco->flags); > +blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > +QEMUIOVector *qiov, BdrvRequestFlags flags, > +BlockCompletionFunc *cb, void *opaque) { > +BlkAioEmAIOCB *acb; > +Coroutine *co; > +IO_CODE(); > + > +blk_inc_in_flight(blk); > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > +acb->rwco = (BlkRwCo) { > +.blk= blk, > +.ret= NOT_DONE, > +.flags = flags, > +.iobuf = qiov, > +.zone_append = { > +.append_sector = offset, See above comment. So since this is a byte value, this needs to be called "offset", no ? > +}, > +}; > +acb->has_returned = false; > + > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > +bdrv_coroutine_enter(blk_bs(blk), co); > +acb->has_returned = true; > +if (acb->rwco.ret != NOT_DONE) { > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > +} > + > +return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > +QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > +int ret; > +IO_CODE(); > + > +blk_inc_in_flight(blk); > +blk_wait_while_drained(blk); > +if (!blk_is_available(blk)) { > +blk_dec_in_flight(blk); > +return -ENOMEDIUM; > +} > + > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > +blk_dec_in_flight(blk); > +return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 17c0b58158..08ab164df4 100755 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > *aiocb) > ssize_t len; > > do { > -if (aiocb->aio_type & QEMU_AIO_WRITE) > +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > len = qemu_pwritev(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, Hu... You are issuing the io for a zone append without first changing the aiocb offset to be equal to the zone write pointer ? And you are calling this without the wps->lock held... Changing the aio offset to be equal to the wp && issuing the io must be atomic. > @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > ssize_t len; > > while (offset < aiocb->aio_nbytes) { > -if (aiocb->a