Damien Le Moal <damien.lem...@opensource.wdc.com> 于2022年10月13日周四 13:55写道: > > On 10/10/22 11:33, Sam Li wrote: > > A zone append command is a write operation that specifies the first > > logical block of a zone as the write position. When writing to a zoned > > block device using zone append, the byte offset of writes is pointing > > to the write pointer of that zone. Upon completion the device will > > respond with the position the data has been written in the zone. > > > > Signed-off-by: Sam Li <faithilike...@gmail.com> > > --- > > block/block-backend.c | 64 +++++++++++++++++++++++++++++++ > > block/file-posix.c | 64 ++++++++++++++++++++++++++++--- > > block/io.c | 21 ++++++++++ > > block/raw-format.c | 7 ++++ > > include/block/block-io.h | 3 ++ > > include/block/block_int-common.h | 3 ++ > > include/block/raw-aio.h | 4 +- > > include/sysemu/block-backend-io.h | 9 +++++ > > 8 files changed, 168 insertions(+), 7 deletions(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index ddc569e3ac..bfdb719bc8 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > > struct { > > BlockZoneOp op; > > } zone_mgmt; > > + struct { > > + int64_t *append_sector; > > I would call this "sector", since it will always be referenced as > "->zone_append.sector", you get the "append" for free :) > > That said, shouldn't this be a byte value, so called "offset" ? Not > entirely sure...
Yes, it can be changed to "offset"(byte) following QEMU's convention. Just need to add conversions to virtio_blk_zone_append/*_complete, which is easily done. > > > + } zone_append; > > }; > > } BlkRwCo; > > > > @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > > BlockZoneOp op, > > return &acb->common; > > } > > > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) { > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_append(rwco->blk, > > rwco->zone_append.append_sector, > > + rwco->iobuf, rwco->flags); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, BdrvRequestFlags flags, > > + BlockCompletionFunc *cb, void *opaque) { > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .ret = NOT_DONE, > > + .flags = flags, > > + .iobuf = qiov, > > + .zone_append = { > > + .append_sector = offset, > > See above comment. So since this is a byte value, this needs to be > called "offset", no ? Yes, same answers above. > > > + }, > > + }; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > /* > > * Send a zone_report command. > > * offset is a byte offset from the start of the device. No alignment > > @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > > BlockZoneOp op, > > return ret; > > } > > > > +/* > > + * Send a zone_append command. > > + */ > > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, BdrvRequestFlags flags) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + blk_wait_while_drained(blk); > > + if (!blk_is_available(blk)) { > > + blk_dec_in_flight(blk); > > + return -ENOMEDIUM; > > + } > > + > > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 17c0b58158..08ab164df4 100755 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > > *aiocb) > > ssize_t len; > > > > do { > > - if (aiocb->aio_type & QEMU_AIO_WRITE) > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > > len = qemu_pwritev(aiocb->aio_fildes, > > aiocb->io.iov, > > aiocb->io.niov, > > Hu... You are issuing the io for a zone append without first changing > the aiocb offset to be equal to the zone write pointer ? And you are It changed in the last patch. But it should be in this patch and make it specific to zone_append case, like: if ( type == & QEMU_AIO_ZONE_APPEND) { /* change offset here */ } > calling this without the wps->lock held... Changing the aio offset to be > equal to the wp && issuing the io must be atomic. Does this mean puttling locks around pwritev()? Like: lock(wp); len = pwritev(); unlock(wp); Because it is accessing wps[] by offset, which is a wp location. And when pwritev() executes, the offset should not be changed by other ios in flight. > > > @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > > *aiocb, char *buf) > > ssize_t len; > > > > while (offset < aiocb->aio_nbytes) { > > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > > len = pwrite(aiocb->aio_fildes, > > (const char *)buf + offset, > > aiocb->aio_nbytes - offset, > > Same comment here. > > > @@ -1731,7 +1731,7 @@ static int handle_aiocb_rw(void *opaque) > > * The offset of regular writes, append writes is the wp location > > * of that zone. > > */ > > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > > if (aiocb->bs->bl.zone_size > 0) { > > BlockZoneWps *wps = aiocb->bs->bl.wps; > > qemu_mutex_lock(&wps->lock); > > @@ -1794,7 +1794,7 @@ static int handle_aiocb_rw(void *opaque) > > } > > > > nbytes = handle_aiocb_rw_linear(aiocb, buf); > > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > > char *p = buf; > > size_t count = aiocb->aio_nbytes, copy; > > int i; > > @@ -1816,7 +1816,7 @@ static int handle_aiocb_rw(void *opaque) > > out: > > if (nbytes == aiocb->aio_nbytes) { > > #if defined(CONFIG_BLKZONED) > > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > > BlockZoneWps *wps = aiocb->bs->bl.wps; > > int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > > if (wps) { > > @@ -1828,6 +1828,11 @@ out: > > if (wend_offset > wps->wp[index]){ > > wps->wp[index] = wend_offset; > > } > > + > > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > > + *aiocb->io.append_sector = > > + wps->wp[index] >> BDRV_SECTOR_BITS; > > + } > > Same comment as last time. You must do this BEFORE the previous hunk > that updates the wp. Otherwise, you are NOT returning the position of > the written data, but the position that follows the written data... > > If you do a zone append to an empty zone, the append sector you return > must be the zone start sector. You can see here that this will never be > the case unless you reverse the 2 hunks above. You are right. I mistook the append sector should be the end sector location. +Upon a successful completion of a VIRTIO_BLK_T_ZONE_APPEND request, the driver +MAY read the starting sector location of the written data from the request +field \field{append_sector}. > > > } > > qemu_mutex_unlock(&wps->lock); > > } > > @@ -1845,7 +1850,7 @@ out: > > } else { > > assert(nbytes < 0); > > #if defined(CONFIG_BLKZONED) > > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > > update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps, > > aiocb->bs->bl.nr_zones); > > } > > @@ -3493,6 +3498,52 @@ static int coroutine_fn > > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > } > > #endif > > > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > > + int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags) { > > + BDRVRawState *s = bs->opaque; > > + int64_t zone_size_mask = bs->bl.zone_size - 1; > > + int64_t iov_len = 0; > > + int64_t len = 0; > > + RawPosixAIOData acb; > > + > > + if (*offset & zone_size_mask) { > > + error_report("sector offset %" PRId64 " is not aligned to zone > > size " > > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); > > + return -EINVAL; > > + } > > + > > + int64_t wg = bs->bl.write_granularity; > > + int64_t wg_mask = wg - 1; > > + for (int i = 0; i < qiov->niov; i++) { > > + iov_len = qiov->iov[i].iov_len; > > + if (iov_len & wg_mask) { > > + error_report("len of IOVector[%d] %" PRId64 " is not aligned to > > block " > > + "size %" PRId64 "", i, iov_len, wg); > > + return -EINVAL; > > + } > > + len += iov_len; > > + } > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_APPEND, > > + .aio_offset = *offset, > > + .aio_nbytes = len, > > + .io = { > > + .iov = qiov->iov, > > + .niov = qiov->niov, > > + .append_sector = offset, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); > > +} > > +#endif > > + > > static coroutine_fn int > > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > > bool blkdev) > > @@ -4268,6 +4319,7 @@ static BlockDriver bdrv_zoned_host_device = { > > /* zone management operations */ > > .bdrv_co_zone_report = raw_co_zone_report, > > .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > + .bdrv_co_zone_append = raw_co_zone_append, > > }; > > #endif > > > > diff --git a/block/io.c b/block/io.c > > index e5aaa64e17..935abf2ed4 100644 > > --- a/block/io.c > > +++ b/block/io.c > > @@ -3230,6 +3230,27 @@ out: > > return co.ret; > > } > > > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_append) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > void *qemu_blockalign(BlockDriverState *bs, size_t size) > > { > > IO_CODE(); > > diff --git a/block/raw-format.c b/block/raw-format.c > > index b885688434..f132880c85 100644 > > --- a/block/raw-format.c > > +++ b/block/raw-format.c > > @@ -325,6 +325,12 @@ static int coroutine_fn > > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); > > } > > > > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t > > *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags) { > > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); > > +} > > + > > static int64_t raw_getlength(BlockDriverState *bs) > > { > > int64_t len; > > @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = { > > .bdrv_co_pdiscard = &raw_co_pdiscard, > > .bdrv_co_zone_report = &raw_co_zone_report, > > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, > > + .bdrv_co_zone_append = &raw_co_zone_append, > > .bdrv_co_block_status = &raw_co_block_status, > > .bdrv_co_copy_range_from = &raw_co_copy_range_from, > > .bdrv_co_copy_range_to = &raw_co_copy_range_to, > > diff --git a/include/block/block-io.h b/include/block/block-io.h > > index f0cdf67d33..6a54453578 100644 > > --- a/include/block/block-io.h > > +++ b/include/block/block-io.h > > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState > > *bs, int64_t offset, > > BlockZoneDescriptor *zones); > > int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > int64_t offset, int64_t len); > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > > diff --git a/include/block/block_int-common.h > > b/include/block/block_int-common.h > > index 59c2d1316d..a7e7db5646 100644 > > --- a/include/block/block_int-common.h > > +++ b/include/block/block_int-common.h > > @@ -701,6 +701,9 @@ struct BlockDriver { > > BlockZoneDescriptor *zones); > > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, > > BlockZoneOp op, > > int64_t offset, int64_t len); > > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, > > + int64_t *offset, QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > > > /* removable device specific */ > > bool (*bdrv_is_inserted)(BlockDriverState *bs); > > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > > index 3d26929cdd..f13cc1887b 100644 > > --- a/include/block/raw-aio.h > > +++ b/include/block/raw-aio.h > > @@ -31,6 +31,7 @@ > > #define QEMU_AIO_TRUNCATE 0x0080 > > #define QEMU_AIO_ZONE_REPORT 0x0100 > > #define QEMU_AIO_ZONE_MGMT 0x0200 > > +#define QEMU_AIO_ZONE_APPEND 0x0400 > > #define QEMU_AIO_TYPE_MASK \ > > (QEMU_AIO_READ | \ > > QEMU_AIO_WRITE | \ > > @@ -41,7 +42,8 @@ > > QEMU_AIO_COPY_RANGE | \ > > QEMU_AIO_TRUNCATE | \ > > QEMU_AIO_ZONE_REPORT | \ > > - QEMU_AIO_ZONE_MGMT) > > + QEMU_AIO_ZONE_MGMT | \ > > + QEMU_AIO_ZONE_APPEND) > > > > /* AIO flags */ > > #define QEMU_AIO_MISALIGNED 0x1000 > > diff --git a/include/sysemu/block-backend-io.h > > b/include/sysemu/block-backend-io.h > > index 6835525582..33e35ae5d7 100644 > > --- a/include/sysemu/block-backend-io.h > > +++ b/include/sysemu/block-backend-io.h > > @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, > > int64_t offset, > > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > int64_t offset, int64_t len, > > BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, BdrvRequestFlags flags, > > + BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t > > bytes, > > BlockCompletionFunc *cb, void *opaque); > > void blk_aio_cancel_async(BlockAIOCB *acb); > > @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > > BlockZoneOp op, > > int64_t offset, int64_t len); > > int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > int64_t offset, int64_t len); > > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t > > *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > > int64_t bytes); > > -- > Damien Le Moal > Western Digital Research >