Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

Sam Li Thu, 13 Oct 2022 00:52:28 -0700

Damien Le Moal <damien.lem...@opensource.wdc.com> 于2022年10月13日周四 13:55写道：
>
> On 10/10/22 11:33, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of writes is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been written in the zone.
> >
> > Signed-off-by: Sam Li <faithilike...@gmail.com>
> > ---
> >  block/block-backend.c             | 64 +++++++++++++++++++++++++++++++
> >  block/file-posix.c                | 64 ++++++++++++++++++++++++++++---
> >  block/io.c                        | 21 ++++++++++
> >  block/raw-format.c                |  7 ++++
> >  include/block/block-io.h          |  3 ++
> >  include/block/block_int-common.h  |  3 ++
> >  include/block/raw-aio.h           |  4 +-
> >  include/sysemu/block-backend-io.h |  9 +++++
> >  8 files changed, 168 insertions(+), 7 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index ddc569e3ac..bfdb719bc8 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> >          struct {
> >              BlockZoneOp op;
> >          } zone_mgmt;
> > +        struct {
> > +            int64_t *append_sector;
>
> I would call this "sector", since it will always be referenced as
> "->zone_append.sector", you get the "append" for free :)
>
> That said, shouldn't this be a byte value, so called "offset" ? Not
> entirely sure...


Yes, it can be changed to "offset"(byte) following QEMU's convention.
Just need to add conversions to virtio_blk_zone_append/*_complete,
which is easily done.

>
> > +        } zone_append;
> >      };
> >  } BlkRwCo;
> >
> > @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >      return &acb->common;
> >  }
> >
> > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
> > +    BlkAioEmAIOCB *acb = opaque;
> > +    BlkRwCo *rwco = &acb->rwco;
> > +
> > +    rwco->ret = blk_co_zone_append(rwco->blk, 
> > rwco->zone_append.append_sector,
> > +                                   rwco->iobuf, rwco->flags);
> > +    blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +                                BlockCompletionFunc *cb, void *opaque) {
> > +    BlkAioEmAIOCB *acb;
> > +    Coroutine *co;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > +    acb->rwco = (BlkRwCo) {
> > +            .blk    = blk,
> > +            .ret    = NOT_DONE,
> > +            .flags  = flags,
> > +            .iobuf  = qiov,
> > +            .zone_append = {
> > +                    .append_sector = offset,
>
> See above comment. So since this is a byte value, this needs to be
> called "offset", no ?

Yes, same answers above.

>
> > +            },
> > +    };
> > +    acb->has_returned = false;
> > +
> > +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > +    bdrv_coroutine_enter(blk_bs(blk), co);
> > +    acb->has_returned = true;
> > +    if (acb->rwco.ret != NOT_DONE) {
> > +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > +                                         blk_aio_complete_bh, acb);
> > +    }
> > +
> > +    return &acb->common;
> > +}
> > +
> >  /*
> >   * Send a zone_report command.
> >   * offset is a byte offset from the start of the device. No alignment
> > @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >      return ret;
> >  }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    if (!blk_is_available(blk)) {
> > +        blk_dec_in_flight(blk);
> > +        return -ENOMEDIUM;
> > +    }
> > +
> > +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >      BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 17c0b58158..08ab164df4 100755
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> > *aiocb)
> >      ssize_t len;
> >
> >      do {
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> >              len = qemu_pwritev(aiocb->aio_fildes,
> >                                 aiocb->io.iov,
> >                                 aiocb->io.niov,
>
> Hu... You are issuing the io for a zone append without first changing
> the aiocb offset to be equal to the zone write pointer ? And you are

It changed in the last patch. But it should be in this patch and make
it specific to zone_append case, like:
if ( type == & QEMU_AIO_ZONE_APPEND) {
    /* change offset here */
}

> calling this without the wps->lock held... Changing the aio offset to be
> equal to the wp && issuing the io must be atomic.

Does this mean puttling locks around pwritev()? Like:
lock(wp);
len = pwritev();
unlock(wp);

Because it is accessing wps[] by offset, which is a wp location. And
when pwritev()  executes, the offset should not be changed by other
ios in flight.

>
> > @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
> > *aiocb, char *buf)
> >      ssize_t len;
> >
> >      while (offset < aiocb->aio_nbytes) {
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              len = pwrite(aiocb->aio_fildes,
> >                           (const char *)buf + offset,
> >                           aiocb->aio_nbytes - offset,
>
> Same comment here.
>
> > @@ -1731,7 +1731,7 @@ static int handle_aiocb_rw(void *opaque)
> >       * The offset of regular writes, append writes is the wp location
> >       * of that zone.
> >       */
> > -    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +    if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >          if (aiocb->bs->bl.zone_size > 0) {
> >              BlockZoneWps *wps = aiocb->bs->bl.wps;
> >              qemu_mutex_lock(&wps->lock);
> > @@ -1794,7 +1794,7 @@ static int handle_aiocb_rw(void *opaque)
> >      }
> >
> >      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> > -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> > +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
> >          char *p = buf;
> >          size_t count = aiocb->aio_nbytes, copy;
> >          int i;
> > @@ -1816,7 +1816,7 @@ static int handle_aiocb_rw(void *opaque)
> >  out:
> >      if (nbytes == aiocb->aio_nbytes) {
> >  #if defined(CONFIG_BLKZONED)
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              BlockZoneWps *wps = aiocb->bs->bl.wps;
> >              int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >              if (wps) {
> > @@ -1828,6 +1828,11 @@ out:
> >                      if (wend_offset > wps->wp[index]){
> >                          wps->wp[index] = wend_offset;
> >                      }
> > +
> > +                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > +                        *aiocb->io.append_sector =
> > +                                wps->wp[index] >> BDRV_SECTOR_BITS;
> > +                    }
>
> Same comment as last time. You must do this BEFORE the previous hunk
> that updates the wp. Otherwise, you are NOT returning the position of
> the written data, but the position that follows the written data...
>
> If you do a zone append to an empty zone, the append sector you return
> must be the zone start sector. You can see here that this will never be
> the case unless you reverse the 2 hunks above.

You are right. I mistook the append sector should be the end sector location.

+Upon a successful completion of a VIRTIO_BLK_T_ZONE_APPEND request, the driver
+MAY read the starting sector location of the written data from the request
+field \field{append_sector}.

>
> >                  }
> >                  qemu_mutex_unlock(&wps->lock);
> >              }
> > @@ -1845,7 +1850,7 @@ out:
> >      } else {
> >          assert(nbytes < 0);
> >  #if defined(CONFIG_BLKZONED)
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >                              aiocb->bs->bl.nr_zones);
> >          }
> > @@ -3493,6 +3498,52 @@ static int coroutine_fn 
> > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >  }
> >  #endif
> >
> > +#if defined(CONFIG_BLKZONED)
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> > +                                           int64_t *offset,
> > +                                           QEMUIOVector *qiov,
> > +                                           BdrvRequestFlags flags) {
> > +    BDRVRawState *s = bs->opaque;
> > +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> > +    int64_t iov_len = 0;
> > +    int64_t len = 0;
> > +    RawPosixAIOData acb;
> > +
> > +    if (*offset & zone_size_mask) {
> > +        error_report("sector offset %" PRId64 " is not aligned to zone 
> > size "
> > +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> > +        return -EINVAL;
> > +    }
> > +
> > +    int64_t wg = bs->bl.write_granularity;
> > +    int64_t wg_mask = wg - 1;
> > +    for (int i = 0; i < qiov->niov; i++) {
> > +       iov_len = qiov->iov[i].iov_len;
> > +       if (iov_len & wg_mask) {
> > +           error_report("len of IOVector[%d] %" PRId64 " is not aligned to 
> > block "
> > +                        "size %" PRId64 "", i, iov_len, wg);
> > +           return -EINVAL;
> > +       }
> > +       len += iov_len;
> > +    }
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type = QEMU_AIO_ZONE_APPEND,
> > +        .aio_offset = *offset,
> > +        .aio_nbytes = len,
> > +        .io = {
> > +                .iov = qiov->iov,
> > +                .niov = qiov->niov,
> > +                .append_sector = offset,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> > +}
> > +#endif
> > +
> >  static coroutine_fn int
> >  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                  bool blkdev)
> > @@ -4268,6 +4319,7 @@ static BlockDriver bdrv_zoned_host_device = {
> >      /* zone management operations */
> >      .bdrv_co_zone_report = raw_co_zone_report,
> >      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +    .bdrv_co_zone_append = raw_co_zone_append,
> >  };
> >  #endif
> >
> > diff --git a/block/io.c b/block/io.c
> > index e5aaa64e17..935abf2ed4 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -3230,6 +3230,27 @@ out:
> >      return co.ret;
> >  }
> >
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > +                        QEMUIOVector *qiov,
> > +                        BdrvRequestFlags flags)
> > +{
> > +    BlockDriver *drv = bs->drv;
> > +    CoroutineIOCompletion co = {
> > +            .coroutine = qemu_coroutine_self(),
> > +    };
> > +    IO_CODE();
> > +
> > +    bdrv_inc_in_flight(bs);
> > +    if (!drv || !drv->bdrv_co_zone_append) {
> > +        co.ret = -ENOTSUP;
> > +        goto out;
> > +    }
> > +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> > +out:
> > +    bdrv_dec_in_flight(bs);
> > +    return co.ret;
> > +}
> > +
> >  void *qemu_blockalign(BlockDriverState *bs, size_t size)
> >  {
> >      IO_CODE();
> > diff --git a/block/raw-format.c b/block/raw-format.c
> > index b885688434..f132880c85 100644
> > --- a/block/raw-format.c
> > +++ b/block/raw-format.c
> > @@ -325,6 +325,12 @@ static int coroutine_fn 
> > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> >  }
> >
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t 
> > *offset,
> > +                                           QEMUIOVector *qiov,
> > +                                           BdrvRequestFlags flags) {
> > +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> > +}
> > +
> >  static int64_t raw_getlength(BlockDriverState *bs)
> >  {
> >      int64_t len;
> > @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
> >      .bdrv_co_pdiscard     = &raw_co_pdiscard,
> >      .bdrv_co_zone_report  = &raw_co_zone_report,
> >      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> > +    .bdrv_co_zone_append = &raw_co_zone_append,
> >      .bdrv_co_block_status = &raw_co_block_status,
> >      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
> >      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> > diff --git a/include/block/block-io.h b/include/block/block-io.h
> > index f0cdf67d33..6a54453578 100644
> > --- a/include/block/block-io.h
> > +++ b/include/block/block-io.h
> > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState 
> > *bs, int64_t offset,
> >                                       BlockZoneDescriptor *zones);
> >  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >                                     int64_t offset, int64_t len);
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > +                                     QEMUIOVector *qiov,
> > +                                     BdrvRequestFlags flags);
> >
> >  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
> >  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> > diff --git a/include/block/block_int-common.h 
> > b/include/block/block_int-common.h
> > index 59c2d1316d..a7e7db5646 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -701,6 +701,9 @@ struct BlockDriver {
> >              BlockZoneDescriptor *zones);
> >      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, 
> > BlockZoneOp op,
> >              int64_t offset, int64_t len);
> > +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> > +            int64_t *offset, QEMUIOVector *qiov,
> > +            BdrvRequestFlags flags);
> >
> >      /* removable device specific */
> >      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > index 3d26929cdd..f13cc1887b 100644
> > --- a/include/block/raw-aio.h
> > +++ b/include/block/raw-aio.h
> > @@ -31,6 +31,7 @@
> >  #define QEMU_AIO_TRUNCATE     0x0080
> >  #define QEMU_AIO_ZONE_REPORT  0x0100
> >  #define QEMU_AIO_ZONE_MGMT    0x0200
> > +#define QEMU_AIO_ZONE_APPEND  0x0400
> >  #define QEMU_AIO_TYPE_MASK \
> >          (QEMU_AIO_READ | \
> >           QEMU_AIO_WRITE | \
> > @@ -41,7 +42,8 @@
> >           QEMU_AIO_COPY_RANGE | \
> >           QEMU_AIO_TRUNCATE  | \
> >           QEMU_AIO_ZONE_REPORT | \
> > -         QEMU_AIO_ZONE_MGMT)
> > +         QEMU_AIO_ZONE_MGMT | \
> > +         QEMU_AIO_ZONE_APPEND)
> >
> >  /* AIO flags */
> >  #define QEMU_AIO_MISALIGNED   0x1000
> > diff --git a/include/sysemu/block-backend-io.h 
> > b/include/sysemu/block-backend-io.h
> > index 6835525582..33e35ae5d7 100644
> > --- a/include/sysemu/block-backend-io.h
> > +++ b/include/sysemu/block-backend-io.h
> > @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, 
> > int64_t offset,
> >  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                int64_t offset, int64_t len,
> >                                BlockCompletionFunc *cb, void *opaque);
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +                                BlockCompletionFunc *cb, void *opaque);
> >  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t 
> > bytes,
> >                               BlockCompletionFunc *cb, void *opaque);
> >  void blk_aio_cancel_async(BlockAIOCB *acb);
> > @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >                                    int64_t offset, int64_t len);
> >  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                         int64_t offset, int64_t len);
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                    QEMUIOVector *qiov,
> > +                                    BdrvRequestFlags flags);
> > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t 
> > *offset,
> > +                                         QEMUIOVector *qiov,
> > +                                         BdrvRequestFlags flags);
> >
> >  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
> >                                        int64_t bytes);
>
> --
> Damien Le Moal
> Western Digital Research
>

Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

Reply via email to