On 9/29/22 18:31, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of the write is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li <faithilike...@gmail.com>
> ---
>  block/block-backend.c              | 65 ++++++++++++++++++++++++++++++
>  block/file-posix.c                 | 51 +++++++++++++++++++++++
>  block/io.c                         | 21 ++++++++++
>  block/raw-format.c                 |  7 ++++
>  include/block/block-io.h           |  3 ++
>  include/block/block_int-common.h   |  3 ++
>  include/sysemu/block-backend-io.h  |  9 +++++
>  qemu-io-cmds.c                     | 62 ++++++++++++++++++++++++++++
>  tests/qemu-iotests/tests/zoned.out |  7 ++++
>  tests/qemu-iotests/tests/zoned.sh  |  9 +++++
>  10 files changed, 237 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f7f7acd6f4..07a8632af1 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>          struct {
>              BlockZoneOp op;
>          } zone_mgmt;
> +        struct {
> +            int64_t *append_sector;
> +        } zone_append;
>      };
>  } BlkRwCo;
>  
> @@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>      return &acb->common;
>  }
>  
> +static void blk_aio_zone_append_entry(void *opaque) {
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +                                   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +        .blk    = blk,
> +        .ret    = NOT_DONE,
> +        .flags  = flags,
> +        .iobuf  = qiov,
> +        .zone_append = {
> +                .append_sector = offset,
> +        },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1921,6 +1965,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>      return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +        blk_dec_in_flight(blk);
> +        return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 33e81ac112..24b70f1afe 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -3454,6 +3454,56 @@ static int coroutine_fn 
> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>  #endif
>  }
>  
> +

whiteline change.

> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +#if defined(CONFIG_BLKZONED)
> +    BDRVRawState *s = bs->opaque;
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +    RawPosixAIOData acb;
> +
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +       iov_len = qiov->iov[i].iov_len;
> +       if (iov_len & wg_mask) {
> +           error_report("len of IOVector[%d] %" PRId64 " is not aligned to 
> block "
> +                        "size %" PRId64 "", i, iov_len, wg);
> +           return -EINVAL;
> +       }
> +       len += iov_len;
> +    }
> +
> +    acb = (RawPosixAIOData) {
> +        .bs = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type = QEMU_AIO_ZONE_APPEND,
> +        .aio_offset = bs->bl.wps->wp[*offset / bs->bl.zone_size],

You cannot read the wps array without holding wps->lock, and the lock MUST
be held until the aio is issued. Otherwise, if another aio is issued after
the above line, the 2 AIOs may end up being issued in reverse order,
causing an unaligned write error.

Given this, I would change this to issue this aio without changing the
offset and let the lowest level function in file-posix doing the
conversion from zone append to regular write, incrementing the wp and
issue the write, all with wps lock held.

> +        .aio_nbytes = len,
> +        .io = {
> +                .iov = qiov->iov,
> +                .niov = qiov->niov,
> +                .wps = bs->bl.wps,
> +                .append_sector = offset,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -4229,6 +4279,7 @@ static BlockDriver bdrv_zoned_host_device = {
>      /* zone management operations */
>      .bdrv_co_zone_report = raw_co_zone_report,
>      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = raw_co_zone_append,
>  };
>  #endif
>  
> diff --git a/block/io.c b/block/io.c
> index 5ab2d169c8..b9dfdf0709 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3239,6 +3239,27 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                        QEMUIOVector *qiov,
> +                        BdrvRequestFlags flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_append) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 9441536819..df8cc33467 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -325,6 +325,12 @@ static int coroutine_fn 
> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
>  }
>  
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t 
> *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>      int64_t len;
> @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
>      .bdrv_co_pdiscard     = &raw_co_pdiscard,
>      .bdrv_co_zone_report  = &raw_co_zone_report,
>      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = &raw_co_zone_append,
>      .bdrv_co_block_status = &raw_co_block_status,
>      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index 65463b88d9..a792164018 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, 
> int64_t offset,
>                                       BlockZoneDescriptor *zones);
>  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>                                     int64_t offset, int64_t len);
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                                     QEMUIOVector *qiov,
> +                                     BdrvRequestFlags flags);
>  
>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> diff --git a/include/block/block_int-common.h 
> b/include/block/block_int-common.h
> index 59c2d1316d..a7e7db5646 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -701,6 +701,9 @@ struct BlockDriver {
>              BlockZoneDescriptor *zones);
>      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp 
> op,
>              int64_t offset, int64_t len);
> +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> +            int64_t *offset, QEMUIOVector *qiov,
> +            BdrvRequestFlags flags);
>  
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> diff --git a/include/sysemu/block-backend-io.h 
> b/include/sysemu/block-backend-io.h
> index 6835525582..33e35ae5d7 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t 
> offset,
>  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                int64_t offset, int64_t len,
>                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t 
> bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>                                    int64_t offset, int64_t len);
>  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                         int64_t offset, int64_t len);
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +                                    QEMUIOVector *qiov,
> +                                    BdrvRequestFlags flags);
> +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
> +                                         QEMUIOVector *qiov,
> +                                         BdrvRequestFlags flags);
>  
>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>                                        int64_t bytes);
> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c

Hmmm... may be move the test stuff to a 3rd patch to facilitate review
(and make each patch smaller) ?

> index e56c8d1c30..6cb86de35b 100644
> --- a/qemu-io-cmds.c
> +++ b/qemu-io-cmds.c
> @@ -1855,6 +1855,67 @@ static const cmdinfo_t zone_reset_cmd = {
>      .oneline = "reset a zone write pointer in zone block device",
>  };
>  
> +static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
> +                              int64_t *offset, int flags, int *total)
> +{
> +    int async_ret = NOT_DONE;
> +
> +    blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
> +    while (async_ret == NOT_DONE) {
> +        main_loop_wait(false);
> +    }
> +
> +    *total = qiov->size;
> +    return async_ret < 0 ? async_ret : 1;
> +}
> +
> +static int zone_append_f(BlockBackend *blk, int argc, char **argv) {
> +    int ret;
> +    int flags = 0;
> +    int total = 0;
> +    int64_t offset;
> +    char *buf;
> +    int nr_iov;
> +    int pattern = 0xcd;
> +    QEMUIOVector qiov;
> +
> +    if (optind > argc - 2) {
> +        return -EINVAL;
> +    }
> +    optind++;
> +    offset = cvtnum(argv[optind]);
> +    if (offset < 0) {
> +        print_cvtnum_err(offset, argv[optind]);
> +        return offset;
> +    }
> +    optind++;
> +    nr_iov = argc - optind;
> +    buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern);
> +    if (buf == NULL) {
> +        return -EINVAL;
> +    }
> +    ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
> +    if (ret < 0) {
> +        printf("zone append failed: %s\n", strerror(-ret));
> +        goto out;
> +    }
> +
> +    out:
> +    qemu_iovec_destroy(&qiov);
> +    qemu_io_free(buf);
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_append_cmd = {
> +    .name = "zone_append",
> +    .altname = "zap",
> +    .cfunc = zone_append_f,
> +    .argmin = 3,
> +    .argmax = 3,
> +    .args = "offset len [len..]",
> +    .oneline = "append write a number of bytes at a specified offset",
> +};
> +
>  static int truncate_f(BlockBackend *blk, int argc, char **argv);
>  static const cmdinfo_t truncate_cmd = {
>      .name       = "truncate",
> @@ -2652,6 +2713,7 @@ static void __attribute((constructor)) 
> init_qemuio_commands(void)
>      qemuio_add_command(&zone_close_cmd);
>      qemuio_add_command(&zone_finish_cmd);
>      qemuio_add_command(&zone_reset_cmd);
> +    qemuio_add_command(&zone_append_cmd);
>      qemuio_add_command(&truncate_cmd);
>      qemuio_add_command(&length_cmd);
>      qemuio_add_command(&info_cmd);
> diff --git a/tests/qemu-iotests/tests/zoned.out 
> b/tests/qemu-iotests/tests/zoned.out
> index 0c8f96deb9..b3b139b4ec 100644
> --- a/tests/qemu-iotests/tests/zoned.out
> +++ b/tests/qemu-iotests/tests/zoned.out
> @@ -50,4 +50,11 @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, 
> zcond:14, [type: 2]
>  (5) resetting the second zone
>  After resetting a zone:
>  start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
> +
> +
> +(6) append write
> +After appending the first zone:
> +start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
> +After appending the second zone:
> +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
>  *** done
> diff --git a/tests/qemu-iotests/tests/zoned.sh 
> b/tests/qemu-iotests/tests/zoned.sh
> index fced0194c5..888711eef2 100755
> --- a/tests/qemu-iotests/tests/zoned.sh
> +++ b/tests/qemu-iotests/tests/zoned.sh
> @@ -79,6 +79,15 @@ echo "(5) resetting the second zone"
>  sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
>  echo "After resetting a zone:"
>  sudo $QEMU_IO $IMG -c "zrp 268435456 1"
> +echo
> +echo
> +echo "(6) append write" # physical block size of the device is 4096
> +sudo $QEMU_IO $IMG -c "zap 0 0x1000 0x2000"
> +echo "After appending the first zone:"
> +sudo $QEMU_IO $IMG -c "zrp 0 1"
> +sudo $QEMU_IO $IMG -c "zap 268435456 0x1000 0x2000"
> +echo "After appending the second zone:"
> +sudo $QEMU_IO $IMG -c "zrp 268435456 1"
>  
>  # success, all done
>  echo "*** done"

-- 
Damien Le Moal
Western Digital Research


Reply via email to