Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-18 Thread Sam Li
Damien Le Moal  于2022年10月17日周一 13:22写道:
>
> On 10/16/22 23:56, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of writes is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been written in the zone.
> >
> > Signed-off-by: Sam Li 
> > ---
> >  block/block-backend.c | 65 ++
> >  block/file-posix.c| 89 +--
> >  block/io.c| 21 
> >  block/raw-format.c|  8 +++
> >  include/block/block-io.h  |  3 ++
> >  include/block/block_int-common.h  |  5 ++
> >  include/block/raw-aio.h   |  4 +-
> >  include/sysemu/block-backend-io.h |  9 
> >  8 files changed, 198 insertions(+), 6 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index 1c618e9c68..06931ddd24 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> >  struct {
> >  unsigned long op;
> >  } zone_mgmt;
> > +struct {
> > +int64_t *append_sector;
>
> As mentioned previosuly, call this sector. "append" is already in the
> zone_append struct member name

Right, missed it here. I'll use "offset" then since it's in block layer.

>
> > +} zone_append;
> >  };
> >  } BlkRwCo;
> >
> > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >  return &acb->common;
> >  }
> >
> > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> > +{
> > +BlkAioEmAIOCB *acb = opaque;
> > +BlkRwCo *rwco = &acb->rwco;
> > +
> > +rwco->ret = blk_co_zone_append(rwco->blk, 
> > rwco->zone_append.append_sector,
>
> ...so you avoid awkward repetitions of "append" like here. You'll have:
> rwco->zone_append.sector, which is shorter and more natural.
>
> > +   rwco->iobuf, rwco->flags);
> > +blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +BlockCompletionFunc *cb, void *opaque) {
> > +BlkAioEmAIOCB *acb;
> > +Coroutine *co;
> > +IO_CODE();
> > +
> > +blk_inc_in_flight(blk);
> > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > +acb->rwco = (BlkRwCo) {
> > +.blk= blk,
> > +.ret= NOT_DONE,
> > +.flags  = flags,
> > +.iobuf  = qiov,
> > +.zone_append = {
> > +.append_sector = offset,
> > +},
> > +};
> > +acb->has_returned = false;
> > +
> > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > +bdrv_coroutine_enter(blk_bs(blk), co);
> > +acb->has_returned = true;
> > +if (acb->rwco.ret != NOT_DONE) {
> > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > + blk_aio_complete_bh, acb);
> > +}
> > +
> > +return &acb->common;
> > +}
> > +
> >  /*
> >   * Send a zone_report command.
> >   * offset is a byte offset from the start of the device. No alignment
> > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >  return ret;
> >  }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > +int ret;
> > +IO_CODE();
> > +
> > +blk_inc_in_flight(blk);
> > +blk_wait_while_drained(blk);
> > +if (!blk_is_available(blk)) {
> > +blk_dec_in_flight(blk);
> > +return -ENOMEDIUM;
> > +}
> > +
> > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > +blk_dec_in_flight(blk);
> > +return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >  BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 5ff5500301..3d0cc33d02 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
> >  struct {
> >  struct iovec *iov;
> >  int niov;
> > +int64_t *offset;
> >  } io;
> >  struct {
> >  uint64_t cmd;
> > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> > Error **errp)
> >  bs->bl.max_active_zones = ret;
> >  }
> >
> > +ret = get_sysfs_long_val(&st, "physical_block_size");
> > +if (ret >= 0) {
> > +bs->bl.write_granularity = ret;
> > +  

Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-16 Thread Damien Le Moal
On 10/16/22 23:56, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 65 ++
>  block/file-posix.c| 89 +--
>  block/io.c| 21 
>  block/raw-format.c|  8 +++
>  include/block/block-io.h  |  3 ++
>  include/block/block_int-common.h  |  5 ++
>  include/block/raw-aio.h   |  4 +-
>  include/sysemu/block-backend-io.h |  9 
>  8 files changed, 198 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1c618e9c68..06931ddd24 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  unsigned long op;
>  } zone_mgmt;
> +struct {
> +int64_t *append_sector;

As mentioned previosuly, call this sector. "append" is already in the
zone_append struct member name

> +} zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = &acb->rwco;
> +
> +rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,

...so you avoid awkward repetitions of "append" like here. You'll have:
rwco->zone_append.sector, which is shorter and more natural.

> +   rwco->iobuf, rwco->flags);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags,
> +BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.ret= NOT_DONE,
> +.flags  = flags,
> +.iobuf  = qiov,
> +.zone_append = {
> +.append_sector = offset,
> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +
> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 5ff5500301..3d0cc33d02 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
>  struct {
>  struct iovec *iov;
>  int niov;
> +int64_t *offset;
>  } io;
>  struct {
>  uint64_t cmd;
> @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  bs->bl.max_active_zones = ret;
>  }
>  
> +ret = get_sysfs_long_val(&st, "physical_block_size");
> +if (ret >= 0) {
> +bs->bl.write_granularity = ret;
> +}
> +
>  bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>  if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
>  error_report("report wps failed");
> @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
> nr_iov, off_t offset)
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>  {
>  ssize_t 

Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-16 Thread Dmitry Fomichev
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 65 ++
>  block/file-posix.c    | 89 +--
>  block/io.c    | 21 
>  block/raw-format.c    |  8 +++
>  include/block/block-io.h  |  3 ++
>  include/block/block_int-common.h  |  5 ++
>  include/block/raw-aio.h   |  4 +-
>  include/sysemu/block-backend-io.h |  9 
>  8 files changed, 198 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1c618e9c68..06931ddd24 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  unsigned long op;
>  } zone_mgmt;
> +    struct {
> +    int64_t *append_sector;
> +    } zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags,
> +    BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +    .blk    = blk,
> +    .ret    = NOT_DONE,
> +    .flags  = flags,
> +    .iobuf  = qiov,
> +    .zone_append = {
> +    .append_sector = offset,
> +    },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +    QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +    blk_dec_in_flight(blk);
> +    return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 5ff5500301..3d0cc33d02 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
>  struct {
>  struct iovec *iov;
>  int niov;
> +    int64_t *offset;
>  } io;
>  struct {
>  uint64_t cmd;
> @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>  bs->bl.max_active_zones = ret;
>  }
>  
> +    ret = get_sysfs_long_val(&st, "physical_block_size");
> +    if (ret >= 0) {
> +    bs->bl.write_granularity = ret;
> +    }
> +
>  bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>  if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
>  error_report("report wps failed");
> @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int
> nr_iov, off_t offset)
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>  {
>  ssize_t len;
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;

Can this code ever be called for a non-zoned device with 0 zone size?
If yes, you need to avoid division by zero h

[PATCH v4 2/3] block: introduce zone append write for zoned devices

2022-10-16 Thread Sam Li
A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of writes is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been written in the zone.

Signed-off-by: Sam Li 
---
 block/block-backend.c | 65 ++
 block/file-posix.c| 89 +--
 block/io.c| 21 
 block/raw-format.c|  8 +++
 include/block/block-io.h  |  3 ++
 include/block/block_int-common.h  |  5 ++
 include/block/raw-aio.h   |  4 +-
 include/sysemu/block-backend-io.h |  9 
 8 files changed, 198 insertions(+), 6 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 1c618e9c68..06931ddd24 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
 struct {
 unsigned long op;
 } zone_mgmt;
+struct {
+int64_t *append_sector;
+} zone_append;
 };
 } BlkRwCo;
 
@@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return &acb->common;
 }
 
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = &acb->rwco;
+
+rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
+   rwco->iobuf, rwco->flags);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags,
+BlockCompletionFunc *cb, void *opaque) {
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.ret= NOT_DONE,
+.flags  = flags,
+.iobuf  = qiov,
+.zone_append = {
+.append_sector = offset,
+},
+};
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return &acb->common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+
+ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+blk_dec_in_flight(blk);
+return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 5ff5500301..3d0cc33d02 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
 struct {
 struct iovec *iov;
 int niov;
+int64_t *offset;
 } io;
 struct {
 uint64_t cmd;
@@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 bs->bl.max_active_zones = ret;
 }
 
+ret = get_sysfs_long_val(&st, "physical_block_size");
+if (ret >= 0) {
+bs->bl.write_granularity = ret;
+}
+
 bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
 if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
 error_report("report wps failed");
@@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int 
nr_iov, off_t offset)
 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
 {
 ssize_t len;
+BlockZoneWps *wps = aiocb->bs->bl.wps;
+int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+
+if (wps) {
+qemu_mutex_lock(&wps->lock);
+if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
+aiocb->aio_offset = wps->wp[index];
+}
+}
 
 do {
-if (aiocb->aio_type & QEMU_AIO_WRITE)
+if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
 len = qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->i