Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

2022-10-13 Thread Damien Le Moal
On 2022/10/13 16:27, Sam Li wrote:
> Damien Le Moal  于2022年10月13日周四 13:55写道:
>>
>> On 10/10/22 11:33, Sam Li wrote:
>>> A zone append command is a write operation that specifies the first
>>> logical block of a zone as the write position. When writing to a zoned
>>> block device using zone append, the byte offset of writes is pointing
>>> to the write pointer of that zone. Upon completion the device will
>>> respond with the position the data has been written in the zone.
>>>
>>> Signed-off-by: Sam Li 
>>> ---
>>>  block/block-backend.c | 64 +++
>>>  block/file-posix.c| 64 ---
>>>  block/io.c| 21 ++
>>>  block/raw-format.c|  7 
>>>  include/block/block-io.h  |  3 ++
>>>  include/block/block_int-common.h  |  3 ++
>>>  include/block/raw-aio.h   |  4 +-
>>>  include/sysemu/block-backend-io.h |  9 +
>>>  8 files changed, 168 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index ddc569e3ac..bfdb719bc8 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>>>  struct {
>>>  BlockZoneOp op;
>>>  } zone_mgmt;
>>> +struct {
>>> +int64_t *append_sector;
>>
>> I would call this "sector", since it will always be referenced as
>> "->zone_append.sector", you get the "append" for free :)
>>
>> That said, shouldn't this be a byte value, so called "offset" ? Not
>> entirely sure...
> 
> Yes, it can be changed to "offset"(byte) following QEMU's convention.
> Just need to add conversions to virtio_blk_zone_append/*_complete,
> which is easily done.
> 
>>
>>> +} zone_append;
>>>  };
>>>  } BlkRwCo;
>>>
>>> @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
>>> BlockZoneOp op,
>>>  return &acb->common;
>>>  }
>>>
>>> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
>>> +BlkAioEmAIOCB *acb = opaque;
>>> +BlkRwCo *rwco = &acb->rwco;
>>> +
>>> +rwco->ret = blk_co_zone_append(rwco->blk, 
>>> rwco->zone_append.append_sector,
>>> +   rwco->iobuf, rwco->flags);
>>> +blk_aio_complete(acb);
>>> +}
>>> +
>>> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
>>> +QEMUIOVector *qiov, BdrvRequestFlags flags,
>>> +BlockCompletionFunc *cb, void *opaque) {
>>> +BlkAioEmAIOCB *acb;
>>> +Coroutine *co;
>>> +IO_CODE();
>>> +
>>> +blk_inc_in_flight(blk);
>>> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>>> +acb->rwco = (BlkRwCo) {
>>> +.blk= blk,
>>> +.ret= NOT_DONE,
>>> +.flags  = flags,
>>> +.iobuf  = qiov,
>>> +.zone_append = {
>>> +.append_sector = offset,
>>
>> See above comment. So since this is a byte value, this needs to be
>> called "offset", no ?
> 
> Yes, same answers above.
> 
>>
>>> +},
>>> +};
>>> +acb->has_returned = false;
>>> +
>>> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
>>> +bdrv_coroutine_enter(blk_bs(blk), co);
>>> +acb->has_returned = true;
>>> +if (acb->rwco.ret != NOT_DONE) {
>>> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>>> + blk_aio_complete_bh, acb);
>>> +}
>>> +
>>> +return &acb->common;
>>> +}
>>> +
>>>  /*
>>>   * Send a zone_report command.
>>>   * offset is a byte offset from the start of the device. No alignment
>>> @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
>>> BlockZoneOp op,
>>>  return ret;
>>>  }
>>>
>>> +/*
>>> + * Send a zone_append command.
>>> + */
>>> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
>>> +QEMUIOVector *qiov, BdrvRequestFlags flags)
>>> +{
>>> +int ret;
>>> +IO_CODE();
>>> +
>>> +blk_inc_in_flight(blk);
>>> +blk_wait_while_drained(blk);
>>> +if (!blk_is_available(blk)) {
>>> +blk_dec_in_flight(blk);
>>> +return -ENOMEDIUM;
>>> +}
>>> +
>>> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
>>> +blk_dec_in_flight(blk);
>>> +return ret;
>>> +}
>>> +
>>>  void blk_drain(BlockBackend *blk)
>>>  {
>>>  BlockDriverState *bs = blk_bs(blk);
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index 17c0b58158..08ab164df4 100755
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
>>> *aiocb)
>>>  ssize_t len;
>>>
>>>  do {
>>> -if (aiocb->aio_type & QEMU_AIO_WRITE)
>>> +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>>>  len = qemu_pwritev(aiocb->aio_fildes,
>>>  

Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

2022-10-13 Thread Sam Li
Damien Le Moal  于2022年10月13日周四 13:55写道:
>
> On 10/10/22 11:33, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of writes is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been written in the zone.
> >
> > Signed-off-by: Sam Li 
> > ---
> >  block/block-backend.c | 64 +++
> >  block/file-posix.c| 64 ---
> >  block/io.c| 21 ++
> >  block/raw-format.c|  7 
> >  include/block/block-io.h  |  3 ++
> >  include/block/block_int-common.h  |  3 ++
> >  include/block/raw-aio.h   |  4 +-
> >  include/sysemu/block-backend-io.h |  9 +
> >  8 files changed, 168 insertions(+), 7 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index ddc569e3ac..bfdb719bc8 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> >  struct {
> >  BlockZoneOp op;
> >  } zone_mgmt;
> > +struct {
> > +int64_t *append_sector;
>
> I would call this "sector", since it will always be referenced as
> "->zone_append.sector", you get the "append" for free :)
>
> That said, shouldn't this be a byte value, so called "offset" ? Not
> entirely sure...

Yes, it can be changed to "offset"(byte) following QEMU's convention.
Just need to add conversions to virtio_blk_zone_append/*_complete,
which is easily done.

>
> > +} zone_append;
> >  };
> >  } BlkRwCo;
> >
> > @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >  return &acb->common;
> >  }
> >
> > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
> > +BlkAioEmAIOCB *acb = opaque;
> > +BlkRwCo *rwco = &acb->rwco;
> > +
> > +rwco->ret = blk_co_zone_append(rwco->blk, 
> > rwco->zone_append.append_sector,
> > +   rwco->iobuf, rwco->flags);
> > +blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +BlockCompletionFunc *cb, void *opaque) {
> > +BlkAioEmAIOCB *acb;
> > +Coroutine *co;
> > +IO_CODE();
> > +
> > +blk_inc_in_flight(blk);
> > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > +acb->rwco = (BlkRwCo) {
> > +.blk= blk,
> > +.ret= NOT_DONE,
> > +.flags  = flags,
> > +.iobuf  = qiov,
> > +.zone_append = {
> > +.append_sector = offset,
>
> See above comment. So since this is a byte value, this needs to be
> called "offset", no ?

Yes, same answers above.

>
> > +},
> > +};
> > +acb->has_returned = false;
> > +
> > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > +bdrv_coroutine_enter(blk_bs(blk), co);
> > +acb->has_returned = true;
> > +if (acb->rwco.ret != NOT_DONE) {
> > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > + blk_aio_complete_bh, acb);
> > +}
> > +
> > +return &acb->common;
> > +}
> > +
> >  /*
> >   * Send a zone_report command.
> >   * offset is a byte offset from the start of the device. No alignment
> > @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >  return ret;
> >  }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > +int ret;
> > +IO_CODE();
> > +
> > +blk_inc_in_flight(blk);
> > +blk_wait_while_drained(blk);
> > +if (!blk_is_available(blk)) {
> > +blk_dec_in_flight(blk);
> > +return -ENOMEDIUM;
> > +}
> > +
> > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > +blk_dec_in_flight(blk);
> > +return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >  BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 17c0b58158..08ab164df4 100755
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> > *aiocb)
> >  ssize_t len;
> >
> >  do {
> > -if (aiocb->aio_type & QEMU_AIO_WRITE)
> > +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> >  len = qemu_pwritev(aiocb->aio_fildes,
> > aiocb->io.iov,
> > 

Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

2022-10-12 Thread Damien Le Moal
On 10/10/22 11:33, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 64 +++
>  block/file-posix.c| 64 ---
>  block/io.c| 21 ++
>  block/raw-format.c|  7 
>  include/block/block-io.h  |  3 ++
>  include/block/block_int-common.h  |  3 ++
>  include/block/raw-aio.h   |  4 +-
>  include/sysemu/block-backend-io.h |  9 +
>  8 files changed, 168 insertions(+), 7 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index ddc569e3ac..bfdb719bc8 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  BlockZoneOp op;
>  } zone_mgmt;
> +struct {
> +int64_t *append_sector;

I would call this "sector", since it will always be referenced as
"->zone_append.sector", you get the "append" for free :)

That said, shouldn't this be a byte value, so called "offset" ? Not
entirely sure...

> +} zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = &acb->rwco;
> +
> +rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +   rwco->iobuf, rwco->flags);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags,
> +BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.ret= NOT_DONE,
> +.flags  = flags,
> +.iobuf  = qiov,
> +.zone_append = {
> +.append_sector = offset,

See above comment. So since this is a byte value, this needs to be
called "offset", no ?

> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +
> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 17c0b58158..08ab164df4 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> *aiocb)
>  ssize_t len;
>  
>  do {
> -if (aiocb->aio_type & QEMU_AIO_WRITE)
> +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>  len = qemu_pwritev(aiocb->aio_fildes,
> aiocb->io.iov,
> aiocb->io.niov,

Hu... You are issuing the io for a zone append without first changing
the aiocb offset to be equal to the zone write pointer ? And you are
calling this without the wps->lock held... Changing the aio offset to be
equal to the wp && issuing the io must be atomic.

> @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
> *aiocb, char *buf)
>  ssize_t len;
>  
>  while (offset < aiocb->aio_nbytes) {
> -if (aiocb->a

[PATCH v3 2/3] block: introduce zone append write for zoned devices

2022-10-09 Thread Sam Li
A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of writes is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been written in the zone.

Signed-off-by: Sam Li 
---
 block/block-backend.c | 64 +++
 block/file-posix.c| 64 ---
 block/io.c| 21 ++
 block/raw-format.c|  7 
 include/block/block-io.h  |  3 ++
 include/block/block_int-common.h  |  3 ++
 include/block/raw-aio.h   |  4 +-
 include/sysemu/block-backend-io.h |  9 +
 8 files changed, 168 insertions(+), 7 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index ddc569e3ac..bfdb719bc8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
 struct {
 BlockZoneOp op;
 } zone_mgmt;
+struct {
+int64_t *append_sector;
+} zone_append;
 };
 } BlkRwCo;
 
@@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return &acb->common;
 }
 
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = &acb->rwco;
+
+rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
+   rwco->iobuf, rwco->flags);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags,
+BlockCompletionFunc *cb, void *opaque) {
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.ret= NOT_DONE,
+.flags  = flags,
+.iobuf  = qiov,
+.zone_append = {
+.append_sector = offset,
+},
+};
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return &acb->common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+
+ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+blk_dec_in_flight(blk);
+return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 17c0b58158..08ab164df4 100755
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
*aiocb)
 ssize_t len;
 
 do {
-if (aiocb->aio_type & QEMU_AIO_WRITE)
+if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
 len = qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
*aiocb, char *buf)
 ssize_t len;
 
 while (offset < aiocb->aio_nbytes) {
-if (aiocb->aio_type & QEMU_AIO_WRITE) {
+if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
 len = pwrite(aiocb->aio_fildes,
  (const char *)buf + offset,
  aiocb->aio_nbytes - offset,
@@ -1731,7 +1731,7 @@ static int handle_aiocb_rw(void *opaque)
  * The offset of regular writes, append writes is the wp location
  * of that zone.
  */
-if (aiocb->aio_type & QEMU_AIO_WRITE) {
+if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
 if (aiocb->bs->bl.zone_size > 0) {
 BlockZoneWps *wps = aiocb->bs->bl.wps;
 qemu_mutex_lock(&wps->lock);
@@ -1794,7 +1794,7 @@ static int handle_aiocb_rw(void *opaque)
 }
 
 nbytes = handle_aiocb_rw_linear(aiocb, buf);
-if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+if (!(aiocb-