Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

2022-10-13 Thread Damien Le Moal
On 2022/10/13 16:27, Sam Li wrote:
> Damien Le Moal  于2022年10月13日周四 13:55写道:
>>
>> On 10/10/22 11:33, Sam Li wrote:
>>> A zone append command is a write operation that specifies the first
>>> logical block of a zone as the write position. When writing to a zoned
>>> block device using zone append, the byte offset of writes is pointing
>>> to the write pointer of that zone. Upon completion the device will
>>> respond with the position the data has been written in the zone.
>>>
>>> Signed-off-by: Sam Li 
>>> ---
>>>  block/block-backend.c | 64 +++
>>>  block/file-posix.c| 64 ---
>>>  block/io.c| 21 ++
>>>  block/raw-format.c|  7 
>>>  include/block/block-io.h  |  3 ++
>>>  include/block/block_int-common.h  |  3 ++
>>>  include/block/raw-aio.h   |  4 +-
>>>  include/sysemu/block-backend-io.h |  9 +
>>>  8 files changed, 168 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index ddc569e3ac..bfdb719bc8 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>>>  struct {
>>>  BlockZoneOp op;
>>>  } zone_mgmt;
>>> +struct {
>>> +int64_t *append_sector;
>>
>> I would call this "sector", since it will always be referenced as
>> "->zone_append.sector", you get the "append" for free :)
>>
>> That said, shouldn't this be a byte value, so called "offset" ? Not
>> entirely sure...
> 
> Yes, it can be changed to "offset"(byte) following QEMU's convention.
> Just need to add conversions to virtio_blk_zone_append/*_complete,
> which is easily done.
> 
>>
>>> +} zone_append;
>>>  };
>>>  } BlkRwCo;
>>>
>>> @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
>>> BlockZoneOp op,
>>>  return &acb->common;
>>>  }
>>>
>>> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
>>> +BlkAioEmAIOCB *acb = opaque;
>>> +BlkRwCo *rwco = &acb->rwco;
>>> +
>>> +rwco->ret = blk_co_zone_append(rwco->blk, 
>>> rwco->zone_append.append_sector,
>>> +   rwco->iobuf, rwco->flags);
>>> +blk_aio_complete(acb);
>>> +}
>>> +
>>> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
>>> +QEMUIOVector *qiov, BdrvRequestFlags flags,
>>> +BlockCompletionFunc *cb, void *opaque) {
>>> +BlkAioEmAIOCB *acb;
>>> +Coroutine *co;
>>> +IO_CODE();
>>> +
>>> +blk_inc_in_flight(blk);
>>> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>>> +acb->rwco = (BlkRwCo) {
>>> +.blk= blk,
>>> +.ret= NOT_DONE,
>>> +.flags  = flags,
>>> +.iobuf  = qiov,
>>> +.zone_append = {
>>> +.append_sector = offset,
>>
>> See above comment. So since this is a byte value, this needs to be
>> called "offset", no ?
> 
> Yes, same answers above.
> 
>>
>>> +},
>>> +};
>>> +acb->has_returned = false;
>>> +
>>> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
>>> +bdrv_coroutine_enter(blk_bs(blk), co);
>>> +acb->has_returned = true;
>>> +if (acb->rwco.ret != NOT_DONE) {
>>> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>>> + blk_aio_complete_bh, acb);
>>> +}
>>> +
>>> +return &acb->common;
>>> +}
>>> +
>>>  /*
>>>   * Send a zone_report command.
>>>   * offset is a byte offset from the start of the device. No alignment
>>> @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
>>> BlockZoneOp op,
>>>  return ret;
>>>  }
>>>
>>> +/*
>>> + * Send a zone_append command.
>>> + */
>>> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
>>> +QEMUIOVector *qiov, BdrvRequestFlags flags)
>>> +{
>>> +int ret;
>>> +IO_CODE();
>>> +
>>> +blk_inc_in_flight(blk);
>>> +blk_wait_while_drained(blk);
>>> +if (!blk_is_available(blk)) {
>>> +blk_dec_in_flight(blk);
>>> +return -ENOMEDIUM;
>>> +}
>>> +
>>> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
>>> +blk_dec_in_flight(blk);
>>> +return ret;
>>> +}
>>> +
>>>  void blk_drain(BlockBackend *blk)
>>>  {
>>>  BlockDriverState *bs = blk_bs(blk);
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index 17c0b58158..08ab164df4 100755
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
>>> *aiocb)
>>>  ssize_t len;
>>>
>>>  do {
>>> -if (aiocb->aio_type & QEMU_AIO_WRITE)
>>> +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>>>  len = qemu_pwritev(aiocb->aio_fildes,
>>>  

Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

2022-10-13 Thread Sam Li
Damien Le Moal  于2022年10月13日周四 13:55写道:
>
> On 10/10/22 11:33, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of writes is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been written in the zone.
> >
> > Signed-off-by: Sam Li 
> > ---
> >  block/block-backend.c | 64 +++
> >  block/file-posix.c| 64 ---
> >  block/io.c| 21 ++
> >  block/raw-format.c|  7 
> >  include/block/block-io.h  |  3 ++
> >  include/block/block_int-common.h  |  3 ++
> >  include/block/raw-aio.h   |  4 +-
> >  include/sysemu/block-backend-io.h |  9 +
> >  8 files changed, 168 insertions(+), 7 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index ddc569e3ac..bfdb719bc8 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> >  struct {
> >  BlockZoneOp op;
> >  } zone_mgmt;
> > +struct {
> > +int64_t *append_sector;
>
> I would call this "sector", since it will always be referenced as
> "->zone_append.sector", you get the "append" for free :)
>
> That said, shouldn't this be a byte value, so called "offset" ? Not
> entirely sure...

Yes, it can be changed to "offset"(byte) following QEMU's convention.
Just need to add conversions to virtio_blk_zone_append/*_complete,
which is easily done.

>
> > +} zone_append;
> >  };
> >  } BlkRwCo;
> >
> > @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >  return &acb->common;
> >  }
> >
> > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
> > +BlkAioEmAIOCB *acb = opaque;
> > +BlkRwCo *rwco = &acb->rwco;
> > +
> > +rwco->ret = blk_co_zone_append(rwco->blk, 
> > rwco->zone_append.append_sector,
> > +   rwco->iobuf, rwco->flags);
> > +blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +BlockCompletionFunc *cb, void *opaque) {
> > +BlkAioEmAIOCB *acb;
> > +Coroutine *co;
> > +IO_CODE();
> > +
> > +blk_inc_in_flight(blk);
> > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > +acb->rwco = (BlkRwCo) {
> > +.blk= blk,
> > +.ret= NOT_DONE,
> > +.flags  = flags,
> > +.iobuf  = qiov,
> > +.zone_append = {
> > +.append_sector = offset,
>
> See above comment. So since this is a byte value, this needs to be
> called "offset", no ?

Yes, same answers above.

>
> > +},
> > +};
> > +acb->has_returned = false;
> > +
> > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > +bdrv_coroutine_enter(blk_bs(blk), co);
> > +acb->has_returned = true;
> > +if (acb->rwco.ret != NOT_DONE) {
> > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > + blk_aio_complete_bh, acb);
> > +}
> > +
> > +return &acb->common;
> > +}
> > +
> >  /*
> >   * Send a zone_report command.
> >   * offset is a byte offset from the start of the device. No alignment
> > @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> > BlockZoneOp op,
> >  return ret;
> >  }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > +int ret;
> > +IO_CODE();
> > +
> > +blk_inc_in_flight(blk);
> > +blk_wait_while_drained(blk);
> > +if (!blk_is_available(blk)) {
> > +blk_dec_in_flight(blk);
> > +return -ENOMEDIUM;
> > +}
> > +
> > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > +blk_dec_in_flight(blk);
> > +return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >  BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 17c0b58158..08ab164df4 100755
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> > *aiocb)
> >  ssize_t len;
> >
> >  do {
> > -if (aiocb->aio_type & QEMU_AIO_WRITE)
> > +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> >  len = qemu_pwritev(aiocb->aio_fildes,
> > aiocb->io.iov,
> > 

Re: [PATCH v3 2/3] block: introduce zone append write for zoned devices

2022-10-12 Thread Damien Le Moal
On 10/10/22 11:33, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c | 64 +++
>  block/file-posix.c| 64 ---
>  block/io.c| 21 ++
>  block/raw-format.c|  7 
>  include/block/block-io.h  |  3 ++
>  include/block/block_int-common.h  |  3 ++
>  include/block/raw-aio.h   |  4 +-
>  include/sysemu/block-backend-io.h |  9 +
>  8 files changed, 168 insertions(+), 7 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index ddc569e3ac..bfdb719bc8 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  BlockZoneOp op;
>  } zone_mgmt;
> +struct {
> +int64_t *append_sector;

I would call this "sector", since it will always be referenced as
"->zone_append.sector", you get the "append" for free :)

That said, shouldn't this be a byte value, so called "offset" ? Not
entirely sure...

> +} zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1869,6 +1872,46 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) {
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = &acb->rwco;
> +
> +rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +   rwco->iobuf, rwco->flags);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags,
> +BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.ret= NOT_DONE,
> +.flags  = flags,
> +.iobuf  = qiov,
> +.zone_append = {
> +.append_sector = offset,

See above comment. So since this is a byte value, this needs to be
called "offset", no ?

> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1921,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +
> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 17c0b58158..08ab164df4 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1657,7 +1657,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData 
> *aiocb)
>  ssize_t len;
>  
>  do {
> -if (aiocb->aio_type & QEMU_AIO_WRITE)
> +if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>  len = qemu_pwritev(aiocb->aio_fildes,
> aiocb->io.iov,
> aiocb->io.niov,

Hu... You are issuing the io for a zone append without first changing
the aiocb offset to be equal to the zone write pointer ? And you are
calling this without the wps->lock held... Changing the aio offset to be
equal to the wp && issuing the io must be atomic.

> @@ -1687,7 +1687,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData 
> *aiocb, char *buf)
>  ssize_t len;
>  
>  while (offset < aiocb->aio_nbytes) {
> -if (aiocb->a