On 9/30/24 01:03, Sam Li wrote: > When the file-posix driver emulates append write, it holds the lock > whenever accessing wp, which limits the IO queue depth to one. > > The write IO flow can be optimized to allow concurrent writes. The lock > is held in two cases: > 1. Assumed that the write IO succeeds, update the wp before issuing the > write. > 2. If the write IO fails, report that zone and use the reported value > as the current wp. > > Signed-off-by: Sam Li <faithilike...@gmail.com> > --- > block/file-posix.c | 33 ++++++++++++++++----------------- > 1 file changed, 16 insertions(+), 17 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index ff928b5e85..64a57fadb1 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -2489,11 +2489,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState > *bs, int64_t *offset_ptr, > #if defined(CONFIG_BLKZONED) > if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && > bs->bl.zoned != BLK_Z_NONE) { > - qemu_co_mutex_lock(&bs->wps->colock); > + BlockZoneWps *wps = bs->wps; > + int index = offset / bs->bl.zone_size;
A blank line after this declaration (to separate declarations from code) woule make this code more readable... > + qemu_co_mutex_lock(&wps->colock); > + uint64_t *wp = &wps->wp[index]; > if (type & QEMU_AIO_ZONE_APPEND) { > - int index = offset / bs->bl.zone_size; > - offset = bs->wps->wp[index]; > + offset = *wp; > + *offset_ptr = offset; > + } > + /* Advance the wp if needed */ > + if (offset + bytes > *wp) { Why the if ? offset MUST be equal to wp for writes, and for zone append we do not need to check the offset at all. So advancing the wp should be unconditional. BUT ! where are the checks for "zone is full" and "offset == wp" for write operations ? These must be checked while holding the zone lock. > + *wp = offset + bytes; > } > + qemu_co_mutex_unlock(&bs->wps->colock); > } > #endif > > @@ -2540,28 +2548,19 @@ out: > #if defined(CONFIG_BLKZONED) > if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && > bs->bl.zoned != BLK_Z_NONE) { > - BlockZoneWps *wps = bs->wps; > if (ret == 0) { > - uint64_t *wp = &wps->wp[offset / bs->bl.zone_size]; > - if (!BDRV_ZT_IS_CONV(*wp)) { > - if (type & QEMU_AIO_ZONE_APPEND) { > - *offset_ptr = *wp; > - trace_zbd_zone_append_complete(bs, *offset_ptr > - >> BDRV_SECTOR_BITS); > - } > - /* Advance the wp if needed */ > - if (offset + bytes > *wp) { > - *wp = offset + bytes; > - } > + if (type & QEMU_AIO_ZONE_APPEND) { > + trace_zbd_zone_append_complete(bs, *offset_ptr > + >> BDRV_SECTOR_BITS); > } > } else { > + qemu_co_mutex_lock(&bs->wps->colock); > /* > * write and append write are not allowed to cross zone > boundaries > */ > update_zones_wp(bs, s->fd, offset, 1); > + qemu_co_mutex_unlock(&bs->wps->colock); > } > - > - qemu_co_mutex_unlock(&wps->colock); > } > #endif > return ret; -- Damien Le Moal Western Digital Research