On Sat, Apr 05, 2025 at 04:52:29PM -0700, Pinku Deb Nath wrote: > Full Unit Access (FUA) is an optimization where a disk write with the > flag set will be persisted to disk immediately instead of potentially > remaining in the disk's write cache. > > This commit address the todo task > for using pwritev2() with RWF_DSYNC in the thread pool section of > raw_co_prw(), if pwritev2() with RWF_DSYNC is available in the host, > which is always the case for Linux kernel >= 4.7. > > The intent for FUA is indicated with the BDRV_REQ_FUA flag. > The old code paths are preserved in case BDRV_REQ_FUA is off > or pwritev2() with RWF_DSYNC is not available. > > Support for disk writes with FUA is handled in qemu_pwritev_fua(), > which uses pwritev2() with RWF_DSYNC if available, otherwise falls > back to pwritev2() with no flags followed by flush using > handle_aiocb_flush(). > > If pwritev2() is not implemented, then disk write in the linear FUA > will fallback to pwrite() + handle_aiocb_flush(). > > Signed-off-by: Pinku Deb Nath <[email protected]> > > --- > > v4: > - Add fallback when qemu_pwritev_fua() returns ENOSYS > - Similar fallback was not added for handle_aiocb_rw_vector() > since there is a preadv_present check in handle_aiocb_rw() > > v3: > - Changed signature to add fd, iov, nr_iov > - Return -ENOSYS for non-Linux hosts > > v2: > - Moved handle_aiocb_flush() into qemu_pwritev_fua() > - In handle_aiocb_rw_linear(), iovec with iovcnt=1 is created > based on the assumption that there will be only one buffer > --- > block/file-posix.c | 68 ++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 56 insertions(+), 12 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 56d1972d15..59bed7866a 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData { > unsigned long op; > } zone_mgmt; > }; > + BdrvRequestFlags flags; > } RawPosixAIOData; > > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > @@ -1674,6 +1675,20 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > return pwritev(fd, iov, nr_iov, offset); > } > > +static ssize_t > +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const > RawPosixAIOData *aiocb) > +{ > +#ifdef RWF_DSYNC > + return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC); > +#else > + ssize_t len = pwritev2(fd, iov, nr_iov, offset, 0);
This will fail to compile on non-Linux OSes that provide preadv(2)
(CONFIG_PREADV) because they do not have pwritev2(2). This can be fixed
by using pwritev() since the flags aren't needed:
ssize_t len = pwritev(fd, iov, nr_iov, offset);
> + if (len == 0) {
> + len = handle_aiocb_flush(aiocb);
> + }
> + return len;
> +#endif
> +}
> +
> #else
>
> static bool preadv_present = false;
> @@ -1690,6 +1705,11 @@ qemu_pwritev(int fd, const struct iovec *iov, int
> nr_iov, off_t offset)
> return -ENOSYS;
> }
>
> +static ssize_t
> +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const
> RawPosixAIOData *aiocb)
> +{
> + return -ENOSYS;
> +}
> #endif
>
> static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> @@ -1698,10 +1718,16 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData
> *aiocb)
>
> len = RETRY_ON_EINTR(
> (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
> - qemu_pwritev(aiocb->aio_fildes,
> - aiocb->io.iov,
> - aiocb->io.niov,
> - aiocb->aio_offset) :
> + (aiocb->flags & BDRV_REQ_FUA) ?
> + qemu_pwritev_fua(aiocb->aio_fildes,
> + aiocb->io.iov,
> + aiocb->io.niov,
> + aiocb->aio_offset,
> + aiocb) :
> + qemu_pwritev(aiocb->aio_fildes,
> + aiocb->io.iov,
> + aiocb->io.niov,
> + aiocb->aio_offset) :
> qemu_preadv(aiocb->aio_fildes,
> aiocb->io.iov,
> aiocb->io.niov,
> @@ -1727,10 +1753,31 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData
> *aiocb, char *buf)
>
> while (offset < aiocb->aio_nbytes) {
> if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> - len = pwrite(aiocb->aio_fildes,
> - (const char *)buf + offset,
> - aiocb->aio_nbytes - offset,
> - aiocb->aio_offset + offset);
> + if (aiocb->flags & BDRV_REQ_FUA) {
> + struct iovec iov = {
> + .iov_base = buf + offset,
> + .iov_len = aiocb->aio_nbytes - offset,
> + };
> + len = qemu_pwritev_fua(aiocb->aio_fildes,
> + &iov,
> + 1,
> + aiocb->aio_offset + offset,
> + aiocb);
> + if (len == -ENOSYS) {
> + len = pwrite(aiocb->aio_fildes,
> + (const char *)buf + offset,
> + aiocb->aio_nbytes - offset,
> + aiocb->aio_offset + offset);
> + if (len == 0) {
> + len = handle_aiocb_flush(aiocb);
> + }
> + }
> + } else {
> + len = pwrite(aiocb->aio_fildes,
> + (const char *)buf + offset,
> + aiocb->aio_nbytes - offset,
> + aiocb->aio_offset + offset);
> + }
> } else {
> len = pread(aiocb->aio_fildes,
> buf + offset,
> @@ -2539,14 +2586,11 @@ static int coroutine_fn raw_co_prw(BlockDriverState
> *bs, int64_t *offset_ptr,
> .iov = qiov->iov,
> .niov = qiov->niov,
> },
> + .flags = flags,
> };
>
> assert(qiov->size == bytes);
> ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
> - if (ret == 0 && (flags & BDRV_REQ_FUA)) {
> - /* TODO Use pwritev2() instead if it's available */
> - ret = raw_co_flush_to_disk(bs);
> - }
> goto out; /* Avoid the compiler err of unused label */
>
> out:
> --
> 2.43.0
>
signature.asc
Description: PGP signature
