Partial writes to a physical block device (rather than to some hole in
a regular file) can cause unnecessary read-modify-write cycles.  In
particular, redundant reads can take noticeable overhead on cloud disks.

Write full blocks with pwritev(2) instead.

Signed-off-by: Gao Xiang <[email protected]>
---
 configure.ac       |  2 ++
 include/erofs/io.h |  5 +++++
 lib/inode.c        | 33 +++++++++++++++------------------
 lib/io.c           | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/configure.ac b/configure.ac
index 88f1cbe..a73a9ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -216,6 +216,7 @@ AC_CHECK_HEADERS(m4_flatten([
        sys/statfs.h
        sys/sysmacros.h
        sys/time.h
+       sys/uio.h
        unistd.h
 ]))
 
@@ -274,6 +275,7 @@ AC_CHECK_FUNCS(m4_flatten([
        ftello64
        pread64
        pwrite64
+       pwritev
        posix_fadvise
        fstatfs
        sendfile
diff --git a/include/erofs/io.h b/include/erofs/io.h
index 3179ea1..101a5ba 100644
--- a/include/erofs/io.h
+++ b/include/erofs/io.h
@@ -16,6 +16,7 @@ extern "C"
 #define _GNU_SOURCE
 #endif
 #include <unistd.h>
+#include <sys/uio.h>
 #include "defs.h"
 
 #ifndef O_BINARY
@@ -27,6 +28,8 @@ struct erofs_vfile;
 struct erofs_vfops {
        ssize_t (*pread)(struct erofs_vfile *vf, void *buf, u64 offset, size_t 
len);
        ssize_t (*pwrite)(struct erofs_vfile *vf, const void *buf, u64 offset, 
size_t len);
+       ssize_t (*pwritev)(struct erofs_vfile *vf, const struct iovec *iov,
+                          int iovcnt, u64 pos);
        int (*fsync)(struct erofs_vfile *vf);
        int (*fallocate)(struct erofs_vfile *vf, u64 offset, size_t len, bool 
pad);
        int (*ftruncate)(struct erofs_vfile *vf, u64 length);
@@ -53,6 +56,8 @@ ssize_t __erofs_io_write(int fd, const void *buf, size_t len);
 
 int erofs_io_fstat(struct erofs_vfile *vf, struct stat *buf);
 ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf, u64 pos, 
size_t len);
+ssize_t erofs_io_pwritev(struct erofs_vfile *vf, const struct iovec *iov,
+                        int iovcnt, u64 pos);
 int erofs_io_fsync(struct erofs_vfile *vf);
 ssize_t erofs_io_fallocate(struct erofs_vfile *vf, u64 offset, size_t len, 
bool pad);
 int erofs_io_ftruncate(struct erofs_vfile *vf, u64 length);
diff --git a/lib/inode.c b/lib/inode.c
index a36ade2..09f519b 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -827,6 +827,7 @@ static struct erofs_bhops erofs_write_inline_bhops = {
 
 static int erofs_write_tail_end(struct erofs_inode *inode)
 {
+       static const u8 zeroed[EROFS_MAX_BLOCK_SIZE];
        struct erofs_sb_info *sbi = inode->sbi;
        struct erofs_buffer_head *bh, *ibh;
 
@@ -843,8 +844,10 @@ static int erofs_write_tail_end(struct erofs_inode *inode)
                ibh->fsprivate = erofs_igrab(inode);
                ibh->op = &erofs_write_inline_bhops;
        } else {
+               struct iovec iov[2];
+               erofs_off_t pos;
                int ret;
-               erofs_off_t pos, zero_pos;
+               bool h0;
 
                if (!bh) {
                        bh = erofs_balloc(sbi->bmgr,
@@ -874,25 +877,19 @@ static int erofs_write_tail_end(struct erofs_inode *inode)
                pos = erofs_btell(bh, true) - erofs_blksiz(sbi);
 
                /* 0'ed data should be padded at head for 0padding conversion */
-               if (erofs_sb_has_lz4_0padding(sbi) && inode->compressed_idata) {
-                       zero_pos = pos;
-                       pos += erofs_blksiz(sbi) - inode->idata_size;
-               } else {
-                       /* pad 0'ed data for the other cases */
-                       zero_pos = pos + inode->idata_size;
-               }
-               ret = erofs_dev_write(sbi, inode->idata, pos, 
inode->idata_size);
-               if (ret)
+               h0 = erofs_sb_has_lz4_0padding(sbi) && inode->compressed_idata;
+               DBG_BUGON(inode->idata_size > erofs_blksiz(sbi));
+
+               iov[h0] = (struct iovec) { .iov_base = inode->idata,
+                                          .iov_len = inode->idata_size };
+               iov[!h0] = (struct iovec) { .iov_base = (u8 *)zeroed,
+                               erofs_blksiz(sbi) - inode->idata_size };
+               ret = erofs_io_pwritev(&sbi->bdev, iov, 2, pos);
+               if (ret < 0)
                        return ret;
+               else if (ret < erofs_blksiz(sbi))
+                       return -EIO;
 
-               DBG_BUGON(inode->idata_size > erofs_blksiz(sbi));
-               if (inode->idata_size < erofs_blksiz(sbi)) {
-                       ret = erofs_dev_fillzero(sbi, zero_pos,
-                                          erofs_blksiz(sbi) - 
inode->idata_size,
-                                          false);
-                       if (ret)
-                               return ret;
-               }
                inode->idata_size = 0;
                free(inode->idata);
                inode->idata = NULL;
diff --git a/lib/io.c b/lib/io.c
index 5c3d263..aa043ca 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -96,6 +96,39 @@ ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void 
*buf,
        return written;
 }
 
+ssize_t erofs_io_pwritev(struct erofs_vfile *vf, const struct iovec *iov,
+                        int iovcnt, u64 pos)
+{
+       ssize_t ret, written;
+       int i;
+
+       if (__erofs_unlikely(cfg.c_dry_run))
+               return 0;
+
+#ifdef HAVE_PWRITEV
+       if (!vf->ops) {
+               ret = pwritev(vf->fd, iov, iovcnt, pos + vf->offset);
+               if (ret < 0)
+                       return -errno;
+               return ret;
+       }
+#endif
+       if (vf->ops && vf->ops->pwritev)
+               return vf->ops->pwritev(vf, iov, iovcnt, pos);
+       written = 0;
+       for (i = 0; i < iovcnt; ++i) {
+               ret = erofs_io_pwrite(vf, iov[i].iov_base, pos, iov[i].iov_len);
+               if (ret < iov[i].iov_len) {
+                       if (ret < 0)
+                               return ret;
+                       return written + ret;
+               }
+               written += iov[i].iov_len;
+               pos += iov[i].iov_len;
+       }
+       return written;
+}
+
 int erofs_io_fsync(struct erofs_vfile *vf)
 {
        int ret;
-- 
2.43.5


Reply via email to