do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.

For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:

clat percentiles (usec):
 |  1.00th=[   33],  5.00th=[   34], 10.00th=[   34], 20.00th=[   34],
 | 30.00th=[   34], 40.00th=[   34], 50.00th=[   35], 60.00th=[   35],
 | 70.00th=[   35], 80.00th=[   35], 90.00th=[   37], 95.00th=[   80],
 | 99.00th=[   98], 99.50th=[  151], 99.90th=[  155], 99.95th=[  155],
 | 99.99th=[  165]

After:

clat percentiles (usec):
 |  1.00th=[   95],  5.00th=[  108], 10.00th=[  129], 20.00th=[  149],
 | 30.00th=[  155], 40.00th=[  161], 50.00th=[  167], 60.00th=[  171],
 | 70.00th=[  177], 80.00th=[  185], 90.00th=[  201], 95.00th=[  270],
 | 99.00th=[  390], 99.50th=[  398], 99.90th=[  418], 99.95th=[  422],
 | 99.99th=[  438]

In other setups, Robert Elliott reported seeing good performance
improvements:

https://lkml.org/lkml/2015/4/3/557

The more applications accessing the device, the worse it gets.

Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.

Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Theodore Ts'o <ty...@mit.edu>
Cc: Elliott, Robert (Server Storage) <elli...@hp.com>
Cc: Al Viro <v...@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <ax...@fb.com>
---
 fs/block_dev.c     |  2 +-
 fs/btrfs/inode.c   |  6 +++---
 fs/dax.c           |  4 ++--
 fs/direct-io.c     |  7 +++++--
 fs/ext4/indirect.c |  6 +++---
 fs/ext4/inode.c    |  4 ++--
 fs/inode.c         | 19 ++++++++++++++++---
 fs/nfs/direct.c    | 10 +++++-----
 include/linux/fs.h |  6 +++++-
 9 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..be1335dbd6be 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -155,7 +155,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, struct 
iov_iter *iter,
 
        return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
                                    offset, blkdev_get_block,
-                                   NULL, NULL, 0);
+                                   NULL, NULL, DIO_SKIP_DIO_COUNT);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d2e732d7af52..6fe341a66ed8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8129,7 +8129,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
                return 0;
 
-       atomic_inc(&inode->i_dio_count);
+       inode_dio_inc(inode);
        smp_mb__after_atomic();
 
        /*
@@ -8169,7 +8169,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                current->journal_info = &outstanding_extents;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
-               inode_dio_done(inode);
+               inode_dio_dec(inode);
                flags = DIO_LOCKING | DIO_SKIP_HOLES;
                wakeup = false;
        }
@@ -8188,7 +8188,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        }
 out:
        if (wakeup)
-               inode_dio_done(inode);
+               inode_dio_dec(inode);
        if (relock)
                mutex_lock(&inode->i_mutex);
 
diff --git a/fs/dax.c b/fs/dax.c
index ed1619ec6537..1d4a9a54a938 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -210,7 +210,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode 
*inode,
        }
 
        /* Protects against truncate */
-       atomic_inc(&inode->i_dio_count);
+       inode_dio_inc(inode);
 
        retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
 
@@ -220,7 +220,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode 
*inode,
        if ((retval > 0) && end_io)
                end_io(iocb, pos, retval, bh.b_private);
 
-       inode_dio_done(inode);
+       inode_dio_dec(inode);
  out:
        return retval;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..cba348e3135f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -254,7 +254,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, 
ssize_t ret,
        if (dio->end_io && dio->result)
                dio->end_io(dio->iocb, offset, transferred, dio->private);
 
-       inode_dio_done(dio->inode);
+       if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+               inode_dio_dec(dio->inode);
+
        if (is_async) {
                if (dio->rw & WRITE) {
                        int err;
@@ -1199,7 +1201,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct 
inode *inode,
        /*
         * Will be decremented at I/O completion time.
         */
-       atomic_inc(&inode->i_dio_count);
+       if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+               inode_dio_inc(inode);
 
        retval = 0;
        sdio.blkbits = blkbits;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 45fe924f82bc..bf77e5b73ae2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -682,11 +682,11 @@ retry:
                 * via ext4_inode_block_unlocked_dio(). Check inode's state
                 * while holding extra i_dio_count ref.
                 */
-               atomic_inc(&inode->i_dio_count);
+               inode_dio_inc(inode);
                smp_mb();
                if (unlikely(ext4_test_inode_state(inode,
                                                    EXT4_STATE_DIOREAD_LOCK))) {
-                       inode_dio_done(inode);
+                       inode_dio_dec(inode);
                        goto locked;
                }
                if (IS_DAX(inode))
@@ -696,7 +696,7 @@ retry:
                        ret = __blockdev_direct_IO(rw, iocb, inode,
                                        inode->i_sb->s_bdev, iter, offset,
                                        ext4_get_block, NULL, NULL, 0);
-               inode_dio_done(inode);
+               inode_dio_dec(inode);
        } else {
 locked:
                if (IS_DAX(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a212b86f..62d4615aa9b0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2978,7 +2978,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb 
*iocb,
         * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
         */
        if (rw == WRITE)
-               atomic_inc(&inode->i_dio_count);
+               inode_dio_inc(inode);
 
        /* If we do a overwrite dio, i_mutex locking can be released */
        overwrite = *((int *)iocb->private);
@@ -3080,7 +3080,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb 
*iocb,
 
 retake_lock:
        if (rw == WRITE)
-               inode_dio_done(inode);
+               inode_dio_dec(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
        if (overwrite) {
                up_read(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/inode.c b/fs/inode.c
index f00b16f45507..c4901c40ad65 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1946,18 +1946,31 @@ void inode_dio_wait(struct inode *inode)
 EXPORT_SYMBOL(inode_dio_wait);
 
 /*
- * inode_dio_done - signal finish of a direct I/O requests
+ * inode_dio_begin - signal start of a direct I/O requests
  * @inode: inode the direct I/O happens on
  *
  * This is called once we've finished processing a direct I/O request,
  * and is used to wake up callers waiting for direct I/O to be quiesced.
  */
-void inode_dio_done(struct inode *inode)
+void inode_dio_inc(struct inode *inode)
+{
+       atomic_inc(&inode->i_dio_count);
+}
+EXPORT_SYMBOL(inode_dio_inc);
+
+/*
+ * inode_dio_dec - signal finish of a direct I/O requests
+ * @inode: inode the direct I/O happens on
+ *
+ * This is called once we've finished processing a direct I/O request,
+ * and is used to wake up callers waiting for direct I/O to be quiesced.
+ */
+void inode_dio_dec(struct inode *inode)
 {
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
 }
-EXPORT_SYMBOL(inode_dio_done);
+EXPORT_SYMBOL(inode_dio_dec);
 
 /*
  * inode_set_flags - atomically set some inode flags
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e907c8cf732e..2890317173eb 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req 
*dreq, bool write)
        if (write)
                nfs_zap_mapping(inode, inode->i_mapping);
 
-       inode_dio_done(inode);
+       inode_dio_dec(inode);
 
        if (dreq->iocb) {
                long res = (long) dreq->error;
@@ -487,7 +487,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct 
nfs_direct_req *dreq,
                             &nfs_direct_read_completion_ops);
        get_dreq(dreq);
        desc.pg_dreq = dreq;
-       atomic_inc(&inode->i_dio_count);
+       inode_dio_inc(inode);
 
        while (iov_iter_count(iter)) {
                struct page **pagevec;
@@ -539,7 +539,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct 
nfs_direct_req *dreq,
         * generic layer handle the completion.
         */
        if (requested_bytes == 0) {
-               inode_dio_done(inode);
+               inode_dio_dec(inode);
                nfs_direct_req_release(dreq);
                return result < 0 ? result : -EIO;
        }
@@ -873,7 +873,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct 
nfs_direct_req *dreq,
                              &nfs_direct_write_completion_ops);
        desc.pg_dreq = dreq;
        get_dreq(dreq);
-       atomic_inc(&inode->i_dio_count);
+       inode_dio_inc(inode);
 
        NFS_I(inode)->write_io += iov_iter_count(iter);
        while (iov_iter_count(iter)) {
@@ -929,7 +929,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct 
nfs_direct_req *dreq,
         * generic layer handle the completion.
         */
        if (requested_bytes == 0) {
-               inode_dio_done(inode);
+               inode_dio_dec(inode);
                nfs_direct_req_release(dreq);
                return result < 0 ? result : -EIO;
        }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 52cc4492cb3a..1292ce1d9be5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2613,6 +2613,9 @@ enum {
 
        /* filesystem can handle aio writes beyond i_size */
        DIO_ASYNC_EXTEND = 0x04,
+
+       /* inode/fs/bdev does not need truncate protection */
+       DIO_SKIP_DIO_COUNT = 0x08,
 };
 
 void dio_end_io(struct bio *bio, int error);
@@ -2633,7 +2636,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct 
kiocb *iocb,
 #endif
 
 void inode_dio_wait(struct inode *inode);
-void inode_dio_done(struct inode *inode);
+void inode_dio_inc(struct inode *inode);
+void inode_dio_dec(struct inode *inode);
 
 extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to