There are about 3 advantages to use direct I/O and AIO on
read/write loop's backing file:

1) double cache can be avoided, then memory usage gets
decreased a lot

2) not like user space direct I/O, there isn't cost of
pinning pages

3) avoid context switch for obtaining good throughput
- in buffered file read, random I/O throughput is often obtained
only if they are submitted concurrently from lots of tasks; but for
sequential I/O, most of times they can be hit from page cache, so
concurrent submissions often introduce unnecessary context switch
and can't improve throughput much. There was such discussion[1]
to use non-blocking I/O to improve the problem for application.
- with direct I/O and AIO, concurrent submissions can be
avoided and random read throughput can't be affected meantime

Follows my fio test result:

1. 16 jobs fio test inside ext4 file system over loop block
1) How to run
        - linux kernel: 4.1.0-rc2-next-20150506 with the patchset
        - the loop block is over one image on HDD.
        - linux psync, 16 jobs, size 400M, ext4 over loop block
        - test result: IOPS from fio output

2) Throughput result:
        -------------------------------------------------------------
        test cases          |randread   |read   |randwrite  |write  |
        -------------------------------------------------------------
        base                |240        |8705   |3763       |20914
        -------------------------------------------------------------
        base+loop aio       |242        |9258   |4577       |21451
        -------------------------------------------------------------

3) context switch
        - context switch decreased by ~16% with loop aio for randread,
        and decreased by ~33% for read

4) memory usage
        - After these four tests with loop aio: ~10% memory becomes used
        - After these four tests without loop aio: more than 55% memory
        becomes used

2. single job fio test inside ext4 file system over loop block(for Maxim 
Patlasov)
1) How to run
        - linux kernel: 4.1.0-rc2-next-20150506 with the patchset
        - the loop block is over one image on HDD.
        - linux psync, 1 job, size 4000M, ext4 over loop block
        - test result: IOPS from fio output

2) Throughput result:
        -------------------------------------------------------------
        test cases          |randread   |read   |randwrite  |write  |
        -------------------------------------------------------------
        base                |109        |21180  |4192       |22782
        -------------------------------------------------------------
        base+loop aio       |114        |21018  |5404       |22670
        -------------------------------------------------------------

3) context switch
        - context switch decreased by ~10% with loop aio for randread,
        and decreased by ~50% for read

4) memory usage
        - After these four tests with loop aio: ~10% memory becomes used
        - After these four tests without loop aio: more than 55% memory
        becomes used

Both 'context switch' and 'memory usage' data are got from sar.

[1] https://lwn.net/Articles/612483/
[2] sar graph when running fio over loop without the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio.pdf

[3] sar graph when running fio over loop with the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio.pdf

[4] sar graph when running fio over loop without the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio-1job.pdf

[5] sar graph when running fio over loop with the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio-1job.pdf

Signed-off-by: Ming Lei <ming....@canonical.com>
---
 drivers/block/loop.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/block/loop.h |  4 +++
 2 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 94c9eec..3652581 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -388,6 +388,65 @@ static int lo_req_flush(struct loop_device *lo, struct 
request *rq)
 
        return ret;
 }
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+       struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+       struct request *rq = cmd->rq;
+
+       if (ret > 0)
+               ret = 0;
+       else if (ret < 0)
+               ret = -EIO;
+
+       rq->errors = ret;
+       blk_mq_complete_request(rq);
+}
+
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+                    loff_t pos, bool rw)
+{
+       struct iov_iter iter;
+       struct bio_vec *bvec;
+       struct bio *bio = cmd->rq->bio;
+       struct file *file = lo->lo_backing_file;
+       int ret;
+
+       /* nomerge for loop request queue */
+       WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+
+       bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+       iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+                     bio_segments(bio), blk_rq_bytes(cmd->rq));
+
+       cmd->iocb.ki_pos = pos;
+       cmd->iocb.ki_filp = file;
+       cmd->iocb.ki_complete = lo_rw_aio_complete;
+       cmd->iocb.ki_flags = IOCB_DONT_DIRTY_PAGE | IOCB_DIRECT;
+
+       if (rw == WRITE)
+               ret = file->f_op->write_iter(&cmd->iocb, &iter);
+       else
+               ret = file->f_op->read_iter(&cmd->iocb, &iter);
+
+       if (ret != -EIOCBQUEUED)
+               cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+       return 0;
+}
+
+
+static inline int lo_rw_simple(struct loop_device *lo,
+               struct request *rq, loff_t pos, bool rw)
+{
+       struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+       if (cmd->use_aio)
+               return lo_rw_aio(lo, cmd, pos, rw);
+
+       if (rw == WRITE)
+               return lo_write_simple(lo, rq, pos);
+       else
+               return lo_read_simple(lo, rq, pos);
+}
 
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
@@ -404,13 +463,13 @@ static int do_req_filebacked(struct loop_device *lo, 
struct request *rq)
                else if (lo->transfer)
                        ret = lo_write_transfer(lo, rq, pos);
                else
-                       ret = lo_write_simple(lo, rq, pos);
+                       ret = lo_rw_simple(lo, rq, pos, WRITE);
 
        } else {
                if (lo->transfer)
                        ret = lo_read_transfer(lo, rq, pos);
                else
-                       ret = lo_read_simple(lo, rq, pos);
+                       ret = lo_rw_simple(lo, rq, pos, READ);
        }
 
        return ret;
@@ -441,6 +500,12 @@ static void do_loop_switch(struct loop_device *lo, struct 
switch_request *p)
                mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
        lo->old_gfp_mask = mapping_gfp_mask(mapping);
        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+       lo->support_dio = mapping->a_ops && mapping->a_ops->direct_IO;
+       if (lo->support_dio)
+               lo->use_aio = true;
+       else
+               lo->use_aio = false;
 }
 
 /*
@@ -761,6 +826,13 @@ static int loop_set_fd(struct loop_device *lo, fmode_t 
mode,
        if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
                blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
+       /* use aio if it is possible */
+       lo->support_dio = mapping->a_ops && mapping->a_ops->direct_IO;
+       if (lo->support_dio)
+               lo->use_aio = true;
+       else
+               lo->use_aio = false;
+
        set_capacity(lo->lo_disk, size);
        bd_set_size(bdev, size << 9);
        loop_sysfs_init(lo);
@@ -1451,6 +1523,12 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (lo->lo_state != Lo_bound)
                return -EIO;
 
+       if (lo->use_aio && !lo->transfer &&
+                       !(cmd->rq->cmd_flags & (REQ_FLUSH | REQ_DISCARD)))
+               cmd->use_aio = true;
+       else
+               cmd->use_aio = false;
+
        queue_kthread_work(&lo->worker, &cmd->work);
 
        return BLK_MQ_RQ_QUEUE_OK;
@@ -1470,7 +1548,8 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
  failed:
        if (ret)
                cmd->rq->errors = -EIO;
-       blk_mq_complete_request(cmd->rq);
+       if (!cmd->use_aio || ret)
+               blk_mq_complete_request(cmd->rq);
 }
 
 static void loop_queue_work(struct kthread_work *work)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 54c6aa5..0af40a0 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -58,6 +58,8 @@ struct loop_device {
        struct mutex            lo_ctl_mutex;
        struct kthread_worker   worker;
        struct task_struct      *worker_task;
+       bool                    support_dio;
+       bool                    use_aio;
 
        struct request_queue    *lo_queue;
        struct blk_mq_tag_set   tag_set;
@@ -68,6 +70,8 @@ struct loop_cmd {
        struct kthread_work work;
        struct request *rq;
        struct list_head list;
+       bool use_aio;           /* use AIO interface to handle I/O */
+       struct kiocb iocb;
 };
 
 /* Support for loadable transfer modules */
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to