There are about 3 advantages to use direct I/O and AIO on read/write loop's backing file:
1) double cache can be avoided, then memory usage gets decreased a lot 2) not like user space direct I/O, there isn't cost of pinning pages 3) avoid context switch for obtaining good throughput - in buffered file read, random I/O top throughput is often obtained only if they are submitted concurrently from lots of tasks; but for sequential I/O, most of times they can be hit from page cache, so concurrent submissions often introduce unnecessary context switch and can't improve throughput much. There was such discussion[1] to use non-blocking I/O to improve the problem for application. - with direct I/O and AIO, concurrent submissions can be avoided and random read throughput can't be affected meantime Follows my fio test result: 1. 16 jobs fio test inside ext4 file system over loop block 1) How to run - linux kernel: 4.1.0-rc2-next-20150506 with the patchset - the loop block is over one image on HDD. - linux psync, 16 jobs, size 400M, ext4 over loop block - test result: IOPS from fio output 2) Throughput result: ------------------------------------------------------------- test cases |randread |read |randwrite |write | ------------------------------------------------------------- base |240 |8705 |3763 |20914 ------------------------------------------------------------- base+loop aio |242 |9258 |4577 |21451 ------------------------------------------------------------- 3) context switch - context switch decreased by ~16% with loop aio for randread, and decreased by ~33% for read 4) memory usage - After these four tests with loop aio: ~10% memory becomes used - After these four tests without loop aio: more than 55% memory becomes used 2. single job fio test inside ext4 file system over loop block(for Maxim Patlasov) 1) How to run - linux kernel: 4.1.0-rc2-next-20150506 with the patchset - the loop block is over one image on HDD. - linux psync, 1 job, size 4000M, ext4 over loop block - test result: IOPS from fio output 2) Throughput result: ------------------------------------------------------------- test cases |randread |read |randwrite |write | ------------------------------------------------------------- base |109 |21180 |4192 |22782 ------------------------------------------------------------- base+loop aio |114 |21018 |5404 |22670 ------------------------------------------------------------- 3) context switch - context switch decreased by ~10% with loop aio for randread, and decreased by ~50% for read 4) memory usage - After these four tests with loop aio: ~10% memory becomes used - After these four tests without loop aio: more than 55% memory becomes used Both 'context switch' and 'memory usage' data are got from sar. [1] https://lwn.net/Articles/612483/ [2] sar graph when running fio over loop without the patchset http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio.pdf [3] sar graph when running fio over loop with the patchset http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio.pdf [4] sar graph when running fio over loop without the patchset http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio-1job.pdf [5] sar graph when running fio over loop with the patchset http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio-1job.pdf Signed-off-by: Ming Lei <ming....@canonical.com> --- drivers/block/loop.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++-- drivers/block/loop.h | 5 ++ 2 files changed, 149 insertions(+), 3 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 06a737b..edaa18a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -200,6 +200,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; lo->dio_align = dio_align; + lo->last_use_dio = use_dio; + atomic_set(&lo->pending_dio, 0); if (use_dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; else @@ -436,6 +438,128 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq) return ret; } +static inline void handle_partial_read(struct loop_cmd *cmd, long bytes) +{ + if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE)) + return; + + if (unlikely(bytes < blk_rq_bytes(cmd->rq))) { + struct bio *bio = cmd->rq->bio; + + bio_advance(bio, bytes); + zero_fill_bio(bio); + } +} + +static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); + struct request *rq = cmd->rq; + struct loop_device *lo = cmd->rq->q->queuedata; + + handle_partial_read(cmd, ret); + + if (ret > 0) + ret = 0; + else if (ret < 0) + ret = -EIO; + + rq->errors = ret; + blk_mq_complete_request(rq); + + if (atomic_dec_and_test(&lo->pending_dio)) { + if (lo->waiter) + wake_up_process(lo->waiter); + } +} + +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, bool rw) +{ + struct iov_iter iter; + struct bio_vec *bvec; + struct bio *bio = cmd->rq->bio; + struct file *file = lo->lo_backing_file; + int ret; + + /* nomerge for loop request queue */ + WARN_ON(cmd->rq->bio != cmd->rq->biotail); + + bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, + bio_segments(bio), blk_rq_bytes(cmd->rq)); + + cmd->iocb.ki_pos = pos; + cmd->iocb.ki_filp = file; + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + + atomic_inc(&lo->pending_dio); + + if (rw == WRITE) + ret = file->f_op->write_iter(&cmd->iocb, &iter); + else + ret = file->f_op->read_iter(&cmd->iocb, &iter); + + if (ret != -EIOCBQUEUED) + cmd->iocb.ki_complete(&cmd->iocb, ret, 0); + return 0; +} + +static inline void lo_drain_pending_dio(struct loop_device *lo) +{ + lo->waiter = current; + + /* order between writing waiter and reading pending_dio */ + smp_mb__before_atomic(); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!atomic_read(&lo->pending_dio)) + break; + schedule(); + } + set_current_state(TASK_RUNNING); + lo->waiter = NULL; +} + +static inline int lo_rw_simple(struct loop_device *lo, + struct request *rq, loff_t pos, bool rw) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + /* + * When working at direct I/O, under very unusual cases, + * such as unaligned direct I/O from application and + * access to loop block device with 'unaligned' offset & size, + * wait for completion of the pending dio/aio before swiching + * to buffered I/O for avoiding probable race between dio and + * buffered I/O. + * + * Page cache flush is handled inside VFS direct I/O path if + * it is needed. + */ + if (unlikely(lo->use_dio && lo->last_use_dio && !cmd->use_aio)) + lo_drain_pending_dio(lo); + + if (cmd->use_aio) + return lo_rw_aio(lo, cmd, pos, rw); + + /* + * lo_write_simple and lo_read_simple should have been covered + * by io submit style function like lo_rw_aio(), one blocker + * is that lo_read_simple() need to call flush_dcache_page after + * the page is written from kernel, and it isn't easy to handle + * this in io submit style function which submits all segments + * of the req at one time. And direct read IO doesn't need to + * run flush_dcache_page(). + */ + if (rw == WRITE) + return lo_write_simple(lo, rq, pos); + else + return lo_read_simple(lo, rq, pos); +} + static int do_req_filebacked(struct loop_device *lo, struct request *rq) { loff_t pos; @@ -451,13 +575,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) else if (lo->transfer) ret = lo_write_transfer(lo, rq, pos); else - ret = lo_write_simple(lo, rq, pos); + ret = lo_rw_simple(lo, rq, pos, WRITE); } else { if (lo->transfer) ret = lo_read_transfer(lo, rq, pos); else - ret = lo_read_simple(lo, rq, pos); + ret = lo_rw_simple(lo, rq, pos, READ); } return ret; @@ -1545,6 +1669,13 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); +static inline bool req_dio_aligned(struct loop_device *lo, + const struct request *rq) +{ + return !((blk_rq_pos(rq) << 9) & lo->dio_align) && + !(blk_rq_bytes(rq) & lo->dio_align); +} + static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1556,6 +1687,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (lo->lo_state != Lo_bound) return -EIO; + if (lo->use_dio && !lo->transfer && + req_dio_aligned(lo, bd->rq) && + !(cmd->rq->cmd_flags & (REQ_FLUSH | REQ_DISCARD))) + cmd->use_aio = true; + else + cmd->use_aio = false; + queue_kthread_work(&lo->worker, &cmd->work); return BLK_MQ_RQ_QUEUE_OK; @@ -1571,11 +1709,14 @@ static void loop_handle_cmd(struct loop_cmd *cmd) goto failed; ret = do_req_filebacked(lo, cmd->rq); + lo->last_use_dio = cmd->use_aio; failed: if (ret) cmd->rq->errors = -EIO; - blk_mq_complete_request(cmd->rq); + /* complete non-aio request */ + if (!cmd->use_aio || ret) + blk_mq_complete_request(cmd->rq); } static void loop_queue_work(struct kthread_work *work) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 63f8e14..439054b 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -60,6 +60,9 @@ struct loop_device { struct task_struct *worker_task; unsigned dio_align; bool use_dio; + bool last_use_dio; + atomic_t pending_dio; + struct task_struct *waiter; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; @@ -70,6 +73,8 @@ struct loop_cmd { struct kthread_work work; struct request *rq; struct list_head list; + bool use_aio; /* use AIO interface to handle I/O */ + struct kiocb iocb; }; /* Support for loadable transfer modules */ -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/