On 4/1/19 9:10 PM, Jianchao Wang wrote:
> For the IORING_SETUP_IOPOLL & direct_io case, all of the submission
> and completion are handled under ctx->uring_lock or in SQ poll thread
> context, so io_get_req and io_put_req has been serialized well.
> 
> Based on this, we introduce the preallocated reqs ring per ctx and
> needn't to provide any lock to serialize the updating of the head
> and tail. The performacne benefits from this. The test result of
> following fio command
> 
> fio --name=io_uring_test --ioengine=io_uring --hipri --fixedbufs
> --iodepth=16 --direct=1 --numjobs=1 --filename=/dev/nvme0n1 --bs=4k
> --group_reporting --runtime=10
> 
> shows IOPS upgrade from 197K to 206K.

I like this idea, but not a fan of the execution of it. See below.

> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 6aaa3058..40837e4 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -104,11 +104,17 @@ struct async_list {
>       size_t                  io_pages;
>  };
>  
> +#define INLINE_REQS_TOTAL 128
> +
>  struct io_ring_ctx {
>       struct {
>               struct percpu_ref       refs;
>       } ____cacheline_aligned_in_smp;
>  
> +     struct io_kiocb *inline_reqs[INLINE_REQS_TOTAL];
> +     struct io_kiocb *inline_req_array;
> +     unsigned long inline_reqs_h, inline_reqs_t;

Why not just use a list? The req has a list member anyway. Then you don't
need a huge array, just a count.

> +
>       struct {
>               unsigned int            flags;
>               bool                    compat;
> @@ -183,7 +189,9 @@ struct io_ring_ctx {
>  
>  struct sqe_submit {
>       const struct io_uring_sqe       *sqe;
> +     struct file                     *file;
>       unsigned short                  index;
> +     bool                            is_fixed;
>       bool                            has_user;
>       bool                            needs_lock;
>       bool                            needs_fixed_file;

Not sure why you're moving these to the sqe_submit?

> @@ -228,7 +236,7 @@ struct io_kiocb {
>  #define REQ_F_PREPPED                16      /* prep already done */
>       u64                     user_data;
>       u64                     error;
> -
> +     bool                    ctx_inline;
>       struct work_struct      work;
>  };

ctx_inline should just be a req flag.

>  
> @@ -397,7 +405,8 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx 
> *ctx, unsigned refs)
>  }
>  
>  static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
> -                                struct io_submit_state *state)
> +                                struct io_submit_state *state,
> +                                bool direct_io)
>  {
>       gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
>       struct io_kiocb *req;
> @@ -405,10 +414,19 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx 
> *ctx,
>       if (!percpu_ref_tryget(&ctx->refs))
>               return NULL;
>  
> -     if (!state) {
> +     /*
> +      * Avoid race with workqueue context that handle buffered IO.
> +      */
> +     if (direct_io &&
> +         ctx->inline_reqs_h - ctx->inline_reqs_t < INLINE_REQS_TOTAL) {
> +         req = ctx->inline_reqs[ctx->inline_reqs_h % INLINE_REQS_TOTAL];
> +         ctx->inline_reqs_h++;
> +         req->ctx_inline = true;
> +     } else if (!state) {

What happens for O_DIRECT that ends up being punted to async context?
We need a clearer indication of whether or not we're under the lock or
not, and then get rid of the direct_io "limitation" for this. Arguably,
cached buffered IO needs this even more than O_DIRECT does, since that
is much faster.

-- 
Jens Axboe

Reply via email to