This is needed to prevent races caused by the way the ->poll API works. To avoid introducing overhead for other users of the iocbs we initialize it to zero and only do refcount operations if it is non-zero in the completion path.
Signed-off-by: Christoph Hellwig <h...@lst.de> Tested-by: Avi Kivity <a...@scylladb.com> --- fs/aio.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 27454594e37a..fe2018ada32c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -18,6 +18,7 @@ #include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> +#include <linux/refcount.h> #include <linux/uio.h> #include <linux/sched/signal.h> @@ -178,6 +179,7 @@ struct aio_kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ + refcount_t ki_refcnt; /* * If the aio_resfd field of the userspace iocb is not zero, @@ -1015,6 +1017,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) percpu_ref_get(&ctx->reqs); INIT_LIST_HEAD(&req->ki_list); + refcount_set(&req->ki_refcnt, 0); req->ki_ctx = ctx; return req; out_put: @@ -1049,6 +1052,15 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) return ret; } +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_read(&iocb->ki_refcnt) == 0 || + refcount_dec_and_test(&iocb->ki_refcnt)) { + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); + } +} + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1118,8 +1130,6 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) eventfd_ctx_put(iocb->ki_eventfd); } - kmem_cache_free(kiocb_cachep, iocb); - /* * We have to order our ring_info tail store above and test * of the wait list below outside the wait lock. This is @@ -1130,8 +1140,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - - percpu_ref_put(&ctx->reqs); + iocb_put(iocb); } /* aio_read_events_ring -- 2.18.0