[RFC PATCH 3/4] nvme: add async ioctl support

Kanchan Joshi Wed, 27 Jan 2021 07:11:30 -0800

Add async_ioctl handler that implements asynchronous handling of ioctl
operation. If requested ioctl opcode does not involve submitting a
command to device (e.g. NVME_IOCTL_ID), it is made to return instantly.
Otherwise, ioctl-completion is decoupled from submission, and
-EIOCBQUEUED is returned post submission. When completion arrives from
device, nvme calls the ioctl-completion handler supplied by upper-layer.
But there is execption to that. An ioctl completion may also require
updating certain ioctl-specific user buffers/fields which can be
accessed only in context of original submitter-task. For such ioctl,
nvme-completion schedules a task-work which first updates ioctl-specific
buffers/fields and after that invokes the ioctl-completion handler.


Signed-off-by: Kanchan Joshi <josh...@samsung.com>
Signed-off-by: Anuj Gupta <anuj2...@samsung.com>
---
 drivers/nvme/host/core.c | 347 +++++++++++++++++++++++++++++++--------
 1 file changed, 280 insertions(+), 67 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 200bdd672c28..57f3040bae34 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -21,6 +21,7 @@
 #include <linux/nvme_ioctl.h>
 #include <linux/pm_qos.h>
 #include <asm/unaligned.h>
+#include <linux/task_work.h>
 
 #include "nvme.h"
 #include "fabrics.h"
@@ -1092,7 +1093,107 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, 
u32 effects)
        }
 }
 
-void nvme_execute_passthru_rq(struct request *rq)
+struct async_pt_desc {
+       struct bio *bio;
+       int status; /* command status */
+       u64 result; /* nvme cmd result */
+       void __user *res_ptr; /* can be null, 32bit addr or 64 bit addr */
+       void __user *meta_ptr;
+       void *meta; /* kernel-space resident buffer */
+       unsigned metalen; /* length of meta */
+       bool is_res64 : 1; /* res_ptr refers to 64bit of space */
+       bool is_write : 1;
+       bool is_taskwork : 1;
+};
+
+static int nvme_add_task_work(struct task_struct *tsk,
+                       struct callback_head *twork,
+                       task_work_func_t work_func)
+{
+       int ret;
+
+       get_task_struct(tsk);
+       init_task_work(twork, work_func);
+       ret = task_work_add(tsk, twork, TWA_SIGNAL);
+       if (!ret)
+               wake_up_process(tsk);
+       return ret;
+}
+
+static void async_pt_update_work(struct callback_head *cbh)
+{
+       struct pt_ioctl_ctx *ptioc;
+       struct async_pt_desc *ptd;
+       struct task_struct *tsk;
+       int ret;
+
+       ptioc = container_of(cbh, struct pt_ioctl_ctx, pt_work);
+       ptd = ptioc->ioc_data;
+       tsk = ptioc->task;
+
+       /* handle meta update */
+       if (ptd->meta) {
+               if (!ptd->status && !ptd->is_write)
+                       if (copy_to_user(ptd->meta_ptr, ptd->meta, 
ptd->metalen))
+                               ptd->status = -EFAULT;
+               kfree(ptd->meta);
+       }
+       /* handle result update */
+       if (ptd->res_ptr) {
+               if (!ptd->is_res64)
+                       ret = put_user(ptd->result, (u32 __user *)ptd->res_ptr);
+               else
+                       ret = put_user(ptd->result, (u64 __user *)ptd->res_ptr);
+               if (ret)
+                       ptd->status = -EFAULT;
+       }
+
+       ptioc->pt_complete(ptioc, ptd->status);
+       put_task_struct(tsk);
+       kfree(ptd);
+}
+
+static void nvme_end_async_pt(struct request *req, blk_status_t err)
+{
+       struct pt_ioctl_ctx *ptioc;
+       struct async_pt_desc *ptd;
+       struct bio *bio;
+
+       ptioc = req->end_io_data;
+       ptd = ptioc->ioc_data;
+
+       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+               ptd->status = -EINTR;
+       else
+               ptd->status = nvme_req(req)->status;
+
+       ptd->result = le64_to_cpu(nvme_req(req)->result.u64);
+       bio = ptd->bio;
+       /* setup task work if needed */
+       if (ptd->is_taskwork) {
+               int ret = nvme_add_task_work(ptioc->task, &ptioc->pt_work,
+                               async_pt_update_work);
+               /* update failure if task-work could not be setup */
+               if (ret < 0) {
+                       put_task_struct(ptioc->task);
+                       ptioc->pt_complete(ptioc, ret);
+                       kfree(ptd->meta);
+                       kfree(ptd);
+               }
+       } else {
+               /* return status via callback, nothing else to update */
+               ptioc->pt_complete(ptioc, ptd->status);
+               kfree(ptd);
+       }
+
+       /* unmap pages, free bio, nvme command and request */
+       blk_rq_unmap_user(bio);
+       kfree(nvme_req(req)->cmd);
+       blk_mq_free_request(req);
+}
+
+
+void nvme_execute_passthru_rq_common(struct request *rq, int async)
 {
        struct nvme_command *cmd = nvme_req(rq)->cmd;
        struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
@@ -1101,15 +1202,52 @@ void nvme_execute_passthru_rq(struct request *rq)
        u32 effects;
 
        effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
-       blk_execute_rq(rq->q, disk, rq, 0);
+       if (!async)
+               blk_execute_rq(rq->q, disk, rq, 0);
+       else
+               blk_execute_rq_nowait(rq->q, disk, rq, 0, nvme_end_async_pt);
        nvme_passthru_end(ctrl, effects);
 }
+
+void nvme_execute_passthru_rq(struct request *rq)
+{
+       return nvme_execute_passthru_rq_common(rq, 0);
+}
 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
 
+static int setup_async_pt_desc(struct request *rq, struct pt_ioctl_ctx *ptioc,
+               void __user *resptr, void __user *meta_buffer, void *meta,
+               unsigned meta_len, bool write, bool is_res64)
+{
+       struct async_pt_desc *ptd;
+
+       ptd = kzalloc(sizeof(struct async_pt_desc), GFP_KERNEL);
+       if (!ptd)
+               return -ENOMEM;
+
+       /* to free bio on completion, as req->bio will be null at that time */
+       ptd->bio = rq->bio;
+       ptd->res_ptr = resptr;
+       ptd->is_write = write;
+       ptd->is_res64 = is_res64;
+       if (meta) {
+               ptd->meta_ptr = meta_buffer;
+               ptd->meta = meta;
+               ptd->metalen = meta_len;
+       }
+       if (resptr)
+               ptd->is_taskwork = 1;
+
+       ptioc->ioc_data = ptd;
+       rq->end_io_data = ptioc;
+       return 0;
+}
+
 static int nvme_submit_user_cmd(struct request_queue *q,
                struct nvme_command *cmd, void __user *ubuffer,
                unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-               u32 meta_seed, u64 *result, unsigned timeout)
+               u32 meta_seed, u64 *result, unsigned timeout,
+               struct pt_ioctl_ctx *ptioc, bool is_res64)
 {
        bool write = nvme_is_write(cmd);
        struct nvme_ns *ns = q->queuedata;
@@ -1145,6 +1283,18 @@ static int nvme_submit_user_cmd(struct request_queue *q,
                }
        }
 
+       if (ptioc) { /* async handling */
+               ret = setup_async_pt_desc(req, ptioc, result, meta_buffer,
+                              meta, meta_len, write, is_res64);
+               if (ret) {
+                       kfree(meta);
+                       goto out_unmap;
+               }
+               /* send request for async processing */
+               nvme_execute_passthru_rq_common(req, 1);
+               return ret;
+       }
+       /* sync handling */
        nvme_execute_passthru_rq(req);
        if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
                ret = -EINTR;
@@ -1521,10 +1671,11 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
        return (void __user *)ptrval;
 }
 
-static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio,
+                       struct pt_ioctl_ctx *ptioc)
 {
        struct nvme_user_io io;
-       struct nvme_command c;
+       struct nvme_command c, *cptr;
        unsigned length, meta_len;
        void __user *metadata;
 
@@ -1554,31 +1705,42 @@ static int nvme_submit_io(struct nvme_ns *ns, struct 
nvme_user_io __user *uio)
                        return -EINVAL;
        }
 
-       memset(&c, 0, sizeof(c));
-       c.rw.opcode = io.opcode;
-       c.rw.flags = io.flags;
-       c.rw.nsid = cpu_to_le32(ns->head->ns_id);
-       c.rw.slba = cpu_to_le64(io.slba);
-       c.rw.length = cpu_to_le16(io.nblocks);
-       c.rw.control = cpu_to_le16(io.control);
-       c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
-       c.rw.reftag = cpu_to_le32(io.reftag);
-       c.rw.apptag = cpu_to_le16(io.apptag);
-       c.rw.appmask = cpu_to_le16(io.appmask);
-
-       return nvme_submit_user_cmd(ns->queue, &c,
+       if (!ptioc)
+               cptr = &c;
+       else { /* for async - allocate cmd dynamically */
+               cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL);
+               if (!cptr)
+                       return -ENOMEM;
+       }
+
+       memset(cptr, 0, sizeof(c));
+       cptr->rw.opcode = io.opcode;
+       cptr->rw.flags = io.flags;
+       cptr->rw.nsid = cpu_to_le32(ns->head->ns_id);
+       cptr->rw.slba = cpu_to_le64(io.slba);
+       cptr->rw.length = cpu_to_le16(io.nblocks);
+       cptr->rw.control = cpu_to_le16(io.control);
+       cptr->rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+       cptr->rw.reftag = cpu_to_le32(io.reftag);
+       cptr->rw.apptag = cpu_to_le16(io.apptag);
+       cptr->rw.appmask = cpu_to_le16(io.appmask);
+
+       return nvme_submit_user_cmd(ns->queue, cptr,
                        nvme_to_user_ptr(io.addr), length,
-                       metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+                       metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
+                       ptioc, 0);
 }
 
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-                       struct nvme_passthru_cmd __user *ucmd)
+                       struct nvme_passthru_cmd __user *ucmd,
+                       struct pt_ioctl_ctx *ptioc)
 {
        struct nvme_passthru_cmd cmd;
-       struct nvme_command c;
+       struct nvme_command c, *cptr;
        unsigned timeout = 0;
        u64 result;
        int status;
+       void *resptr;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
@@ -1586,43 +1748,61 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct 
nvme_ns *ns,
                return -EFAULT;
        if (cmd.flags)
                return -EINVAL;
+       if (!ptioc) {
+               cptr = &c;
+               resptr = &result;
+       } else {
+               /*
+                * for async - (a) allocate cmd dynamically
+                * (b) use user-space result addr
+                */
+               cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL);
+               if (!cptr)
+                       return -ENOMEM;
+               resptr = &ucmd->result;
+       }
 
-       memset(&c, 0, sizeof(c));
-       c.common.opcode = cmd.opcode;
-       c.common.flags = cmd.flags;
-       c.common.nsid = cpu_to_le32(cmd.nsid);
-       c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
-       c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-       c.common.cdw10 = cpu_to_le32(cmd.cdw10);
-       c.common.cdw11 = cpu_to_le32(cmd.cdw11);
-       c.common.cdw12 = cpu_to_le32(cmd.cdw12);
-       c.common.cdw13 = cpu_to_le32(cmd.cdw13);
-       c.common.cdw14 = cpu_to_le32(cmd.cdw14);
-       c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+       memset(cptr, 0, sizeof(c));
+       cptr->common.opcode = cmd.opcode;
+       cptr->common.flags = cmd.flags;
+       cptr->common.nsid = cpu_to_le32(cmd.nsid);
+       cptr->common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+       cptr->common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+       cptr->common.cdw10 = cpu_to_le32(cmd.cdw10);
+       cptr->common.cdw11 = cpu_to_le32(cmd.cdw11);
+       cptr->common.cdw12 = cpu_to_le32(cmd.cdw12);
+       cptr->common.cdw13 = cpu_to_le32(cmd.cdw13);
+       cptr->common.cdw14 = cpu_to_le32(cmd.cdw14);
+       cptr->common.cdw15 = cpu_to_le32(cmd.cdw15);
 
        if (cmd.timeout_ms)
                timeout = msecs_to_jiffies(cmd.timeout_ms);
 
-       status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+       status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, cptr,
                        nvme_to_user_ptr(cmd.addr), cmd.data_len,
                        nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-                       0, &result, timeout);
+                       0, resptr, timeout, ptioc, 0);
 
-       if (status >= 0) {
+       if (!ptioc && status >= 0) {
                if (put_user(result, &ucmd->result))
                        return -EFAULT;
        }
+       /* async case, free cmd in case of error */
+       if (ptioc && status < 0)
+               kfree(cptr);
 
        return status;
 }
 
 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-                       struct nvme_passthru_cmd64 __user *ucmd)
+                       struct nvme_passthru_cmd64 __user *ucmd,
+                       struct pt_ioctl_ctx *ptioc)
 {
        struct nvme_passthru_cmd64 cmd;
-       struct nvme_command c;
+       struct nvme_command c, *cptr;
        unsigned timeout = 0;
        int status;
+       void *resptr;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
@@ -1631,31 +1811,43 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, 
struct nvme_ns *ns,
        if (cmd.flags)
                return -EINVAL;
 
-       memset(&c, 0, sizeof(c));
-       c.common.opcode = cmd.opcode;
-       c.common.flags = cmd.flags;
-       c.common.nsid = cpu_to_le32(cmd.nsid);
-       c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
-       c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-       c.common.cdw10 = cpu_to_le32(cmd.cdw10);
-       c.common.cdw11 = cpu_to_le32(cmd.cdw11);
-       c.common.cdw12 = cpu_to_le32(cmd.cdw12);
-       c.common.cdw13 = cpu_to_le32(cmd.cdw13);
-       c.common.cdw14 = cpu_to_le32(cmd.cdw14);
-       c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+       if (!ptioc) {
+               cptr = &c;
+               resptr = &cmd.result;
+       } else {
+               cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL);
+               if (!cptr)
+                       return -ENOMEM;
+               resptr = &ucmd->result;
+       }
+
+       memset(cptr, 0, sizeof(struct nvme_command));
+       cptr->common.opcode = cmd.opcode;
+       cptr->common.flags = cmd.flags;
+       cptr->common.nsid = cpu_to_le32(cmd.nsid);
+       cptr->common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+       cptr->common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+       cptr->common.cdw10 = cpu_to_le32(cmd.cdw10);
+       cptr->common.cdw11 = cpu_to_le32(cmd.cdw11);
+       cptr->common.cdw12 = cpu_to_le32(cmd.cdw12);
+       cptr->common.cdw13 = cpu_to_le32(cmd.cdw13);
+       cptr->common.cdw14 = cpu_to_le32(cmd.cdw14);
+       cptr->common.cdw15 = cpu_to_le32(cmd.cdw15);
 
        if (cmd.timeout_ms)
                timeout = msecs_to_jiffies(cmd.timeout_ms);
 
-       status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+       status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, cptr,
                        nvme_to_user_ptr(cmd.addr), cmd.data_len,
                        nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-                       0, &cmd.result, timeout);
+                       0, resptr, timeout, ptioc, 1);
 
-       if (status >= 0) {
+       if (!ptioc && status >= 0) {
                if (put_user(cmd.result, &ucmd->result))
                        return -EFAULT;
        }
+       if (ptioc && status < 0)
+               kfree(cptr);
 
        return status;
 }
@@ -1702,7 +1894,8 @@ static bool is_ctrl_ioctl(unsigned int cmd)
 static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
                                  void __user *argp,
                                  struct nvme_ns_head *head,
-                                 int srcu_idx)
+                                 int srcu_idx,
+                                 struct pt_ioctl_ctx *ptioc)
 {
        struct nvme_ctrl *ctrl = ns->ctrl;
        int ret;
@@ -1712,21 +1905,24 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, 
unsigned int cmd,
 
        switch (cmd) {
        case NVME_IOCTL_ADMIN_CMD:
-               ret = nvme_user_cmd(ctrl, NULL, argp);
+               ret = nvme_user_cmd(ctrl, NULL, argp, ptioc);
                break;
        case NVME_IOCTL_ADMIN64_CMD:
-               ret = nvme_user_cmd64(ctrl, NULL, argp);
+               ret = nvme_user_cmd64(ctrl, NULL, argp, ptioc);
                break;
        default:
-               ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+               if (!ptioc)
+                       ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+               else
+                       ret = -EOPNOTSUPP; /* RFP: no support for now */
                break;
        }
        nvme_put_ctrl(ctrl);
        return ret;
 }
 
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
-               unsigned int cmd, unsigned long arg)
+static int nvme_async_ioctl(struct block_device *bdev, fmode_t mode,
+               unsigned int cmd, unsigned long arg, struct pt_ioctl_ctx *ptioc)
 {
        struct nvme_ns_head *head = NULL;
        void __user *argp = (void __user *)arg;
@@ -1743,33 +1939,49 @@ static int nvme_ioctl(struct block_device *bdev, 
fmode_t mode,
         * deadlock when deleting namespaces using the passthrough interface.
         */
        if (is_ctrl_ioctl(cmd))
-               return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+               return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 
ptioc);
 
        switch (cmd) {
        case NVME_IOCTL_ID:
                force_successful_syscall_return();
                ret = ns->head->ns_id;
+               if (ptioc)
+                       goto put_ns; /* return in sync fashion always */
                break;
        case NVME_IOCTL_IO_CMD:
-               ret = nvme_user_cmd(ns->ctrl, ns, argp);
+               ret = nvme_user_cmd(ns->ctrl, ns, argp, ptioc);
                break;
        case NVME_IOCTL_SUBMIT_IO:
-               ret = nvme_submit_io(ns, argp);
+               ret = nvme_submit_io(ns, argp, ptioc);
                break;
        case NVME_IOCTL_IO64_CMD:
-               ret = nvme_user_cmd64(ns->ctrl, ns, argp);
+               ret = nvme_user_cmd64(ns->ctrl, ns, argp, ptioc);
                break;
        default:
+               if (ptioc) {
+                       /* RFP- don't support this for now */
+                       ret = -EOPNOTSUPP;
+                       break;
+               }
                if (ns->ndev)
                        ret = nvme_nvm_ioctl(ns, cmd, arg);
                else
                        ret = -ENOTTY;
        }
-
+       /* if there is no error, return queued for async-ioctl */
+       if (ptioc && ret >= 0)
+               ret = -EIOCBQUEUED;
+ put_ns:
        nvme_put_ns_from_disk(head, srcu_idx);
        return ret;
 }
 
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+               unsigned int cmd, unsigned long arg)
+{
+       return nvme_async_ioctl(bdev, mode, cmd, arg, NULL);
+}
+
 #ifdef CONFIG_COMPAT
 struct nvme_user_io32 {
        __u8    opcode;
@@ -2324,6 +2536,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit);
 static const struct block_device_operations nvme_bdev_ops = {
        .owner          = THIS_MODULE,
        .ioctl          = nvme_ioctl,
+       .async_ioctl    = nvme_async_ioctl,
        .compat_ioctl   = nvme_compat_ioctl,
        .open           = nvme_open,
        .release        = nvme_release,
@@ -3261,7 +3474,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void 
__user *argp)
        kref_get(&ns->kref);
        up_read(&ctrl->namespaces_rwsem);
 
-       ret = nvme_user_cmd(ctrl, ns, argp);
+       ret = nvme_user_cmd(ctrl, ns, argp, NULL);
        nvme_put_ns(ns);
        return ret;
 
@@ -3278,9 +3491,9 @@ static long nvme_dev_ioctl(struct file *file, unsigned 
int cmd,
 
        switch (cmd) {
        case NVME_IOCTL_ADMIN_CMD:
-               return nvme_user_cmd(ctrl, NULL, argp);
+               return nvme_user_cmd(ctrl, NULL, argp, NULL);
        case NVME_IOCTL_ADMIN64_CMD:
-               return nvme_user_cmd64(ctrl, NULL, argp);
+               return nvme_user_cmd64(ctrl, NULL, argp, NULL);
        case NVME_IOCTL_IO_CMD:
                return nvme_dev_user_cmd(ctrl, argp);
        case NVME_IOCTL_RESET:
-- 
2.25.1

[RFC PATCH 3/4] nvme: add async ioctl support

Reply via email to