Use the block layer (bio_submit) to pass through the IO to the nvme driver
instead of the direct IO submission hooks.

Currently that code supports only read/write, and it still assumes that
we talk to an nvme driver.


Signed-off-by: Maxim Levitsky <mlevi...@redhat.com>
---
 drivers/nvme/mdev/Kconfig |   8 ++
 drivers/nvme/mdev/host.c  | 239 +++++++++++++++++++++++++++++++++++++-
 drivers/nvme/mdev/io.c    |   7 ++
 drivers/nvme/mdev/priv.h  |  61 ++++++++++
 4 files changed, 313 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/mdev/Kconfig b/drivers/nvme/mdev/Kconfig
index 7ebc66cdeac0..1ace298a364d 100644
--- a/drivers/nvme/mdev/Kconfig
+++ b/drivers/nvme/mdev/Kconfig
@@ -14,3 +14,11 @@ config NVME_MDEV_VFIO
          guest, also as a NVME namespace, attached to a virtual NVME
          controller
          If unsure, say N.
+
+config NVME_MDEV_VFIO_GENERIC_IO
+       bool "Use generic block layer IO"
+       depends on NVME_MDEV_VFIO
+       help
+         Send the IO through the block layer using polled IO queues,
+         instead of dedicated mdev queues
+         If unsure, say N.
diff --git a/drivers/nvme/mdev/host.c b/drivers/nvme/mdev/host.c
index 6590946b86c2..a2ba69dcf4f2 100644
--- a/drivers/nvme/mdev/host.c
+++ b/drivers/nvme/mdev/host.c
@@ -53,6 +53,7 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct 
nvme_ctrl *ctrl)
        if (!hctrl)
                return NULL;
 
+#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
        nr_host_queues = ctrl->ops->ext_queues_available(ctrl);
        max_lba_transfer = ctrl->max_hw_sectors >> (PAGE_SHIFT - 9);
 
@@ -63,6 +64,15 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct 
nvme_ctrl *ctrl)
                return NULL;
        }
 
+       hctrl->oncs = ctrl->oncs &
+               (NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES);
+#else
+       /* for now don't deal with bio chaining */
+       max_lba_transfer = BIO_MAX_PAGES;
+       nr_host_queues = MDEV_NVME_NUM_BIO_QUEUES;
+       /* for now no support for write zeros and discard*/
+       hctrl->oncs = 0;
+#endif
 
        kref_init(&hctrl->ref);
        mutex_init(&hctrl->lock);
@@ -70,8 +80,6 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct 
nvme_ctrl *ctrl)
        hctrl->nvme_ctrl = ctrl;
        nvme_get_ctrl(ctrl);
 
-       hctrl->oncs = ctrl->oncs &
-               (NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES);
 
        hctrl->id = ctrl->instance;
        hctrl->node = dev_to_node(ctrl->dev);
@@ -200,6 +208,8 @@ bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl 
*hctrl, u8 optcode)
        }
 }
 
+#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
+
 /* Allocate a host IO queue */
 int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl)
 {
@@ -228,6 +238,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl 
*hctrl, u16 qid)
 
 /* Submit a IO passthrough command */
 int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl,
+                             struct nvme_mdev_vns *vns,
                              u16 qid, u32 tag,
                              struct nvme_command *cmd,
                              struct nvme_ext_data_iter *datait)
@@ -248,6 +259,226 @@ int nvme_mdev_hctrl_hq_poll(struct nvme_mdev_hctrl *hctrl,
        return ctrl->ops->ext_queue_poll(ctrl, qid, results, max_len);
 }
 
+#else
+
+/* Allocate a 'host' queue - here the queues are virtual*/
+int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl)
+{
+       int qid, ret;
+       struct hw_mbio_queue *hwq;
+
+       for (qid = 0 ; qid < MDEV_NVME_NUM_BIO_QUEUES ; qid++)
+               if (!hctrl->hw_queues[qid])
+                       break;
+
+       if (qid == MDEV_NVME_NUM_BIO_QUEUES)
+               return -ENOSPC;
+
+       hwq = kzalloc_node(sizeof(*hwq), GFP_KERNEL, hctrl->node);
+       if (!hwq)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&hwq->bios_in_flight);
+
+       ret = bioset_init(&hwq->bioset, MDEV_NVME_BIO_QUEUE_SIZE,
+                         offsetof(struct mbio, bio), BIOSET_NEED_BVECS);
+
+       if (ret < 0) {
+               kfree(hwq);
+               return ret;
+       }
+
+       hctrl->hw_queues[qid] = hwq;
+       return qid + 1;
+}
+
+/* Free a 'host' queue - here the queues are virtual*/
+void nvme_mdev_hctrl_hq_free(struct nvme_mdev_hctrl *hctrl, u16 qid)
+{
+       struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1];
+
+       if (WARN_ON(!hwq))
+               return;
+
+       WARN_ON(!list_empty(&hwq->bios_in_flight));
+       WARN_ON(hwq->inflight);
+
+       hctrl->hw_queues[qid - 1] = NULL;
+       bioset_exit(&hwq->bioset);
+       kfree(hwq);
+}
+
+/*
+ * Check if the host queue has space for submission - also our limit
+ * not related to the block layer
+ */
+bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid)
+{
+       struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1];
+
+       if (WARN_ON(!hwq))
+               return false;
+       return hwq->inflight < MDEV_NVME_BIO_QUEUE_SIZE;
+}
+
+/*
+ * Callback we get from the block layer
+ * Note that despite polling, this can be run from IRQ context
+ */
+static void nvme_mdev_hctrl_bio_done(struct bio *bio)
+{
+       struct mbio *mbio = container_of(bio, struct mbio, bio);
+
+       /* this will mark this bio as done, and allow the polling thread
+        * to return it to the user
+        */
+       mbio->status = nvme_mdev_translate_error_block(bio->bi_status);
+}
+
+/* Submit a IO passthrough command */
+int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl,
+                             struct nvme_mdev_vns *vns,
+                             u16 qid, u32 tag,
+                             struct nvme_command *cmd,
+                             struct nvme_ext_data_iter *datait)
+{
+       struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1];
+       struct bio *bio = NULL;
+       struct mbio *mbio;
+       struct page *page;
+       u8 opcode = cmd->common.opcode;
+       int retval, op, op_flags = 0;
+       int offset;
+
+       if (WARN_ON(!hwq))
+               return -EINVAL;
+       if (WARN_ON(hwq->inflight >= MDEV_NVME_BIO_QUEUE_SIZE))
+               return -EBUSY;
+
+       /* read/write buffer processing */
+       if (opcode == nvme_cmd_read || opcode == nvme_cmd_write) {
+               unsigned long datalength =
+                       (le16_to_cpu(cmd->rw.length) + 1) << vns->blksize_shift;
+
+               if (opcode == nvme_cmd_read) {
+                       op = REQ_OP_READ;
+               } else {
+                       op = REQ_OP_WRITE;
+                       op_flags = REQ_SYNC | REQ_IDLE;
+                       if (cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
+                               op_flags |= REQ_FUA;
+               }
+
+               if (WARN_ON(datait->count > BIO_MAX_PAGES))
+                       return -EINVAL;
+
+               bio = bio_alloc_bioset(GFP_KERNEL, datait->count, &hwq->bioset);
+               if (WARN_ON(!bio))
+                       return -ENOMEM;
+
+               mbio = container_of(bio, struct mbio, bio);
+
+               /* starting sector */
+               bio->bi_iter.bi_sector = le64_to_cpu(cmd->rw.slba) <<
+                               (vns->blksize_shift - 9);
+
+               /* Data. Last page might be partial size*/
+               while (datait->count) {
+                       int chunk = min(PAGE_SIZE, datalength);
+
+                       if (WARN_ON(datalength == 0))
+                               break;
+
+                       page = pfn_to_page(PHYS_PFN(datait->physical));
+                       offset = OFFSET_IN_PAGE(datait->physical);
+
+                       if (bio_add_page(&mbio->bio, page,
+                                        chunk, offset) != chunk) {
+                               WARN_ON(1);
+                               retval = -ENOMEM;
+                               goto error;
+                       }
+
+                       retval = datait->next(datait);
+                       if (WARN_ON(retval))
+                               goto error;
+                       datalength -= chunk;
+               }
+
+       /* flush request */
+       } else if (opcode == nvme_cmd_flush) {
+               op = REQ_OP_WRITE;
+               op_flags = REQ_PREFLUSH;
+               bio = bio_alloc_bioset(GFP_KERNEL, 0, &hwq->bioset);
+               if (WARN_ON(!bio))
+                       return -ENOMEM;
+               mbio = container_of(bio, struct mbio, bio);
+       } else {
+               retval =  -EINVAL;
+               goto error;
+       }
+
+       /* set polling */
+       op_flags |= REQ_HIPRI | REQ_NOWAIT;
+
+       /* setup the bio */
+       bio_set_dev(bio, vns->host_part);
+       bio->bi_end_io = nvme_mdev_hctrl_bio_done;
+       bio_set_op_attrs(bio, op, op_flags);
+
+       /* setup our portion of the bio*/
+       mbio = container_of(bio, struct mbio, bio);
+       mbio->tag = tag;
+       mbio->status = NVME_STATUS_PENDING;
+       mbio->blk_queue = bdev_get_queue(vns->host_part);
+
+       /* submit the bio*/
+       mbio->cookie = submit_bio(bio);
+
+       list_add_tail(&mbio->link, &hwq->bios_in_flight);
+       hwq->inflight++;
+       return 0;
+error:
+       if (bio)
+               bio_put(bio);
+       return retval;
+}
+
+/* Poll for completion of IO passthrough commands */
+int nvme_mdev_hctrl_hq_poll(struct nvme_mdev_hctrl *hctrl,
+                           u32 qid,
+                           struct nvme_ext_cmd_result *results,
+                           unsigned int max_len)
+{
+       struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1];
+       struct mbio *mbio, *tmp;
+
+       int i = 0;
+
+       if (!hwq->inflight)
+               return -1;
+
+       list_for_each_entry_safe(mbio, tmp, &hwq->bios_in_flight, link) {
+               if (mbio->status == NVME_STATUS_PENDING)
+                       blk_poll(mbio->blk_queue, mbio->cookie, false);
+
+               if (mbio->status == NVME_STATUS_PENDING)
+                       continue;
+
+               results[i].tag = mbio->tag;
+               results[i].status = mbio->status;
+
+               hwq->inflight--;
+               list_del(&mbio->link);
+               bio_put(&mbio->bio);
+
+               if (++i == max_len)
+                       break;
+       }
+       return i;
+}
+#endif
+
 /* Destroy all host controllers */
 void nvme_mdev_hctrl_destroy_all(void)
 {
@@ -486,6 +717,10 @@ static int __init nvme_mdev_init(void)
        }
 
        pr_info("nvme_mdev " NVME_MDEV_FIRMWARE_VERSION " loaded\n");
+
+#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
+       pr_info("nvme_mdev: using block layer polled IO\b");
+#endif
        return 0;
 }
 
diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c
index 39550d0e3649..d3c46de33b01 100644
--- a/drivers/nvme/mdev/io.c
+++ b/drivers/nvme/mdev/io.c
@@ -70,7 +70,11 @@ static int nvme_mdev_io_translate_rw(struct io_ctx *ctx)
        if (!check_range(slba, length, ctx->ns->ns_size))
                return DNR(NVME_SC_LBA_RANGE);
 
+#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
        ctx->out.rw.slba = cpu_to_le64(slba + ctx->ns->host_lba_offset);
+#else
+       ctx->out.rw.slba = in->slba;
+#endif
        ctx->out.rw.length = in->length;
 
        ret = nvme_mdev_udata_iter_set_dptr(&ctx->udatait, &in->dptr,
@@ -195,7 +199,9 @@ static int nvme_mdev_io_translate_dsm(struct io_ctx *ctx)
                _DBG(ctx->vctrl, "IOQ: DSM_MANAGEMENT: RANGE 0x%llx-0x%x\n",
                     slba, nlb);
 
+#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
                data_ptr[i].slba = cpu_to_le64(slba + ctx->ns->host_lba_offset);
+#endif
        }
 
        ctx->out.dsm.attributes = in->attributes;
@@ -280,6 +286,7 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 
sqid)
 
        /*passthrough*/
        ret = nvme_mdev_hctrl_hq_submit(ctx->hctrl,
+                                       ctx->ns,
                                        vsq->hsq,
                                        (((u32)vsq->qid) << 16) | ((u32)ucid),
                                        &ctx->out,
diff --git a/drivers/nvme/mdev/priv.h b/drivers/nvme/mdev/priv.h
index a11a1842957d..1dd5fce0bfa6 100644
--- a/drivers/nvme/mdev/priv.h
+++ b/drivers/nvme/mdev/priv.h
@@ -34,7 +34,12 @@
 #define MAX_VIRTUAL_NAMESPACES 16 /* NSID = 1..16*/
 #define MAX_VIRTUAL_IRQS 16
 
+#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
 #define MAX_HOST_QUEUES 4
+#else
+#define MAX_HOST_QUEUES 1
+#endif
+
 #define MAX_AER_COMMANDS 16
 #define MAX_LOG_PAGES 16
 
@@ -323,6 +328,39 @@ struct nvme_mdev_inst_type {
        struct attribute_group *attrgroup;
 };
 
+#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
+
+#define MDEV_NVME_BIO_QUEUE_SIZE 128
+#define NVME_STATUS_PENDING 0xFFFF
+#define MDEV_NVME_NUM_BIO_QUEUES 16
+
+struct mbio {
+       /* link in a list of pending bios*/
+       struct list_head link;
+
+       struct request_queue *blk_queue;
+
+       /*GDPR compliant*/
+       unsigned int cookie;
+
+       /* tag from the translation (user cid + user qid) */
+       u32 tag;
+
+       /* result NVME status */
+       u16 status;
+
+       /* must be last for bioset allocation*/
+       struct bio bio;
+};
+
+struct hw_mbio_queue {
+       int inflight;
+       struct list_head bios_in_flight;
+       struct bio_set bioset;
+};
+
+#endif
+
 /*Abstraction of the host controller that we are connected to */
 struct nvme_mdev_hctrl {
        struct mutex lock;
@@ -344,6 +382,10 @@ struct nvme_mdev_hctrl {
 
        /* book-keeping for number of host queues we can allocate*/
        unsigned int nr_host_queues;
+
+#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO
+       struct hw_mbio_queue *hw_queues[MDEV_NVME_NUM_BIO_QUEUES];
+#endif
 };
 
 /* vctrl.c*/
@@ -415,6 +457,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl 
*hctrl, u16 qid);
 bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode);
 
 int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl,
+                             struct nvme_mdev_vns *vns,
                              u16 qid, u32 tag,
                              struct nvme_command *cmd,
                              struct nvme_ext_data_iter *datait);
@@ -701,6 +744,24 @@ static inline int nvme_mdev_translate_error(int error)
        }
 }
 
+static inline int nvme_mdev_translate_error_block(blk_status_t blk_sts)
+{
+       switch (blk_sts) {
+       case BLK_STS_OK:
+               return NVME_SC_SUCCESS;
+       case BLK_STS_NOSPC:
+               return DNR(NVME_SC_CAP_EXCEEDED);
+       case BLK_STS_TARGET:
+               return DNR(NVME_SC_LBA_RANGE);
+       case BLK_STS_NOTSUPP:
+               return DNR(NVME_SC_INVALID_OPCODE);
+       case BLK_STS_MEDIUM:
+               return DNR(NVME_SC_ACCESS_DENIED);
+       default:
+               return DNR(NVME_SC_INTERNAL);
+       }
+}
+
 static inline bool timeout(ktime_t event, ktime_t now, unsigned long 
timeout_ms)
 {
        return ktime_ms_delta(now, event) > (long)timeout_ms;
-- 
2.17.2

Reply via email to