[PATCH 02/13] nvme-pci: use atomic bitops to mark a queue enabled

2018-12-02 Thread Christoph Hellwig
This gets rid of all the messing with cq_vector and the ->polled field
by using an atomic bitop to mark the queue enabled or not.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 43 ++---
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a1bb4bb92e7f..022395a319f4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -201,7 +201,8 @@ struct nvme_queue {
u16 last_cq_head;
u16 qid;
u8 cq_phase;
-   u8 polled;
+   unsigned long flags;
+#define NVMEQ_ENABLED  0
u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
@@ -927,7 +928,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
*hctx,
 * We should not need to do this, but we're still using this to
 * ensure we can drain requests on a dying queue.
 */
-   if (unlikely(nvmeq->cq_vector < 0 && !nvmeq->polled))
+   if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
return BLK_STS_IOERR;
 
ret = nvme_setup_cmd(ns, req, &cmnd);
@@ -1395,31 +1396,19 @@ static void nvme_free_queues(struct nvme_dev *dev, int 
lowest)
  */
 static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 {
-   int vector;
-
-   spin_lock_irq(&nvmeq->cq_lock);
-   if (nvmeq->cq_vector == -1 && !nvmeq->polled) {
-   spin_unlock_irq(&nvmeq->cq_lock);
+   if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
return 1;
-   }
-   vector = nvmeq->cq_vector;
-   nvmeq->dev->online_queues--;
-   nvmeq->cq_vector = -1;
-   nvmeq->polled = false;
-   spin_unlock_irq(&nvmeq->cq_lock);
 
-   /*
-* Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
-* having to grab the lock.
-*/
+   /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
 
+   nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
-
-   if (vector != -1)
-   pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
-
+   if (nvmeq->cq_vector == -1)
+   return 0;
+   pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
+   nvmeq->cq_vector = -1;
return 0;
 }
 
@@ -1578,13 +1567,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, 
int qid, bool polled)
else if (result)
goto release_cq;
 
-   /*
-* Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
-* invoke free_irq for it and cause a 'Trying to free already-free IRQ
-* xxx' warning if the create CQ/SQ command times out.
-*/
nvmeq->cq_vector = vector;
-   nvmeq->polled = polled;
nvme_init_queue(nvmeq, qid);
 
if (vector != -1) {
@@ -1593,11 +1576,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, 
int qid, bool polled)
goto release_sq;
}
 
+   set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
 
 release_sq:
nvmeq->cq_vector = -1;
-   nvmeq->polled = false;
dev->online_queues--;
adapter_delete_sq(dev, qid);
 release_cq:
@@ -1751,6 +1734,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev 
*dev)
return result;
}
 
+   set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
 }
 
@@ -2173,6 +2157,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
if (nr_io_queues == 0)
return 0;
+   
+   clear_bit(NVMEQ_ENABLED, &adminq->flags);
 
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2227,6 +2213,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
adminq->cq_vector = -1;
return result;
}
+   set_bit(NVMEQ_ENABLED, &adminq->flags);
return nvme_create_io_queues(dev);
 }
 
-- 
2.19.1



[PATCH 01/13] block: move queues types to the block layer

2018-12-02 Thread Christoph Hellwig
Having another indirect all in the fast path doesn't really help
in our post-spectre world.  Also having too many queue type is just
going to create confusion, so I'd rather manage them centrally.

Note that the queue type naming and ordering changes a bit - the
first index now is the default queue for everything not explicitly
marked, the optional ones are read and poll queues.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-sysfs.c|  9 +-
 block/blk-mq.h  | 21 +++--
 drivers/nvme/host/pci.c | 68 +++--
 include/linux/blk-mq.h  | 15 -
 4 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 6efef1f679f0..9c2df137256a 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -173,9 +173,16 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct 
blk_mq_hw_ctx *hctx, char *page)
return ret;
 }
 
+static const char *const hctx_types[] = {
+   [HCTX_TYPE_DEFAULT] = "default",
+   [HCTX_TYPE_READ]= "read",
+   [HCTX_TYPE_POLL]= "poll",
+};
+
 static ssize_t blk_mq_hw_sysfs_type_show(struct blk_mq_hw_ctx *hctx, char 
*page)
 {
-   return sprintf(page, "%u\n", hctx->type);
+   BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES);
+   return sprintf(page, "%s\n", hctx_types[hctx->type]);
 }
 
 static struct attribute *default_ctx_attrs[] = {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7291e5379358..a664ea44ffd4 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -81,16 +81,14 @@ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map 
*qmap, unsigned int);
 /*
  * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
  * @q: request queue
- * @hctx_type: the hctx type index
+ * @type: the hctx type index
  * @cpu: CPU
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue 
*q,
- unsigned int 
hctx_type,
+ enum hctx_type type,
  unsigned int cpu)
 {
-   struct blk_mq_tag_set *set = q->tag_set;
-
-   return q->queue_hw_ctx[set->map[hctx_type].mq_map[cpu]];
+   return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
 }
 
 /*
@@ -103,12 +101,17 @@ static inline struct blk_mq_hw_ctx 
*blk_mq_map_queue(struct request_queue *q,
 unsigned int flags,
 unsigned int cpu)
 {
-   int hctx_type = 0;
+   enum hctx_type type = HCTX_TYPE_DEFAULT;
+
+   if (q->tag_set->nr_maps > HCTX_TYPE_POLL &&
+   ((flags & REQ_HIPRI) && test_bit(QUEUE_FLAG_POLL, &q->queue_flags)))
+   type = HCTX_TYPE_POLL;
 
-   if (q->mq_ops->rq_flags_to_type)
-   hctx_type = q->mq_ops->rq_flags_to_type(q, flags);
+   else if (q->tag_set->nr_maps > HCTX_TYPE_READ &&
+((flags & REQ_OP_MASK) == REQ_OP_READ))
+   type = HCTX_TYPE_READ;
 
-   return blk_mq_map_queue_type(q, hctx_type, cpu);
+   return blk_mq_map_queue_type(q, type, cpu);
 }
 
 /*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 527907aa6903..a1bb4bb92e7f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -95,13 +95,6 @@ struct nvme_queue;
 
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
-enum {
-   NVMEQ_TYPE_READ,
-   NVMEQ_TYPE_WRITE,
-   NVMEQ_TYPE_POLL,
-   NVMEQ_TYPE_NR,
-};
-
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -115,7 +108,7 @@ struct nvme_dev {
struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
-   unsigned io_queues[NVMEQ_TYPE_NR];
+   unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
u32 db_stride;
@@ -499,10 +492,10 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 
map->nr_queues = dev->io_queues[i];
if (!map->nr_queues) {
-   BUG_ON(i == NVMEQ_TYPE_READ);
+   BUG_ON(i == HCTX_TYPE_DEFAULT);
 
/* shared set, resuse read set parameters */
-   map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ];
+   map->nr_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
qoff = 0;
offset = queue_irq_offset(dev);
}
@@ -512,7 +505,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 * affinity), so use the regular blk-mq cpu mapping
 */
map->queue_offset = qoff;
-   if (i != NVMEQ_TYPE_POLL)
+   if (i != HCTX_TYPE_POLL)
blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), 
offse

[PATCH 05/13] nvme-pci: consolidate code for polling non-dedicated queues

2018-12-02 Thread Christoph Hellwig
We have three places that can poll for I/O completions on a normal
interrupt-enabled queue.  All of them are in slow path code, so
consolidate them to a single helper that uses spin_lock_irqsave and
removes the fast path cqe_pending check.

Signed-off-by: Christoph Hellwig 
---
 drivers/nvme/host/pci.c | 35 ---
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d42bb76e5e78..10c26a2e355a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1072,17 +1072,19 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_NONE;
 }
 
-static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
+/*
+ * Poll for completions any queue, including those not dedicated to polling.
+ * Can be called from any context.
+ */
+static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
 {
+   unsigned long flags;
u16 start, end;
int found;
 
-   if (!nvme_cqe_pending(nvmeq))
-   return 0;
-
-   spin_lock_irq(&nvmeq->cq_lock);
+   spin_lock_irqsave(&nvmeq->cq_lock, flags);
found = nvme_process_cq(nvmeq, &start, &end, tag);
-   spin_unlock_irq(&nvmeq->cq_lock);
+   spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
 
nvme_complete_cqes(nvmeq, start, end);
return found;
@@ -1279,7 +1281,7 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
/*
 * Did we miss an interrupt?
 */
-   if (__nvme_poll(nvmeq, req->tag)) {
+   if (nvme_poll_irqdisable(nvmeq, req->tag)) {
dev_warn(dev->ctrl.device,
 "I/O %d QID %d timeout, completion polled\n",
 req->tag, nvmeq->qid);
@@ -1406,18 +1408,13 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
struct nvme_queue *nvmeq = &dev->queues[0];
-   u16 start, end;
 
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
 
-   spin_lock_irq(&nvmeq->cq_lock);
-   nvme_process_cq(nvmeq, &start, &end, -1);
-   spin_unlock_irq(&nvmeq->cq_lock);
-
-   nvme_complete_cqes(nvmeq, start, end);
+   nvme_poll_irqdisable(nvmeq, -1);
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -2217,17 +2214,9 @@ static void nvme_del_queue_end(struct request *req, 
blk_status_t error)
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
struct nvme_queue *nvmeq = req->end_io_data;
-   u16 start, end;
 
-   if (!error) {
-   unsigned long flags;
-
-   spin_lock_irqsave(&nvmeq->cq_lock, flags);
-   nvme_process_cq(nvmeq, &start, &end, -1);
-   spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
-
-   nvme_complete_cqes(nvmeq, start, end);
-   }
+   if (!error)
+   nvme_poll_irqdisable(nvmeq, -1);
 
nvme_del_queue_end(req, error);
 }
-- 
2.19.1



[PATCH 04/13] nvme-pci: only allow polling with separate poll queues

2018-12-02 Thread Christoph Hellwig
This will allow us to simplify both the regular NVMe interrupt handler
and the upcoming aio poll code.  In addition to that the separate
queues are generally a good idea for performance reasons.

Signed-off-by: Christoph Hellwig 
---
 drivers/nvme/host/pci.c | 18 +-
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b820dd0351cb..d42bb76e5e78 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1089,13 +1089,6 @@ static int __nvme_poll(struct nvme_queue *nvmeq, 
unsigned int tag)
 }
 
 static int nvme_poll(struct blk_mq_hw_ctx *hctx)
-{
-   struct nvme_queue *nvmeq = hctx->driver_data;
-
-   return __nvme_poll(nvmeq, -1);
-}
-
-static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx)
 {
struct nvme_queue *nvmeq = hctx->driver_data;
u16 start, end;
@@ -1605,12 +1598,11 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 
 static const struct blk_mq_ops nvme_mq_ops = {
NVME_SHARED_MQ_OPS,
-   .poll   = nvme_poll,
 };
 
-static const struct blk_mq_ops nvme_mq_poll_noirq_ops = {
+static const struct blk_mq_ops nvme_mq_poll_ops = {
NVME_SHARED_MQ_OPS,
-   .poll   = nvme_poll_noirq,
+   .poll   = nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -2298,10 +2290,10 @@ static int nvme_dev_add(struct nvme_dev *dev)
int ret;
 
if (!dev->ctrl.tagset) {
-   if (!dev->io_queues[HCTX_TYPE_POLL])
-   dev->tagset.ops = &nvme_mq_ops;
+   if (dev->io_queues[HCTX_TYPE_POLL])
+   dev->tagset.ops = &nvme_mq_poll_ops;
else
-   dev->tagset.ops = &nvme_mq_poll_noirq_ops;
+   dev->tagset.ops = &nvme_mq_ops;
 
dev->tagset.nr_hw_queues = dev->online_queues - 1;
dev->tagset.nr_maps = HCTX_MAX_TYPES;
-- 
2.19.1



block and nvme polling improvements V3

2018-12-02 Thread Christoph Hellwig
Hi all,

this series optimizes a few bits in the block layer and nvme code
related to polling.

It starts by moving the queue types recently introduce entirely into
the block layer instead of requiring an indirect call for them.

It then switches nvme and the block layer to only allow polling
with separate poll queues, which allows us to realize the following
benefits:

 - poll queues can safely avoid disabling irqs on any locks
   (we already do that in NVMe, but it isn't 100% kosher as-is)
 - regular interrupt driven queues can drop the CQ lock entirely,
   as we won't race for completing CQs

Then we drop the NVMe RDMA code, as it doesn't follow the new mode,
and remove the nvme multipath polling code including the block hooks
for it, which didn't make much sense to start with given that we
started bypassing the multipath code for single controller subsystems
early on.  Last but not least we enable polling in the block layer
by default if the underlying driver has poll queues, as that already
requires explicit user action.

Note that it would be really nice to have polling back for RDMA with
dedicated poll queues, but that might take a while.  Also based on
Jens' polling aio patches we could now implement a model in nvmet
where we have a thread polling both the backend nvme device and
the RDMA CQs, which might give us some pretty nice performace
(I know Sagi looked into something similar a while ago).

A git tree is also available at:

git://git.infradead.org/users/hch/block.git nvme-polling

Gitweb:


http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/nvme-polling

Changes since v2:
 - fix a changelog typo
 - report a string instead of an index from the type sysfs attribute
 - move to a per-queue completions for queue deletion
 - clear NVMEQ_DELETE_ERROR when initializing a queue

Changes since v1:
 - rebased to the latest block for-4.21 tree


[PATCH 03/13] nvme-pci: cleanup SQ allocation a bit

2018-12-02 Thread Christoph Hellwig
Use a bit flag to mark if the SQ was allocated from the CMB, and clean
up the surrounding code a bit.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 33 +++--
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 022395a319f4..b820dd0351cb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -186,7 +186,6 @@ struct nvme_queue {
struct nvme_dev *dev;
spinlock_t sq_lock;
struct nvme_command *sq_cmds;
-   bool sq_cmds_is_io;
spinlock_t cq_lock cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
@@ -203,6 +202,7 @@ struct nvme_queue {
u8 cq_phase;
unsigned long flags;
 #define NVMEQ_ENABLED  0
+#define NVMEQ_SQ_CMB   1
u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
@@ -1366,17 +1366,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+   if (!nvmeq->sq_cmds)
+   return;
 
-   if (nvmeq->sq_cmds) {
-   if (nvmeq->sq_cmds_is_io)
-   pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
-   nvmeq->sq_cmds,
-   SQ_SIZE(nvmeq->q_depth));
-   else
-   dma_free_coherent(nvmeq->q_dmadev,
- SQ_SIZE(nvmeq->q_depth),
- nvmeq->sq_cmds,
- nvmeq->sq_dma_addr);
+   if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
+   pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+   nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
+   } else {
+   dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+   nvmeq->sq_cmds, nvmeq->sq_dma_addr);
}
 }
 
@@ -1462,15 +1460,14 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, 
struct nvme_queue *nvmeq,
nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
nvmeq->sq_cmds);
-   nvmeq->sq_cmds_is_io = true;
-   }
-
-   if (!nvmeq->sq_cmds) {
-   nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-   &nvmeq->sq_dma_addr, GFP_KERNEL);
-   nvmeq->sq_cmds_is_io = false;
+   if (nvmeq->sq_dma_addr) {
+   set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
+   return 0; 
+   }
}
 
+   nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+   &nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
return 0;
-- 
2.19.1



[PATCH 06/13] nvme-pci: refactor nvme_disable_io_queues

2018-12-02 Thread Christoph Hellwig
Pass the opcode for the delete SQ/CQ command as an argument instead of
the somewhat confusing pass loop.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 41 -
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10c26a2e355a..9ceba9900ca3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2244,31 +2244,29 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, 
u8 opcode)
return 0;
 }
 
-static void nvme_disable_io_queues(struct nvme_dev *dev)
+static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 {
-   int pass, queues = dev->online_queues - 1;
+   int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
-   u8 opcode = nvme_admin_delete_sq;
 
-   for (pass = 0; pass < 2; pass++) {
-   int sent = 0, i = queues;
-
-   reinit_completion(&dev->ioq_wait);
+   reinit_completion(&dev->ioq_wait);
  retry:
-   timeout = ADMIN_TIMEOUT;
-   for (; i > 0; i--, sent++)
-   if (nvme_delete_queue(&dev->queues[i], opcode))
-   break;
-
-   while (sent--) {
-   timeout = 
wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
-   if (timeout == 0)
-   return;
-   if (i)
-   goto retry;
-   }
-   opcode = nvme_admin_delete_cq;
+   timeout = ADMIN_TIMEOUT;
+   while (nr_queues > 0) {
+   if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
+   break;
+   nr_queues--;
+   sent++;
}
+   while (sent--) {
+   timeout = wait_for_completion_io_timeout(&dev->ioq_wait,
+   timeout);
+   if (timeout == 0)
+   return false;
+   if (nr_queues)
+   goto retry;
+   }
+   return true;
 }
 
 /*
@@ -2428,7 +2426,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
nvme_stop_queues(&dev->ctrl);
 
if (!dead && dev->ctrl.queue_count > 0) {
-   nvme_disable_io_queues(dev);
+   if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
+   nvme_disable_io_queues(dev, nvme_admin_delete_cq);
nvme_disable_admin_queue(dev, shutdown);
}
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
-- 
2.19.1



[PATCH 11/13] block: remove ->poll_fn

2018-12-02 Thread Christoph Hellwig
This was intended to support users like nvme multipath, but is just
getting in the way and adding another indirect call.

Signed-off-by: Christoph Hellwig 
---
 block/blk-core.c   | 23 ---
 block/blk-mq.c | 24 +++-
 include/linux/blkdev.h |  2 --
 3 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3f6f5e6c2fe4..942276399085 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1249,29 +1249,6 @@ blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
-/**
- * blk_poll - poll for IO completions
- * @q:  the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- *Poll for completions on the passed in queue. Returns number of
- *completed entries found. If @spin is true, then blk_poll will continue
- *looping until at least one completion is found, unless the task is
- *otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
-{
-   if (!q->poll_fn || !blk_qc_t_valid(cookie))
-   return 0;
-
-   if (current->plug)
-   blk_flush_plug_list(current->plug, false);
-   return q->poll_fn(q, cookie, spin);
-}
-EXPORT_SYMBOL_GPL(blk_poll);
-
 /**
  * blk_cloned_rq_check_limits - Helper function to check a cloned request
  *  for new the queue limits
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7dcef565dc0f..9c90c5038d07 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,6 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -2823,8 +2822,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
spin_lock_init(&q->requeue_lock);
 
blk_queue_make_request(q, blk_mq_make_request);
-   if (q->mq_ops->poll)
-   q->poll_fn = blk_mq_poll;
 
/*
 * Do this after blk_queue_make_request() overrides it...
@@ -3385,14 +3382,30 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
return blk_mq_poll_hybrid_sleep(q, hctx, rq);
 }
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
+/**
+ * blk_poll - poll for IO completions
+ * @q:  the queue
+ * @cookie: cookie passed back at IO submission time
+ * @spin: whether to spin for completions
+ *
+ * Description:
+ *Poll for completions on the passed in queue. Returns number of
+ *completed entries found. If @spin is true, then blk_poll will continue
+ *looping until at least one completion is found, unless the task is
+ *otherwise marked running (or we need to reschedule).
+ */
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
struct blk_mq_hw_ctx *hctx;
long state;
 
-   if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+   if (!blk_qc_t_valid(cookie) ||
+   !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
 
+   if (current->plug)
+   blk_flush_plug_list(current->plug, false);
+
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 
/*
@@ -3433,6 +3446,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t 
cookie, bool spin)
__set_current_state(TASK_RUNNING);
return 0;
 }
+EXPORT_SYMBOL_GPL(blk_poll);
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 08d940f85fa0..0b3874bdbc6a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,6 @@ static inline unsigned short req_get_ioprio(struct request 
*req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t, bool spin);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);
@@ -401,7 +400,6 @@ struct request_queue {
struct rq_qos   *rq_qos;
 
make_request_fn *make_request_fn;
-   poll_q_fn   *poll_fn;
dma_drain_needed_fn *dma_drain_needed;
 
const struct blk_mq_ops *mq_ops;
-- 
2.19.1



[PATCH 13/13] block: enable polling by default if a poll map is initalized

2018-12-02 Thread Christoph Hellwig
If the user did setup polling in the driver we should not require
another know in the block layer to enable it.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9c90c5038d07..a550a00ac00c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2811,6 +2811,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
q->tag_set = set;
 
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
+   if (set->nr_maps > HCTX_TYPE_POLL)
+   blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
if (!(set->flags & BLK_MQ_F_SG_MERGE))
blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-- 
2.19.1



[PATCH 08/13] nvme-pci: remove the CQ lock for interrupt driven queues

2018-12-02 Thread Christoph Hellwig
Now that we can't poll regular, interrupt driven I/O queues there
is almost nothing that can race with an interrupt.  The only
possible other contexts polling a CQ are the error handler and
queue shutdown, and both are so far off in the slow path that
we can simply use the big hammer of disabling interrupts.

With that we can stop taking the cq_lock for normal queues.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 37 ++---
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2d5a468c35b1..4ccb4ea22ac6 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -185,7 +185,8 @@ struct nvme_queue {
struct nvme_dev *dev;
spinlock_t sq_lock;
struct nvme_command *sq_cmds;
-   spinlock_t cq_lock cacheline_aligned_in_smp;
+/* only used for poll queues: */
+   spinlock_t cq_poll_lock cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
@@ -1050,12 +1051,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
irqreturn_t ret = IRQ_NONE;
u16 start, end;
 
-   spin_lock(&nvmeq->cq_lock);
+   /*
+* The rmb/wmb pair ensures we see all updates from a previous run of
+* the irq handler, even if that was on another CPU.
+*/
+   rmb();
if (nvmeq->cq_head != nvmeq->last_cq_head)
ret = IRQ_HANDLED;
nvme_process_cq(nvmeq, &start, &end, -1);
nvmeq->last_cq_head = nvmeq->cq_head;
-   spin_unlock(&nvmeq->cq_lock);
+   wmb();
 
if (start != end) {
nvme_complete_cqes(nvmeq, start, end);
@@ -1079,13 +1084,24 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
  */
 static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
 {
-   unsigned long flags;
+   struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
u16 start, end;
int found;
 
-   spin_lock_irqsave(&nvmeq->cq_lock, flags);
+   /*
+* For a poll queue we need to protect against the polling thread
+* using the CQ lock.  For normal interrupt driven threads we have
+* to disable the interrupt to avoid racing with it.
+*/
+   if (nvmeq->cq_vector == -1)
+   spin_lock(&nvmeq->cq_poll_lock);
+   else
+   disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
found = nvme_process_cq(nvmeq, &start, &end, tag);
-   spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
+   if (nvmeq->cq_vector == -1)
+   spin_unlock(&nvmeq->cq_poll_lock);
+   else
+   enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 
nvme_complete_cqes(nvmeq, start, end);
return found;
@@ -1100,9 +1116,9 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
if (!nvme_cqe_pending(nvmeq))
return 0;
 
-   spin_lock(&nvmeq->cq_lock);
+   spin_lock(&nvmeq->cq_poll_lock);
found = nvme_process_cq(nvmeq, &start, &end, -1);
-   spin_unlock(&nvmeq->cq_lock);
+   spin_unlock(&nvmeq->cq_poll_lock);
 
nvme_complete_cqes(nvmeq, start, end);
return found;
@@ -1482,7 +1498,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int 
qid, int depth)
nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->sq_lock);
-   spin_lock_init(&nvmeq->cq_lock);
+   spin_lock_init(&nvmeq->cq_poll_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1518,7 +1534,6 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 
qid)
 {
struct nvme_dev *dev = nvmeq->dev;
 
-   spin_lock_irq(&nvmeq->cq_lock);
nvmeq->sq_tail = 0;
nvmeq->last_sq_tail = 0;
nvmeq->cq_head = 0;
@@ -1527,7 +1542,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 
qid)
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
-   spin_unlock_irq(&nvmeq->cq_lock);
+   wmb(); /* ensure the first interrupt sees the initialization */
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
-- 
2.19.1



[PATCH 07/13] nvme-pci: don't poll from irq context when deleting queues

2018-12-02 Thread Christoph Hellwig
This is the last place outside of nvme_irq that handles CQEs from
interrupt context, and thus is in the way of removing the cq_lock for
normal queues, and avoiding lockdep warnings on the poll queues, for
which we already take it without IRQ disabling.

Signed-off-by: Christoph Hellwig 
---
 drivers/nvme/host/pci.c | 27 +++
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9ceba9900ca3..2d5a468c35b1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -122,7 +122,6 @@ struct nvme_dev {
u32 cmbsz;
u32 cmbloc;
struct nvme_ctrl ctrl;
-   struct completion ioq_wait;
 
mempool_t *iod_mempool;
 
@@ -203,10 +202,12 @@ struct nvme_queue {
unsigned long flags;
 #define NVMEQ_ENABLED  0
 #define NVMEQ_SQ_CMB   1
+#define NVMEQ_DELETE_ERROR 2
u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
u32 *dbbuf_cq_ei;
+   struct completion delete_done;
 };
 
 /*
@@ -1535,6 +1536,8 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, 
int qid, bool polled)
int result;
s16 vector;
 
+   clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
+
/*
 * A queue's vector matches the queue identifier unless the controller
 * has only one vector available.
@@ -2208,15 +2211,15 @@ static void nvme_del_queue_end(struct request *req, 
blk_status_t error)
struct nvme_queue *nvmeq = req->end_io_data;
 
blk_mq_free_request(req);
-   complete(&nvmeq->dev->ioq_wait);
+   complete(&nvmeq->delete_done);
 }
 
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
struct nvme_queue *nvmeq = req->end_io_data;
 
-   if (!error)
-   nvme_poll_irqdisable(nvmeq, -1);
+   if (error)
+   set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
nvme_del_queue_end(req, error);
 }
@@ -2238,6 +2241,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 
opcode)
req->timeout = ADMIN_TIMEOUT;
req->end_io_data = nvmeq;
 
+   init_completion(&nvmeq->delete_done);
blk_execute_rq_nowait(q, NULL, req, false,
opcode == nvme_admin_delete_cq ?
nvme_del_cq_end : nvme_del_queue_end);
@@ -2249,7 +2253,6 @@ static bool nvme_disable_io_queues(struct nvme_dev *dev, 
u8 opcode)
int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
 
-   reinit_completion(&dev->ioq_wait);
  retry:
timeout = ADMIN_TIMEOUT;
while (nr_queues > 0) {
@@ -2258,11 +2261,20 @@ static bool nvme_disable_io_queues(struct nvme_dev 
*dev, u8 opcode)
nr_queues--;
sent++;
}
-   while (sent--) {
-   timeout = wait_for_completion_io_timeout(&dev->ioq_wait,
+   while (sent) {
+   struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
+
+   timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
timeout);
if (timeout == 0)
return false;
+
+   /* handle any remaining CQEs */
+   if (opcode == nvme_admin_delete_cq &&
+   !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
+   nvme_poll_irqdisable(nvmeq, -1);
+
+   sent--;
if (nr_queues)
goto retry;
}
@@ -2746,7 +2758,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
-   init_completion(&dev->ioq_wait);
 
result = nvme_setup_prp_pools(dev);
if (result)
-- 
2.19.1



[PATCH 10/13] nvme-mpath: remove I/O polling support

2018-12-02 Thread Christoph Hellwig
The ->poll_fn has been stale for a while, as a lot of places check for mq
ops.  But there is no real point in it anyway, as we don't even use
the multipath code for subsystems without multiple ports, which is usually
what we do high performance I/O to.  If it really becomes an issue we
should rework the nvme code to also skip the multipath code for any
private namespace, even if that could mean some trouble when rescanning.

Signed-off-by: Christoph Hellwig 
---
 drivers/nvme/host/multipath.c | 16 
 1 file changed, 16 deletions(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index ffebdd0ae34b..ec310b1b9267 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,21 +220,6 @@ static blk_qc_t nvme_ns_head_make_request(struct 
request_queue *q,
return ret;
 }
 
-static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin)
-{
-   struct nvme_ns_head *head = q->queuedata;
-   struct nvme_ns *ns;
-   int found = 0;
-   int srcu_idx;
-
-   srcu_idx = srcu_read_lock(&head->srcu);
-   ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
-   if (likely(ns && nvme_path_is_optimized(ns)))
-   found = ns->queue->poll_fn(q, qc, spin);
-   srcu_read_unlock(&head->srcu, srcu_idx);
-   return found;
-}
-
 static void nvme_requeue_work(struct work_struct *work)
 {
struct nvme_ns_head *head =
@@ -281,7 +266,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct 
nvme_ns_head *head)
goto out;
q->queuedata = head;
blk_queue_make_request(q, nvme_ns_head_make_request);
-   q->poll_fn = nvme_ns_head_poll;
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(q, 512);
-- 
2.19.1



[PATCH 09/13] nvme-rdma: remove I/O polling support

2018-12-02 Thread Christoph Hellwig
The code was always a bit of a hack that digs far too much into
RDMA core internals.  Lets kick it out and reimplement proper
dedicated poll queues as needed.

Signed-off-by: Christoph Hellwig 
---
 drivers/nvme/host/rdma.c | 24 
 1 file changed, 24 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ccfde6c7c0a5..a62e9f177c06 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1736,29 +1736,6 @@ static blk_status_t nvme_rdma_queue_rq(struct 
blk_mq_hw_ctx *hctx,
return BLK_STS_IOERR;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
-{
-   struct nvme_rdma_queue *queue = hctx->driver_data;
-   struct ib_cq *cq = queue->ib_cq;
-   struct ib_wc wc;
-   int found = 0;
-
-   while (ib_poll_cq(cq, 1, &wc) > 0) {
-   struct ib_cqe *cqe = wc.wr_cqe;
-
-   if (cqe) {
-   if (cqe->done == nvme_rdma_recv_done) {
-   nvme_rdma_recv_done(cq, &wc);
-   found++;
-   } else {
-   cqe->done(cq, &wc);
-   }
-   }
-   }
-
-   return found;
-}
-
 static void nvme_rdma_complete_rq(struct request *rq)
 {
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
@@ -1780,7 +1757,6 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
.init_request   = nvme_rdma_init_request,
.exit_request   = nvme_rdma_exit_request,
.init_hctx  = nvme_rdma_init_hctx,
-   .poll   = nvme_rdma_poll,
.timeout= nvme_rdma_timeout,
.map_queues = nvme_rdma_map_queues,
 };
-- 
2.19.1



[PATCH 12/13] block: only allow polling if a poll queue_map exists

2018-12-02 Thread Christoph Hellwig
This avoids having to have differnet mq_ops for different setups
with or without poll queues.

Signed-off-by: Christoph Hellwig 
---
 block/blk-sysfs.c   |  2 +-
 drivers/nvme/host/pci.c | 29 +
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9f0cb370b39b..bb7c642ffefa 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -402,7 +402,7 @@ static ssize_t queue_poll_store(struct request_queue *q, 
const char *page,
unsigned long poll_on;
ssize_t ret;
 
-   if (!q->mq_ops || !q->mq_ops->poll)
+   if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL)
return -EINVAL;
 
ret = queue_var_store(&poll_on, page, count);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4ccb4ea22ac6..7732c4979a4e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1602,22 +1602,15 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
.timeout= nvme_timeout,
 };
 
-#define NVME_SHARED_MQ_OPS \
-   .queue_rq   = nvme_queue_rq,\
-   .commit_rqs = nvme_commit_rqs,  \
-   .complete   = nvme_pci_complete_rq, \
-   .init_hctx  = nvme_init_hctx,   \
-   .init_request   = nvme_init_request,\
-   .map_queues = nvme_pci_map_queues,  \
-   .timeout= nvme_timeout  \
-
 static const struct blk_mq_ops nvme_mq_ops = {
-   NVME_SHARED_MQ_OPS,
-};
-
-static const struct blk_mq_ops nvme_mq_poll_ops = {
-   NVME_SHARED_MQ_OPS,
-   .poll   = nvme_poll,
+   .queue_rq   = nvme_queue_rq,
+   .complete   = nvme_pci_complete_rq,
+   .commit_rqs = nvme_commit_rqs,
+   .init_hctx  = nvme_init_hctx,
+   .init_request   = nvme_init_request,
+   .map_queues = nvme_pci_map_queues,
+   .timeout= nvme_timeout,
+   .poll   = nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -2304,11 +2297,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
int ret;
 
if (!dev->ctrl.tagset) {
-   if (dev->io_queues[HCTX_TYPE_POLL])
-   dev->tagset.ops = &nvme_mq_poll_ops;
-   else
-   dev->tagset.ops = &nvme_mq_ops;
-
+   dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
dev->tagset.nr_maps = HCTX_MAX_TYPES;
dev->tagset.timeout = NVME_IO_TIMEOUT;
-- 
2.19.1



Re: [PATCH 1/2] blk-mq: Export iterating all tagged requests

2018-12-02 Thread Ming Lei
On Sun, Dec 2, 2018 at 12:48 AM Christoph Hellwig  wrote:
>
> On Fri, Nov 30, 2018 at 01:36:09PM -0700, Jens Axboe wrote:
> > On 11/30/18 1:26 PM, Keith Busch wrote:
> > > A driver may wish to iterate every tagged request, not just ones that
> > > satisfy blk_mq_request_started(). The intended use is so a driver may
> > > terminate entered requests on quiesced queues.
> >
> > How about we just move the started check into the handler passed in for
> > those that care about it? Much saner to make the interface iterate
> > everything, and leave whatever state check to the callback.
>
> So we used to do that, and I changed it back in May to test for
> MQ_RQ_IN_FLIGHT, and then Ming changed it to check
> blk_mq_request_started.  So this is clearly a minefield of sorts..

The change to blk_mq_request_started() is for just fixing SCSI's boot
hang issue.

>
> Note that at least mtip32xx, nbd, skd and the various nvme transports
> want to use the function to terminate all requests in the error
> path, and it would be great to have one single understood, documented
> and debugged helper for that in the core, so this is a vote for moving
> more of the logic in your second helper into the core code.  skd
> will need actually use ->complete to release resources for that, though
> and mtip plays some odd abort bits.  If it weren't for the interesting
> abort behavior in nvme-fc that means we could even unexport the
> low-level interface.

Agree, and especially SCSI's use should be understood given
SCSI is so widely deployed in product system.

thanks,
Ming Lei


Re: [Xen-devel] [PATCH] xen-blkfront: use old rinfo after enomem during migration

2018-12-02 Thread Manjunath Patil

On 11/30/2018 2:33 PM, Boris Ostrovsky wrote:


On 11/30/18 4:49 PM, Manjunath Patil wrote:

Thank you Boris for your comments. I removed faulty email of mine.

replies inline.
On 11/30/2018 12:42 PM, Boris Ostrovsky wrote:

On 11/29/18 12:17 AM, Manjunath Patil wrote:

Hi,
Feel free to suggest/comment on this.

I am trying to do the following at dst during the migration now.
1. Dont clear the old rinfo in blkif_free(). Instead just clean it.
2. Store the old rinfo and nr_rings into temp variables in
negotiate_mq()
3. let nr_rings get re-calculated based on backend data
4. try allocating new memory based on new nr_rings

Since I suspect number of rings will likely be the same why not reuse
the rings in the common case?

I thought attaching devices will be more often than migration. Hence
did not want add to an extra check for
   - if I am inside migration code path and
   - if new nr_rings is equal to old nr_rings or not

Sure addition of such a thing would avoid the memory allocation
altogether in migration path,
but it would add a little overhead for normal device addition.

Do you think its worth adding that change?


IMO a couple of extra checks are not going to make much difference.

I will add this change


I wonder though --- have you actually seen the case where you did fail
allocation and changes provided in this patch made things work? I am
asking because right after negotiate_mq() we will call setup_blkring()
and it will want to allocate bunch of memory. A failure there is fatal
(to ring setup). So it seems to me that you will survive negotiate_mq()
but then will likely fail soon after.
I have noticed the ENOMEM insise negotiate_mq() on ct machine. When I 
included my patch, I manually triggered the ENOMEM using a debug flag.

The patch works for ENOMEM inside negotiate_mq().

As you mentioned, if we really hit the ENOMEM in negotiate_mq(), we 
might hit it in setup_blkring() as well.

We should add the similar change to blkif_sring struct as well.

I will make this change as well and send the new patch-set for review.





5.
    a. If memory allocation is a success
   - free the old rinfo and proceed to use the new rinfo
    b. If memory allocation is a failure
   - use the old the rinfo
   - adjust the nr_rings to the lowest of new nr_rings and old
nr_rings
@@ -1918,10 +1936,24 @@ static int negotiate_mq(struct blkfront_info
*info)
     sizeof(struct blkfront_ring_info),
     GFP_KERNEL);
   if (!info->rinfo) {
-    xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating
ring_info structure");
-    info->nr_rings = 0;
-    return -ENOMEM;
-    }
+    if (unlikely(nr_rings_old)) {
+    /* We might waste some memory if
+ * info->nr_rings < nr_rings_old
+ */
+    info->rinfo = rinfo_old;
+    if (info->nr_rings > nr_rings_old)
+    info->nr_rings = nr_rings_old;
+    xenbus_dev_fatal(info->xbdev, -ENOMEM,

Why xenbus_dev_fatal()?

I wanted to make sure that this msg is seen on console by default. So
that we know there was a enomem event happened and we recovered from it.
What do you suggest instead? xenbus_dev_error?

Neither. xenbus_dev_fatal() is going to change connection state so it is
certainly not what we want. And even xenbus_dev_error() doesn't look
like the right thing to do since as far as block device setup is
concerned there are no errors.

Maybe pr_warn().

I will include this.

Thank you for your comments.


-boris



-boris



+    "reusing old ring_info structure(new ring size=%d)",
+    info->nr_rings);
+    } else {
+    xenbus_dev_fatal(info->xbdev, -ENOMEM,
+    "allocating ring_info structure");
+    info->nr_rings = 0;
+    return -ENOMEM;
+    }
+    } else if (unlikely(nr_rings_old))
+    kfree(rinfo_old);
     for (i = 0; i < info->nr_rings; i++) {
   struct blkfront_ring_info *rinfo;


___
Xen-devel mailing list
xen-de...@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel