Re: [PATCH v13] NVMe: Convert to blk-mq

2014-09-30 Thread Matias Bjørling
On Tue, Sep 30, 2014 at 7:42 PM, Keith Busch  wrote:
> On Tue, 30 Sep 2014, Matias Bjørling wrote:
>>
>> @@ -1967,27 +1801,30 @@ static struct nvme_ns *nvme_alloc_ns(struct
>> nvme_dev *dev, unsigned nsid,
>> {
>
> ...
>>
>> -   ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
>> +   queue_flag_set_unlocked(QUEUE_FLAG_DEFAULT, ns->queue);
>
>
> Instead of the above, you want
>
> +   ns->queue->queue_flags |= QUEUE_FLAG_DEFAULT;
>
> I only caught it because of the fun dm-mpath attempt with nvme and
> blk-mq. Or maybe we don't want to include "QUEUE_FLAG_STACKABLE" right
> now (part of the default flags) because it will kernel crash if you're
> using dm-multipath with a blk-mq driver today.
>

Great find. We should properly add a bunch of patches that add the
necessary support after the patch has gone in. I have a feeling that
there is a lot of interesting corner cases for getting stackable to
work.

> Otherwise, looks great!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v13] NVMe: Convert to blk-mq

2014-09-30 Thread Keith Busch

On Tue, 30 Sep 2014, Matias Bjørling wrote:

@@ -1967,27 +1801,30 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev 
*dev, unsigned nsid,
{

...

-   ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+   queue_flag_set_unlocked(QUEUE_FLAG_DEFAULT, ns->queue);


Instead of the above, you want

+   ns->queue->queue_flags |= QUEUE_FLAG_DEFAULT;

I only caught it because of the fun dm-mpath attempt with nvme and
blk-mq. Or maybe we don't want to include "QUEUE_FLAG_STACKABLE" right
now (part of the default flags) because it will kernel crash if you're
using dm-multipath with a blk-mq driver today.

Otherwise, looks great!

[PATCH v13] NVMe: Convert to blk-mq

2014-09-30 Thread Matias Bjørling
This converts the NVMe driver to a blk-mq request-based driver.

The NVMe driver is currently bio-based and implements queue logic within itself.
By using blk-mq, a lot of these responsibilities can be moved and simplified.

The patch is divided into the following blocks:

 * Per-command data and cmdid have been moved into the struct request field.
   The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id maintenance
   are now handled by blk-mq through the rq->tag field.

 * The logic for splitting bio's has been moved into the blk-mq layer. The
   driver instead notifies the block layer about limited gap support in SG
   lists.

 * blk-mq handles timeouts and is reimplemented within nvme_timeout(). This both
   includes abort handling and command cancelation.

 * Assignment of nvme queues to CPUs are replaced with the blk-mq version. The
   current blk-mq strategy is to assign the number of mapped queues and CPUs to
   provide synergy, while the nvme driver assign as many nvme hw queues as
   possible. This can be implemented in blk-mq if needed.

 * NVMe queues are merged with the tags structure of blk-mq.

 * blk-mq takes care of setup/teardown of nvme queues and guards invalid
   accesses. Therefore, RCU-usage for nvme queues can be removed.

 * IO tracing and accounting are handled by blk-mq and therefore removed.

 * Queue suspension logic is replaced with the logic from the block layer.

Contributions in this patch from:

  Sam Bradshaw 
  Jens Axboe 
  Keith Busch 
  Robert Nelson 

Acked-by: Keith Busch 
Acked-by: Jens Axboe 
Signed-off-by: Matias Bjørling 
---
 drivers/block/nvme-core.c | 1347 ++---
 drivers/block/nvme-scsi.c |8 +-
 include/linux/nvme.h  |   15 +-
 3 files changed, 552 insertions(+), 818 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 28aec2d..6fec7b8 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
  */
 
 #include 
-#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -33,7 +33,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -42,9 +41,8 @@
 #include 
 #include 
 
-#include 
-
 #define NVME_Q_DEPTH   1024
+#define NVME_AQ_DEPTH  64
 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT  (admin_timeout * HZ)
@@ -76,10 +74,12 @@ static wait_queue_head_t nvme_kthread_wait;
 static struct notifier_block nvme_nb;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
 
 struct async_cmd_info {
struct kthread_work work;
struct kthread_worker *worker;
+   struct request *req;
u32 result;
int status;
void *ctx;
@@ -90,7 +90,6 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
-   struct rcu_head r_head;
struct device *q_dmadev;
struct nvme_dev *dev;
char irqname[24];   /* nvme4294967295-65535\0 */
@@ -99,10 +98,6 @@ struct nvme_queue {
volatile struct nvme_completion *cqes;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
-   wait_queue_head_t sq_full;
-   wait_queue_t sq_cong_wait;
-   struct bio_list sq_cong;
-   struct list_head iod_bio;
u32 __iomem *q_db;
u16 q_depth;
u16 cq_vector;
@@ -112,10 +107,8 @@ struct nvme_queue {
u16 qid;
u8 cq_phase;
u8 cqe_seen;
-   u8 q_suspended;
-   cpumask_var_t cpu_mask;
struct async_cmd_info cmdinfo;
-   unsigned long cmdid_data[];
+   struct blk_mq_hw_ctx *hctx;
 };
 
 /*
@@ -143,62 +136,72 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, 
void *,
 struct nvme_cmd_info {
nvme_completion_fn fn;
void *ctx;
-   unsigned long timeout;
int aborted;
+   struct nvme_queue *nvmeq;
 };
 
-static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+   unsigned int hctx_idx)
 {
-   return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+   struct nvme_dev *dev = data;
+   struct nvme_queue *nvmeq = dev->queues[0];
+
+   WARN_ON(nvmeq->hctx);
+   nvmeq->hctx = hctx;
+   hctx->driver_data = nvmeq;
+   return 0;
 }
 
-static unsigned nvme_queue_extra(int depth)
+static int nvme_admin_init_request(void *data, struct request *req,
+   unsigned int hctx_idx, unsigned int rq_idx,
+   unsigned int numa_node)
 {
-   return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+   struct nvme_dev *dev = data;
+   struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+   struct nvme_queue