Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-15 Thread Matias Bjorling

On 08/15/2014 01:09 AM, Keith Busch wrote:


The allocation and freeing of blk-mq parts seems a bit asymmetrical
to me. The 'tags' belong to the tagset, but any request_queue using
that tagset may free the tags. I looked to separate the tag allocation
concerns, but that's more time than I have, so this is my quick-fix
driver patch, forcing tag access through the hw_ctx.



I moved nvmeq->hctx->tags into nvmeq->tags in the last version. I missed 
the free's in blk_mq_map_swqueue. Good catch.


The previous method might have another problem. If there's two 
namespaces, sharing tag set. The hctx_init fn could be called with 
different hctx for an nvmeq, leading to false tag sharing between nvme 
queues.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-15 Thread Matias Bjorling

On 08/15/2014 01:09 AM, Keith Busch wrote:


The allocation and freeing of blk-mq parts seems a bit asymmetrical
to me. The 'tags' belong to the tagset, but any request_queue using
that tagset may free the tags. I looked to separate the tag allocation
concerns, but that's more time than I have, so this is my quick-fix
driver patch, forcing tag access through the hw_ctx.



I moved nvmeq-hctx-tags into nvmeq-tags in the last version. I missed 
the free's in blk_mq_map_swqueue. Good catch.


The previous method might have another problem. If there's two 
namespaces, sharing tag set. The hctx_init fn could be called with 
different hctx for an nvmeq, leading to false tag sharing between nvme 
queues.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Keith Busch

On Thu, 14 Aug 2014, Jens Axboe wrote:

nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


The allocation and freeing of blk-mq parts seems a bit asymmetrical
to me. The 'tags' belong to the tagset, but any request_queue using
that tagset may free the tags. I looked to separate the tag allocation
concerns, but that's more time than I have, so this is my quick-fix
driver patch, forcing tag access through the hw_ctx.

---
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 384dc91..91432d2 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -109,7 +109,7 @@ struct nvme_queue {
u8 cqe_seen;
u8 q_suspended;
struct async_cmd_info cmdinfo;
-   struct blk_mq_tags *tags;
+   struct blk_mq_hw_ctx *hctx;
 };

 /*
@@ -148,6 +148,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, 
void *data,
struct nvme_queue *nvmeq = dev->queues[0];

hctx->driver_data = nvmeq;
+   nvmeq->hctx = hctx;
return 0;
 }

@@ -174,6 +175,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void 
*data,
irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
hctx->cpumask);
hctx->driver_data = nvmeq;
+   nvmeq->hctx = hctx;
return 0;
 }

@@ -280,8 +282,7 @@ static void async_completion(struct nvme_queue *nvmeq, void 
*ctx,
 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
  unsigned int tag)
 {
-   struct request *req = blk_mq_tag_to_rq(nvmeq->tags, tag);
-
+   struct request *req = blk_mq_tag_to_rq(nvmeq->hctx->tags, tag);
return blk_mq_rq_to_pdu(req);
 }

@@ -654,8 +655,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct 
request *req)
nvme_submit_flush(nvmeq, ns, req->tag);
else
nvme_submit_iod(nvmeq, iod, ns);
-
- queued:
nvme_process_cq(nvmeq);
spin_unlock_irq(>q_lock);
return BLK_MQ_RQ_QUEUE_OK;
@@ -1051,9 +1050,8 @@ static void nvme_cancel_queue_ios(void *data, unsigned 
long *tag_map)
if (tag >= qdepth)
break;

-   req = blk_mq_tag_to_rq(nvmeq->tags, tag++);
+   req = blk_mq_tag_to_rq(nvmeq->hctx->tags, tag++);
cmd = blk_mq_rq_to_pdu(req);
if (cmd->ctx == CMD_CTX_CANCELLED)
continue;

@@ -1132,8 +1130,8 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
 {
spin_lock_irq(>q_lock);
nvme_process_cq(nvmeq);
-   if (nvmeq->tags)
-   blk_mq_tag_busy_iter(nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
+   if (nvmeq->hctx->tags)
+   blk_mq_tag_busy_iter(nvmeq->hctx->tags, nvme_cancel_queue_ios, 
nvmeq);
spin_unlock_irq(>q_lock);
 }

@@ -1353,8 +1351,6 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(>admin_tagset))
return -ENOMEM;

-   dev->queues[0]->tags = dev->admin_tagset.tags[0];
-
dev->admin_q = blk_mq_init_queue(>admin_tagset);
if (!dev->admin_q) {
blk_mq_free_tag_set(>admin_tagset);
@@ -2055,9 +2051,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(>tagset))
goto out;

-   for (i = 1; i < dev->online_queues; i++)
-   dev->queues[i]->tags = dev->tagset.tags[i - 1];
-
id_ns = mem;
for (i = 1; i <= nn; i++) {
res = nvme_identify(dev, i, 0, dma_addr);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Keith Busch

On Thu, 14 Aug 2014, Matias Bjorling wrote:

nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


The queue's tags were freed in 'blk_mq_map_swqueue' because some queues
weren't mapped to a s/w queue, but the driver has a pointer to that
freed memory, so it's a use-after-free error.

This part in the driver looks different than it used to be in v8 when I
last tested. The nvme_queue used to have a pointer to the 'hctx', but now
it points directly to the 'tags', but it doesn't appear to be very safe.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Matias Bjorling



I haven't event tried debugging this next one: doing an insmod+rmmod
caused this warning followed by a panic:



I'll look into it. Thanks


nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


Thanks for the hint.

Keith, I'll first have a chance to fix it tomorrow. Let me know if you 
find it in the mean time.


I've put up a nvmemq_v12 on the github repository with the q_suspended 
removed.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Keith Busch

On Thu, 14 Aug 2014, Jens Axboe wrote:

On 08/14/2014 02:25 AM, Matias Bjørling wrote:

The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken?


Looks OK to me, looking at the code, 'result' is initialized to
BLK_MQ_RQ_QUEUE_BUSY though. Which looks correct, we don't want to error
on a suspended queue.


My mistake missing how the result was initialized.


nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


I'm not running with slab debugging. If it's any clue at all, blk-mq is
using 16 of the 31 allocated h/w queues (which is okay as we discussed
earlier), and the oops happens when clearing the first unused queue.

I'll have time to mess with this more today, so I can either help find
the problem or apply a patch if one becomes available.

Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Jens Axboe
On 08/14/2014 02:25 AM, Matias Bjørling wrote:
> On 08/14/2014 12:27 AM, Keith Busch wrote:
>> On Sun, 10 Aug 2014, Matias Bjørling wrote:
>>> On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling  wrote:
 This converts the NVMe driver to a blk-mq request-based driver.

>>>
>>> Willy, do you need me to make any changes to the conversion? Can you
>>> pick it up for 3.17?
>>
>> Hi Matias,
>>
> 
> Hi Keith, Thanks for taking the time to take another look.
> 
>> I'm starting to get a little more spare time to look at this again. I
>> think there are still some bugs here, or perhaps something better we
>> can do. I'll just start with one snippet of the code:
>>
>> @@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue
>> *nvmeq, struct nvme_ns *ns,
>>   submit_iod:
>>  spin_lock_irq(>q_lock);
>>  if (nvmeq->q_suspended) {
>>  spin_unlock_irq(>q_lock);
>>  goto finish_cmd;
>>  }
>>
>>   
>>
>>   finish_cmd:
>>  nvme_finish_cmd(nvmeq, req->tag, NULL);
>>  nvme_free_iod(nvmeq->dev, iod);
>>  return result;
>> }
>>
>>
>> If the nvme queue is marked "suspended", this code just goto's the finish
>> without setting "result", so I don't think that's right.
> 
> The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken?

Looks OK to me, looking at the code, 'result' is initialized to
BLK_MQ_RQ_QUEUE_BUSY though. Which looks correct, we don't want to error
on a suspended queue.

>> But do we even need the "q_suspended" flag anymore? It was there because
>> we couldn't prevent incoming requests as a bio based driver and we needed
>> some way to mark that the h/w's IO queue was temporarily inactive, but
>> blk-mq has ways to start/stop a queue at a higher level, right? If so,
>> I think that's probably a better way than using this driver specific way.
> 
> Not really, its managed by the block layer. Its on purpose I haven't
> removed it. The patch is already too big, and I want to keep the patch
> free from extra noise that can be removed by later patches.
> 
> Should I remove it anyway?

No point in keeping it, if it's not needed...

>> I haven't event tried debugging this next one: doing an insmod+rmmod
>> caused this warning followed by a panic:
>>
> 
> I'll look into it. Thanks

nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.

FWIW, in general, we've run a bunch of testing internally at FB, all on
backported blk-mq stack and nvme-mq. No issues observed, and performance
is good and overhead low. For other reasons that I can't go into here,
this is the stack on which we'll run nvme hardware. Other features are
much easily implemented on top of a blk-mq based driver as opposed to a
bio based one, similarly to the suspended part above.

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Matias Bjørling

On 08/14/2014 12:27 AM, Keith Busch wrote:

On Sun, 10 Aug 2014, Matias Bjørling wrote:

On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling  wrote:

This converts the NVMe driver to a blk-mq request-based driver.



Willy, do you need me to make any changes to the conversion? Can you
pick it up for 3.17?


Hi Matias,



Hi Keith, Thanks for taking the time to take another look.


I'm starting to get a little more spare time to look at this again. I
think there are still some bugs here, or perhaps something better we
can do. I'll just start with one snippet of the code:

@@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue
*nvmeq, struct nvme_ns *ns,
  submit_iod:
 spin_lock_irq(>q_lock);
 if (nvmeq->q_suspended) {
 spin_unlock_irq(>q_lock);
 goto finish_cmd;
 }

  

  finish_cmd:
 nvme_finish_cmd(nvmeq, req->tag, NULL);
 nvme_free_iod(nvmeq->dev, iod);
 return result;
}


If the nvme queue is marked "suspended", this code just goto's the finish
without setting "result", so I don't think that's right.


The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken?



But do we even need the "q_suspended" flag anymore? It was there because
we couldn't prevent incoming requests as a bio based driver and we needed
some way to mark that the h/w's IO queue was temporarily inactive, but
blk-mq has ways to start/stop a queue at a higher level, right? If so,
I think that's probably a better way than using this driver specific way.


Not really, its managed by the block layer. Its on purpose I haven't 
removed it. The patch is already too big, and I want to keep the patch 
free from extra noise that can be removed by later patches.


Should I remove it anyway?



I haven't event tried debugging this next one: doing an insmod+rmmod
caused this warning followed by a panic:



I'll look into it. Thanks


Aug 13 15:41:41 kbgrz1 kernel: [   89.207525] [ cut here
]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207538] WARNING: CPU: 8 PID: 5768
at mm/slab_common.c:491 kmalloc_slab+0x33/0x8b()
Aug 13 15:41:41 kbgrz1 kernel: [   89.207541] Modules linked in: nvme(-)
parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss
oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp
llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal
coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support microcode pcspkr
ehci_pci ehci_hcd usbcore acpi_cpufreq lpc_ich usb_common ioatdma
mfd_core i2c_i801 evdev wmi tpm_tis ipmi_si tpm ipmi_msghandler
processor thermal_sys button ext4 crc16 jbd2 mbcache sg sr_mod cdrom
sd_mod crct10dif_generic crc_t10dif crct10dif_common nbd dm_mod
crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw
gf128mul ablk_helper cryptd isci libsas igb ahci libahci
scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod dca
Aug 13 15:41:41 kbgrz1 kernel: [   89.207653] CPU: 8 PID: 5768 Comm:
nvme1 Not tainted 3.16.0-rc6+ #24
Aug 13 15:41:41 kbgrz1 kernel: [   89.207656] Hardware name: Intel
Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210
12/23/2013
Aug 13 15:41:41 kbgrz1 kernel: [   89.207659]  
0009 8139f9ba 
Aug 13 15:41:41 kbgrz1 kernel: [   89.207664]  8103db86
e8601d80 810f0d59 0246
Aug 13 15:41:41 kbgrz1 kernel: [   89.207669]  
880827bf28c0 8020 88082b8d9d00
Aug 13 15:41:41 kbgrz1 kernel: [   89.207674] Call Trace:
Aug 13 15:41:41 kbgrz1 kernel: [   89.207685]  [] ?
dump_stack+0x41/0x51
Aug 13 15:41:41 kbgrz1 kernel: [   89.207694]  [] ?
warn_slowpath_common+0x7d/0x95
Aug 13 15:41:41 kbgrz1 kernel: [   89.207699]  [] ?
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207704]  [] ?
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207710]  [] ?
__kmalloc+0x28/0xf1
Aug 13 15:41:41 kbgrz1 kernel: [   89.207719]  [] ?
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207728]  [] ?
nvme_init_hctx+0x49/0x49 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207733]  [] ?
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207738]  [] ?
nvme_clear_queue+0x72/0x7d [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207744]  [] ?
nvme_del_queue_end+0x12/0x26 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207750]  [] ?
kthread_worker_fn+0xb1/0x111
Aug 13 15:41:41 kbgrz1 kernel: [   89.207754]  [] ?
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207758]  [] ?
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207762]  [] ?
kthread+0x9e/0xa6
Aug 13 15:41:41 kbgrz1 kernel: [   89.207766]  [] ?
__kthread_parkme+0x5c/0x5c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207773]  [] ?
ret_from_fork+0x7c/0xb0
Aug 13 15:41:41 kbgrz1 kernel: [   89.20]  [] ?
__kthread_parkme+0x5c/0x5c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207780] ---[ end trace

Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Matias Bjørling

On 08/14/2014 12:27 AM, Keith Busch wrote:

On Sun, 10 Aug 2014, Matias Bjørling wrote:

On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling m...@bjorling.me wrote:

This converts the NVMe driver to a blk-mq request-based driver.



Willy, do you need me to make any changes to the conversion? Can you
pick it up for 3.17?


Hi Matias,



Hi Keith, Thanks for taking the time to take another look.


I'm starting to get a little more spare time to look at this again. I
think there are still some bugs here, or perhaps something better we
can do. I'll just start with one snippet of the code:

@@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue
*nvmeq, struct nvme_ns *ns,
  submit_iod:
 spin_lock_irq(nvmeq-q_lock);
 if (nvmeq-q_suspended) {
 spin_unlock_irq(nvmeq-q_lock);
 goto finish_cmd;
 }

  snip

  finish_cmd:
 nvme_finish_cmd(nvmeq, req-tag, NULL);
 nvme_free_iod(nvmeq-dev, iod);
 return result;
}


If the nvme queue is marked suspended, this code just goto's the finish
without setting result, so I don't think that's right.


The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken?



But do we even need the q_suspended flag anymore? It was there because
we couldn't prevent incoming requests as a bio based driver and we needed
some way to mark that the h/w's IO queue was temporarily inactive, but
blk-mq has ways to start/stop a queue at a higher level, right? If so,
I think that's probably a better way than using this driver specific way.


Not really, its managed by the block layer. Its on purpose I haven't 
removed it. The patch is already too big, and I want to keep the patch 
free from extra noise that can be removed by later patches.


Should I remove it anyway?



I haven't event tried debugging this next one: doing an insmod+rmmod
caused this warning followed by a panic:



I'll look into it. Thanks


Aug 13 15:41:41 kbgrz1 kernel: [   89.207525] [ cut here
]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207538] WARNING: CPU: 8 PID: 5768
at mm/slab_common.c:491 kmalloc_slab+0x33/0x8b()
Aug 13 15:41:41 kbgrz1 kernel: [   89.207541] Modules linked in: nvme(-)
parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss
oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp
llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal
coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support microcode pcspkr
ehci_pci ehci_hcd usbcore acpi_cpufreq lpc_ich usb_common ioatdma
mfd_core i2c_i801 evdev wmi tpm_tis ipmi_si tpm ipmi_msghandler
processor thermal_sys button ext4 crc16 jbd2 mbcache sg sr_mod cdrom
sd_mod crct10dif_generic crc_t10dif crct10dif_common nbd dm_mod
crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw
gf128mul ablk_helper cryptd isci libsas igb ahci libahci
scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod dca
Aug 13 15:41:41 kbgrz1 kernel: [   89.207653] CPU: 8 PID: 5768 Comm:
nvme1 Not tainted 3.16.0-rc6+ #24
Aug 13 15:41:41 kbgrz1 kernel: [   89.207656] Hardware name: Intel
Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210
12/23/2013
Aug 13 15:41:41 kbgrz1 kernel: [   89.207659]  
0009 8139f9ba 
Aug 13 15:41:41 kbgrz1 kernel: [   89.207664]  8103db86
e8601d80 810f0d59 0246
Aug 13 15:41:41 kbgrz1 kernel: [   89.207669]  
880827bf28c0 8020 88082b8d9d00
Aug 13 15:41:41 kbgrz1 kernel: [   89.207674] Call Trace:
Aug 13 15:41:41 kbgrz1 kernel: [   89.207685]  [8139f9ba] ?
dump_stack+0x41/0x51
Aug 13 15:41:41 kbgrz1 kernel: [   89.207694]  [8103db86] ?
warn_slowpath_common+0x7d/0x95
Aug 13 15:41:41 kbgrz1 kernel: [   89.207699]  [810f0d59] ?
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207704]  [810f0d59] ?
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207710]  [81115329] ?
__kmalloc+0x28/0xf1
Aug 13 15:41:41 kbgrz1 kernel: [   89.207719]  [811d0daf] ?
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207728]  [a052c426] ?
nvme_init_hctx+0x49/0x49 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207733]  [811d0daf] ?
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207738]  [a052c98b] ?
nvme_clear_queue+0x72/0x7d [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207744]  [a052c9a8] ?
nvme_del_queue_end+0x12/0x26 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207750]  [810576e3] ?
kthread_worker_fn+0xb1/0x111
Aug 13 15:41:41 kbgrz1 kernel: [   89.207754]  [81057632] ?
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207758]  [81057632] ?
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207762]  [810574b9] ?
kthread+0x9e/0xa6
Aug 13 15:41:41 kbgrz1 kernel: [   89.207766]  

Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Jens Axboe
On 08/14/2014 02:25 AM, Matias Bjørling wrote:
 On 08/14/2014 12:27 AM, Keith Busch wrote:
 On Sun, 10 Aug 2014, Matias Bjørling wrote:
 On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling m...@bjorling.me wrote:
 This converts the NVMe driver to a blk-mq request-based driver.


 Willy, do you need me to make any changes to the conversion? Can you
 pick it up for 3.17?

 Hi Matias,

 
 Hi Keith, Thanks for taking the time to take another look.
 
 I'm starting to get a little more spare time to look at this again. I
 think there are still some bugs here, or perhaps something better we
 can do. I'll just start with one snippet of the code:

 @@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue
 *nvmeq, struct nvme_ns *ns,
   submit_iod:
  spin_lock_irq(nvmeq-q_lock);
  if (nvmeq-q_suspended) {
  spin_unlock_irq(nvmeq-q_lock);
  goto finish_cmd;
  }

   snip

   finish_cmd:
  nvme_finish_cmd(nvmeq, req-tag, NULL);
  nvme_free_iod(nvmeq-dev, iod);
  return result;
 }


 If the nvme queue is marked suspended, this code just goto's the finish
 without setting result, so I don't think that's right.
 
 The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken?

Looks OK to me, looking at the code, 'result' is initialized to
BLK_MQ_RQ_QUEUE_BUSY though. Which looks correct, we don't want to error
on a suspended queue.

 But do we even need the q_suspended flag anymore? It was there because
 we couldn't prevent incoming requests as a bio based driver and we needed
 some way to mark that the h/w's IO queue was temporarily inactive, but
 blk-mq has ways to start/stop a queue at a higher level, right? If so,
 I think that's probably a better way than using this driver specific way.
 
 Not really, its managed by the block layer. Its on purpose I haven't
 removed it. The patch is already too big, and I want to keep the patch
 free from extra noise that can be removed by later patches.
 
 Should I remove it anyway?

No point in keeping it, if it's not needed...

 I haven't event tried debugging this next one: doing an insmod+rmmod
 caused this warning followed by a panic:

 
 I'll look into it. Thanks

nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.

FWIW, in general, we've run a bunch of testing internally at FB, all on
backported blk-mq stack and nvme-mq. No issues observed, and performance
is good and overhead low. For other reasons that I can't go into here,
this is the stack on which we'll run nvme hardware. Other features are
much easily implemented on top of a blk-mq based driver as opposed to a
bio based one, similarly to the suspended part above.

-- 
Jens Axboe

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Keith Busch

On Thu, 14 Aug 2014, Jens Axboe wrote:

On 08/14/2014 02:25 AM, Matias Bjørling wrote:

The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken?


Looks OK to me, looking at the code, 'result' is initialized to
BLK_MQ_RQ_QUEUE_BUSY though. Which looks correct, we don't want to error
on a suspended queue.


My mistake missing how the result was initialized.


nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


I'm not running with slab debugging. If it's any clue at all, blk-mq is
using 16 of the 31 allocated h/w queues (which is okay as we discussed
earlier), and the oops happens when clearing the first unused queue.

I'll have time to mess with this more today, so I can either help find
the problem or apply a patch if one becomes available.

Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Matias Bjorling



I haven't event tried debugging this next one: doing an insmod+rmmod
caused this warning followed by a panic:



I'll look into it. Thanks


nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


Thanks for the hint.

Keith, I'll first have a chance to fix it tomorrow. Let me know if you 
find it in the mean time.


I've put up a nvmemq_v12 on the github repository with the q_suspended 
removed.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Keith Busch

On Thu, 14 Aug 2014, Matias Bjorling wrote:

nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


The queue's tags were freed in 'blk_mq_map_swqueue' because some queues
weren't mapped to a s/w queue, but the driver has a pointer to that
freed memory, so it's a use-after-free error.

This part in the driver looks different than it used to be in v8 when I
last tested. The nvme_queue used to have a pointer to the 'hctx', but now
it points directly to the 'tags', but it doesn't appear to be very safe.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-14 Thread Keith Busch

On Thu, 14 Aug 2014, Jens Axboe wrote:

nr_tags must be uninitialized or screwed up somehow, otherwise I don't
see how that kmalloc() could warn on being too large. Keith, are you
running with slab debugging? Matias, might be worth trying.


The allocation and freeing of blk-mq parts seems a bit asymmetrical
to me. The 'tags' belong to the tagset, but any request_queue using
that tagset may free the tags. I looked to separate the tag allocation
concerns, but that's more time than I have, so this is my quick-fix
driver patch, forcing tag access through the hw_ctx.

---
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 384dc91..91432d2 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -109,7 +109,7 @@ struct nvme_queue {
u8 cqe_seen;
u8 q_suspended;
struct async_cmd_info cmdinfo;
-   struct blk_mq_tags *tags;
+   struct blk_mq_hw_ctx *hctx;
 };

 /*
@@ -148,6 +148,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, 
void *data,
struct nvme_queue *nvmeq = dev-queues[0];

hctx-driver_data = nvmeq;
+   nvmeq-hctx = hctx;
return 0;
 }

@@ -174,6 +175,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void 
*data,
irq_set_affinity_hint(dev-entry[nvmeq-cq_vector].vector,
hctx-cpumask);
hctx-driver_data = nvmeq;
+   nvmeq-hctx = hctx;
return 0;
 }

@@ -280,8 +282,7 @@ static void async_completion(struct nvme_queue *nvmeq, void 
*ctx,
 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
  unsigned int tag)
 {
-   struct request *req = blk_mq_tag_to_rq(nvmeq-tags, tag);
-
+   struct request *req = blk_mq_tag_to_rq(nvmeq-hctx-tags, tag);
return blk_mq_rq_to_pdu(req);
 }

@@ -654,8 +655,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct 
request *req)
nvme_submit_flush(nvmeq, ns, req-tag);
else
nvme_submit_iod(nvmeq, iod, ns);
-
- queued:
nvme_process_cq(nvmeq);
spin_unlock_irq(nvmeq-q_lock);
return BLK_MQ_RQ_QUEUE_OK;
@@ -1051,9 +1050,8 @@ static void nvme_cancel_queue_ios(void *data, unsigned 
long *tag_map)
if (tag = qdepth)
break;

-   req = blk_mq_tag_to_rq(nvmeq-tags, tag++);
+   req = blk_mq_tag_to_rq(nvmeq-hctx-tags, tag++);
cmd = blk_mq_rq_to_pdu(req);
if (cmd-ctx == CMD_CTX_CANCELLED)
continue;

@@ -1132,8 +1130,8 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
 {
spin_lock_irq(nvmeq-q_lock);
nvme_process_cq(nvmeq);
-   if (nvmeq-tags)
-   blk_mq_tag_busy_iter(nvmeq-tags, nvme_cancel_queue_ios, nvmeq);
+   if (nvmeq-hctx-tags)
+   blk_mq_tag_busy_iter(nvmeq-hctx-tags, nvme_cancel_queue_ios, 
nvmeq);
spin_unlock_irq(nvmeq-q_lock);
 }

@@ -1353,8 +1351,6 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(dev-admin_tagset))
return -ENOMEM;

-   dev-queues[0]-tags = dev-admin_tagset.tags[0];
-
dev-admin_q = blk_mq_init_queue(dev-admin_tagset);
if (!dev-admin_q) {
blk_mq_free_tag_set(dev-admin_tagset);
@@ -2055,9 +2051,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(dev-tagset))
goto out;

-   for (i = 1; i  dev-online_queues; i++)
-   dev-queues[i]-tags = dev-tagset.tags[i - 1];
-
id_ns = mem;
for (i = 1; i = nn; i++) {
res = nvme_identify(dev, i, 0, dma_addr);
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-13 Thread Keith Busch

On Sun, 10 Aug 2014, Matias Bjørling wrote:

On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling  wrote:

This converts the NVMe driver to a blk-mq request-based driver.



Willy, do you need me to make any changes to the conversion? Can you
pick it up for 3.17?


Hi Matias,

I'm starting to get a little more spare time to look at this again. I
think there are still some bugs here, or perhaps something better we
can do. I'll just start with one snippet of the code:

@@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue 
*nvmeq, struct nvme_ns *ns,
 submit_iod:
spin_lock_irq(>q_lock);
if (nvmeq->q_suspended) {
spin_unlock_irq(>q_lock);
goto finish_cmd;
}

 

 finish_cmd:
nvme_finish_cmd(nvmeq, req->tag, NULL);
nvme_free_iod(nvmeq->dev, iod);
return result;
}


If the nvme queue is marked "suspended", this code just goto's the finish
without setting "result", so I don't think that's right.

But do we even need the "q_suspended" flag anymore? It was there because
we couldn't prevent incoming requests as a bio based driver and we needed
some way to mark that the h/w's IO queue was temporarily inactive, but
blk-mq has ways to start/stop a queue at a higher level, right? If so,
I think that's probably a better way than using this driver specific way.

I haven't event tried debugging this next one: doing an insmod+rmmod
caused this warning followed by a panic:

Aug 13 15:41:41 kbgrz1 kernel: [   89.207525] [ cut here 
]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207538] WARNING: CPU: 8 PID: 5768 at 
mm/slab_common.c:491 kmalloc_slab+0x33/0x8b()
Aug 13 15:41:41 kbgrz1 kernel: [   89.207541] Modules linked in: nvme(-) 
parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss 
oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs 
joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp 
kvm_intel kvm iTCO_wdt iTCO_vendor_support microcode pcspkr ehci_pci ehci_hcd 
usbcore acpi_cpufreq lpc_ich usb_common ioatdma mfd_core i2c_i801 evdev wmi 
tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 
jbd2 mbcache sg sr_mod cdrom sd_mod crct10dif_generic crc_t10dif 
crct10dif_common nbd dm_mod crc32c_intel ghash_clmulni_intel aesni_intel 
aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd isci libsas igb ahci 
libahci scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod 
dca
Aug 13 15:41:41 kbgrz1 kernel: [   89.207653] CPU: 8 PID: 5768 Comm: nvme1 Not 
tainted 3.16.0-rc6+ #24
Aug 13 15:41:41 kbgrz1 kernel: [   89.207656] Hardware name: Intel Corporation 
S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
Aug 13 15:41:41 kbgrz1 kernel: [   89.207659]   
0009 8139f9ba 
Aug 13 15:41:41 kbgrz1 kernel: [   89.207664]  8103db86 
e8601d80 810f0d59 0246
Aug 13 15:41:41 kbgrz1 kernel: [   89.207669]   
880827bf28c0 8020 88082b8d9d00
Aug 13 15:41:41 kbgrz1 kernel: [   89.207674] Call Trace:
Aug 13 15:41:41 kbgrz1 kernel: [   89.207685]  [] ? 
dump_stack+0x41/0x51
Aug 13 15:41:41 kbgrz1 kernel: [   89.207694]  [] ? 
warn_slowpath_common+0x7d/0x95
Aug 13 15:41:41 kbgrz1 kernel: [   89.207699]  [] ? 
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207704]  [] ? 
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207710]  [] ? 
__kmalloc+0x28/0xf1
Aug 13 15:41:41 kbgrz1 kernel: [   89.207719]  [] ? 
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207728]  [] ? 
nvme_init_hctx+0x49/0x49 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207733]  [] ? 
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207738]  [] ? 
nvme_clear_queue+0x72/0x7d [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207744]  [] ? 
nvme_del_queue_end+0x12/0x26 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207750]  [] ? 
kthread_worker_fn+0xb1/0x111
Aug 13 15:41:41 kbgrz1 kernel: [   89.207754]  [] ? 
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207758]  [] ? 
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207762]  [] ? 
kthread+0x9e/0xa6
Aug 13 15:41:41 kbgrz1 kernel: [   89.207766]  [] ? 
__kthread_parkme+0x5c/0x5c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207773]  [] ? 
ret_from_fork+0x7c/0xb0
Aug 13 15:41:41 kbgrz1 kernel: [   89.20]  [] ? 
__kthread_parkme+0x5c/0x5c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207780] ---[ end trace 8dc4a4c97c467d4c 
]---
Aug 13 15:41:41 kbgrz1 kernel: [   89.223627] PGD 0 
Aug 13 15:41:41 kbgrz1 kernel: [   89.226038] Oops:  [#1] SMP 
Aug 13 15:41:41 kbgrz1 kernel: [   89.229917] Modules linked in: nvme(-) parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev 

Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-13 Thread Keith Busch

On Sun, 10 Aug 2014, Matias Bjørling wrote:

On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling m...@bjorling.me wrote:

This converts the NVMe driver to a blk-mq request-based driver.



Willy, do you need me to make any changes to the conversion? Can you
pick it up for 3.17?


Hi Matias,

I'm starting to get a little more spare time to look at this again. I
think there are still some bugs here, or perhaps something better we
can do. I'll just start with one snippet of the code:

@@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue 
*nvmeq, struct nvme_ns *ns,
 submit_iod:
spin_lock_irq(nvmeq-q_lock);
if (nvmeq-q_suspended) {
spin_unlock_irq(nvmeq-q_lock);
goto finish_cmd;
}

 snip

 finish_cmd:
nvme_finish_cmd(nvmeq, req-tag, NULL);
nvme_free_iod(nvmeq-dev, iod);
return result;
}


If the nvme queue is marked suspended, this code just goto's the finish
without setting result, so I don't think that's right.

But do we even need the q_suspended flag anymore? It was there because
we couldn't prevent incoming requests as a bio based driver and we needed
some way to mark that the h/w's IO queue was temporarily inactive, but
blk-mq has ways to start/stop a queue at a higher level, right? If so,
I think that's probably a better way than using this driver specific way.

I haven't event tried debugging this next one: doing an insmod+rmmod
caused this warning followed by a panic:

Aug 13 15:41:41 kbgrz1 kernel: [   89.207525] [ cut here 
]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207538] WARNING: CPU: 8 PID: 5768 at 
mm/slab_common.c:491 kmalloc_slab+0x33/0x8b()
Aug 13 15:41:41 kbgrz1 kernel: [   89.207541] Modules linked in: nvme(-) 
parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss 
oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs 
joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp 
kvm_intel kvm iTCO_wdt iTCO_vendor_support microcode pcspkr ehci_pci ehci_hcd 
usbcore acpi_cpufreq lpc_ich usb_common ioatdma mfd_core i2c_i801 evdev wmi 
tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 
jbd2 mbcache sg sr_mod cdrom sd_mod crct10dif_generic crc_t10dif 
crct10dif_common nbd dm_mod crc32c_intel ghash_clmulni_intel aesni_intel 
aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd isci libsas igb ahci 
libahci scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod 
dca
Aug 13 15:41:41 kbgrz1 kernel: [   89.207653] CPU: 8 PID: 5768 Comm: nvme1 Not 
tainted 3.16.0-rc6+ #24
Aug 13 15:41:41 kbgrz1 kernel: [   89.207656] Hardware name: Intel Corporation 
S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
Aug 13 15:41:41 kbgrz1 kernel: [   89.207659]   
0009 8139f9ba 
Aug 13 15:41:41 kbgrz1 kernel: [   89.207664]  8103db86 
e8601d80 810f0d59 0246
Aug 13 15:41:41 kbgrz1 kernel: [   89.207669]   
880827bf28c0 8020 88082b8d9d00
Aug 13 15:41:41 kbgrz1 kernel: [   89.207674] Call Trace:
Aug 13 15:41:41 kbgrz1 kernel: [   89.207685]  [8139f9ba] ? 
dump_stack+0x41/0x51
Aug 13 15:41:41 kbgrz1 kernel: [   89.207694]  [8103db86] ? 
warn_slowpath_common+0x7d/0x95
Aug 13 15:41:41 kbgrz1 kernel: [   89.207699]  [810f0d59] ? 
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207704]  [810f0d59] ? 
kmalloc_slab+0x33/0x8b
Aug 13 15:41:41 kbgrz1 kernel: [   89.207710]  [81115329] ? 
__kmalloc+0x28/0xf1
Aug 13 15:41:41 kbgrz1 kernel: [   89.207719]  [811d0daf] ? 
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207728]  [a052c426] ? 
nvme_init_hctx+0x49/0x49 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207733]  [811d0daf] ? 
blk_mq_tag_busy_iter+0x30/0x7c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207738]  [a052c98b] ? 
nvme_clear_queue+0x72/0x7d [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207744]  [a052c9a8] ? 
nvme_del_queue_end+0x12/0x26 [nvme]
Aug 13 15:41:41 kbgrz1 kernel: [   89.207750]  [810576e3] ? 
kthread_worker_fn+0xb1/0x111
Aug 13 15:41:41 kbgrz1 kernel: [   89.207754]  [81057632] ? 
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207758]  [81057632] ? 
kthread_create_on_node+0x171/0x171
Aug 13 15:41:41 kbgrz1 kernel: [   89.207762]  [810574b9] ? 
kthread+0x9e/0xa6
Aug 13 15:41:41 kbgrz1 kernel: [   89.207766]  [8105741b] ? 
__kthread_parkme+0x5c/0x5c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207773]  [813a3a2c] ? 
ret_from_fork+0x7c/0xb0
Aug 13 15:41:41 kbgrz1 kernel: [   89.20]  [8105741b] ? 
__kthread_parkme+0x5c/0x5c
Aug 13 15:41:41 kbgrz1 kernel: [   89.207780] ---[ end trace 8dc4a4c97c467d4c 
]---
Aug 13 15:41:41 kbgrz1 kernel: [   89.223627] PGD 0 
Aug 13 15:41:41 

Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-10 Thread Matias Bjørling
On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling  wrote:
> This converts the NVMe driver to a blk-mq request-based driver.
>

Willy, do you need me to make any changes to the conversion? Can you
pick it up for 3.17?

Thanks,
Matias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v11] NVMe: Convert to blk-mq

2014-08-10 Thread Matias Bjørling
On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling m...@bjorling.me wrote:
 This converts the NVMe driver to a blk-mq request-based driver.


Willy, do you need me to make any changes to the conversion? Can you
pick it up for 3.17?

Thanks,
Matias
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v11] NVMe: Convert to blk-mq

2014-07-26 Thread Matias Bjørling
This converts the NVMe driver to a blk-mq request-based driver.

The NVMe driver is currently bio-based and implements queue logic within itself.
By using blk-mq, a lot of these responsibilities can be moved and simplified.

The patch is divided into the following blocks:

 * Per-command data and cmdid have been moved into the struct request field.
   The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id maintenance
   are now handled by blk-mq through the rq->tag field.

 * The logic for splitting bio's has been moved into the blk-mq layer. The
   driver instead notifies the block layer about limited gap support in SG
   lists.

 * blk-mq handles timeouts and is reimplemented within nvme_timeout(). This both
   includes abort handling and command cancelation.

 * Assignment of nvme queues to CPUs are replaced with the blk-mq version. The
   current blk-mq strategy is to assign the number of mapped queues and CPUs to
   provide synergy, while the nvme driver assign as many nvme hw queues as
   possible. This can be implemented in blk-mq if needed.

 * NVMe queues are merged with the tags structure of blk-mq.

 * blk-mq takes care of setup/teardown of nvme queues and guards invalid
   accesses. Therefore, RCU-usage for nvme queues can be removed.

 * IO tracing and accounting are handled by blk-mq and therefore removed.

 * Setup and teardown of nvme queues are now handled by nvme_[init/exit]_hctx.

Contributions in this patch from:

  Sam Bradshaw 
  Jens Axboe 
  Keith Busch 
  Robert Nelson 

Acked-by: Keith Busch 
Acked-by: Jens Axboe 
Signed-off-by: Matias Bjørling 
---
 drivers/block/nvme-core.c | 1303 ++---
 drivers/block/nvme-scsi.c |8 +-
 include/linux/nvme.h  |   15 +-
 3 files changed, 534 insertions(+), 792 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 28aec2d..384dc91 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
  */
 
 #include 
-#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -33,7 +33,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -42,9 +41,8 @@
 #include 
 #include 
 
-#include 
-
 #define NVME_Q_DEPTH   1024
+#define NVME_AQ_DEPTH  64
 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT  (admin_timeout * HZ)
@@ -76,10 +74,12 @@ static wait_queue_head_t nvme_kthread_wait;
 static struct notifier_block nvme_nb;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
 
 struct async_cmd_info {
struct kthread_work work;
struct kthread_worker *worker;
+   struct request *req;
u32 result;
int status;
void *ctx;
@@ -90,7 +90,6 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
-   struct rcu_head r_head;
struct device *q_dmadev;
struct nvme_dev *dev;
char irqname[24];   /* nvme4294967295-65535\0 */
@@ -99,10 +98,6 @@ struct nvme_queue {
volatile struct nvme_completion *cqes;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
-   wait_queue_head_t sq_full;
-   wait_queue_t sq_cong_wait;
-   struct bio_list sq_cong;
-   struct list_head iod_bio;
u32 __iomem *q_db;
u16 q_depth;
u16 cq_vector;
@@ -113,9 +108,8 @@ struct nvme_queue {
u8 cq_phase;
u8 cqe_seen;
u8 q_suspended;
-   cpumask_var_t cpu_mask;
struct async_cmd_info cmdinfo;
-   unsigned long cmdid_data[];
+   struct blk_mq_tags *tags;
 };
 
 /*
@@ -143,62 +137,65 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, 
void *,
 struct nvme_cmd_info {
nvme_completion_fn fn;
void *ctx;
-   unsigned long timeout;
int aborted;
+   struct nvme_queue *nvmeq;
 };
 
-static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+   unsigned int hctx_idx)
 {
-   return (void *)>cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+   struct nvme_dev *dev = data;
+   struct nvme_queue *nvmeq = dev->queues[0];
+
+   hctx->driver_data = nvmeq;
+   return 0;
 }
 
-static unsigned nvme_queue_extra(int depth)
+static int nvme_admin_init_request(void *data, struct request *req,
+   unsigned int hctx_idx, unsigned int rq_idx,
+   unsigned int numa_node)
 {
-   return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+   struct nvme_dev *dev = data;
+   struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+   struct nvme_queue *nvmeq = dev->queues[0];
+
+   BUG_ON(!nvmeq);
+   cmd->nvmeq = nvmeq;

[PATCH v11] NVMe: Convert to blk-mq

2014-07-26 Thread Matias Bjørling
This converts the NVMe driver to a blk-mq request-based driver.

The NVMe driver is currently bio-based and implements queue logic within itself.
By using blk-mq, a lot of these responsibilities can be moved and simplified.

The patch is divided into the following blocks:

 * Per-command data and cmdid have been moved into the struct request field.
   The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id maintenance
   are now handled by blk-mq through the rq-tag field.

 * The logic for splitting bio's has been moved into the blk-mq layer. The
   driver instead notifies the block layer about limited gap support in SG
   lists.

 * blk-mq handles timeouts and is reimplemented within nvme_timeout(). This both
   includes abort handling and command cancelation.

 * Assignment of nvme queues to CPUs are replaced with the blk-mq version. The
   current blk-mq strategy is to assign the number of mapped queues and CPUs to
   provide synergy, while the nvme driver assign as many nvme hw queues as
   possible. This can be implemented in blk-mq if needed.

 * NVMe queues are merged with the tags structure of blk-mq.

 * blk-mq takes care of setup/teardown of nvme queues and guards invalid
   accesses. Therefore, RCU-usage for nvme queues can be removed.

 * IO tracing and accounting are handled by blk-mq and therefore removed.

 * Setup and teardown of nvme queues are now handled by nvme_[init/exit]_hctx.

Contributions in this patch from:

  Sam Bradshaw sbrads...@micron.com
  Jens Axboe ax...@fb.com
  Keith Busch keith.bu...@intel.com
  Robert Nelson rlnel...@google.com

Acked-by: Keith Busch keith.bu...@intel.com
Acked-by: Jens Axboe ax...@fb.com
Signed-off-by: Matias Bjørling m...@bjorling.me
---
 drivers/block/nvme-core.c | 1303 ++---
 drivers/block/nvme-scsi.c |8 +-
 include/linux/nvme.h  |   15 +-
 3 files changed, 534 insertions(+), 792 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 28aec2d..384dc91 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
  */
 
 #include linux/nvme.h
-#include linux/bio.h
 #include linux/bitops.h
 #include linux/blkdev.h
+#include linux/blk-mq.h
 #include linux/cpu.h
 #include linux/delay.h
 #include linux/errno.h
@@ -33,7 +33,6 @@
 #include linux/module.h
 #include linux/moduleparam.h
 #include linux/pci.h
-#include linux/percpu.h
 #include linux/poison.h
 #include linux/ptrace.h
 #include linux/sched.h
@@ -42,9 +41,8 @@
 #include scsi/sg.h
 #include asm-generic/io-64-nonatomic-lo-hi.h
 
-#include trace/events/block.h
-
 #define NVME_Q_DEPTH   1024
+#define NVME_AQ_DEPTH  64
 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
 #define ADMIN_TIMEOUT  (admin_timeout * HZ)
@@ -76,10 +74,12 @@ static wait_queue_head_t nvme_kthread_wait;
 static struct notifier_block nvme_nb;
 
 static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
 
 struct async_cmd_info {
struct kthread_work work;
struct kthread_worker *worker;
+   struct request *req;
u32 result;
int status;
void *ctx;
@@ -90,7 +90,6 @@ struct async_cmd_info {
  * commands and one for I/O commands).
  */
 struct nvme_queue {
-   struct rcu_head r_head;
struct device *q_dmadev;
struct nvme_dev *dev;
char irqname[24];   /* nvme4294967295-65535\0 */
@@ -99,10 +98,6 @@ struct nvme_queue {
volatile struct nvme_completion *cqes;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
-   wait_queue_head_t sq_full;
-   wait_queue_t sq_cong_wait;
-   struct bio_list sq_cong;
-   struct list_head iod_bio;
u32 __iomem *q_db;
u16 q_depth;
u16 cq_vector;
@@ -113,9 +108,8 @@ struct nvme_queue {
u8 cq_phase;
u8 cqe_seen;
u8 q_suspended;
-   cpumask_var_t cpu_mask;
struct async_cmd_info cmdinfo;
-   unsigned long cmdid_data[];
+   struct blk_mq_tags *tags;
 };
 
 /*
@@ -143,62 +137,65 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, 
void *,
 struct nvme_cmd_info {
nvme_completion_fn fn;
void *ctx;
-   unsigned long timeout;
int aborted;
+   struct nvme_queue *nvmeq;
 };
 
-static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+   unsigned int hctx_idx)
 {
-   return (void *)nvmeq-cmdid_data[BITS_TO_LONGS(nvmeq-q_depth)];
+   struct nvme_dev *dev = data;
+   struct nvme_queue *nvmeq = dev-queues[0];
+
+   hctx-driver_data = nvmeq;
+   return 0;
 }
 
-static unsigned nvme_queue_extra(int depth)
+static int nvme_admin_init_request(void *data, struct request *req,
+