Re: [PATCH v11] NVMe: Convert to blk-mq
On 08/15/2014 01:09 AM, Keith Busch wrote: The allocation and freeing of blk-mq parts seems a bit asymmetrical to me. The 'tags' belong to the tagset, but any request_queue using that tagset may free the tags. I looked to separate the tag allocation concerns, but that's more time than I have, so this is my quick-fix driver patch, forcing tag access through the hw_ctx. I moved nvmeq->hctx->tags into nvmeq->tags in the last version. I missed the free's in blk_mq_map_swqueue. Good catch. The previous method might have another problem. If there's two namespaces, sharing tag set. The hctx_init fn could be called with different hctx for an nvmeq, leading to false tag sharing between nvme queues. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v11] NVMe: Convert to blk-mq
On Thu, 14 Aug 2014, Jens Axboe wrote: nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. The allocation and freeing of blk-mq parts seems a bit asymmetrical to me. The 'tags' belong to the tagset, but any request_queue using that tagset may free the tags. I looked to separate the tag allocation concerns, but that's more time than I have, so this is my quick-fix driver patch, forcing tag access through the hw_ctx. --- diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 384dc91..91432d2 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -109,7 +109,7 @@ struct nvme_queue { u8 cqe_seen; u8 q_suspended; struct async_cmd_info cmdinfo; - struct blk_mq_tags *tags; + struct blk_mq_hw_ctx *hctx; }; /* @@ -148,6 +148,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_queue *nvmeq = dev->queues[0]; hctx->driver_data = nvmeq; + nvmeq->hctx = hctx; return 0; } @@ -174,6 +175,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, hctx->cpumask); hctx->driver_data = nvmeq; + nvmeq->hctx = hctx; return 0; } @@ -280,8 +282,7 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx, static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, unsigned int tag) { - struct request *req = blk_mq_tag_to_rq(nvmeq->tags, tag); - + struct request *req = blk_mq_tag_to_rq(nvmeq->hctx->tags, tag); return blk_mq_rq_to_pdu(req); } @@ -654,8 +655,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) nvme_submit_flush(nvmeq, ns, req->tag); else nvme_submit_iod(nvmeq, iod, ns); - - queued: nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; @@ -1051,9 +1050,8 @@ static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map) if (tag >= qdepth) break; - req = blk_mq_tag_to_rq(nvmeq->tags, tag++); + req = blk_mq_tag_to_rq(nvmeq->hctx->tags, tag++); cmd = blk_mq_rq_to_pdu(req); if (cmd->ctx == CMD_CTX_CANCELLED) continue; @@ -1132,8 +1130,8 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) { spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - if (nvmeq->tags) - blk_mq_tag_busy_iter(nvmeq->tags, nvme_cancel_queue_ios, nvmeq); + if (nvmeq->hctx->tags) + blk_mq_tag_busy_iter(nvmeq->hctx->tags, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); } @@ -1353,8 +1351,6 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->admin_tagset)) return -ENOMEM; - dev->queues[0]->tags = dev->admin_tagset.tags[0]; - dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); if (!dev->admin_q) { blk_mq_free_tag_set(&dev->admin_tagset); @@ -2055,9 +2051,6 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->tagset)) goto out; - for (i = 1; i < dev->online_queues; i++) - dev->queues[i]->tags = dev->tagset.tags[i - 1]; - id_ns = mem; for (i = 1; i <= nn; i++) { res = nvme_identify(dev, i, 0, dma_addr); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v11] NVMe: Convert to blk-mq
On Thu, 14 Aug 2014, Matias Bjorling wrote: nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. The queue's tags were freed in 'blk_mq_map_swqueue' because some queues weren't mapped to a s/w queue, but the driver has a pointer to that freed memory, so it's a use-after-free error. This part in the driver looks different than it used to be in v8 when I last tested. The nvme_queue used to have a pointer to the 'hctx', but now it points directly to the 'tags', but it doesn't appear to be very safe. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v11] NVMe: Convert to blk-mq
I haven't event tried debugging this next one: doing an insmod+rmmod caused this warning followed by a panic: I'll look into it. Thanks nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. Thanks for the hint. Keith, I'll first have a chance to fix it tomorrow. Let me know if you find it in the mean time. I've put up a nvmemq_v12 on the github repository with the q_suspended removed. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v11] NVMe: Convert to blk-mq
On Thu, 14 Aug 2014, Jens Axboe wrote: On 08/14/2014 02:25 AM, Matias Bjørling wrote: The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken? Looks OK to me, looking at the code, 'result' is initialized to BLK_MQ_RQ_QUEUE_BUSY though. Which looks correct, we don't want to error on a suspended queue. My mistake missing how the result was initialized. nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. I'm not running with slab debugging. If it's any clue at all, blk-mq is using 16 of the 31 allocated h/w queues (which is okay as we discussed earlier), and the oops happens when clearing the first unused queue. I'll have time to mess with this more today, so I can either help find the problem or apply a patch if one becomes available.
Re: [PATCH v11] NVMe: Convert to blk-mq
On 08/14/2014 02:25 AM, Matias Bjørling wrote: > On 08/14/2014 12:27 AM, Keith Busch wrote: >> On Sun, 10 Aug 2014, Matias Bjørling wrote: >>> On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling wrote: This converts the NVMe driver to a blk-mq request-based driver. >>> >>> Willy, do you need me to make any changes to the conversion? Can you >>> pick it up for 3.17? >> >> Hi Matias, >> > > Hi Keith, Thanks for taking the time to take another look. > >> I'm starting to get a little more spare time to look at this again. I >> think there are still some bugs here, or perhaps something better we >> can do. I'll just start with one snippet of the code: >> >> @@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue >> *nvmeq, struct nvme_ns *ns, >> submit_iod: >> spin_lock_irq(&nvmeq->q_lock); >> if (nvmeq->q_suspended) { >> spin_unlock_irq(&nvmeq->q_lock); >> goto finish_cmd; >> } >> >> >> >> finish_cmd: >> nvme_finish_cmd(nvmeq, req->tag, NULL); >> nvme_free_iod(nvmeq->dev, iod); >> return result; >> } >> >> >> If the nvme queue is marked "suspended", this code just goto's the finish >> without setting "result", so I don't think that's right. > > The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken? Looks OK to me, looking at the code, 'result' is initialized to BLK_MQ_RQ_QUEUE_BUSY though. Which looks correct, we don't want to error on a suspended queue. >> But do we even need the "q_suspended" flag anymore? It was there because >> we couldn't prevent incoming requests as a bio based driver and we needed >> some way to mark that the h/w's IO queue was temporarily inactive, but >> blk-mq has ways to start/stop a queue at a higher level, right? If so, >> I think that's probably a better way than using this driver specific way. > > Not really, its managed by the block layer. Its on purpose I haven't > removed it. The patch is already too big, and I want to keep the patch > free from extra noise that can be removed by later patches. > > Should I remove it anyway? No point in keeping it, if it's not needed... >> I haven't event tried debugging this next one: doing an insmod+rmmod >> caused this warning followed by a panic: >> > > I'll look into it. Thanks nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. FWIW, in general, we've run a bunch of testing internally at FB, all on backported blk-mq stack and nvme-mq. No issues observed, and performance is good and overhead low. For other reasons that I can't go into here, this is the stack on which we'll run nvme hardware. Other features are much easily implemented on top of a blk-mq based driver as opposed to a bio based one, similarly to the suspended part above. -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v11] NVMe: Convert to blk-mq
On 08/14/2014 12:27 AM, Keith Busch wrote: On Sun, 10 Aug 2014, Matias Bjørling wrote: On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling wrote: This converts the NVMe driver to a blk-mq request-based driver. Willy, do you need me to make any changes to the conversion? Can you pick it up for 3.17? Hi Matias, Hi Keith, Thanks for taking the time to take another look. I'm starting to get a little more spare time to look at this again. I think there are still some bugs here, or perhaps something better we can do. I'll just start with one snippet of the code: @@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, submit_iod: spin_lock_irq(&nvmeq->q_lock); if (nvmeq->q_suspended) { spin_unlock_irq(&nvmeq->q_lock); goto finish_cmd; } finish_cmd: nvme_finish_cmd(nvmeq, req->tag, NULL); nvme_free_iod(nvmeq->dev, iod); return result; } If the nvme queue is marked "suspended", this code just goto's the finish without setting "result", so I don't think that's right. The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken? But do we even need the "q_suspended" flag anymore? It was there because we couldn't prevent incoming requests as a bio based driver and we needed some way to mark that the h/w's IO queue was temporarily inactive, but blk-mq has ways to start/stop a queue at a higher level, right? If so, I think that's probably a better way than using this driver specific way. Not really, its managed by the block layer. Its on purpose I haven't removed it. The patch is already too big, and I want to keep the patch free from extra noise that can be removed by later patches. Should I remove it anyway? I haven't event tried debugging this next one: doing an insmod+rmmod caused this warning followed by a panic: I'll look into it. Thanks Aug 13 15:41:41 kbgrz1 kernel: [ 89.207525] [ cut here ] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207538] WARNING: CPU: 8 PID: 5768 at mm/slab_common.c:491 kmalloc_slab+0x33/0x8b() Aug 13 15:41:41 kbgrz1 kernel: [ 89.207541] Modules linked in: nvme(-) parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support microcode pcspkr ehci_pci ehci_hcd usbcore acpi_cpufreq lpc_ich usb_common ioatdma mfd_core i2c_i801 evdev wmi tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod crct10dif_generic crc_t10dif crct10dif_common nbd dm_mod crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd isci libsas igb ahci libahci scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod dca Aug 13 15:41:41 kbgrz1 kernel: [ 89.207653] CPU: 8 PID: 5768 Comm: nvme1 Not tainted 3.16.0-rc6+ #24 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207656] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207659] 0009 8139f9ba Aug 13 15:41:41 kbgrz1 kernel: [ 89.207664] 8103db86 e8601d80 810f0d59 0246 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207669] 880827bf28c0 8020 88082b8d9d00 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207674] Call Trace: Aug 13 15:41:41 kbgrz1 kernel: [ 89.207685] [] ? dump_stack+0x41/0x51 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207694] [] ? warn_slowpath_common+0x7d/0x95 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207699] [] ? kmalloc_slab+0x33/0x8b Aug 13 15:41:41 kbgrz1 kernel: [ 89.207704] [] ? kmalloc_slab+0x33/0x8b Aug 13 15:41:41 kbgrz1 kernel: [ 89.207710] [] ? __kmalloc+0x28/0xf1 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207719] [] ? blk_mq_tag_busy_iter+0x30/0x7c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207728] [] ? nvme_init_hctx+0x49/0x49 [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207733] [] ? blk_mq_tag_busy_iter+0x30/0x7c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207738] [] ? nvme_clear_queue+0x72/0x7d [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207744] [] ? nvme_del_queue_end+0x12/0x26 [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207750] [] ? kthread_worker_fn+0xb1/0x111 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207754] [] ? kthread_create_on_node+0x171/0x171 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207758] [] ? kthread_create_on_node+0x171/0x171 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207762] [] ? kthread+0x9e/0xa6 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207766] [] ? __kthread_parkme+0x5c/0x5c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207773] [] ? ret_from_fork+0x7c/0xb0 Aug 13 15:41:41 kbgrz1 kernel: [ 89.20] [] ? __kthread_parkme+0x5c/0x5c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207780] ---[ end trace
Re: [PATCH v11] NVMe: Convert to blk-mq
On Sun, 10 Aug 2014, Matias Bjørling wrote: On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling wrote: This converts the NVMe driver to a blk-mq request-based driver. Willy, do you need me to make any changes to the conversion? Can you pick it up for 3.17? Hi Matias, I'm starting to get a little more spare time to look at this again. I think there are still some bugs here, or perhaps something better we can do. I'll just start with one snippet of the code: @@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, submit_iod: spin_lock_irq(&nvmeq->q_lock); if (nvmeq->q_suspended) { spin_unlock_irq(&nvmeq->q_lock); goto finish_cmd; } finish_cmd: nvme_finish_cmd(nvmeq, req->tag, NULL); nvme_free_iod(nvmeq->dev, iod); return result; } If the nvme queue is marked "suspended", this code just goto's the finish without setting "result", so I don't think that's right. But do we even need the "q_suspended" flag anymore? It was there because we couldn't prevent incoming requests as a bio based driver and we needed some way to mark that the h/w's IO queue was temporarily inactive, but blk-mq has ways to start/stop a queue at a higher level, right? If so, I think that's probably a better way than using this driver specific way. I haven't event tried debugging this next one: doing an insmod+rmmod caused this warning followed by a panic: Aug 13 15:41:41 kbgrz1 kernel: [ 89.207525] [ cut here ] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207538] WARNING: CPU: 8 PID: 5768 at mm/slab_common.c:491 kmalloc_slab+0x33/0x8b() Aug 13 15:41:41 kbgrz1 kernel: [ 89.207541] Modules linked in: nvme(-) parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support microcode pcspkr ehci_pci ehci_hcd usbcore acpi_cpufreq lpc_ich usb_common ioatdma mfd_core i2c_i801 evdev wmi tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod crct10dif_generic crc_t10dif crct10dif_common nbd dm_mod crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd isci libsas igb ahci libahci scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod dca Aug 13 15:41:41 kbgrz1 kernel: [ 89.207653] CPU: 8 PID: 5768 Comm: nvme1 Not tainted 3.16.0-rc6+ #24 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207656] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207659] 0009 8139f9ba Aug 13 15:41:41 kbgrz1 kernel: [ 89.207664] 8103db86 e8601d80 810f0d59 0246 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207669] 880827bf28c0 8020 88082b8d9d00 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207674] Call Trace: Aug 13 15:41:41 kbgrz1 kernel: [ 89.207685] [] ? dump_stack+0x41/0x51 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207694] [] ? warn_slowpath_common+0x7d/0x95 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207699] [] ? kmalloc_slab+0x33/0x8b Aug 13 15:41:41 kbgrz1 kernel: [ 89.207704] [] ? kmalloc_slab+0x33/0x8b Aug 13 15:41:41 kbgrz1 kernel: [ 89.207710] [] ? __kmalloc+0x28/0xf1 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207719] [] ? blk_mq_tag_busy_iter+0x30/0x7c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207728] [] ? nvme_init_hctx+0x49/0x49 [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207733] [] ? blk_mq_tag_busy_iter+0x30/0x7c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207738] [] ? nvme_clear_queue+0x72/0x7d [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207744] [] ? nvme_del_queue_end+0x12/0x26 [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207750] [] ? kthread_worker_fn+0xb1/0x111 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207754] [] ? kthread_create_on_node+0x171/0x171 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207758] [] ? kthread_create_on_node+0x171/0x171 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207762] [] ? kthread+0x9e/0xa6 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207766] [] ? __kthread_parkme+0x5c/0x5c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207773] [] ? ret_from_fork+0x7c/0xb0 Aug 13 15:41:41 kbgrz1 kernel: [ 89.20] [] ? __kthread_parkme+0x5c/0x5c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207780] ---[ end trace 8dc4a4c97c467d4c ]--- Aug 13 15:41:41 kbgrz1 kernel: [ 89.223627] PGD 0 Aug 13 15:41:41 kbgrz1 kernel: [ 89.226038] Oops: [#1] SMP Aug 13 15:41:41 kbgrz1 kernel: [ 89.229917] Modules linked in: nvme(-) parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joyd
Re: [PATCH v11] NVMe: Convert to blk-mq
On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling wrote: > This converts the NVMe driver to a blk-mq request-based driver. > Willy, do you need me to make any changes to the conversion? Can you pick it up for 3.17? Thanks, Matias -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v11] NVMe: Convert to blk-mq
This converts the NVMe driver to a blk-mq request-based driver. The NVMe driver is currently bio-based and implements queue logic within itself. By using blk-mq, a lot of these responsibilities can be moved and simplified. The patch is divided into the following blocks: * Per-command data and cmdid have been moved into the struct request field. The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id maintenance are now handled by blk-mq through the rq->tag field. * The logic for splitting bio's has been moved into the blk-mq layer. The driver instead notifies the block layer about limited gap support in SG lists. * blk-mq handles timeouts and is reimplemented within nvme_timeout(). This both includes abort handling and command cancelation. * Assignment of nvme queues to CPUs are replaced with the blk-mq version. The current blk-mq strategy is to assign the number of mapped queues and CPUs to provide synergy, while the nvme driver assign as many nvme hw queues as possible. This can be implemented in blk-mq if needed. * NVMe queues are merged with the tags structure of blk-mq. * blk-mq takes care of setup/teardown of nvme queues and guards invalid accesses. Therefore, RCU-usage for nvme queues can be removed. * IO tracing and accounting are handled by blk-mq and therefore removed. * Setup and teardown of nvme queues are now handled by nvme_[init/exit]_hctx. Contributions in this patch from: Sam Bradshaw Jens Axboe Keith Busch Robert Nelson Acked-by: Keith Busch Acked-by: Jens Axboe Signed-off-by: Matias Bjørling --- drivers/block/nvme-core.c | 1303 ++--- drivers/block/nvme-scsi.c |8 +- include/linux/nvme.h | 15 +- 3 files changed, 534 insertions(+), 792 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 28aec2d..384dc91 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -13,9 +13,9 @@ */ #include -#include #include #include +#include #include #include #include @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -42,9 +41,8 @@ #include #include -#include - #define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 64 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) @@ -76,10 +74,12 @@ static wait_queue_head_t nvme_kthread_wait; static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_process_cq(struct nvme_queue *nvmeq); struct async_cmd_info { struct kthread_work work; struct kthread_worker *worker; + struct request *req; u32 result; int status; void *ctx; @@ -90,7 +90,6 @@ struct async_cmd_info { * commands and one for I/O commands). */ struct nvme_queue { - struct rcu_head r_head; struct device *q_dmadev; struct nvme_dev *dev; char irqname[24]; /* nvme4294967295-65535\0 */ @@ -99,10 +98,6 @@ struct nvme_queue { volatile struct nvme_completion *cqes; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; - wait_queue_head_t sq_full; - wait_queue_t sq_cong_wait; - struct bio_list sq_cong; - struct list_head iod_bio; u32 __iomem *q_db; u16 q_depth; u16 cq_vector; @@ -113,9 +108,8 @@ struct nvme_queue { u8 cq_phase; u8 cqe_seen; u8 q_suspended; - cpumask_var_t cpu_mask; struct async_cmd_info cmdinfo; - unsigned long cmdid_data[]; + struct blk_mq_tags *tags; }; /* @@ -143,62 +137,65 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, struct nvme_cmd_info { nvme_completion_fn fn; void *ctx; - unsigned long timeout; int aborted; + struct nvme_queue *nvmeq; }; -static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + hctx->driver_data = nvmeq; + return 0; } -static unsigned nvme_queue_extra(int depth) +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + cmd->nvmeq = n