Hi, On Wed, Oct 31, 2018 at 08:36:31AM -0600, Jens Axboe wrote: > NVMe does round-robin between queues by default, which means that > sharing a queue map for both reads and writes can be problematic > in terms of read servicing. It's much easier to flood the queue > with writes and reduce the read servicing. > > Implement two queue maps, one for reads and one for writes. The > write queue count is configurable through the 'write_queues' > parameter. > > By default, we retain the previous behavior of having a single > queue set, shared between reads and writes. Setting 'write_queues' > to a non-zero value will create two queue sets, one for reads and > one for writes, the latter using the configurable number of > queues (hardware queue counts permitting). > > Reviewed-by: Hannes Reinecke <h...@suse.com> > Reviewed-by: Keith Busch <keith.bu...@intel.com> > Signed-off-by: Jens Axboe <ax...@kernel.dk>
This patch causes hangs when running recent versions of -next with several architectures; see the -next column at kerneltests.org/builders for details. Bisect log below; this was run with qemu on alpha. Reverting this patch as well as "nvme: add separate poll queue map" fixes the problem. Guenter --- # bad: [220dcf1c6fc97f8873b6d9fe121b80decd4b71a8] Add linux-next specific files for 20181113 # good: [ccda4af0f4b92f7b4c308d3acc262f4a7e3affad] Linux 4.20-rc2 git bisect start 'HEAD' 'v4.20-rc2' # good: [b5ae1d7e1bd7cf5dfdef977774da5cb69c60c911] Merge remote-tracking branch 'crypto/master' git bisect good b5ae1d7e1bd7cf5dfdef977774da5cb69c60c911 # bad: [be877b847ffe96fb18e4925f05d5c2eb12b6b9f1] Merge remote-tracking branch 'block/for-next' git bisect bad be877b847ffe96fb18e4925f05d5c2eb12b6b9f1 # good: [088122c5028636643d566c89cfdc621beed79974] Merge remote-tracking branch 'drm-intel/for-linux-next' git bisect good 088122c5028636643d566c89cfdc621beed79974 # good: [3577f74d5ae898c34252c5cdc096a681910a919f] Merge remote-tracking branch 'drm-misc/for-linux-next' git bisect good 3577f74d5ae898c34252c5cdc096a681910a919f # good: [72008e28c7bf500a03f09929176bd025de94861b] Merge remote-tracking branch 'sound-asoc/for-next' git bisect good 72008e28c7bf500a03f09929176bd025de94861b # bad: [d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48] block: add REQ_HIPRI and inherit it from IOCB_HIPRI git bisect bad d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48 # good: [4316b79e4321d4140164e42f228778e5bc66c84f] block: kill legacy parts of timeout handling git bisect good 4316b79e4321d4140164e42f228778e5bc66c84f # good: [a0fedc857dff457e123aeaa2039d28ac90e543df] Merge branch 'irq/for-block' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-4.21/block git bisect good a0fedc857dff457e123aeaa2039d28ac90e543df # good: [b3c661b15d5ab11d982e58bee23e05c1780528a1] blk-mq: support multiple hctx maps git bisect good b3c661b15d5ab11d982e58bee23e05c1780528a1 # good: [67cae4c948a5311121905a2a8740c50daf7f6478] blk-mq: cleanup and improve list insertion git bisect good 67cae4c948a5311121905a2a8740c50daf7f6478 # good: [843477d4cc5c4bb4e346c561ecd3b9d0bd67e8c8] blk-mq: initial support for multiple queue maps git bisect good 843477d4cc5c4bb4e346c561ecd3b9d0bd67e8c8 # bad: [3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992] nvme: utilize two queue maps, one for reads and one for writes git bisect bad 3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992 # first bad commit: [3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992] nvme: utilize two queue maps, one for reads and one for writes > --- > drivers/nvme/host/pci.c | 200 > +++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 181 insertions(+), 19 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 49ad854d1b91..1987df13b73e 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -74,11 +74,29 @@ static int io_queue_depth = 1024; > module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); > MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); > > +static int queue_count_set(const char *val, const struct kernel_param *kp); > +static const struct kernel_param_ops queue_count_ops = { > + .set = queue_count_set, > + .get = param_get_int, > +}; > + > +static int write_queues; > +module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644); > +MODULE_PARM_DESC(write_queues, > + "Number of queues to use for writes. If not set, reads and writes " > + "will share a queue set."); > + > struct nvme_dev; > struct nvme_queue; > > static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); > > +enum { > + NVMEQ_TYPE_READ, > + NVMEQ_TYPE_WRITE, > + NVMEQ_TYPE_NR, > +}; > + > /* > * Represents an NVM Express device. Each nvme_dev is a PCI function. > */ > @@ -92,6 +110,7 @@ struct nvme_dev { > struct dma_pool *prp_small_pool; > unsigned online_queues; > unsigned max_qid; > + unsigned io_queues[NVMEQ_TYPE_NR]; > unsigned int num_vecs; > int q_depth; > u32 db_stride; > @@ -134,6 +153,17 @@ static int io_queue_depth_set(const char *val, const > struct kernel_param *kp) > return param_set_int(val, kp); > } > > +static int queue_count_set(const char *val, const struct kernel_param *kp) > +{ > + int n = 0, ret; > + > + ret = kstrtoint(val, 10, &n); > + if (n > num_possible_cpus()) > + n = num_possible_cpus(); > + > + return param_set_int(val, kp); > +} > + > static inline unsigned int sq_idx(unsigned int qid, u32 stride) > { > return qid * 2 * stride; > @@ -218,9 +248,20 @@ static inline void _nvme_check_size(void) > BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); > } > > +static unsigned int max_io_queues(void) > +{ > + return num_possible_cpus() + write_queues; > +} > + > +static unsigned int max_queue_count(void) > +{ > + /* IO queues + admin queue */ > + return 1 + max_io_queues(); > +} > + > static inline unsigned int nvme_dbbuf_size(u32 stride) > { > - return ((num_possible_cpus() + 1) * 8 * stride); > + return (max_queue_count() * 8 * stride); > } > > static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) > @@ -431,12 +472,41 @@ static int nvme_init_request(struct blk_mq_tag_set > *set, struct request *req, > return 0; > } > > +static int queue_irq_offset(struct nvme_dev *dev) > +{ > + /* if we have more than 1 vec, admin queue offsets us by 1 */ > + if (dev->num_vecs > 1) > + return 1; > + > + return 0; > +} > + > static int nvme_pci_map_queues(struct blk_mq_tag_set *set) > { > struct nvme_dev *dev = set->driver_data; > + int i, qoff, offset; > + > + offset = queue_irq_offset(dev); > + for (i = 0, qoff = 0; i < set->nr_maps; i++) { > + struct blk_mq_queue_map *map = &set->map[i]; > + > + map->nr_queues = dev->io_queues[i]; > + if (!map->nr_queues) { > + BUG_ON(i == NVMEQ_TYPE_READ); > > - return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev), > - dev->num_vecs > 1 ? 1 /* admin queue */ : 0); > + /* shared set, resuse read set parameters */ > + map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ]; > + qoff = 0; > + offset = queue_irq_offset(dev); > + } > + > + map->queue_offset = qoff; > + blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); > + qoff += map->nr_queues; > + offset += map->nr_queues; > + } > + > + return 0; > } > > /** > @@ -849,6 +919,14 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx > *hctx, > return ret; > } > > +static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags) > +{ > + if ((flags & REQ_OP_MASK) == REQ_OP_READ) > + return NVMEQ_TYPE_READ; > + > + return NVMEQ_TYPE_WRITE; > +} > + > static void nvme_pci_complete_rq(struct request *req) > { > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > @@ -1475,13 +1553,14 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { > }; > > static const struct blk_mq_ops nvme_mq_ops = { > - .queue_rq = nvme_queue_rq, > - .complete = nvme_pci_complete_rq, > - .init_hctx = nvme_init_hctx, > - .init_request = nvme_init_request, > - .map_queues = nvme_pci_map_queues, > - .timeout = nvme_timeout, > - .poll = nvme_poll, > + .queue_rq = nvme_queue_rq, > + .rq_flags_to_type = nvme_rq_flags_to_type, > + .complete = nvme_pci_complete_rq, > + .init_hctx = nvme_init_hctx, > + .init_request = nvme_init_request, > + .map_queues = nvme_pci_map_queues, > + .timeout = nvme_timeout, > + .poll = nvme_poll, > }; > > static void nvme_dev_remove_admin(struct nvme_dev *dev) > @@ -1891,6 +1970,87 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) > return ret; > } > > +static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int > nr_io_queues) > +{ > + unsigned int this_w_queues = write_queues; > + > + /* > + * Setup read/write queue split > + */ > + if (nr_io_queues == 1) { > + dev->io_queues[NVMEQ_TYPE_READ] = 1; > + dev->io_queues[NVMEQ_TYPE_WRITE] = 0; > + return; > + } > + > + /* > + * If 'write_queues' is set, ensure it leaves room for at least > + * one read queue > + */ > + if (this_w_queues >= nr_io_queues) > + this_w_queues = nr_io_queues - 1; > + > + /* > + * If 'write_queues' is set to zero, reads and writes will share > + * a queue set. > + */ > + if (!this_w_queues) { > + dev->io_queues[NVMEQ_TYPE_WRITE] = 0; > + dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues; > + } else { > + dev->io_queues[NVMEQ_TYPE_WRITE] = this_w_queues; > + dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues - this_w_queues; > + } > +} > + > +static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues) > +{ > + struct pci_dev *pdev = to_pci_dev(dev->dev); > + int irq_sets[2]; > + struct irq_affinity affd = { > + .pre_vectors = 1, > + .nr_sets = ARRAY_SIZE(irq_sets), > + .sets = irq_sets, > + }; > + int result; > + > + /* > + * For irq sets, we have to ask for minvec == maxvec. This passes > + * any reduction back to us, so we can adjust our queue counts and > + * IRQ vector needs. > + */ > + do { > + nvme_calc_io_queues(dev, nr_io_queues); > + irq_sets[0] = dev->io_queues[NVMEQ_TYPE_READ]; > + irq_sets[1] = dev->io_queues[NVMEQ_TYPE_WRITE]; > + if (!irq_sets[1]) > + affd.nr_sets = 1; > + > + /* > + * Need IRQs for read+write queues, and one for the admin queue > + */ > + nr_io_queues = irq_sets[0] + irq_sets[1] + 1; > + > + result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues, > + nr_io_queues, > + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); > + > + /* > + * Need to reduce our vec counts > + */ > + if (result == -ENOSPC) { > + nr_io_queues--; > + if (!nr_io_queues) > + return result; > + continue; > + } else if (result <= 0) > + return -EIO; > + break; > + } while (1); > + > + return result; > +} > + > static int nvme_setup_io_queues(struct nvme_dev *dev) > { > struct nvme_queue *adminq = &dev->queues[0]; > @@ -1898,11 +2058,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) > int result, nr_io_queues; > unsigned long size; > > - struct irq_affinity affd = { > - .pre_vectors = 1 > - }; > - > - nr_io_queues = num_possible_cpus(); > + nr_io_queues = max_io_queues(); > result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); > if (result < 0) > return result; > @@ -1937,13 +2093,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) > * setting up the full range we need. > */ > pci_free_irq_vectors(pdev); > - result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1, > - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); > + > + result = nvme_setup_irqs(dev, nr_io_queues); > if (result <= 0) > return -EIO; > + > dev->num_vecs = result; > dev->max_qid = max(result - 1, 1); > > + dev_info(dev->ctrl.device, "%d/%d read/write queues\n", > + dev->io_queues[NVMEQ_TYPE_READ], > + dev->io_queues[NVMEQ_TYPE_WRITE]); > + > /* > * Should investigate if there's a performance win from allocating > * more queues than interrupt vectors; it might allow the submission > @@ -2045,6 +2206,7 @@ static int nvme_dev_add(struct nvme_dev *dev) > if (!dev->ctrl.tagset) { > dev->tagset.ops = &nvme_mq_ops; > dev->tagset.nr_hw_queues = dev->online_queues - 1; > + dev->tagset.nr_maps = NVMEQ_TYPE_NR; > dev->tagset.timeout = NVME_IO_TIMEOUT; > dev->tagset.numa_node = dev_to_node(dev->dev); > dev->tagset.queue_depth = > @@ -2491,8 +2653,8 @@ static int nvme_probe(struct pci_dev *pdev, const > struct pci_device_id *id) > if (!dev) > return -ENOMEM; > > - dev->queues = kcalloc_node(num_possible_cpus() + 1, > - sizeof(struct nvme_queue), GFP_KERNEL, node); > + dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue), > + GFP_KERNEL, node); > if (!dev->queues) > goto free; > > -- > 2.7.4