Re: [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support

2019-02-17 Thread Ming Lei
On Sat, Feb 16, 2019 at 06:13:11PM +0100, Thomas Gleixner wrote:
> Now that the NVME driver is converted over to the calc_set() callback, the
> workarounds of the original set support can be removed.
> 
> Signed-off-by: Thomas Gleixner 
> ---
>  kernel/irq/affinity.c |   20 
>  1 file changed, 4 insertions(+), 16 deletions(-)
> 
> Index: b/kernel/irq/affinity.c
> ===
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -264,20 +264,13 @@ irq_create_affinity_masks(unsigned int n
>  
>   /*
>* Simple invocations do not provide a calc_sets() callback. Install
> -  * the generic one. The check for affd->nr_sets is a temporary
> -  * workaround and will be removed after the NVME driver is converted
> -  * over.
> +  * the generic one.
>*/
> - if (!affd->nr_sets && !affd->calc_sets)
> + if (!affd->calc_sets)
>   affd->calc_sets = default_calc_sets;
>  
> - /*
> -  * If the device driver provided a calc_sets() callback let it
> -  * recalculate the number of sets and their size. The check will go
> -  * away once the NVME driver is converted over.
> -  */
> - if (affd->calc_sets)
> - affd->calc_sets(affd, affvecs);
> + /* Recalculate the sets */
> + affd->calc_sets(affd, affvecs);
>  
>   if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
>   return NULL;
> @@ -344,11 +337,6 @@ unsigned int irq_calc_affinity_vectors(u
>  
>   if (affd->calc_sets) {
>   set_vecs = maxvec - resv;
> - } else if (affd->nr_sets) {
> - unsigned int i;
> -
> - for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
> -         set_vecs += affd->set_size[i];
>   } else {
>   get_online_cpus();
>   set_vecs = cpumask_weight(cpu_possible_mask);
> 
> 

Reviewed-by: Ming Lei 

Thanks,
Ming


Re: [patch v6 1/7] genirq/affinity: Code consolidation

2019-02-17 Thread Ming Lei
;
> @@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(co
>   curvec = firstvec;
>   --nodes;
>   }
> -
> -out:
>   return done;
>  }
>  
> @@ -174,13 +172,14 @@ static int __irq_build_affinity_masks(co
>   *   2) spread other possible CPUs on these vectors
>   */
>  static int irq_build_affinity_masks(const struct irq_affinity *affd,
> - int startvec, int numvecs, int firstvec,
> + unsigned int startvec, unsigned int numvecs,
> + unsigned int firstvec,
>   struct irq_affinity_desc *masks)
>  {
> - int curvec = startvec, nr_present, nr_others;
> - int ret = -ENOMEM;
> - cpumask_var_t nmsk, npresmsk;
> + unsigned int curvec = startvec, nr_present, nr_others;
>   cpumask_var_t *node_to_cpumask;
> + cpumask_var_t nmsk, npresmsk;
> + int ret = -ENOMEM;
>  
>   if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
>   return ret;
> @@ -239,12 +238,10 @@ static int irq_build_affinity_masks(cons
>   * Returns the irq_affinity_desc pointer or NULL if allocation failed.
>   */
>  struct irq_affinity_desc *
> -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
> +irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity 
> *affd)
>  {
> - int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
> - int curvec, usedvecs;
> + unsigned int affvecs, curvec, usedvecs, nr_sets, i;
>   struct irq_affinity_desc *masks = NULL;
> - int i, nr_sets;
>  
>   /*
>* If there aren't any vectors left after applying the pre/post
> @@ -264,16 +261,17 @@ irq_create_affinity_masks(int nvecs, con
>* Spread on present CPUs starting from affd->pre_vectors. If we
>* have multiple sets, build each sets affinity mask separately.
>*/
> + affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
>   nr_sets = affd->nr_sets;
>   if (!nr_sets)
>   nr_sets = 1;
>  
>   for (i = 0, usedvecs = 0; i < nr_sets; i++) {
> - int this_vecs = affd->sets ? affd->sets[i] : affvecs;
> + unsigned int this_vecs = affd->sets ? affd->sets[i] : affvecs;
>   int ret;
>  
>   ret = irq_build_affinity_masks(affd, curvec, this_vecs,
> - curvec, masks);
> +curvec, masks);
>   if (ret) {
>   kfree(masks);
>   return NULL;
> @@ -303,17 +301,17 @@ irq_create_affinity_masks(int nvecs, con
>   * @maxvec:  The maximum number of vectors available
>   * @affd:Description of the affinity requirements
>   */
> -int irq_calc_affinity_vectors(int minvec, int maxvec, const struct 
> irq_affinity *affd)
> +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int 
> maxvec,
> +const struct irq_affinity *affd)
>  {
> - int resv = affd->pre_vectors + affd->post_vectors;
> - int vecs = maxvec - resv;
> - int set_vecs;
> + unsigned int resv = affd->pre_vectors + affd->post_vectors;
> + unsigned int set_vecs;
>  
>   if (resv > minvec)
>   return 0;
>  
>   if (affd->nr_sets) {
> - int i;
> + unsigned int i;
>  
>   for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
>   set_vecs += affd->sets[i];
> @@ -323,5 +321,5 @@ int irq_calc_affinity_vectors(int minvec
>   put_online_cpus();
>   }
>  
> - return resv + min(set_vecs, vecs);
> + return resv + min(set_vecs, maxvec - resv);
>  }

Reviewed-by: Ming Lei 


Thanks,
Ming


Re: [dm-devel] [PATCH V15 00/18] block: support multi-page bvec

2019-02-17 Thread Ming Lei
On Fri, Feb 15, 2019 at 10:59:47AM -0700, Jens Axboe wrote:
> On 2/15/19 10:14 AM, Bart Van Assche wrote:
> > On Fri, 2019-02-15 at 08:49 -0700, Jens Axboe wrote:
> >> On 2/15/19 4:13 AM, Ming Lei wrote:
> >>> This patchset brings multi-page bvec into block layer:
> >>
> >> Applied, thanks Ming. Let's hope it sticks!
> > 
> > Hi Jens and Ming,
> > 
> > Test nvmeof-mp/002 fails with Jens' for-next branch from this morning.
> > I have not yet tried to figure out which patch introduced the failure.
> > Anyway, this is what I see in the kernel log for test nvmeof-mp/002:
> > 
> > [  475.611363] BUG: unable to handle kernel NULL pointer dereference at 
> > 0020
> > [  475.621188] #PF error: [normal kernel read fault]
> > [  475.623148] PGD 0 P4D 0  
> > [  475.624737] Oops:  [#1] PREEMPT SMP KASAN
> > [  475.626628] CPU: 1 PID: 277 Comm: kworker/1:1H Tainted: GB   
> >   5.0.0-rc6-dbg+ #1
> > [  475.630232] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> > 1.10.2-1 04/01/2014
> > [  475.633855] Workqueue: kblockd blk_mq_requeue_work
> > [  475.635777] RIP: 0010:__blk_recalc_rq_segments+0xbe/0x590
> > [  475.670948] Call Trace:
> > [  475.693515]  blk_recalc_rq_segments+0x2f/0x50
> > [  475.695081]  blk_insert_cloned_request+0xbb/0x1c0
> > [  475.701142]  dm_mq_queue_rq+0x3d1/0x770
> > [  475.707225]  blk_mq_dispatch_rq_list+0x5fc/0xb10
> > [  475.717137]  blk_mq_sched_dispatch_requests+0x256/0x300
> > [  475.721767]  __blk_mq_run_hw_queue+0xd6/0x180
> > [  475.725920]  __blk_mq_delay_run_hw_queue+0x25c/0x290
> > [  475.727480]  blk_mq_run_hw_queue+0x119/0x1b0
> > [  475.732019]  blk_mq_run_hw_queues+0x7b/0xa0
> > [  475.733468]  blk_mq_requeue_work+0x2cb/0x300
> > [  475.736473]  process_one_work+0x4f1/0xa40
> > [  475.739424]  worker_thread+0x67/0x5b0
> > [  475.741751]  kthread+0x1cf/0x1f0
> > [  475.746034]  ret_from_fork+0x24/0x30
> > 
> > (gdb) list *(__blk_recalc_rq_segments+0xbe)
> > 0x816a152e is in __blk_recalc_rq_segments (block/blk-merge.c:366).
> > 361  struct bio *bio)
> > 362 {
> > 363 struct bio_vec bv, bvprv = { NULL };
> > 364 int prev = 0;
> > 365 unsigned int seg_size, nr_phys_segs;
> > 366 unsigned front_seg_size = bio->bi_seg_front_size;
> > 367 struct bio *fbio, *bbio;
> > 368 struct bvec_iter iter;
> > 369
> > 370 if (!bio)
> 
> Just ran a few tests, and it also seems to cause about a 5% regression
> in per-core IOPS throughput. Prior to this work, I could get 1620K 4k
> rand read IOPS out of core, now I'm at ~1535K. The cycler stealer seems
> to be blk_queue_split() and blk_rq_map_sg().

Could you share us your test setting?

I will run null_blk first and see if it can be reproduced.

Thanks,
Ming


Re: [dm-devel] [PATCH V15 00/18] block: support multi-page bvec

2019-02-17 Thread Ming Lei
On Fri, Feb 15, 2019 at 09:14:15AM -0800, Bart Van Assche wrote:
> On Fri, 2019-02-15 at 08:49 -0700, Jens Axboe wrote:
> > On 2/15/19 4:13 AM, Ming Lei wrote:
> > > This patchset brings multi-page bvec into block layer:
> > 
> > Applied, thanks Ming. Let's hope it sticks!
> 
> Hi Jens and Ming,
> 
> Test nvmeof-mp/002 fails with Jens' for-next branch from this morning.
> I have not yet tried to figure out which patch introduced the failure.
> Anyway, this is what I see in the kernel log for test nvmeof-mp/002:
> 
> [  475.611363] BUG: unable to handle kernel NULL pointer dereference at 
> 0020
> [  475.621188] #PF error: [normal kernel read fault]
> [  475.623148] PGD 0 P4D 0  
> [  475.624737] Oops:  [#1] PREEMPT SMP KASAN
> [  475.626628] CPU: 1 PID: 277 Comm: kworker/1:1H Tainted: GB 
> 5.0.0-rc6-dbg+ #1
> [  475.630232] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1 04/01/2014
> [  475.633855] Workqueue: kblockd blk_mq_requeue_work
> [  475.635777] RIP: 0010:__blk_recalc_rq_segments+0xbe/0x590
> [  475.670948] Call Trace:
> [  475.693515]  blk_recalc_rq_segments+0x2f/0x50
> [  475.695081]  blk_insert_cloned_request+0xbb/0x1c0
> [  475.701142]  dm_mq_queue_rq+0x3d1/0x770
> [  475.707225]  blk_mq_dispatch_rq_list+0x5fc/0xb10
> [  475.717137]  blk_mq_sched_dispatch_requests+0x256/0x300
> [  475.721767]  __blk_mq_run_hw_queue+0xd6/0x180
> [  475.725920]  __blk_mq_delay_run_hw_queue+0x25c/0x290
> [  475.727480]  blk_mq_run_hw_queue+0x119/0x1b0
> [  475.732019]  blk_mq_run_hw_queues+0x7b/0xa0
> [  475.733468]  blk_mq_requeue_work+0x2cb/0x300
> [  475.736473]  process_one_work+0x4f1/0xa40
> [  475.739424]  worker_thread+0x67/0x5b0
> [  475.741751]  kthread+0x1cf/0x1f0
> [  475.746034]  ret_from_fork+0x24/0x30
> 
> (gdb) list *(__blk_recalc_rq_segments+0xbe)
> 0x816a152e is in __blk_recalc_rq_segments (block/blk-merge.c:366).
> 361  struct bio *bio)
> 362 {
> 363 struct bio_vec bv, bvprv = { NULL };
> 364 int prev = 0;
> 365 unsigned int seg_size, nr_phys_segs;
> 366 unsigned front_seg_size = bio->bi_seg_front_size;
> 367 struct bio *fbio, *bbio;
> 368 struct bvec_iter iter;
> 369
> 370 if (!bio)
> 
> Bart.

Thanks for your test!

The following patch should fix this issue:


diff --git a/block/blk-merge.c b/block/blk-merge.c
index bed065904677..066b66430523 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -363,13 +363,15 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
struct bio_vec bv, bvprv = { NULL };
int prev = 0;
unsigned int seg_size, nr_phys_segs;
-   unsigned front_seg_size = bio->bi_seg_front_size;
+   unsigned front_seg_size;
struct bio *fbio, *bbio;
struct bvec_iter iter;
 
if (!bio)
return 0;
 
+   front_seg_size = bio->bi_seg_front_size;
+
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:

Thanks,
Ming


Re: [PATCH V15 00/18] block: support multi-page bvec

2019-02-17 Thread Ming Lei
On Fri, Feb 15, 2019 at 03:51:26PM +0100, Christoph Hellwig wrote:
> I still don't understand why mp_bvec_last_segment isn't simply
> called bvec_last_segment as there is no conflict.  But I don't
> want to hold this series up on that as there only are two users
> left and we can always just fix it up later.

mp_bvec_last_segment() is one bvec helper, so better to keep its
name consistent with other bvec helpers.

Thanks,
Ming


[PATCH V15 16/18] block: document usage of bio iterator helpers

2019-02-15 Thread Ming Lei
Now multi-page bvec is supported, some helpers may return page by
page, meantime some may return segment by segment, this patch
documents the usage.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 Documentation/block/biovecs.txt | 25 +
 1 file changed, 25 insertions(+)

diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
index 25689584e6e0..ce6eccaf5df7 100644
--- a/Documentation/block/biovecs.txt
+++ b/Documentation/block/biovecs.txt
@@ -117,3 +117,28 @@ Other implications:
size limitations and the limitations of the underlying devices. Thus
there's no need to define ->merge_bvec_fn() callbacks for individual block
drivers.
+
+Usage of helpers:
+=
+
+* The following helpers whose names have the suffix of "_all" can only be used
+on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
+shouldn't use them because the bio may have been split before it reached the
+driver.
+
+   bio_for_each_segment_all()
+   bio_first_bvec_all()
+   bio_first_page_all()
+   bio_last_bvec_all()
+
+* The following helpers iterate over single-page segment. The passed 'struct
+bio_vec' will contain a single-page IO vector during the iteration
+
+   bio_for_each_segment()
+   bio_for_each_segment_all()
+
+* The following helpers iterate over multi-page bvec. The passed 'struct
+bio_vec' will contain a multi-page IO vector during the iteration
+
+   bio_for_each_bvec()
+   rq_for_each_bvec()
-- 
2.9.5



[PATCH V15 18/18] block: kill BLK_MQ_F_SG_MERGE

2019-02-15 Thread Ming Lei
QUEUE_FLAG_NO_SG_MERGE has been killed, so kill BLK_MQ_F_SG_MERGE too.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/block/loop.c | 2 +-
 drivers/block/nbd.c  | 2 +-
 drivers/block/rbd.c  | 2 +-
 drivers/block/skd_main.c | 1 -
 drivers/block/xen-blkfront.c | 2 +-
 drivers/md/dm-rq.c   | 2 +-
 drivers/mmc/core/queue.c | 3 +--
 drivers/scsi/scsi_lib.c  | 2 +-
 include/linux/blk-mq.h   | 1 -
 10 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 697d6213c82b..c39247c5ddb6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -249,7 +249,6 @@ static const char *const alloc_policy_name[] = {
 static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED),
-   HCTX_FLAG_NAME(SG_MERGE),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
 };
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 8ef583197414..3d63ad036398 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1937,7 +1937,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
lo->tag_set.driver_data = lo;
 
err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7c9a949e876b..32a7ba1674b7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1571,7 +1571,7 @@ static int nbd_dev_add(int index)
nbd->tag_set.numa_node = NUMA_NO_NODE;
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-   BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
+   BLK_MQ_F_BLOCKING;
nbd->tag_set.driver_data = nbd;
 
err = blk_mq_alloc_tag_set(&nbd->tag_set);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1e92b61d0bd5..abe9e1c89227 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3988,7 +3988,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-   rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
 
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index ab893a7571a2..7d3ad6c22ee5 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev)
skdev->sgs_per_request * sizeof(struct scatterlist);
skdev->tag_set.numa_node = NUMA_NO_NODE;
skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-   BLK_MQ_F_SG_MERGE |
BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
skdev->tag_set.driver_data = skdev;
rc = blk_mq_alloc_tag_set(&skdev->tag_set);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0ed4b200fa58..d43a5677ccbc 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
sector_size,
} else
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
-   info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 4eb5f8c56535..b2f8eb2365ee 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, 
struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
-   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 35cc138b096d..cc19e71c71d4 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -410,8 +410,7 @@ int m

[PATCH V15 17/18] block: kill QUEUE_FLAG_NO_SG_MERGE

2019-02-15 Thread Ming Lei
Since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"),
physical segment number is mainly figured out in blk_queue_split() for
fast path, and the flag of BIO_SEG_VALID is set there too.

Now only blk_recount_segments() and blk_recalc_rq_segments() use this
flag.

Basically blk_recount_segments() is bypassed in fast path given BIO_SEG_VALID
is set in blk_queue_split().

For another user of blk_recalc_rq_segments():

- run in partial completion branch of blk_update_request, which is an unusual 
case

- run in blk_cloned_rq_check_limits(), still not a big problem if the flag is 
killed
since dm-rq is the only user.

Multi-page bvec is enabled now, not doing S/G merging is rather pointless with 
the
current setup of the I/O path, as it isn't going to save you a significant 
amount
of cycles.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c  | 31 ++-
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c |  3 ---
 drivers/md/dm-table.c  | 13 -
 include/linux/blkdev.h |  1 -
 5 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1912499b08b7..bed065904677 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -358,8 +358,7 @@ void blk_queue_split(struct request_queue *q, struct bio 
**bio)
 EXPORT_SYMBOL(blk_queue_split);
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-struct bio *bio,
-bool no_sg_merge)
+struct bio *bio)
 {
struct bio_vec bv, bvprv = { NULL };
int prev = 0;
@@ -385,13 +384,6 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
nr_phys_segs = 0;
for_each_bio(bio) {
bio_for_each_bvec(bv, bio, iter) {
-   /*
-* If SG merging is disabled, each bio vector is
-* a segment
-*/
-   if (no_sg_merge)
-   goto new_segment;
-
if (prev) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
@@ -421,27 +413,16 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-   bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
-   &rq->q->queue_flags);
-
-   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
-   no_sg_merge);
+   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   unsigned short seg_cnt = bio_segments(bio);
-
-   if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
-   (seg_cnt < queue_max_segments(q)))
-   bio->bi_phys_segments = seg_cnt;
-   else {
-   struct bio *nxt = bio->bi_next;
+   struct bio *nxt = bio->bi_next;
 
-   bio->bi_next = NULL;
-   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
-   bio->bi_next = nxt;
-   }
+   bio->bi_next = NULL;
+   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+   bio->bi_next = nxt;
 
bio_set_flag(bio, BIO_SEG_VALID);
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c782e81db627..697d6213c82b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -128,7 +128,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(NO_SG_MERGE),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 44d471ff8754..fa508ee31742 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2837,9 +2837,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
-   if (!(set->flags & BLK_MQ_F_SG_MERGE))
-   blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
q->sg_reserved_size = INT_MAX;
 
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4b1be754cc41..ba9481f1bf3c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, 
struct dm_dev *dev,
return q && !blk_queue_add_random(q);
 }
 
-static 

[PATCH V15 15/18] block: always define BIO_MAX_PAGES as 256

2019-02-15 Thread Ming Lei
Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to
increase BIO_MAX_PAGES for it.

CONFIG_THP_SWAP needs to split one THP into normal pages and adds
them all to one bio. With multipage-bvec, it just takes one bvec to
hold them all.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 include/linux/bio.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9f77adcfde82..bdd11d4c2f05 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -34,15 +34,7 @@
 #define BIO_BUG_ON
 #endif
 
-#ifdef CONFIG_THP_SWAP
-#if HPAGE_PMD_NR > 256
-#define BIO_MAX_PAGES  HPAGE_PMD_NR
-#else
 #define BIO_MAX_PAGES  256
-#endif
-#else
-#define BIO_MAX_PAGES  256
-#endif
 
 #define bio_prio(bio)  (bio)->bi_ioprio
 #define bio_set_prio(bio, prio)((bio)->bi_ioprio = prio)
-- 
2.9.5



[PATCH V15 09/18] fs/buffer.c: use bvec iterator to truncate the bio

2019-02-15 Thread Ming Lei
Once multi-page bvec is enabled, the last bvec may include more than one
page, this patch use mp_bvec_last_segment() to truncate the bio.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 fs/buffer.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 52d024bfdbc1..817871274c77 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3032,7 +3032,10 @@ void guard_bio_eod(int op, struct bio *bio)
 
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
-   zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+   struct bio_vec bv;
+
+   mp_bvec_last_segment(bvec, &bv);
+   zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
 }
-- 
2.9.5



[PATCH V15 13/18] block: allow bio_for_each_segment_all() to iterate over multi-page bvec

2019-02-15 Thread Ming Lei
This patch introduces one extra iterator variable to bio_for_each_segment_all(),
then we can allow bio_for_each_segment_all() to iterate over multi-page bvec.

Given it is just one mechannical & simple change on all 
bio_for_each_segment_all()
users, this patch does tree-wide change in one single patch, so that we can
avoid to use a temporary helper for this conversion.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/bio.c   | 27 ++-
 block/bounce.c|  6 --
 drivers/md/bcache/btree.c |  3 ++-
 drivers/md/dm-crypt.c |  3 ++-
 drivers/md/raid1.c|  3 ++-
 drivers/staging/erofs/data.c  |  3 ++-
 drivers/staging/erofs/unzip_vle.c |  3 ++-
 fs/block_dev.c|  6 --
 fs/btrfs/compression.c|  3 ++-
 fs/btrfs/disk-io.c|  3 ++-
 fs/btrfs/extent_io.c  |  9 ++---
 fs/btrfs/inode.c  |  6 --
 fs/btrfs/raid56.c |  3 ++-
 fs/crypto/bio.c   |  3 ++-
 fs/direct-io.c|  4 +++-
 fs/exofs/ore.c|  3 ++-
 fs/exofs/ore_raid.c   |  3 ++-
 fs/ext4/page-io.c |  3 ++-
 fs/ext4/readpage.c|  3 ++-
 fs/f2fs/data.c|  9 ++---
 fs/gfs2/lops.c|  9 ++---
 fs/gfs2/meta_io.c |  3 ++-
 fs/iomap.c|  6 --
 fs/mpage.c|  3 ++-
 fs/xfs/xfs_aops.c |  5 +++--
 include/linux/bio.h   | 11 +--
 include/linux/bvec.h  | 30 ++
 27 files changed, 127 insertions(+), 46 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 4db1008309ed..968b12fea564 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1072,8 +1072,9 @@ static int bio_copy_from_iter(struct bio *bio, struct 
iov_iter *iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_from_iter(bvec->bv_page,
@@ -1103,8 +1104,9 @@ static int bio_copy_to_iter(struct bio *bio, struct 
iov_iter iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_to_iter(bvec->bv_page,
@@ -1126,8 +1128,9 @@ void bio_free_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all(bvec, bio, i, iter_all)
__free_page(bvec->bv_page);
 }
 EXPORT_SYMBOL(bio_free_pages);
@@ -1295,6 +1298,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
struct bio *bio;
int ret;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
@@ -1368,7 +1372,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio;
 
  out_unmap:
-   bio_for_each_segment_all(bvec, bio, j) {
+   bio_for_each_segment_all(bvec, bio, j, iter_all) {
put_page(bvec->bv_page);
}
bio_put(bio);
@@ -1379,11 +1383,12 @@ static void __bio_unmap_user(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
/*
 * make sure we dirty pages we wrote to
 */
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
 
@@ -1475,8 +1480,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private;
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
@@ -1585,8 +1591,9 @@ void bio_set_pages_dirty(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
}
@@ -1596,8 +1603,9 @@ static void bio_release_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 

[PATCH V15 12/18] bcache: avoid to use bio_for_each_segment_all() in bch_bio_alloc_pages()

2019-02-15 Thread Ming Lei
bch_bio_alloc_pages() is always called on one new bio, so it is safe
to access the bvec table directly. Given it is the only kind of this
case, open code the bvec table access since bio_for_each_segment_all()
will be changed to support for iterating over multipage bvec.

Acked-by: Coly Li 
Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 drivers/md/bcache/util.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 20eddeac1531..62fb917f7a4f 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
int i;
struct bio_vec *bv;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   /*
+* This is called on freshly new bio, so it is safe to access the
+* bvec table directly.
+*/
+   for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
-- 
2.9.5



[PATCH V15 14/18] block: enable multipage bvecs

2019-02-15 Thread Ming Lei
This patch pulls the trigger for multi-page bvecs.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/bio.c | 22 +++---
 fs/iomap.c  |  4 ++--
 fs/xfs/xfs_aops.c   |  4 ++--
 include/linux/bio.h |  2 +-
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 968b12fea564..83a2dfa417ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * @page: page to add
  * @len: length of the data to add
  * @off: offset of the data in @page
+ * @same_page: if %true only merge if the new data is in the same physical
+ * page as the last segment of the bio.
  *
  * Try to add the data at @page + @off to the last bvec of @bio.  This is a
  * a useful optimisation for file systems with a block size smaller than the
@@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * Return %true on success or %false on failure.
  */
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off)
+   unsigned int len, unsigned int off, bool same_page)
 {
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
 
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+   phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
+   bv->bv_offset + bv->bv_len - 1;
+   phys_addr_t page_addr = page_to_phys(page);
 
-   if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
-   bv->bv_len += len;
-   bio->bi_iter.bi_size += len;
-   return true;
-   }
+   if (vec_end_addr + 1 != page_addr + off)
+   return false;
+   if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
+   return false;
+
+   bv->bv_len += len;
+   bio->bi_iter.bi_size += len;
+   return true;
}
return false;
 }
@@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
 int bio_add_page(struct bio *bio, struct page *page,
 unsigned int len, unsigned int offset)
 {
-   if (!__bio_try_merge_page(bio, page, len, offset)) {
+   if (!__bio_try_merge_page(bio, page, len, offset, false)) {
if (bio_full(bio))
return 0;
__bio_add_page(bio, page, len, offset);
diff --git a/fs/iomap.c b/fs/iomap.c
index af736acd9006..0c350e658b7f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -318,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
 */
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
-   if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+   if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -349,7 +349,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
 
-   __bio_add_page(ctx->bio, page, plen, poff);
+   bio_add_page(ctx->bio, page, plen, poff);
 done:
/*
 * Move the caller beyond our range so that it keeps making progress.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1f1829e506e8..b9fd44168f61 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -616,12 +616,12 @@ xfs_add_to_ioend(
bdev, sector);
}
 
-   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(&iop->write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-   __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+   bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
 
wpc->ioend->io_size += len;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 089370eb84d9..9f77adcfde82 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -441,7 +441,7 @@ extern int bio_add_page(struct bio *, struct page *, 
unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
   unsigned int, unsigned int);
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off);
+   unsigned int len, unsigned int off, bool same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-- 
2.9.5



[PATCH V15 11/18] block: loop: pass multi-page bvec to iov_iter

2019-02-15 Thread Ming Lei
iov_iter is implemented on bvec itererator helpers, so it is safe to pass
multi-page bvec to it, and this way is much more efficient than passing one
page in each bvec.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 drivers/block/loop.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cf5538942834..8ef583197414 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -511,21 +511,22 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 loff_t pos, bool rw)
 {
struct iov_iter iter;
+   struct req_iterator rq_iter;
struct bio_vec *bvec;
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct bio *bio = rq->bio;
struct file *file = lo->lo_backing_file;
+   struct bio_vec tmp;
unsigned int offset;
-   int segments = 0;
+   int nr_bvec = 0;
int ret;
 
+   rq_for_each_bvec(tmp, rq, rq_iter)
+   nr_bvec++;
+
if (rq->bio != rq->biotail) {
-   struct req_iterator iter;
-   struct bio_vec tmp;
 
-   __rq_for_each_bio(bio, rq)
-   segments += bio_segments(bio);
-   bvec = kmalloc_array(segments, sizeof(struct bio_vec),
+   bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
 GFP_NOIO);
if (!bvec)
return -EIO;
@@ -534,10 +535,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
/*
 * The bios of the request may be started from the middle of
 * the 'bvec' because of bio splitting, so we can't directly
-* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
+* copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
 * API will take care of all details for us.
 */
-   rq_for_each_segment(tmp, rq, iter) {
+   rq_for_each_bvec(tmp, rq, rq_iter) {
*bvec = tmp;
bvec++;
}
@@ -551,11 +552,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 */
offset = bio->bi_iter.bi_bvec_done;
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-   segments = bio_segments(bio);
}
atomic_set(&cmd->ref, 2);
 
-   iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq));
+   iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
iter.iov_offset = offset;
 
cmd->iocb.ki_pos = pos;
-- 
2.9.5



[PATCH V15 10/18] btrfs: use mp_bvec_last_segment to get bio's last page

2019-02-15 Thread Ming Lei
Preparing for supporting multi-page bvec.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 fs/btrfs/extent_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dc8ba3ee515d..986ef49b0269 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2697,11 +2697,12 @@ static int __must_check submit_one_bio(struct bio *bio, 
int mirror_num,
 {
blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio);
-   struct page *page = bvec->bv_page;
+   struct bio_vec bv;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
 
-   start = page_offset(page) + bvec->bv_offset;
+   mp_bvec_last_segment(bvec, &bv);
+   start = page_offset(bv.bv_page) + bv.bv_offset;
 
bio->bi_private = NULL;
 
-- 
2.9.5



[PATCH V15 08/18] block: introduce mp_bvec_last_segment()

2019-02-15 Thread Ming Lei
BTRFS and guard_bio_eod() need to get the last singlepage segment
from one multipage bvec, so introduce this helper to make them happy.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 0ae729b1c9fe..21f76bad7be2 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -131,4 +131,26 @@ static inline bool bvec_iter_advance(const struct bio_vec 
*bv,
.bi_bvec_done   = 0,\
 }
 
+/*
+ * Get the last single-page segment from the multi-page bvec and store it
+ * in @seg
+ */
+static inline void mp_bvec_last_segment(const struct bio_vec *bvec,
+   struct bio_vec *seg)
+{
+   unsigned total = bvec->bv_offset + bvec->bv_len;
+   unsigned last_page = (total - 1) / PAGE_SIZE;
+
+   seg->bv_page = nth_page(bvec->bv_page, last_page);
+
+   /* the whole segment is inside the last page */
+   if (bvec->bv_offset >= last_page * PAGE_SIZE) {
+   seg->bv_offset = bvec->bv_offset % PAGE_SIZE;
+   seg->bv_len = bvec->bv_len;
+   } else {
+   seg->bv_offset = 0;
+   seg->bv_len = total - last_page * PAGE_SIZE;
+   }
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
2.9.5



[PATCH V15 07/18] block: use bio_for_each_bvec() to map sg

2019-02-15 Thread Ming Lei
It is more efficient to use bio_for_each_bvec() to map sg, meantime
we have to consider splitting multipage bvec as done in blk_bio_segment_split().

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 70 +++
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4ef56b2d2aa5..1912499b08b7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -464,6 +464,54 @@ static int blk_phys_contig_segment(struct request_queue 
*q, struct bio *bio,
return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
 }
 
+static struct scatterlist *blk_next_sg(struct scatterlist **sg,
+   struct scatterlist *sglist)
+{
+   if (!*sg)
+   return sglist;
+
+   /*
+* If the driver previously mapped a shorter list, we could see a
+* termination bit prematurely unless it fully inits the sg table
+* on each mapping. We KNOW that there must be more entries here
+* or the driver would be buggy, so force clear the termination bit
+* to avoid doing a full sg_init_table() in drivers for each command.
+*/
+   sg_unmark_end(*sg);
+   return sg_next(*sg);
+}
+
+static unsigned blk_bvec_map_sg(struct request_queue *q,
+   struct bio_vec *bvec, struct scatterlist *sglist,
+   struct scatterlist **sg)
+{
+   unsigned nbytes = bvec->bv_len;
+   unsigned nsegs = 0, total = 0, offset = 0;
+
+   while (nbytes > 0) {
+   unsigned seg_size;
+   struct page *pg;
+   unsigned idx;
+
+   *sg = blk_next_sg(sg, sglist);
+
+   seg_size = get_max_segment_size(q, bvec->bv_offset + total);
+   seg_size = min(nbytes, seg_size);
+
+   offset = (total + bvec->bv_offset) % PAGE_SIZE;
+   idx = (total + bvec->bv_offset) / PAGE_SIZE;
+   pg = nth_page(bvec->bv_page, idx);
+
+   sg_set_page(*sg, pg, seg_size, offset);
+
+   total += seg_size;
+   nbytes -= seg_size;
+   nsegs++;
+   }
+
+   return nsegs;
+}
+
 static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 struct scatterlist *sglist, struct bio_vec *bvprv,
@@ -481,25 +529,7 @@ __blk_segment_map_sg(struct request_queue *q, struct 
bio_vec *bvec,
(*sg)->length += nbytes;
} else {
 new_segment:
-   if (!*sg)
-   *sg = sglist;
-   else {
-   /*
-* If the driver previously mapped a shorter
-* list, we could see a termination bit
-* prematurely unless it fully inits the sg
-* table on each mapping. We KNOW that there
-* must be more entries here or the driver
-* would be buggy, so force clear the
-* termination bit to avoid doing a full
-* sg_init_table() in drivers for each command.
-*/
-   sg_unmark_end(*sg);
-   *sg = sg_next(*sg);
-   }
-
-   sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
-   (*nsegs)++;
+   (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
}
*bvprv = *bvec;
 }
@@ -521,7 +551,7 @@ static int __blk_bios_map_sg(struct request_queue *q, 
struct bio *bio,
int nsegs = 0;
 
for_each_bio(bio)
-   bio_for_each_segment(bvec, bio, iter)
+   bio_for_each_bvec(bvec, bio, iter)
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
 &nsegs);
 
-- 
2.9.5



[PATCH V15 06/18] block: use bio_for_each_bvec() to compute multi-page bvec count

2019-02-15 Thread Ming Lei
First it is more efficient to use bio_for_each_bvec() in both
blk_bio_segment_split() and __blk_recalc_rq_segments() to compute how
many multi-page bvecs there are in the bio.

Secondly once bio_for_each_bvec() is used, the bvec may need to be
splitted because its length can be very longer than max segment size,
so we have to split the big bvec into several segments.

Thirdly when splitting multi-page bvec into segments, the max segment
limit may be reached, so the bio split need to be considered under
this situation too.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 103 +++---
 1 file changed, 83 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index f85d878f313d..4ef56b2d2aa5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct 
request_queue *q,
return sectors;
 }
 
+static unsigned get_max_segment_size(struct request_queue *q,
+unsigned offset)
+{
+   unsigned long mask = queue_segment_boundary(q);
+
+   /* default segment boundary mask means no boundary limit */
+   if (mask == BLK_SEG_BOUNDARY_MASK)
+   return queue_max_segment_size(q);
+
+   return min_t(unsigned long, mask - (mask & offset) + 1,
+queue_max_segment_size(q));
+}
+
+/*
+ * Split the bvec @bv into segments, and update all kinds of
+ * variables.
+ */
+static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
+   unsigned *nsegs, unsigned *last_seg_size,
+   unsigned *front_seg_size, unsigned *sectors)
+{
+   unsigned len = bv->bv_len;
+   unsigned total_len = 0;
+   unsigned new_nsegs = 0, seg_size = 0;
+
+   /*
+* Multi-page bvec may be too big to hold in one segment, so the
+* current bvec has to be splitted as multiple segments.
+*/
+   while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
+   seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
+   seg_size = min(seg_size, len);
+
+   new_nsegs++;
+   total_len += seg_size;
+   len -= seg_size;
+
+   if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+   break;
+   }
+
+   if (!new_nsegs)
+   return !!len;
+
+   /* update front segment size */
+   if (!*nsegs) {
+   unsigned first_seg_size;
+
+   if (new_nsegs == 1)
+   first_seg_size = get_max_segment_size(q, bv->bv_offset);
+   else
+   first_seg_size = queue_max_segment_size(q);
+
+   if (*front_seg_size < first_seg_size)
+   *front_seg_size = first_seg_size;
+   }
+
+   /* update other varibles */
+   *last_seg_size = seg_size;
+   *nsegs += new_nsegs;
+   if (sectors)
+   *sectors += total_len >> 9;
+
+   /* split in the middle of the bvec if len != 0 */
+   return !!len;
+}
+
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 struct bio *bio,
 struct bio_set *bs,
@@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
 
-   bio_for_each_segment(bv, bio, iter) {
+   bio_for_each_bvec(bv, bio, iter) {
/*
 * If the queue doesn't support SG gaps and adding this
 * offset would create a gap, disallow it.
@@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
 */
if (nsegs < queue_max_segments(q) &&
sectors < max_sectors) {
-   nsegs++;
-   sectors = max_sectors;
+   /* split in the middle of bvec */
+   bv.bv_len = (max_sectors - sectors) << 9;
+   bvec_split_segs(q, &bv, &nsegs,
+   &seg_size,
+   &front_seg_size,
+   §ors);
}
goto split;
}
@@ -212,14 +283,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
if (nsegs == queue_max_segments(q))
goto split;
 
-   if (nsegs == 1 && seg_size > front_seg_size)
-   front_seg_size = seg_size;
-
-   nsegs++;
 

[PATCH V15 05/18] block: introduce bio_for_each_bvec() and rq_for_each_bvec()

2019-02-15 Thread Ming Lei
bio_for_each_bvec() is used for iterating over multi-page bvec for bio
split & merge code.

rq_for_each_bvec() can be used for drivers which may handle the
multi-page bvec directly, so far loop is one perfect use case.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bio.h| 10 ++
 include/linux/blkdev.h |  4 
 2 files changed, 14 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 72b4f7be2106..7ef8a7505c0a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -156,6 +156,16 @@ static inline void bio_advance_iter(struct bio *bio, 
struct bvec_iter *iter,
 #define bio_for_each_segment(bvl, bio, iter)   \
__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
 
+#define __bio_for_each_bvec(bvl, bio, iter, start) \
+   for (iter = (start);\
+(iter).bi_size &&  \
+   ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
+bio_advance_iter((bio), &(iter), (bvl).bv_len))
+
+/* iterate over multi-page bvec */
+#define bio_for_each_bvec(bvl, bio, iter)  \
+   __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)
+
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
 static inline unsigned bio_segments(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3603270cb82d..b6292d469ea4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -792,6 +792,10 @@ struct req_iterator {
__rq_for_each_bio(_iter.bio, _rq)   \
bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 
+#define rq_for_each_bvec(bvl, _rq, _iter)  \
+   __rq_for_each_bio(_iter.bio, _rq)   \
+   bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
+
 #define rq_iter_last(bvec, _iter)  \
(_iter.bio->bi_next == NULL &&  \
 bio_iter_last(bvec, _iter.iter))
-- 
2.9.5



[PATCH V15 03/18] block: remove bvec_iter_rewind()

2019-02-15 Thread Ming Lei
Commit 7759eb23fd980 ("block: remove bio_rewind_iter()") removes
bio_rewind_iter(), then no one uses bvec_iter_rewind() any more,
so remove it.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 24 
 1 file changed, 24 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 02c73c6aa805..ba0ae40e77c9 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -92,30 +92,6 @@ static inline bool bvec_iter_advance(const struct bio_vec 
*bv,
return true;
 }
 
-static inline bool bvec_iter_rewind(const struct bio_vec *bv,
-struct bvec_iter *iter,
-unsigned int bytes)
-{
-   while (bytes) {
-   unsigned len = min(bytes, iter->bi_bvec_done);
-
-   if (iter->bi_bvec_done == 0) {
-   if (WARN_ONCE(iter->bi_idx == 0,
- "Attempted to rewind iter beyond "
- "bvec's boundaries\n")) {
-   return false;
-   }
-   iter->bi_idx--;
-   iter->bi_bvec_done = __bvec_iter_bvec(bv, 
*iter)->bv_len;
-   continue;
-   }
-   bytes -= len;
-   iter->bi_size += len;
-   iter->bi_bvec_done -= len;
-   }
-   return true;
-}
-
 #define for_each_bvec(bvl, bio_vec, iter, start)   \
for (iter = (start);\
 (iter).bi_size &&  \
-- 
2.9.5



[PATCH V15 04/18] block: introduce multi-page bvec helpers

2019-02-15 Thread Ming Lei
This patch introduces helpers of 'mp_bvec_iter_*' for multi-page bvec
support.

The introduced helpers treate one bvec as real multi-page segment,
which may include more than one pages.

The existed helpers of bvec_iter_* are interfaces for supporting current
bvec iterator which is thought as single-page by drivers, fs, dm and
etc. These introduced helpers will build single-page bvec in flight, so
this way won't break current bio/bvec users, which needn't any change.

Follows some multi-page bvec background:

- bvecs stored in bio->bi_io_vec is always multi-page style

- bvec(struct bio_vec) represents one physically contiguous I/O
  buffer, now the buffer may include more than one page after
  multi-page bvec is supported, and all these pages represented
  by one bvec is physically contiguous. Before multi-page bvec
  support, at most one page is included in one bvec, we call it
  single-page bvec.

- .bv_page of the bvec points to the 1st page in the multi-page bvec

- .bv_offset of the bvec is the offset of the buffer in the bvec

The effect on the current drivers/filesystem/dm/bcache/...:

- almost everyone supposes that one bvec only includes one single
  page, so we keep the sp interface not changed, for example,
  bio_for_each_segment() still returns single-page bvec

- bio_for_each_segment_all() will return single-page bvec too

- during iterating, iterator variable(struct bvec_iter) is always
  updated in multi-page bvec style, and bvec_iter_advance() is kept
  not changed

- returned(copied) single-page bvec is built in flight by bvec
  helpers from the stored multi-page bvec

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ba0ae40e77c9..0ae729b1c9fe 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * was unsigned short, but we might as well be ready for > 64kB I/O pages
@@ -50,16 +51,39 @@ struct bvec_iter {
  */
 #define __bvec_iter_bvec(bvec, iter)   (&(bvec)[(iter).bi_idx])
 
-#define bvec_iter_page(bvec, iter) \
+/* multi-page (mp_bvec) helpers */
+#define mp_bvec_iter_page(bvec, iter)  \
(__bvec_iter_bvec((bvec), (iter))->bv_page)
 
-#define bvec_iter_len(bvec, iter)  \
+#define mp_bvec_iter_len(bvec, iter)   \
min((iter).bi_size, \
__bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
 
-#define bvec_iter_offset(bvec, iter)   \
+#define mp_bvec_iter_offset(bvec, iter)\
(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
 
+#define mp_bvec_iter_page_idx(bvec, iter)  \
+   (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)
+
+#define mp_bvec_iter_bvec(bvec, iter)  \
+((struct bio_vec) {\
+   .bv_page= mp_bvec_iter_page((bvec), (iter)),\
+   .bv_len = mp_bvec_iter_len((bvec), (iter)), \
+   .bv_offset  = mp_bvec_iter_offset((bvec), (iter)),  \
+})
+
+/* For building single-page bvec in flight */
+ #define bvec_iter_offset(bvec, iter)  \
+   (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)
+
+#define bvec_iter_len(bvec, iter)  \
+   min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),   \
+ PAGE_SIZE - bvec_iter_offset((bvec), (iter)))
+
+#define bvec_iter_page(bvec, iter) \
+   nth_page(mp_bvec_iter_page((bvec), (iter)), \
+mp_bvec_iter_page_idx((bvec), (iter)))
+
 #define bvec_iter_bvec(bvec, iter) \
 ((struct bio_vec) {\
.bv_page= bvec_iter_page((bvec), (iter)),   \
-- 
2.9.5



[PATCH V15 02/18] block: don't use bio->bi_vcnt to figure out segment number

2019-02-15 Thread Ming Lei
It is wrong to use bio->bi_vcnt to figure out how many segments
there are in the bio even though CLONED flag isn't set on this bio,
because this bio may be splitted or advanced.

So always use bio_segments() in blk_recount_segments(), and it shouldn't
cause any performance loss now because the physical segment number is figured
out in blk_queue_split() and BIO_SEG_VALID is set meantime since
bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting").

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Fixes: 76d8137a3113 ("blk-merge: recaculate segment if it isn't less than max 
segments")
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 71e9ac03f621..f85d878f313d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -367,13 +367,7 @@ void blk_recalc_rq_segments(struct request *rq)
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   unsigned short seg_cnt;
-
-   /* estimate segment number by bi_vcnt for non-cloned bio */
-   if (bio_flagged(bio, BIO_CLONED))
-   seg_cnt = bio_segments(bio);
-   else
-   seg_cnt = bio->bi_vcnt;
+   unsigned short seg_cnt = bio_segments(bio);
 
if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
(seg_cnt < queue_max_segments(q)))
-- 
2.9.5



[PATCH V15 01/18] btrfs: look at bi_size for repair decisions

2019-02-15 Thread Ming Lei
From: Christoph Hellwig 

bio_readpage_error currently uses bi_vcnt to decide if it is worth
retrying an I/O.  But the vector count is mostly an implementation
artifact - it really should figure out if there is more than a
single sector worth retrying.  Use bi_size for that and shift by
PAGE_SHIFT.  This really should be blocks/sectors, but given that
btrfs doesn't support a sector size different from the PAGE_SIZE
using the page size keeps the changes to a minimum.

Reviewed-by: Omar Sandoval 
Reviewed-by: David Sterba 
Signed-off-by: Christoph Hellwig 
---
 fs/btrfs/extent_io.c | 2 +-
 include/linux/bio.h  | 6 --
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 52abe4082680..dc8ba3ee515d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2350,7 +2350,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 
phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
-   unsigned failed_bio_pages = bio_pages_all(failed_bio);
+   unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
 
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7380b094dcca..72b4f7be2106 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -263,12 +263,6 @@ static inline void bio_get_last_bvec(struct bio *bio, 
struct bio_vec *bv)
bv->bv_len = iter.bi_bvec_done;
 }
 
-static inline unsigned bio_pages_all(struct bio *bio)
-{
-   WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-   return bio->bi_vcnt;
-}
-
 static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
 {
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-- 
2.9.5



[PATCH V15 00/18] block: support multi-page bvec

2019-02-15 Thread Ming Lei
t change

V10:
- no any code change, just add more guys and list into patch's CC list,
as suggested by Christoph and Dave Chinner
V9:
- fix regression on iomap's sub-pagesize io vec, covered by patch 13
V8:
- remove prepare patches which all are merged to linus tree
- rebase on for-4.21/block
- address comments on V7
- add patches of killing NO_SG_MERGE

V7:
- include Christoph and Mike's bio_clone_bioset() patches, which is
  actually prepare patches for multipage bvec
- address Christoph's comments

V6:
- avoid to introduce lots of renaming, follow Jen's suggestion of
using the name of chunk for multipage io vector
- include Christoph's three prepare patches
- decrease stack usage for using bio_for_each_chunk_segment_all()
- address Kent's comment

V5:
- remove some of prepare patches, which have been merged already
- add bio_clone_seg_bioset() to fix DM's bio clone, which
is introduced by 18a25da84354c6b (dm: ensure bio submission follows
a depth-first tree walk)
- rebase on the latest block for-v4.18

V4:
- rename bio_for_each_segment*() as bio_for_each_page*(), rename
bio_segments() as bio_pages(), rename rq_for_each_segment() as
rq_for_each_pages(), because these helpers never return real
segment, and they always return single page bvec

- introducing segment_for_each_page_all()

- introduce new 
bio_for_each_segment*()/rq_for_each_segment()/bio_segments()
for returning real multipage segment

- rewrite segment_last_page()

- rename bvec iterator helper as suggested by Christoph

- replace comment with applying bio helpers as suggested by Christoph

- document usage of bio iterator helpers

- redefine BIO_MAX_PAGES as 256 to make the biggest bvec table
accommodated in 4K page

- move bio_alloc_pages() into bcache as suggested by Christoph

V3:
- rebase on v4.13-rc3 with for-next of block tree
- run more xfstests: xfs/ext4 over NVMe, Sata, DM(linear),
MD(raid1), and not see regressions triggered
- add Reviewed-by on some btrfs patches
- remove two MD patches because both are merged to linus tree
  already

V2:
- bvec table direct access in raid has been cleaned, so NO_MP
flag is dropped
- rebase on recent Neil Brown's change on bio and bounce code
- reorganize the patchset

V1:
- against v4.10-rc1 and some cleanup in V0 are in -linus already
- handle queue_virt_boundary() in mp bvec change and make NVMe happy
- further BTRFS cleanup
- remove QUEUE_FLAG_SPLIT_MP
- rename for two new helpers of bio_for_each_segment_all()
- fix bounce convertion
- address comments in V0

[1], http://marc.info/?l=linux-kernel&m=141680246629547&w=2
[2], https://patchwork.kernel.org/patch/9451523/
[3], http://marc.info/?t=14773544711&r=1&w=2
[4], http://marc.info/?l=linux-mm&m=147745525801433&w=2
[5], http://marc.info/?t=14956948457&r=1&w=2
[6], http://marc.info/?t=14982021534&r=1&w=2



Christoph Hellwig (1):
  btrfs: look at bi_size for repair decisions

Ming Lei (17):
  block: don't use bio->bi_vcnt to figure out segment number
  block: remove bvec_iter_rewind()
  block: introduce multi-page bvec helpers
  block: introduce bio_for_each_bvec() and rq_for_each_bvec()
  block: use bio_for_each_bvec() to compute multi-page bvec count
  block: use bio_for_each_bvec() to map sg
  block: introduce mp_bvec_last_segment()
  fs/buffer.c: use bvec iterator to truncate the bio
  btrfs: use mp_bvec_last_segment to get bio's last page
  block: loop: pass multi-page bvec to iov_iter
  bcache: avoid to use bio_for_each_segment_all() in
bch_bio_alloc_pages()
  block: allow bio_for_each_segment_all() to iterate over multi-page
bvec
  block: enable multipage bvecs
  block: always define BIO_MAX_PAGES as 256
  block: document usage of bio iterator helpers
  block: kill QUEUE_FLAG_NO_SG_MERGE
  block: kill BLK_MQ_F_SG_MERGE

 Documentation/block/biovecs.txt   |  25 +
 block/bio.c   |  49 ++---
 block/blk-merge.c | 210 +-
 block/blk-mq-debugfs.c|   2 -
 block/blk-mq.c|   3 -
 block/bounce.c|   6 +-
 drivers/block/loop.c  |  22 ++--
 drivers/block/nbd.c   |   2 +-
 drivers/block/rbd.c   |   2 +-
 drivers/block/skd_main.c  |   1 -
 drivers/block/xen-blkfront.c  |   2 +-
 drivers/md/bcache/btree.c |   3 +-
 drivers/md/bcache/util.c  |   6 +-
 drivers/md/dm-crypt.c |   3 +-
 drivers/md/dm-rq.c|   2 +-
 drivers

Re: [PATCH] blk-flush: fix inverted scheduler check

2019-02-14 Thread Ming Lei
On Mon, Feb 11, 2019 at 04:11:40PM -0700, Jens Axboe wrote:
> 
> A previous commit inadvertently inverted a check for whether or not we
> have an IO scheduler attached. This is reported to cause a hang with
> particular LVM setups.
> 
> Fixup the condition.
> 
> Reported-by: Dragan Milenkovic 
> Fixes: 344e9ffcbd18 ("block: add queue_is_mq() helper")
> Signed-off-by: Jens Axboe 
> 
> ---
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 6e0f2d97fc6d..b56f4e5e6359 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -273,7 +273,7 @@ static void blk_kick_flush(struct request_queue *q, 
> struct blk_flush_queue *fq,
>* assigned to empty flushes, and we deadlock if we are expecting
>* other requests to make progress. Don't defer for that case.
>*/
> - if (!list_empty(&fq->flush_data_in_flight) && q->elevator &&
> + if (!list_empty(&fq->flush_data_in_flight) && !q->elevator &&
>       time_before(jiffies,
>   fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
>   return;

Reviewed-by: Ming Lei 

Thanks,
Ming


Re: [PATCH V2] blk-mq: insert rq with DONTPREP to hctx dispatch list when requeue

2019-02-14 Thread Ming Lei
On Fri, Feb 15, 2019 at 10:34:39AM +0800, jianchao.wang wrote:
> Hi Ming
> 
> Thanks for your kindly response.
> 
> On 2/15/19 10:00 AM, Ming Lei wrote:
> > On Tue, Feb 12, 2019 at 09:56:25AM +0800, Jianchao Wang wrote:
> >> When requeue, if RQF_DONTPREP, rq has contained some driver
> >> specific data, so insert it to hctx dispatch list to avoid any
> >> merge. Take scsi as example, here is the trace event log (no
> >> io scheduler, because RQF_STARTED would prevent merging),
> >>
> >>kworker/0:1H-339   [000] ...1  2037.209289: block_rq_insert: 8,0 R 4096 
> >> () 32768 + 8 [kworker/0:1H]
> >> scsi_inert_test-1987  [000]   2037.220465: block_bio_queue: 8,0 R 
> >> 32776 + 8 [scsi_inert_test]
> >> scsi_inert_test-1987  [000] ...2  2037.220466: block_bio_backmerge: 8,0 R 
> >> 32776 + 8 [scsi_inert_test]
> >>kworker/0:1H-339   [000]   2047.220913: block_rq_issue: 8,0 R 8192 
> >> () 32768 + 16 [kworker/0:1H]
> >> scsi_inert_test-1996  [000] ..s1  2047.221007: block_rq_complete: 8,0 R () 
> >> 32768 + 8 [0]
> >> scsi_inert_test-1996  [000] .Ns1  2047.221045: block_rq_requeue: 8,0 R () 
> >> 32776 + 8 [0]
> >>kworker/0:1H-339   [000] ...1  2047.221054: block_rq_insert: 8,0 R 4096 
> >> () 32776 + 8 [kworker/0:1H]
> >>kworker/0:1H-339   [000] ...1  2047.221056: block_rq_issue: 8,0 R 4096 
> >> () 32776 + 8 [kworker/0:1H]
> >> scsi_inert_test-1986  [000] ..s1  2047.221119: block_rq_complete: 8,0 R () 
> >> 32776 + 8 [0]
> >>
> >> (32768 + 8) was requeued by scsi_queue_insert and had RQF_DONTPREP.
> > 
> > scsi_mq_requeue_cmd() does uninit the request before requeuing, but
> > __scsi_queue_insert doesn't do that.
> 
> Yes.
> scsi layer use both of them.
> 
> > 
> > 
> >> Then it was merged with (32776 + 8) and issued. Due to RQF_DONTPREP,
> >> the sdb only contained the part of (32768 + 8), then only that part
> >> was completed. The lucky thing was that scsi_io_completion detected
> >> it and requeued the remaining part. So we didn't get corrupted data.
> >> However, the requeue of (32776 + 8) is not expected.
> >>
> >> Suggested-by: Jens Axboe 
> >> Signed-off-by: Jianchao Wang 
> >> ---
> >> V2:
> >>  - refactor the code based on Jens' suggestion
> >>
> >>  block/blk-mq.c | 12 ++--
> >>  1 file changed, 10 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >> index 8f5b533..9437a5e 100644
> >> --- a/block/blk-mq.c
> >> +++ b/block/blk-mq.c
> >> @@ -737,12 +737,20 @@ static void blk_mq_requeue_work(struct work_struct 
> >> *work)
> >>spin_unlock_irq(&q->requeue_lock);
> >>  
> >>list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
> >> -  if (!(rq->rq_flags & RQF_SOFTBARRIER))
> >> +  if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
> >>continue;
> >>  
> >>rq->rq_flags &= ~RQF_SOFTBARRIER;
> >>list_del_init(&rq->queuelist);
> >> -  blk_mq_sched_insert_request(rq, true, false, false);
> >> +  /*
> >> +   * If RQF_DONTPREP, rq has contained some driver specific
> >> +   * data, so insert it to hctx dispatch list to avoid any
> >> +   * merge.
> >> +   */
> >> +  if (rq->rq_flags & RQF_DONTPREP)
> >> +  blk_mq_request_bypass_insert(rq, false);
> >> +  else
> >> +  blk_mq_sched_insert_request(rq, true, false, false);
> >>}
> > 
> > Suppose it is one WRITE request to zone device, this way might break
> > the order.
> 
> I'm not sure about this.
> Since the request is dispatched, it should hold and zone write lock.
> And also mq-deadline doesn't have a .requeue_request, zone write lock
> wouldn't be released during requeue.

You are right, looks I misunderstood the zone write lock, sorry for
the noise.

> 
> IMO, this requeue action is similar with what blk_mq_dispatch_rq_list does.
> The latter one also issues the request to underlying driver and requeue rqs
> on dispatch_list if get BLK_STS_SOURCE or BLK_STS_DEV_SOURCE.
> 
> And in addition, RQF_STARTED is set by io scheduler .dispatch_request and
> it could be stop merging as RQF_NOMERGE_FLAGS contains it. 

Yes, that is correct.

Then another question is:

Why don't always requeue request in this way so that it can be simplified
into one code path?

1) in block legacy code, blk_requeue_request() doesn't insert the
request into scheduler queue, and simply put the request into
q->queue_head.

2) blk_mq_requeue_request() is basically run from completion context for
handling very unusual cases(partial completion, error, timeout, ...),
and there shouldn't have benefit to schedule/merge requeued request.

3) RQF_DONTPREP is like a driver private flag, and read/write by driver
only before this patch.

Thanks,
Ming


Re: [PATCH AUTOSEL 4.20 49/77] block: cover another queue enter recursion via BIO_QUEUE_ENTERED

2019-02-14 Thread Ming Lei
On Thu, Feb 14, 2019 at 09:08:27PM -0500, Sasha Levin wrote:
> From: Ming Lei 
> 
> [ Upstream commit 698cef173983b086977e633e46476e0f925ca01e ]
> 
> Except for blk_queue_split(), bio_split() is used for splitting bio too,
> then the remained bio is often resubmit to queue via generic_make_request().
> So the same queue enter recursion exits in this case too. Unfortunatley
> commit cd4a4ae4683dc2 doesn't help this case.
> 
> This patch covers the above case by setting BIO_QUEUE_ENTERED before calling
> q->make_request_fn.
> 
> In theory the per-bio flag is used to simulate one stack variable, it is
> just fine to clear it after q->make_request_fn is returned. Especially
> the same bio can't be submitted from another context.
> 
> Fixes: cd4a4ae4683dc2 ("block: don't use blocking queue entered for recursive 
> bio submits")
> Cc: Tetsuo Handa 
> Cc: NeilBrown 
> Reviewed-by:  Mike Snitzer 
> Signed-off-by: Ming Lei 
> Signed-off-by: Jens Axboe 
> Signed-off-by: Sasha Levin 
> ---
>  block/blk-core.c  | 11 +++
>  block/blk-merge.c | 10 --
>  2 files changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index deb56932f8c4..22260339f66f 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2449,7 +2449,18 @@ blk_qc_t generic_make_request(struct bio *bio)
>   /* Create a fresh bio_list for all subordinate requests 
> */
>   bio_list_on_stack[1] = bio_list_on_stack[0];
>   bio_list_init(&bio_list_on_stack[0]);
> +
> + /*
> +  * Since we're recursing into make_request here, ensure
> +  * that we mark this bio as already having entered the 
> queue.
> +  * If not, and the queue is going away, we can get stuck
> +  * forever on waiting for the queue reference to drop. 
> But
> +  * that will never happen, as we're already holding a
> +  * reference to it.
> +  */
> + bio_set_flag(bio, BIO_QUEUE_ENTERED);
>   ret = q->make_request_fn(q, bio);
> + bio_clear_flag(bio, BIO_QUEUE_ENTERED);
>  
>   /* sort new bios into those for a lower level
>* and those for the same level
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 7695034f4b87..adfe835ab258 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -272,16 +272,6 @@ void blk_queue_split(struct request_queue *q, struct bio 
> **bio)
>   /* there isn't chance to merge the splitted bio */
>   split->bi_opf |= REQ_NOMERGE;
>  
> - /*
> -  * Since we're recursing into make_request here, ensure
> -  * that we mark this bio as already having entered the queue.
> -  * If not, and the queue is going away, we can get stuck
> -  * forever on waiting for the queue reference to drop. But
> -  * that will never happen, as we're already holding a
> -  * reference to it.
> -  */
> - bio_set_flag(*bio, BIO_QUEUE_ENTERED);
> -
>   bio_chain(split, *bio);
>   trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
>   generic_make_request(*bio);
> -- 
> 2.19.1
> 

This one should be dropped, given it has been reverted.

-- 
Ming


Re: [PATCH V2] blk-mq: insert rq with DONTPREP to hctx dispatch list when requeue

2019-02-14 Thread Ming Lei
On Tue, Feb 12, 2019 at 09:56:25AM +0800, Jianchao Wang wrote:
> When requeue, if RQF_DONTPREP, rq has contained some driver
> specific data, so insert it to hctx dispatch list to avoid any
> merge. Take scsi as example, here is the trace event log (no
> io scheduler, because RQF_STARTED would prevent merging),
> 
>kworker/0:1H-339   [000] ...1  2037.209289: block_rq_insert: 8,0 R 4096 () 
> 32768 + 8 [kworker/0:1H]
> scsi_inert_test-1987  [000]   2037.220465: block_bio_queue: 8,0 R 32776 + 
> 8 [scsi_inert_test]
> scsi_inert_test-1987  [000] ...2  2037.220466: block_bio_backmerge: 8,0 R 
> 32776 + 8 [scsi_inert_test]
>kworker/0:1H-339   [000]   2047.220913: block_rq_issue: 8,0 R 8192 () 
> 32768 + 16 [kworker/0:1H]
> scsi_inert_test-1996  [000] ..s1  2047.221007: block_rq_complete: 8,0 R () 
> 32768 + 8 [0]
> scsi_inert_test-1996  [000] .Ns1  2047.221045: block_rq_requeue: 8,0 R () 
> 32776 + 8 [0]
>kworker/0:1H-339   [000] ...1  2047.221054: block_rq_insert: 8,0 R 4096 () 
> 32776 + 8 [kworker/0:1H]
>kworker/0:1H-339   [000] ...1  2047.221056: block_rq_issue: 8,0 R 4096 () 
> 32776 + 8 [kworker/0:1H]
> scsi_inert_test-1986  [000] ..s1  2047.221119: block_rq_complete: 8,0 R () 
> 32776 + 8 [0]
> 
> (32768 + 8) was requeued by scsi_queue_insert and had RQF_DONTPREP.

scsi_mq_requeue_cmd() does uninit the request before requeuing, but
__scsi_queue_insert doesn't do that.


> Then it was merged with (32776 + 8) and issued. Due to RQF_DONTPREP,
> the sdb only contained the part of (32768 + 8), then only that part
> was completed. The lucky thing was that scsi_io_completion detected
> it and requeued the remaining part. So we didn't get corrupted data.
> However, the requeue of (32776 + 8) is not expected.
> 
> Suggested-by: Jens Axboe 
> Signed-off-by: Jianchao Wang 
> ---
> V2:
>  - refactor the code based on Jens' suggestion
> 
>  block/blk-mq.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 8f5b533..9437a5e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -737,12 +737,20 @@ static void blk_mq_requeue_work(struct work_struct 
> *work)
>   spin_unlock_irq(&q->requeue_lock);
>  
>   list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
> - if (!(rq->rq_flags & RQF_SOFTBARRIER))
> + if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
>   continue;
>  
>   rq->rq_flags &= ~RQF_SOFTBARRIER;
>   list_del_init(&rq->queuelist);
> - blk_mq_sched_insert_request(rq, true, false, false);
> + /*
> +  * If RQF_DONTPREP, rq has contained some driver specific
> +  * data, so insert it to hctx dispatch list to avoid any
> +  * merge.
> +  */
> + if (rq->rq_flags & RQF_DONTPREP)
> + blk_mq_request_bypass_insert(rq, false);
> + else
> + blk_mq_sched_insert_request(rq, true, false, false);
>   }

Suppose it is one WRITE request to zone device, this way might break
the order.


Thanks,
Ming


Re: [patch V5 4/8] nvme-pci: Simplify interrupt allocation

2019-02-14 Thread Ming Lei
On Thu, Feb 14, 2019 at 09:47:59PM +0100, Thomas Gleixner wrote:
> From: Ming Lei 
> 
> The NVME PCI driver contains a tedious mechanism for interrupt
> allocation, which is necessary to adjust the number and size of interrupt
> sets to the maximum available number of interrupts which depends on the
> underlying PCI capabilities and the available CPU resources.
> 
> It works around the former short comings of the PCI and core interrupt
> allocation mechanims in combination with interrupt sets.
> 
> The PCI interrupt allocation function allows to provide a maximum and a
> minimum number of interrupts to be allocated and tries to allocate as
> many as possible. This worked without driver interaction as long as there
> was only a single set of interrupts to handle.
> 
> With the addition of support for multiple interrupt sets in the generic
> affinity spreading logic, which is invoked from the PCI interrupt
> allocation, the adaptive loop in the PCI interrupt allocation did not
> work for multiple interrupt sets. The reason is that depending on the
> total number of interrupts which the PCI allocation adaptive loop tries
> to allocate in each step, the number and the size of the interrupt sets
> need to be adapted as well. Due to the way the interrupt sets support was
> implemented there was no way for the PCI interrupt allocation code or the
> core affinity spreading mechanism to invoke a driver specific function
> for adapting the interrupt sets configuration.
> 
> As a consequence the driver had to implement another adaptive loop around
> the PCI interrupt allocation function and calling that with maximum and
> minimum interrupts set to the same value. This ensured that the
> allocation either succeeded or immediately failed without any attempt to
> adjust the number of interrupts in the PCI code.
> 
> The core code now allows drivers to provide a callback to recalculate the
> number and the size of interrupt sets during PCI interrupt allocation,
> which in turn allows the PCI interrupt allocation function to be called
> in the same way as with a single set of interrupts. The PCI code handles
> the adaptive loop and the interrupt affinity spreading mechanism invokes
> the driver callback to adapt the interrupt set configuration to the
> current loop value. This replaces the adaptive loop in the driver
> completely.
> 
> Implement the NVME specific callback which adjusts the interrupt sets
> configuration and remove the adaptive allocation loop.
> 
> [ tglx: Simplify the callback further and restore the dropped adjustment of
>   number of sets ]
> 
> Signed-off-by: Ming Lei 
> Signed-off-by: Thomas Gleixner 
> 
> ---
>  drivers/nvme/host/pci.c |  108 
> 
>  1 file changed, 28 insertions(+), 80 deletions(-)
> 
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2041,41 +2041,32 @@ static int nvme_setup_host_mem(struct nv
>   return ret;
>  }
>  
> -/* irq_queues covers admin queue */
> -static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int 
> irq_queues)
> +/*
> + * nirqs is the number of interrupts available for write and read
> + * queues. The core already reserved an interrupt for the admin queue.
> + */
> +static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int 
> nrirqs)
>  {
> - unsigned int this_w_queues = write_queues;
> -
> - WARN_ON(!irq_queues);
> -
> - /*
> -  * Setup read/write queue split, assign admin queue one independent
> -  * irq vector if irq_queues is > 1.
> -  */
> - if (irq_queues <= 2) {
> - dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
> - dev->io_queues[HCTX_TYPE_READ] = 0;
> - return;
> - }
> + struct nvme_dev *dev = affd->priv;
> + unsigned int nr_read_queues;
>  
>   /*
> -  * If 'write_queues' is set, ensure it leaves room for at least
> -  * one read queue and one admin queue
> -  */
> - if (this_w_queues >= irq_queues)
> - this_w_queues = irq_queues - 2;
> -
> - /*
> -  * If 'write_queues' is set to zero, reads and writes will share
> -  * a queue set.
> -  */
> - if (!this_w_queues) {
> - dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues - 1;
> - dev->io_queues[HCTX_TYPE_READ] = 0;
> - } else {
> - dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
> - dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues - 1;
> - }
> +  * If only one interrupt is available, combine write and read
> +  * queues. If 'write_queues' is

Re: blktests nvme/012 triggering LOCKDEP warning

2019-02-14 Thread Ming Lei
On Thu, Feb 14, 2019 at 1:36 AM Bart Van Assche  wrote:
>
> On Wed, 2019-02-13 at 12:27 +0800, Ming Lei wrote:
> > On Wed, Feb 13, 2019 at 12:33 AM Theodore Y. Ts'o  wrote:
> > >
> > > Is this a known issue?  nvme/012 is triggering the following lockdep 
> > > warning:
> > >
> > > Thanks,
> > >
> > > - Ted
> > >
> > > [ 1964.751910] run blktests nvme/012 at 2019-02-11 20:58:31
> > > [ 1964.977624] nvmet: adding nsid 1 to subsystem blktests-subsystem-1
> > > [ 1965.006395] nvmet: creating controller 1 for subsystem 
> > > blktests-subsystem-1 for NQN 
> > > nqn.2014-08.org.nvmexpress:uuid:8a58b187-6d09-4c5d-bc03-593896d2d80d.
> > > [ 1965.011811] nvme nvme0: ANA group 1: optimized.
> > > [ 1965.011899] nvme nvme0: creating 2 I/O queues.
> > > [ 1965.013966] nvme nvme0: new ctrl: "blktests-subsystem-1"
> > >
> > > [ 1965.282478] 
> > > [ 1965.287922] WARNING: possible recursive locking detected
> > > [ 1965.293364] 5.0.0-rc3-xfstests-00015-g1236f7d60242 #841 Not tainted
> > > [ 1965.299762] 
> > > [ 1965.305207] ksoftirqd/1/16 is trying to acquire lock:
> > > [ 1965.310389] 0282032e (&(&fq->mq_flush_lock)->rlock){..-.}, at: 
> > > flush_end_io+0x4e/0x1d0
> > > [ 1965.319146]
> > >but task is already holding lock:
> > > [ 1965.325106] cbadcbc2 (&(&fq->mq_flush_lock)->rlock){..-.}, at: 
> > > flush_end_io+0x4e/0x1d0
> > > [ 1965.333957]
> > >other info that might help us debug this:
> > > [ 1965.340615]  Possible unsafe locking scenario:
> > >
> > > [ 1965.346664]CPU0
> > > [ 1965.349248]
> > > [ 1965.351820]   lock(&(&fq->mq_flush_lock)->rlock);
> > > [ 1965.356654]   lock(&(&fq->mq_flush_lock)->rlock);
> > > [ 1965.361490]
> > > *** DEADLOCK ***
> > >
> > > [ 1965.367541]  May be due to missing lock nesting notation
> > >
> > > [ 1965.374636] 1 lock held by ksoftirqd/1/16:
> > > [ 1965.378890]  #0: cbadcbc2 
> > > (&(&fq->mq_flush_lock)->rlock){..-.}, at: flush_end_io+0x4e/0x1d0
> > > [ 1965.388080]
> > >stack backtrace:
> > > [ 1965.392570] CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 
> > > 5.0.0-rc3-xfstests-00015-g1236f7d60242 #841
> > > [ 1965.402002] Hardware name: Google Google Compute Engine/Google Compute 
> > > Engine, BIOS Google 01/01/2011
> > > [ 1965.411411] Call Trace:
> > > [ 1965.413996]  dump_stack+0x67/0x90
> > > [ 1965.417433]  __lock_acquire.cold.45+0x2b4/0x313
> > > [ 1965.422194]  lock_acquire+0x98/0x160
> > > [ 1965.425894]  ? flush_end_io+0x4e/0x1d0
> > > [ 1965.429817]  _raw_spin_lock_irqsave+0x3b/0x80
> > > [ 1965.434299]  ? flush_end_io+0x4e/0x1d0
> > > [ 1965.438162]  flush_end_io+0x4e/0x1d0
> > > [ 1965.441909]  blk_mq_complete_request+0x76/0x110
> > > [ 1965.446580]  nvmet_req_complete+0x15/0x110 [nvmet]
> > > [ 1965.452098]  nvmet_bio_done+0x27/0x50 [nvmet]
> > > [ 1965.456634]  blk_update_request+0xd7/0x2d0
> > > [ 1965.460869]  blk_mq_end_request+0x1a/0x100
> > > [ 1965.465091]  blk_flush_complete_seq+0xe5/0x350
> > > [ 1965.469660]  flush_end_io+0x12f/0x1d0
> > > [ 1965.473436]  blk_done_softirq+0x9f/0xd0
> > > [ 1965.477398]  __do_softirq+0xca/0x440
> > > [ 1965.481092]  ? smpboot_thread_fn+0x2f/0x1e0
> > > [ 1965.485512]  ? smpboot_thread_fn+0x74/0x1e0
> > > [ 1965.489813]  ? smpboot_thread_fn+0x118/0x1e0
> > > [ 1965.494379]  run_ksoftirqd+0x24/0x50
> > > [ 1965.498081]  smpboot_thread_fn+0x113/0x1e0
> > > [ 1965.502399]  ? sort_range+0x20/0x20
> > > [ 1965.506008]  kthread+0x121/0x140
> > > [ 1965.509395]  ? kthread_park+0x80/0x80
> > > [ 1965.513290]  ret_from_fork+0x3a/0x50
> > > [ 1965.527032] XFS (nvme0n1): Mounting V5 Filesystem
> > > [ 1965.541778] XFS (nvme0n1): Ending clean mount
> > > [ 2064.142830] XFS (nvme0n1): Unmounting Filesystem
> > > [ 2064.171432] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"
> >
> > That is a false positive.
> >
> > It is caused by calling host request's completion handler from target
> > IO's completion
> > handler di

[PATCH V4 4/4] PCI: Document .calc_sets as required in case of multiple interrupt sets

2019-02-14 Thread Ming Lei
Now NVMe has implemented the .calc_sets callback for calculating size of
interrupt set.

For other cases of multiple IRQ sets, pre-calculating each set's size
before allocating IRQ vectors can't work too because the available
interrupt number for this device is unknown at that time.

So document .calc_sets as required explicitly for multiple interrupt sets.

Reviewed-by: Jens Axboe 
Acked-by: Bjorn Helgaas 
Signed-off-by: Ming Lei 
---
 drivers/pci/msi.c | 14 --
 include/linux/interrupt.h |  3 ++-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 96978459e2a0..64383ab5f53f 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1036,10 +1036,11 @@ static int __pci_enable_msi_range(struct pci_dev *dev, 
int minvec, int maxvec,
return -ERANGE;
 
/*
-* If the caller is passing in sets, we can't support a range of
-* vectors. The caller needs to handle that.
+* If the caller requests multiple sets of IRQs where each set
+* requires different affinity, it must also supply a ->calc_sets()
+* callback to compute size of each interrupt set
 */
-   if (affd && affd->nr_sets && minvec != maxvec)
+   if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
 
if (WARN_ON_ONCE(dev->msi_enabled))
@@ -1094,10 +1095,11 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
return -ERANGE;
 
/*
-* If the caller is passing in sets, we can't support a range of
-* supported vectors. The caller needs to handle that.
+* If the caller requests multiple sets of IRQs where each set
+* requires different affinity, it must also supply a ->calc_sets()
+* callback to compute size of each interrupt set
 */
-   if (affd && affd->nr_sets && minvec != maxvec)
+   if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
 
if (WARN_ON_ONCE(dev->msix_enabled))
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index acb9c5ad6bc1..0da2a5cea8f3 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -269,7 +269,8 @@ struct irq_affinity_notify {
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
  * @set_size:  Number of affinitized sets
- * @calc_sets: Callback for caculating interrupt sets size
+ * @calc_sets: Callback for caculating set size, required for
+ * multiple interrupt sets.
  * @priv:  Private data of @calc_sets
  */
 struct irq_affinity {
-- 
2.9.5



[PATCH V4 3/4] nvme-pci: Simplify interrupt allocation

2019-02-14 Thread Ming Lei
The NVME PCI driver contains a tedious mechanism for interrupt
allocation, which is necessary to adjust the number and size of interrupt
sets to the maximum available number of interrupts which depends on the
underlying PCI capabilities and the available CPU resources.

It works around the former short comings of the PCI and core interrupt
allocation mechanims in combination with interrupt sets.

The PCI interrupt allocation function allows to provide a maximum and a
minimum number of interrupts to be allocated and tries to allocate as
many as possible. This worked without driver interaction as long as there
was only a single set of interrupts to handle.

With the addition of support for multiple interrupt sets in the generic
affinity spreading logic, which is invoked from the PCI interrupt
allocation, the adaptive loop in the PCI interrupt allocation did not
work for multiple interrupt sets. The reason is that depending on the
total number of interrupts which the PCI allocation adaptive loop tries
to allocate in each step, the number and the size of the interrupt sets
need to be adapted as well. Due to the way the interrupt sets support was
implemented there was no way for the PCI interrupt allocation code or the
core affinity spreading mechanism to invoke a driver specific function
for adapting the interrupt sets configuration.

As a consequence the driver had to implement another adaptive loop around
the PCI interrupt allocation function and calling that with maximum and
minimum interrupts set to the same value. This ensured that the
allocation either succeeded or immediately failed without any attempt to
adjust the number of interrupts in the PCI code.

The core code now allows drivers to provide a callback to recalculate the
number and the size of interrupt sets during PCI interrupt allocation,
which in turn allows the PCI interrupt allocation function to be called
in the same way as with a single set of interrupts. The PCI code handles
the adaptive loop and the interrupt affinity spreading mechanism invokes
the driver callback to adapt the interrupt set configuration to the
current loop value. This replaces the adaptive loop in the driver
completely.

Implement the NVME specific callback which adjusts the interrupt sets
configuration and remove the adaptive allocation loop.

Reviewed-by: Jens Axboe 
Reviewed-by: Keith Busch 
Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c | 62 +
 1 file changed, 16 insertions(+), 46 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 193d94caf457..02ae653bf2c3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2078,14 +2078,25 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, 
unsigned int irq_queues)
}
 }
 
+static void nvme_calc_irq_sets(struct irq_affinity *affd, int nvecs)
+{
+   struct nvme_dev *dev = affd->priv;
+
+   nvme_calc_io_queues(dev, nvecs);
+
+   affd->set_size[HCTX_TYPE_DEFAULT] = dev->io_queues[HCTX_TYPE_DEFAULT];
+   affd->set_size[HCTX_TYPE_READ] = dev->io_queues[HCTX_TYPE_READ];
+   affd->nr_sets = 2;
+}
+
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
struct irq_affinity affd = {
.pre_vectors = 1,
-   .nr_sets = 2,
+   .calc_sets = nvme_calc_irq_sets,
+   .priv = dev,
};
-   int *irq_sets = affd.set_size;
int result = 0;
unsigned int irq_queues, this_p_queues;
 
@@ -2102,50 +2113,8 @@ static int nvme_setup_irqs(struct nvme_dev *dev, 
unsigned int nr_io_queues)
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
-   /*
-* For irq sets, we have to ask for minvec == maxvec. This passes
-* any reduction back to us, so we can adjust our queue counts and
-* IRQ vector needs.
-*/
-   do {
-   nvme_calc_io_queues(dev, irq_queues);
-   irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
-   irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
-   if (!irq_sets[1])
-   affd.nr_sets = 1;
-
-   /*
-* If we got a failure and we're down to asking for just
-* 1 + 1 queues, just ask for a single vector. We'll share
-* that between the single IO queue and the admin queue.
-* Otherwise, we assign one independent vector to admin queue.
-*/
-   if (irq_queues > 1)
-   irq_queues = irq_sets[0] + irq_sets[1] + 1;
-
-   result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
-   irq_queues,
-   PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
-   /*
-* Need 

[PATCH V4 2/4] genirq/affinity: add new callback for caculating interrupt sets size

2019-02-14 Thread Ming Lei
The interrupt affinity spreading mechanism supports to spread out
affinities for one or more interrupt sets. A interrupt set contains one
or more interrupts. Each set is mapped to a specific functionality of a
device, e.g. general I/O queues and read I/O queus of multiqueue block
devices.

The number of interrupts per set is defined by the driver. It depends on
the total number of available interrupts for the device, which is
determined by the PCI capabilites and the availability of underlying CPU
resources, and the number of queues which the device provides and the
driver wants to instantiate.

The driver passes initial configuration for the interrupt allocation via
a pointer to struct affinity_desc.

Right now the allocation mechanism is complex as it requires to have a
loop in the driver to determine the maximum number of interrupts which
are provided by the PCI capabilities and the underlying CPU resources.
This loop would have to be replicated in every driver which wants to
utilize this mechanism. That's unwanted code duplication and error
prone.

In order to move this into generic facilities it is required to have a
mechanism, which allows the recalculation of the interrupt sets and
their size, in the core code. As the core code does not have any
knowledge about the underlying device, a driver specific callback will
be added to struct affinity_desc, which will be invoked by the core
code. The callback will get the number of available interupts as an
argument, so the driver can calculate the corresponding number and size
of interrupt sets.

To support this, two modifications for the handling of struct
affinity_desc are required:

1) The (optional) interrupt sets size information is contained in a
   separate array of integers and struct affinity_desc contains a
   pointer to it.

   This is cumbersome and as the maximum number of interrupt sets is
   small, there is no reason to have separate storage. Moving the size
   array into struct affinity_desc avoids indirections makes the code
   simpler.

2) At the moment the struct affinity_desc pointer which is handed in from
   the driver and passed through to several core functions is marked
   'const'.

This patch adds callback to recalculate the number and size of interrupt sets,
also removes the 'const' qualifier for 'affd'.

Reviewed-by: Jens Axboe 
Signed-off-by: Ming Lei 
---
 drivers/pci/msi.c | 18 +-
 include/linux/interrupt.h |  6 +-
 include/linux/pci.h   |  4 ++--
 kernel/irq/affinity.c | 25 -
 4 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c0b47867258..96978459e2a0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -532,7 +532,7 @@ static int populate_msi_sysfs(struct pci_dev *pdev)
 }
 
 static struct msi_desc *
-msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd)
+msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 {
struct irq_affinity_desc *masks = NULL;
struct msi_desc *entry;
@@ -597,7 +597,7 @@ static int msi_verify_entries(struct pci_dev *dev)
  * which could have been allocated.
  */
 static int msi_capability_init(struct pci_dev *dev, int nvec,
-  const struct irq_affinity *affd)
+  struct irq_affinity *affd)
 {
struct msi_desc *entry;
int ret;
@@ -669,7 +669,7 @@ static void __iomem *msix_map_region(struct pci_dev *dev, 
unsigned nr_entries)
 
 static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
  struct msix_entry *entries, int nvec,
- const struct irq_affinity *affd)
+ struct irq_affinity *affd)
 {
struct irq_affinity_desc *curmsk, *masks = NULL;
struct msi_desc *entry;
@@ -736,7 +736,7 @@ static void msix_program_entries(struct pci_dev *dev,
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry 
*entries,
-   int nvec, const struct irq_affinity *affd)
+   int nvec, struct irq_affinity *affd)
 {
int ret;
u16 control;
@@ -932,7 +932,7 @@ int pci_msix_vec_count(struct pci_dev *dev)
 EXPORT_SYMBOL(pci_msix_vec_count);
 
 static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
-int nvec, const struct irq_affinity *affd)
+int nvec, struct irq_affinity *affd)
 {
int nr_entries;
int i, j;
@@ -1018,7 +1018,7 @@ int pci_msi_enabled(void)
 EXPORT_SYMBOL(pci_msi_enabled);
 
 static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
- const struct irq_affinity *affd)
+ struct irq_

[PATCH V4 1/4] genirq/affinity: store interrupt sets size in 'struct irq_affinity'

2019-02-14 Thread Ming Lei
The interrupt affinity spreading mechanism supports to spread out
affinities for one or more interrupt sets. A interrupt set contains one
or more interrupts. Each set is mapped to a specific functionality of a
device, e.g. general I/O queues and read I/O queus of multiqueue block
devices.

The number of interrupts per set is defined by the driver. It depends on
the total number of available interrupts for the device, which is
determined by the PCI capabilites and the availability of underlying CPU
resources, and the number of queues which the device provides and the
driver wants to instantiate.

The driver passes initial configuration for the interrupt allocation via
a pointer to struct affinity_desc.

Right now the allocation mechanism is complex as it requires to have a
loop in the driver to determine the maximum number of interrupts which
are provided by the PCI capabilities and the underlying CPU resources.
This loop would have to be replicated in every driver which wants to
utilize this mechanism. That's unwanted code duplication and error
prone.

In order to move this into generic facilities it is required to have a
mechanism, which allows the recalculation of the interrupt sets and
their size, in the core code. As the core code does not have any
knowledge about the underlying device, a driver specific callback will
be added to struct affinity_desc, which will be invoked by the core
code. The callback will get the number of available interupts as an
argument, so the driver can calculate the corresponding number and size
of interrupt sets.

To support this, two modifications for the handling of struct
affinity_desc are required:

1) The (optional) interrupt sets size information is contained in a
   separate array of integers and struct affinity_desc contains a
   pointer to it.

   This is cumbersome and as the maximum number of interrupt sets is
   small, there is no reason to have separate storage. Moving the size
   array into struct affinity_desc avoids indirections makes the code
   simpler.

2) At the moment the struct affinity_desc pointer which is handed in from
   the driver and passed through to several core functions is marked
   'const'.

With the upcoming callback to recalculate the number and size of
interrupt sets, it's necessary to remove the 'const' qualifier. Otherwise
the callback would not be able to update the data.

This patch does the 1st thing and stores interrupt sets size in
'struct irq_affinity'.

Reviewed-by: Jens Axboe 
Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c   |  5 ++---
 include/linux/interrupt.h |  6 --
 kernel/irq/affinity.c | 15 ---
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 022ea1ee63f8..193d94caf457 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2081,12 +2081,11 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, 
unsigned int irq_queues)
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int irq_sets[2];
struct irq_affinity affd = {
.pre_vectors = 1,
-   .nr_sets = ARRAY_SIZE(irq_sets),
-   .sets = irq_sets,
+   .nr_sets = 2,
};
+   int *irq_sets = affd.set_size;
int result = 0;
unsigned int irq_queues, this_p_queues;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7c9434652f36..d9dd5bd61e36 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,6 +259,8 @@ struct irq_affinity_notify {
void (*release)(struct kref *ref);
 };
 
+#defineIRQ_AFFINITY_MAX_SETS  4
+
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
  * @pre_vectors:   Don't apply affinity to @pre_vectors at beginning of
@@ -266,13 +268,13 @@ struct irq_affinity_notify {
  * @post_vectors:  Don't apply affinity to @post_vectors at end of
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
- * @sets:  Number of affinitized sets
+ * @set_size:  Number of affinitized sets
  */
 struct irq_affinity {
int pre_vectors;
int post_vectors;
int nr_sets;
-   int *sets;
+   int set_size[IRQ_AFFINITY_MAX_SETS];
 };
 
 /**
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 118b66d64a53..a5e3e5fb3b92 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -245,6 +245,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
int curvec, usedvecs;
struct irq_affinity_desc *masks = NULL;
int i, nr_sets;
+   int set_size[IRQ_AFFINITY_MAX_SETS];
 
/*
 * If there aren't any vectors left after applying the pre/post
@@ -253,6 +254,9 @@ irq_c

[PATCH V4 0/4] genirq/affinity: add .calc_sets for improving IRQ allocation & spread

2019-02-14 Thread Ming Lei
Hi,

Currently pre-caculated set vectors are provided by driver for
allocating & spread vectors. This way only works when drivers passes
same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
also requires driver to retry the allocating & spread.

As Bjorn and Keith mentioned, the current usage & interface for irq sets
is a bit awkward because the retrying should have been avoided by providing
one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
'max_vecs', number of the allocated vectors is unknown before calling
pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
pre-caculated.

Add a new callback of .calc_sets into 'struct irq_affinity' so that
driver can caculate set vectors after IRQ vector is allocated and
before spread IRQ vectors.

V4:
- re-writen changelogs by Thomas Gleixner
- merge patch of removing 'const' into the patch
for adding new callback
 
V3:
- don't mark 'affd' as const in related functions
- add sanity check on affd->nr_sets
- remove the local variable of 'nr_sets' in irq_create_affinity_masks
- set .nr_sets as 2 in nvme
- update comment in msi.c

V2:
- add .calc_sets instead of .setup_affinity() which is easy to
be abused by drivers

Ming Lei (4):
  genirq/affinity: store interrupt sets size in 'struct irq_affinity'
  genirq/affinity: add new callback for caculating interrupt sets size
  nvme-pci: Simplify interrupt allocation
  PCI: Document .calc_sets as required in case of multiple interrupt
sets

 drivers/nvme/host/pci.c   | 63 ---
 drivers/pci/msi.c | 32 +---
 include/linux/interrupt.h | 13 +++---
 include/linux/pci.h   |  4 +--
 kernel/irq/affinity.c | 26 ---
 5 files changed, 62 insertions(+), 76 deletions(-)

-- 
2.9.5



[PATCH V3 1/5] genirq/affinity: don't mark 'affd' as const

2019-02-13 Thread Ming Lei
Currently all parameters in 'affd' are read-only, so 'affd' is marked
as const in both pci_alloc_irq_vectors_affinity() and 
irq_create_affinity_masks().

We have to ask driver to re-caculate set vectors after the whole IRQ
vectors are allocated later, and the result needs to be stored in 'affd'.
Also both the two interfaces are core APIs, which should be trusted.

So don't mark 'affd' as const both pci_alloc_irq_vectors_affinity() and
irq_create_affinity_masks().

Signed-off-by: Ming Lei 
---
 drivers/pci/msi.c | 18 +-
 include/linux/interrupt.h |  2 +-
 include/linux/pci.h   |  4 ++--
 kernel/irq/affinity.c |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c0b47867258..96978459e2a0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -532,7 +532,7 @@ static int populate_msi_sysfs(struct pci_dev *pdev)
 }
 
 static struct msi_desc *
-msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd)
+msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 {
struct irq_affinity_desc *masks = NULL;
struct msi_desc *entry;
@@ -597,7 +597,7 @@ static int msi_verify_entries(struct pci_dev *dev)
  * which could have been allocated.
  */
 static int msi_capability_init(struct pci_dev *dev, int nvec,
-  const struct irq_affinity *affd)
+  struct irq_affinity *affd)
 {
struct msi_desc *entry;
int ret;
@@ -669,7 +669,7 @@ static void __iomem *msix_map_region(struct pci_dev *dev, 
unsigned nr_entries)
 
 static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
  struct msix_entry *entries, int nvec,
- const struct irq_affinity *affd)
+ struct irq_affinity *affd)
 {
struct irq_affinity_desc *curmsk, *masks = NULL;
struct msi_desc *entry;
@@ -736,7 +736,7 @@ static void msix_program_entries(struct pci_dev *dev,
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry 
*entries,
-   int nvec, const struct irq_affinity *affd)
+   int nvec, struct irq_affinity *affd)
 {
int ret;
u16 control;
@@ -932,7 +932,7 @@ int pci_msix_vec_count(struct pci_dev *dev)
 EXPORT_SYMBOL(pci_msix_vec_count);
 
 static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
-int nvec, const struct irq_affinity *affd)
+int nvec, struct irq_affinity *affd)
 {
int nr_entries;
int i, j;
@@ -1018,7 +1018,7 @@ int pci_msi_enabled(void)
 EXPORT_SYMBOL(pci_msi_enabled);
 
 static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
- const struct irq_affinity *affd)
+ struct irq_affinity *affd)
 {
int nvec;
int rc;
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL(pci_enable_msi);
 
 static int __pci_enable_msix_range(struct pci_dev *dev,
   struct msix_entry *entries, int minvec,
-  int maxvec, const struct irq_affinity *affd)
+  int maxvec, struct irq_affinity *affd)
 {
int rc, nvec = maxvec;
 
@@ -1165,9 +1165,9 @@ EXPORT_SYMBOL(pci_enable_msix_range);
  */
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
   unsigned int max_vecs, unsigned int flags,
-  const struct irq_affinity *affd)
+  struct irq_affinity *affd)
 {
-   static const struct irq_affinity msi_default_affd;
+   struct irq_affinity msi_default_affd = {0};
int msix_vecs = -ENOSPC;
int msi_vecs = -ENOSPC;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7c9434652f36..1ed1014c9684 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -332,7 +332,7 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify 
*notify);
 
 struct irq_affinity_desc *
-irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+irq_create_affinity_masks(int nvec, struct irq_affinity *affd);
 
 int irq_calc_affinity_vectors(int minvec, int maxvec, const struct 
irq_affinity *affd);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 40b327b814aa..4eca42cf611b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1396,7 +1396,7 @@ static inline int pci_enable_msix_exact(struct pci_dev 
*dev,
 }
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
   unsigned int max_vecs, unsigned int fl

[PATCH V3 5/5] genirq/affinity: Document .calc_sets as required in case of multiple sets

2019-02-13 Thread Ming Lei
Now NVMe has implemented the .calc_sets callback for caculating each
set's vectors.

For other cases of multiple IRQ sets, pre-caculating each set's vectors
before allocating IRQ vectors can't work because the whole vectors
number is unknow at that time.

So document .calc_sets as required explicitly for multiple sets.

Acked-by: Bjorn Helgaas 
Signed-off-by: Ming Lei 
---
 drivers/pci/msi.c | 16 ++--
 include/linux/interrupt.h |  3 ++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 96978459e2a0..199d708b4099 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1036,10 +1036,12 @@ static int __pci_enable_msi_range(struct pci_dev *dev, 
int minvec, int maxvec,
return -ERANGE;
 
/*
-* If the caller is passing in sets, we can't support a range of
-* vectors. The caller needs to handle that.
+* If the caller requests multiple sets of IRQs where each set
+* requires different affinity, it must also supply a ->calc_sets()
+* callback to compute vectors for each set after whole vectors are
+* allocated.
 */
-   if (affd && affd->nr_sets && minvec != maxvec)
+   if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
 
if (WARN_ON_ONCE(dev->msi_enabled))
@@ -1094,10 +1096,12 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
return -ERANGE;
 
/*
-* If the caller is passing in sets, we can't support a range of
-* supported vectors. The caller needs to handle that.
+* If the caller requests multiple sets of IRQs where each set
+* requires different affinity, it must also supply a ->calc_sets()
+* callback to compute vectors for each set after whole vectors are
+* allocated.
 */
-   if (affd && affd->nr_sets && minvec != maxvec)
+   if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
 
if (WARN_ON_ONCE(dev->msix_enabled))
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7a27f6ba1f2f..a053f7fb0ff1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -269,7 +269,8 @@ struct irq_affinity_notify {
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
  * @set_vectors:   Number of affinitized sets
- * @calc_sets: Callback for caculating set vectors
+ * @calc_sets: Callback for caculating set vectors, required for
+ * multiple irq sets.
  * @priv:  Private data of @calc_sets
  */
 struct irq_affinity {
-- 
2.9.5



[PATCH V3 0/5] genirq/affinity: add .calc_sets for improving IRQ allocation & spread

2019-02-13 Thread Ming Lei
Hi,

Currently pre-caculated set vectors are provided by driver for
allocating & spread vectors. This way only works when drivers passes
same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
also requires driver to retry the allocating & spread.

As Bjorn and Keith mentioned, the current usage & interface for irq sets
is a bit awkward because the retrying should have been avoided by providing
one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
'max_vecs', number of the allocated vectors is unknown before calling
pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
pre-caculated.

Add a new callback of .calc_sets into 'struct irq_affinity' so that
driver can caculate set vectors after IRQ vector is allocated and
before spread IRQ vectors.

V3:
- don't mark 'affd' as const in related functions
- add sanity check on affd->nr_sets
- remove the local variable of 'nr_sets' in irq_create_affinity_masks
- set .nr_sets as 2 in nvme
- update comment in msi.c

V2:
- add .calc_sets instead of .setup_affinity() which is easy to
be abused by drivers

Ming Lei (5):
  genirq/affinity: don't mark 'affd' as const
  genirq/affinity: store irq set vectors in 'struct irq_affinity'
  genirq/affinity: add new callback for caculating set vectors
  nvme-pci: avoid irq allocation retrying via .calc_sets
  genirq/affinity: Document .calc_sets as required in case of multiple
sets

 drivers/nvme/host/pci.c   | 63 ---
 drivers/pci/msi.c | 34 ++---
 include/linux/interrupt.h | 13 +++---
 include/linux/pci.h   |  4 +--
 kernel/irq/affinity.c | 26 ---
 5 files changed, 64 insertions(+), 76 deletions(-)

-- 
2.9.5



[PATCH V3 3/5] genirq/affinity: add new callback for caculating set vectors

2019-02-13 Thread Ming Lei
Currently pre-caculated set vectors are provided by driver for
allocating & spread vectors. This way only works when drivers passes
same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
also requires driver to retry the allocating & spread.

As Bjorn and Keith mentioned, the current usage & interface for irq sets
is a bit awkward because the retrying should have been avoided by providing
one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
'max_vecs', number of the allocated vectors is unknown before calling
pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
pre-caculated.

Add a new callback of .calc_sets into 'struct irq_affinity' so that
driver can caculate set vectors after IRQ vector is allocated and
before spread IRQ vectors. Add 'priv' so that driver may retrieve
its private data via the 'struct irq_affinity'.

Suggested-by: Thomas Gleixner 
Signed-off-by: Ming Lei 
---
 include/linux/interrupt.h | 4 
 kernel/irq/affinity.c | 8 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a20150627a32..7a27f6ba1f2f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -269,12 +269,16 @@ struct irq_affinity_notify {
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
  * @set_vectors:   Number of affinitized sets
+ * @calc_sets: Callback for caculating set vectors
+ * @priv:  Private data of @calc_sets
  */
 struct irq_affinity {
int pre_vectors;
int post_vectors;
int nr_sets;
int set_vectors[IRQ_MAX_SETS];
+   void(*calc_sets)(struct irq_affinity *, int nvecs);
+   void*priv;
 };
 
 /**
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index b868b9d3df7f..2341b1f005fa 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -267,7 +267,9 @@ irq_create_affinity_masks(int nvecs, struct irq_affinity 
*affd)
 * Spread on present CPUs starting from affd->pre_vectors. If we
 * have multiple sets, build each sets affinity mask separately.
 */
-   if (!affd->nr_sets) {
+   if (affd->calc_sets) {
+   affd->calc_sets(affd, nvecs);
+   } else if (!affd->nr_sets) {
affd->nr_sets = 1;
affd->set_vectors[0] = affvecs;
}
@@ -316,7 +318,9 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const 
struct irq_affinity
if (resv > minvec)
return 0;
 
-   if (affd->nr_sets) {
+   if (affd->calc_sets) {
+   set_vecs = vecs;
+   } else if (affd->nr_sets) {
int i;
 
for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-- 
2.9.5



[PATCH V3 4/5] nvme-pci: avoid irq allocation retrying via .calc_sets

2019-02-13 Thread Ming Lei
Currently pre-caculate each set vectors, and this way requires same
'max_vecs' and 'min_vecs' passed to pci_alloc_irq_vectors_affinity(),
then nvme_setup_irqs() has to retry in case of allocation failure.

This usage & interface is a bit awkward because the retry should have
been avoided by providing one reasonable 'min_vecs'.

Implement the callback of .calc_sets, so that pci_alloc_irq_vectors_affinity()
can calculate each set's vector after IRQ vectors is allocated and
before spread IRQ, then NVMe's retry in case of irq allocation failure
can be removed.

Reviewed-by: Keith Busch 
Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c | 62 +
 1 file changed, 16 insertions(+), 46 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0086bdf80ea1..8c51252a897e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2078,14 +2078,25 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, 
unsigned int irq_queues)
}
 }
 
+static void nvme_calc_irq_sets(struct irq_affinity *affd, int nvecs)
+{
+   struct nvme_dev *dev = affd->priv;
+
+   nvme_calc_io_queues(dev, nvecs);
+
+   affd->set_vectors[HCTX_TYPE_DEFAULT] = 
dev->io_queues[HCTX_TYPE_DEFAULT];
+   affd->set_vectors[HCTX_TYPE_READ] = dev->io_queues[HCTX_TYPE_READ];
+   affd->nr_sets = 2;
+}
+
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
struct irq_affinity affd = {
.pre_vectors = 1,
-   .nr_sets = 2,
+   .calc_sets = nvme_calc_irq_sets,
+   .priv = dev,
};
-   int *irq_sets = affd.set_vectors;
int result = 0;
unsigned int irq_queues, this_p_queues;
 
@@ -2102,50 +2113,8 @@ static int nvme_setup_irqs(struct nvme_dev *dev, 
unsigned int nr_io_queues)
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
-   /*
-* For irq sets, we have to ask for minvec == maxvec. This passes
-* any reduction back to us, so we can adjust our queue counts and
-* IRQ vector needs.
-*/
-   do {
-   nvme_calc_io_queues(dev, irq_queues);
-   irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
-   irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
-   if (!irq_sets[1])
-   affd.nr_sets = 1;
-
-   /*
-* If we got a failure and we're down to asking for just
-* 1 + 1 queues, just ask for a single vector. We'll share
-* that between the single IO queue and the admin queue.
-* Otherwise, we assign one independent vector to admin queue.
-*/
-   if (irq_queues > 1)
-   irq_queues = irq_sets[0] + irq_sets[1] + 1;
-
-   result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
-   irq_queues,
-   PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
-   /*
-* Need to reduce our vec counts. If we get ENOSPC, the
-* platform should support mulitple vecs, we just need
-* to decrease our ask. If we get EINVAL, the platform
-* likely does not. Back down to ask for just one vector.
-*/
-   if (result == -ENOSPC) {
-   irq_queues--;
-   if (!irq_queues)
-   return result;
-   continue;
-   } else if (result == -EINVAL) {
-   irq_queues = 1;
-   continue;
-   } else if (result <= 0)
-   return -EIO;
-   break;
-   } while (1);
-
+   result = pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+   PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
return result;
 }
 
@@ -3021,6 +2990,7 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
+   BUILD_BUG_ON(2 > IRQ_MAX_SETS);
return pci_register_driver(&nvme_driver);
 }
 
-- 
2.9.5



[PATCH V3 2/5] genirq/affinity: store irq set vectors in 'struct irq_affinity'

2019-02-13 Thread Ming Lei
Currently the array of irq set vectors is provided by driver.

irq_create_affinity_masks() can be simplied a bit by treating the
non-irq-set case as single irq set.

So move this array into 'struct irq_affinity', and pre-define the max
set number as 4, which should be enough for normal cases.

Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c   |  5 ++---
 include/linux/interrupt.h |  6 --
 kernel/irq/affinity.c | 18 +++---
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 022ea1ee63f8..0086bdf80ea1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2081,12 +2081,11 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, 
unsigned int irq_queues)
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int irq_sets[2];
struct irq_affinity affd = {
.pre_vectors = 1,
-   .nr_sets = ARRAY_SIZE(irq_sets),
-   .sets = irq_sets,
+   .nr_sets = 2,
};
+   int *irq_sets = affd.set_vectors;
int result = 0;
unsigned int irq_queues, this_p_queues;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 1ed1014c9684..a20150627a32 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,6 +259,8 @@ struct irq_affinity_notify {
void (*release)(struct kref *ref);
 };
 
+#defineIRQ_MAX_SETS  4
+
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
  * @pre_vectors:   Don't apply affinity to @pre_vectors at beginning of
@@ -266,13 +268,13 @@ struct irq_affinity_notify {
  * @post_vectors:  Don't apply affinity to @post_vectors at end of
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
- * @sets:  Number of affinitized sets
+ * @set_vectors:   Number of affinitized sets
  */
 struct irq_affinity {
int pre_vectors;
int post_vectors;
int nr_sets;
-   int *sets;
+   int set_vectors[IRQ_MAX_SETS];
 };
 
 /**
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 9200d3b26f7d..b868b9d3df7f 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -244,7 +244,7 @@ irq_create_affinity_masks(int nvecs, struct irq_affinity 
*affd)
int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
int curvec, usedvecs;
struct irq_affinity_desc *masks = NULL;
-   int i, nr_sets;
+   int i;
 
/*
 * If there aren't any vectors left after applying the pre/post
@@ -253,6 +253,9 @@ irq_create_affinity_masks(int nvecs, struct irq_affinity 
*affd)
if (nvecs == affd->pre_vectors + affd->post_vectors)
return NULL;
 
+   if (affd->nr_sets > IRQ_MAX_SETS)
+   return NULL;
+
masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
if (!masks)
return NULL;
@@ -264,12 +267,13 @@ irq_create_affinity_masks(int nvecs, struct irq_affinity 
*affd)
 * Spread on present CPUs starting from affd->pre_vectors. If we
 * have multiple sets, build each sets affinity mask separately.
 */
-   nr_sets = affd->nr_sets;
-   if (!nr_sets)
-   nr_sets = 1;
+   if (!affd->nr_sets) {
+   affd->nr_sets = 1;
+   affd->set_vectors[0] = affvecs;
+   }
 
-   for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-   int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+   for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+   int this_vecs = affd->set_vectors[i];
int ret;
 
ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -316,7 +320,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const 
struct irq_affinity
int i;
 
for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-   set_vecs += affd->sets[i];
+   set_vecs += affd->set_vectors[i];
} else {
get_online_cpus();
set_vecs = cpumask_weight(cpu_possible_mask);
-- 
2.9.5



Re: blktests nvme/012 triggering LOCKDEP warning

2019-02-12 Thread Ming Lei
On Wed, Feb 13, 2019 at 12:33 AM Theodore Y. Ts'o  wrote:
>
> Is this a known issue?  nvme/012 is triggering the following lockdep warning:
>
> Thanks,
>
> - Ted
>
> [ 1964.751910] run blktests nvme/012 at 2019-02-11 20:58:31
> [ 1964.977624] nvmet: adding nsid 1 to subsystem blktests-subsystem-1
> [ 1965.006395] nvmet: creating controller 1 for subsystem 
> blktests-subsystem-1 for NQN 
> nqn.2014-08.org.nvmexpress:uuid:8a58b187-6d09-4c5d-bc03-593896d2d80d.
> [ 1965.011811] nvme nvme0: ANA group 1: optimized.
> [ 1965.011899] nvme nvme0: creating 2 I/O queues.
> [ 1965.013966] nvme nvme0: new ctrl: "blktests-subsystem-1"
>
> [ 1965.282478] 
> [ 1965.287922] WARNING: possible recursive locking detected
> [ 1965.293364] 5.0.0-rc3-xfstests-00015-g1236f7d60242 #841 Not tainted
> [ 1965.299762] 
> [ 1965.305207] ksoftirqd/1/16 is trying to acquire lock:
> [ 1965.310389] 0282032e (&(&fq->mq_flush_lock)->rlock){..-.}, at: 
> flush_end_io+0x4e/0x1d0
> [ 1965.319146]
>but task is already holding lock:
> [ 1965.325106] cbadcbc2 (&(&fq->mq_flush_lock)->rlock){..-.}, at: 
> flush_end_io+0x4e/0x1d0
> [ 1965.333957]
>other info that might help us debug this:
> [ 1965.340615]  Possible unsafe locking scenario:
>
> [ 1965.346664]CPU0
> [ 1965.349248]
> [ 1965.351820]   lock(&(&fq->mq_flush_lock)->rlock);
> [ 1965.356654]   lock(&(&fq->mq_flush_lock)->rlock);
> [ 1965.361490]
> *** DEADLOCK ***
>
> [ 1965.367541]  May be due to missing lock nesting notation
>
> [ 1965.374636] 1 lock held by ksoftirqd/1/16:
> [ 1965.378890]  #0: cbadcbc2 (&(&fq->mq_flush_lock)->rlock){..-.}, 
> at: flush_end_io+0x4e/0x1d0
> [ 1965.388080]
>stack backtrace:
> [ 1965.392570] CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 
> 5.0.0-rc3-xfstests-00015-g1236f7d60242 #841
> [ 1965.402002] Hardware name: Google Google Compute Engine/Google Compute 
> Engine, BIOS Google 01/01/2011
> [ 1965.411411] Call Trace:
> [ 1965.413996]  dump_stack+0x67/0x90
> [ 1965.417433]  __lock_acquire.cold.45+0x2b4/0x313
> [ 1965.422194]  lock_acquire+0x98/0x160
> [ 1965.425894]  ? flush_end_io+0x4e/0x1d0
> [ 1965.429817]  _raw_spin_lock_irqsave+0x3b/0x80
> [ 1965.434299]  ? flush_end_io+0x4e/0x1d0
> [ 1965.438162]  flush_end_io+0x4e/0x1d0
> [ 1965.441909]  blk_mq_complete_request+0x76/0x110
> [ 1965.446580]  nvmet_req_complete+0x15/0x110 [nvmet]
> [ 1965.452098]  nvmet_bio_done+0x27/0x50 [nvmet]
> [ 1965.456634]  blk_update_request+0xd7/0x2d0
> [ 1965.460869]  blk_mq_end_request+0x1a/0x100
> [ 1965.465091]  blk_flush_complete_seq+0xe5/0x350
> [ 1965.469660]  flush_end_io+0x12f/0x1d0
> [ 1965.473436]  blk_done_softirq+0x9f/0xd0
> [ 1965.477398]  __do_softirq+0xca/0x440
> [ 1965.481092]  ? smpboot_thread_fn+0x2f/0x1e0
> [ 1965.485512]  ? smpboot_thread_fn+0x74/0x1e0
> [ 1965.489813]  ? smpboot_thread_fn+0x118/0x1e0
> [ 1965.494379]  run_ksoftirqd+0x24/0x50
> [ 1965.498081]  smpboot_thread_fn+0x113/0x1e0
> [ 1965.502399]  ? sort_range+0x20/0x20
> [ 1965.506008]  kthread+0x121/0x140
> [ 1965.509395]  ? kthread_park+0x80/0x80
> [ 1965.513290]  ret_from_fork+0x3a/0x50
> [ 1965.527032] XFS (nvme0n1): Mounting V5 Filesystem
> [ 1965.541778] XFS (nvme0n1): Ending clean mount
> [ 2064.142830] XFS (nvme0n1): Unmounting Filesystem
> [ 2064.171432] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"

That is a false positive.

It is caused by calling host request's completion handler from target
IO's completion
handler directly, and this way should be nvme-loop only.

We may need to annotate the locks in .end_io of blk-flush for avoiding
this warning.

BTW, this way of nvme-loop handling IO completion may trigger soft lockup too.

Thanks,
Ming Lei


Re: qla2xxx init warning with blk_mq at drivers/pci/msi.c:1273 pci_irq_get_affinity+0xf4/0x120

2019-02-12 Thread Ming Lei
On Wed, Feb 13, 2019 at 5:36 AM Meelis Roos  wrote:
>
> >  I tested todays 5.0.0-rc5-00358-gdf3865f on my sparcs and a couple of 
> > servers that have qla2xxx
> >  FC adapter gave me this warning:
>
> Now I got a very similar one on an x86-64 server:
>
>
> [   18.472568] qla2xxx [:00:00.0]-0005: : QLogic Fibre Channel HBA 
> Driver: 10.00.00.12-k.
> [   18.474272] PCI Interrupt Link [LNKD] enabled at IRQ 19
> [   18.474272] qla2xxx [:04:00.0]-001d: : Found an ISP2432 irq 19 iobase 
> 0x(ptrval).
> [   18.917293] scsi host3: qla2xxx
> [   18.917412] WARNING: CPU: 0 PID: 1650 at drivers/pci/msi.c:1273 
> pci_irq_get_affinity+0x35/0x80
> [   18.917554] Modules linked in: qla2xxx(+) scsi_transport_fc i2c_nforce2 
> e1000 pata_amd k8temp libata i2c_core hwmon forcedeth(+) powernow_k8 pcspkr 
> autofs4
> [   18.917708] CPU: 0 PID: 1650 Comm: kworker/0:3 Not tainted 
> 5.0.0-rc6-00013-gaa0c38cf39de #14
> [   18.917848] Hardware name: Sun Microsystems Sun Fire X4200 M2/Sun Fire 
> X4200 M2, BIOS 0ABJX104 04/09/2009
> [   18.918002] Workqueue: events work_for_cpu_fn
> [   18.918082] RIP: 0010:pci_irq_get_affinity+0x35/0x80
> [   18.918164] Code: 2e 48 8b 87 b8 02 00 00 48 8d 8f b8 02 00 00 48 39 c1 74 
> 16 85 f6 74 4e 31 d2 eb 04 39 d6 74 46 48 8b 00 ff c2 48 39 c8 75 f2 <0f> 0b 
> 31 c0 c3 83 e2 02 48 c7 c0 c0 46 d5 b8 74 29 48 8b 97 b8 02
> [   18.918374] RSP: 0018:a15f4108fcb0 EFLAGS: 00010246
> [   18.918460] RAX: 95e7995f52b8 RBX:  RCX: 
> 95e7995f52b8
> [   18.918547] RDX: 0002 RSI: 0002 RDI: 
> 95e7995f5000
> [   18.918629] RBP:  R08: ff80 R09: 
> 95e799008e60
> [   18.918712] R10: b90b0f01 R11: a15f4108fa90 R12: 
> 0002
> [   18.918795] R13: 95e7995f5000 R14:  R15: 
> 95e7969890a8
> [   18.918878] FS:  () GS:95e79b40() 
> knlGS:
> [   18.919013] CS:  0010 DS:  ES:  CR0: 80050033
> [   18.919103] CR2: 7f213fb62000 CR3: 000217dc CR4: 
> 06f0
> [   18.919185] Call Trace:
> [   18.919269]  blk_mq_pci_map_queues+0x32/0xc0
> [   18.919354]  blk_mq_alloc_tag_set+0x12e/0x340
> [   18.919437]  scsi_add_host_with_dma+0xa1/0x310
> [   18.919549]  qla2x00_probe_one+0x1272/0x2400 [qla2xxx]
> [   18.919632]  ? try_to_wake_up+0x2c8/0x6f0
> [   18.919710]  local_pci_probe+0x4b/0xb0
> [   18.919787]  work_for_cpu_fn+0x11/0x20
> [   18.919866]  process_one_work+0x1d1/0x360
> [   18.919943]  worker_thread+0x20e/0x3f0
> [   18.920020]  ? wq_calc_node_cpumask+0x110/0x110
> [   18.920100]  kthread+0x109/0x120
> [   18.920176]  ? kthread_create_on_node+0x60/0x60
> [   18.920256]  ret_from_fork+0x35/0x40
> [   18.920334] ---[ end trace 56ed281ce2c61e69 ]---
> [   18.926988] qla2xxx [:04:00.0]-00fb:3: QLogic QLE2460 - 
> SG-(X)PCIE1FC-QF4, Sun StorageTek 4 Gb FC Enterprise PCI-Express Single 
> Channel.
> [   18.927148] qla2xxx [:04:00.0]-00fc:3: ISP2432: PCIe (2.5GT/s x4) @ 
> :04:00.0 hdma+ host#=3 fw=8.07.00 (9496).
> [   18.927757] qla2xxx [:83:00.0]-001d: : Found an ISP2432 irq 30 iobase 
> 0x(ptrval).
> [   19.350303] qla2xxx [:83:00.0]-00fb:4: QLogic QLE2460 - 
> SG-(X)PCIE1FC-QF4, Sun StorageTek 4 Gb FC Enterprise PCI-Express Single 
> Channel.
> [   19.350474] qla2xxx [:83:00.0]-00fc:4: ISP2432: PCIe (2.5GT/s x4) @ 
> :83:00.0 hdma+ host#=4 fw=8.07.00 (9496).
> [   40.421151] qla2xxx [:04:00.0]-8038:3: Cable is unplugged...
> [   40.837159] qla2xxx [:83:00.0]-8038:4: Cable is unplugged...

We saw such issue too in which .msix_count is too small, and equal to
.pre_vectors, and Himanshu is working on it.

Thanks,
Ming Lei


[PATCH V2 3/4] nvme-pci: avoid irq allocation retrying via .calc_sets

2019-02-12 Thread Ming Lei
Currently pre-caculate each set vectors, and this way requires same
'max_vecs' and 'min_vecs' passed to pci_alloc_irq_vectors_affinity(),
then nvme_setup_irqs() has to retry in case of allocation failure.

This usage & interface is a bit awkward because the retry should have
been avoided by providing one reasonable 'min_vecs'.

Implement the callback of .calc_sets, so that pci_alloc_irq_vectors_affinity()
can calculate each set's vector after IRQ vectors is allocated and
before spread IRQ, then NVMe's retry in case of irq allocation failure
can be removed.

Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c | 62 +
 1 file changed, 16 insertions(+), 46 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0086bdf80ea1..ca381894542a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2078,14 +2078,25 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, 
unsigned int irq_queues)
}
 }
 
+static void nvme_calc_irq_sets(struct irq_affinity *affd, int nvecs)
+{
+   struct nvme_dev *dev = affd->priv;
+
+   nvme_calc_io_queues(dev, nvecs);
+
+   affd->set_vectors[HCTX_TYPE_DEFAULT] = 
dev->io_queues[HCTX_TYPE_DEFAULT];
+   affd->set_vectors[HCTX_TYPE_READ] = dev->io_queues[HCTX_TYPE_READ];
+   affd->nr_sets = HCTX_TYPE_POLL;
+}
+
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
struct irq_affinity affd = {
.pre_vectors = 1,
-   .nr_sets = 2,
+   .calc_sets = nvme_calc_irq_sets,
+   .priv = dev,
};
-   int *irq_sets = affd.set_vectors;
int result = 0;
unsigned int irq_queues, this_p_queues;
 
@@ -2102,50 +2113,8 @@ static int nvme_setup_irqs(struct nvme_dev *dev, 
unsigned int nr_io_queues)
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
-   /*
-* For irq sets, we have to ask for minvec == maxvec. This passes
-* any reduction back to us, so we can adjust our queue counts and
-* IRQ vector needs.
-*/
-   do {
-   nvme_calc_io_queues(dev, irq_queues);
-   irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
-   irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
-   if (!irq_sets[1])
-   affd.nr_sets = 1;
-
-   /*
-* If we got a failure and we're down to asking for just
-* 1 + 1 queues, just ask for a single vector. We'll share
-* that between the single IO queue and the admin queue.
-* Otherwise, we assign one independent vector to admin queue.
-*/
-   if (irq_queues > 1)
-   irq_queues = irq_sets[0] + irq_sets[1] + 1;
-
-   result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
-   irq_queues,
-   PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
-   /*
-* Need to reduce our vec counts. If we get ENOSPC, the
-* platform should support mulitple vecs, we just need
-* to decrease our ask. If we get EINVAL, the platform
-* likely does not. Back down to ask for just one vector.
-*/
-   if (result == -ENOSPC) {
-   irq_queues--;
-   if (!irq_queues)
-   return result;
-   continue;
-   } else if (result == -EINVAL) {
-   irq_queues = 1;
-   continue;
-   } else if (result <= 0)
-   return -EIO;
-   break;
-   } while (1);
-
+   result = pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+   PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
return result;
 }
 
@@ -3021,6 +2990,7 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
+   BUILD_BUG_ON(HCTX_TYPE_POLL > IRQ_MAX_SETS);
return pci_register_driver(&nvme_driver);
 }
 
-- 
2.9.5



[PATCH V2 4/4] genirq/affinity: Document .calc_sets as required in case of multiple sets

2019-02-12 Thread Ming Lei
Now NVMe has implemented the .calc_sets callback for caculating each
set's vectors.

For other cases of multiple irq sets, it isn't a good way to pre-caculate
each set's vectors before allocating IRQ vectors because NVMe's same issue
exists too.

Document .calc_sets as required explicitly for multiple sets.

Signed-off-by: Ming Lei 
---
 drivers/pci/msi.c | 4 ++--
 include/linux/interrupt.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c0b47867258..9f91fa713141 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1039,7 +1039,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, 
int minvec, int maxvec,
 * If the caller is passing in sets, we can't support a range of
 * vectors. The caller needs to handle that.
 */
-   if (affd && affd->nr_sets && minvec != maxvec)
+   if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
 
if (WARN_ON_ONCE(dev->msi_enabled))
@@ -1097,7 +1097,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 * If the caller is passing in sets, we can't support a range of
 * supported vectors. The caller needs to handle that.
 */
-   if (affd && affd->nr_sets && minvec != maxvec)
+   if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
 
if (WARN_ON_ONCE(dev->msix_enabled))
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7a27f6ba1f2f..a053f7fb0ff1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -269,7 +269,8 @@ struct irq_affinity_notify {
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
  * @set_vectors:   Number of affinitized sets
- * @calc_sets: Callback for caculating set vectors
+ * @calc_sets: Callback for caculating set vectors, required for
+ * multiple irq sets.
  * @priv:  Private data of @calc_sets
  */
 struct irq_affinity {
-- 
2.9.5



[PATCH V2 2/4] genirq/affinity: add new callback for caculating set vectors

2019-02-12 Thread Ming Lei
Currently pre-caculated set vectors are provided by driver for
allocating & spread vectors. This way only works when drivers passes
same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
also requires driver to retry the allocating & spread.

As Bjorn and Keith mentioned, the current usage & interface for irq sets
is a bit awkward because the retrying should have been avoided by providing
one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
'max_vecs', number of the allocated vectors is unknown before calling
pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
pre-caculated.

Add a new callback of .calc_sets into 'struct irq_affinity' so that
driver can caculate set vectors after IRQ vector is allocated and
before spread IRQ vectors. Add 'priv' so that driver may retrieve
its private data via the 'struct irq_affinity'.

Suggested-by: Thomas Gleixner 
Signed-off-by: Ming Lei 
---
 include/linux/interrupt.h |  4 
 kernel/irq/affinity.c | 13 +
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a20150627a32..7a27f6ba1f2f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -269,12 +269,16 @@ struct irq_affinity_notify {
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
  * @set_vectors:   Number of affinitized sets
+ * @calc_sets: Callback for caculating set vectors
+ * @priv:  Private data of @calc_sets
  */
 struct irq_affinity {
int pre_vectors;
int post_vectors;
int nr_sets;
int set_vectors[IRQ_MAX_SETS];
+   void(*calc_sets)(struct irq_affinity *, int nvecs);
+   void*priv;
 };
 
 /**
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a97b7c33d2db..34abba63df4d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -264,11 +264,14 @@ irq_create_affinity_masks(int nvecs, struct irq_affinity 
*affd)
 * Spread on present CPUs starting from affd->pre_vectors. If we
 * have multiple sets, build each sets affinity mask separately.
 */
-   nr_sets = affd->nr_sets;
-   if (!nr_sets) {
+   if (affd->calc_sets) {
+   affd->calc_sets(affd, nvecs);
+   nr_sets = affd->nr_sets;
+   } else if (!affd->nr_sets) {
nr_sets = 1;
affd->set_vectors[0] = affvecs;
-   }
+   } else
+   nr_sets = affd->nr_sets;
 
for (i = 0, usedvecs = 0; i < nr_sets; i++) {
int this_vecs = affd->set_vectors[i];
@@ -314,7 +317,9 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const 
struct irq_affinity
if (resv > minvec)
return 0;
 
-   if (affd->nr_sets) {
+   if (affd->calc_sets) {
+   set_vecs = vecs;
+   } else if (affd->nr_sets) {
int i;
 
for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-- 
2.9.5



[PATCH V2 1/4] genirq/affinity: store irq set vectors in 'struct irq_affinity'

2019-02-12 Thread Ming Lei
Currently the array of irq set vectors is provided by driver.

irq_create_affinity_masks() can be simplied a bit by treating the
non-irq-set case as single irq set.

So move this array into 'struct irq_affinity', and pre-define the max
set number as 4, which should be enough for normal cases.

Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c   |  5 ++---
 include/linux/interrupt.h |  8 +---
 kernel/irq/affinity.c | 10 ++
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 022ea1ee63f8..0086bdf80ea1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2081,12 +2081,11 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, 
unsigned int irq_queues)
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int irq_sets[2];
struct irq_affinity affd = {
.pre_vectors = 1,
-   .nr_sets = ARRAY_SIZE(irq_sets),
-   .sets = irq_sets,
+   .nr_sets = 2,
};
+   int *irq_sets = affd.set_vectors;
int result = 0;
unsigned int irq_queues, this_p_queues;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7c9434652f36..a20150627a32 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,6 +259,8 @@ struct irq_affinity_notify {
void (*release)(struct kref *ref);
 };
 
+#defineIRQ_MAX_SETS  4
+
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
  * @pre_vectors:   Don't apply affinity to @pre_vectors at beginning of
@@ -266,13 +268,13 @@ struct irq_affinity_notify {
  * @post_vectors:  Don't apply affinity to @post_vectors at end of
  * the MSI(-X) vector space
  * @nr_sets:   Length of passed in *sets array
- * @sets:  Number of affinitized sets
+ * @set_vectors:   Number of affinitized sets
  */
 struct irq_affinity {
int pre_vectors;
int post_vectors;
int nr_sets;
-   int *sets;
+   int set_vectors[IRQ_MAX_SETS];
 };
 
 /**
@@ -332,7 +334,7 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify 
*notify);
 
 struct irq_affinity_desc *
-irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+irq_create_affinity_masks(int nvec, struct irq_affinity *affd);
 
 int irq_calc_affinity_vectors(int minvec, int maxvec, const struct 
irq_affinity *affd);
 
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 118b66d64a53..a97b7c33d2db 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -239,7 +239,7 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
  * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
 struct irq_affinity_desc *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(int nvecs, struct irq_affinity *affd)
 {
int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
int curvec, usedvecs;
@@ -265,11 +265,13 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
 * have multiple sets, build each sets affinity mask separately.
 */
nr_sets = affd->nr_sets;
-   if (!nr_sets)
+   if (!nr_sets) {
nr_sets = 1;
+   affd->set_vectors[0] = affvecs;
+   }
 
for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-   int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+   int this_vecs = affd->set_vectors[i];
int ret;
 
ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -316,7 +318,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const 
struct irq_affinity
int i;
 
for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-   set_vecs += affd->sets[i];
+   set_vecs += affd->set_vectors[i];
} else {
get_online_cpus();
set_vecs = cpumask_weight(cpu_possible_mask);
-- 
2.9.5



[PATCH V2 0/4] genirq/affinity: add .calc_sets for improving IRQ allocation & spread

2019-02-12 Thread Ming Lei
Hi,

Currently pre-caculated set vectors are provided by driver for
allocating & spread vectors. This way only works when drivers passes
same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
also requires driver to retry the allocating & spread.

As Bjorn and Keith mentioned, the current usage & interface for irq sets
is a bit awkward because the retrying should have been avoided by providing
one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
'max_vecs', number of the allocated vectors is unknown before calling
pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
pre-caculated.

Add a new callback of .calc_sets into 'struct irq_affinity' so that
driver can caculate set vectors after IRQ vector is allocated and
before spread IRQ vectors. Add 'priv' so that driver may retrieve
its private data via the 'struct irq_affinity'.


V2:
    - add .calc_sets instead of .setup_affinity() which is easy to
be abused by drivers

Ming Lei (4):
  genirq/affinity: store irq set vectors in 'struct irq_affinity'
  genirq/affinity: add new callback for caculating set vectors
  nvme-pci: avoid irq allocation retrying via .calc_sets
  genirq/affinity: Document .calc_sets as required in case of multiple
sets

 drivers/nvme/host/pci.c   | 63 ---
 drivers/pci/msi.c |  4 +--
 include/linux/interrupt.h | 13 +++---
 kernel/irq/affinity.c | 19 +-
 4 files changed, 41 insertions(+), 58 deletions(-)

-- 
2.9.5



Re: [PATCH 2/5] genirq/affinity: allow driver to setup managed IRQ's affinity

2019-02-12 Thread Ming Lei
Hi Thomas,

On Mon, Feb 11, 2019 at 11:38:07PM +0100, Thomas Gleixner wrote:
> Ming,
> 
> On Mon, 11 Feb 2019, Bjorn Helgaas wrote:
> 
> > On Mon, Feb 11, 2019 at 11:54:00AM +0800, Ming Lei wrote:
> > > On Sun, Feb 10, 2019 at 05:30:41PM +0100, Thomas Gleixner wrote:
> > > > On Fri, 25 Jan 2019, Ming Lei wrote:
> > > > 
> > > > > This patch introduces callback of .setup_affinity into 'struct
> > > > > irq_affinity', so that:
> > > > 
> > > > Please see Documentation/process/submitting-patches.rst. Search for 
> > > > 'This
> > > > patch' 
> > > 
> > > Sorry for that, because I am not a native English speaker and it looks a 
> > > bit
> > > difficult for me to understand the subtle difference.
> 
> Sorry I was a bit terse.
> 
> > I think Thomas is saying that instead of "This patch introduces
> > callback ...", you could say "Introduce callback of ...".
> > 
> > The changelog is *part* of the patch, so the context is obvious and
> > there's no need to include the words "This patch".
> > 
> > I make the same changes to patches I receive.  In fact, I would go
> > even further and say "Add callback .setup_affinity() ..." because "add"
> > means the same as "introduce" but is shorter and simpler.
> 
> Thanks for the explanation, Bjorn!
> 
> There is another point here. It's not only the 'This patch introduces ...'
> part. It's also good practice to structure the changelog so it provides
> context and reasoning first and then tells what the change is, e.g.:
> 
>The current handling of multiple interrupt sets in the core interrupt
>affinity logic, requires the driver to do ...  This is necessary
>because 
> 
>This handling should be in the core code, but the core implementation
>has no way to recompute the interrupt sets for a given number of
>vectors.
> 
>Add an optional callback to struct affd, which lets the driver recompute
>the interrupt set before the interrupt affinity spreading takes place.
> 
> The first paragraph provides context, i.e. the status quo, The second
> paragraph provides reasoning what is missing and the last one tells what's
> done to solve it.
> 
> That's pretty much the same for all changelogs, even if you fix a bug. Just
> in the bug case, the second paragraph describes the details of the bug and
> the possible consequences.
> 
> You really need to look at the changelog as a stand alone information
> source. That's important when you look at a commit as an outsider or even
> if you look at your own patch 6 month down the road when you already paged
> out all the details.
> 
> Hope that clarifies it.

Your description about how to write changelog is really helpful and useful
for me, thanks!

Maybe you can add it into Documentation/SubmittingPatches, so that lots
of people can benefit from the guide.


Thanks,
Ming


Re: [PATCH 4/5] nvme-pci: simplify nvme_setup_irqs() via .setup_affinity callback

2019-02-10 Thread Ming Lei
On Sun, Feb 10, 2019 at 07:49:12PM +0100, Thomas Gleixner wrote:
> On Fri, 25 Jan 2019, Ming Lei wrote:
> > +static int nvme_setup_affinity(const struct irq_affinity *affd,
> > +  struct irq_affinity_desc *masks,
> > +  unsigned int nmasks)
> > +{
> > +   struct nvme_dev *dev = affd->priv;
> > +   int affvecs = nmasks - affd->pre_vectors - affd->post_vectors;
> > +   int curvec, usedvecs;
> > +   int i;
> > +
> > +   nvme_calc_io_queues(dev, nmasks);
> 
> So this is the only NVME specific information. Everything else can be done
> in generic code. So what you really want is:
> 
>   struct affd {
>   ...
> + calc_sets(struct affd *, unsigned int nvecs);
>   ...
>   }
> 
> And sets want to be actually inside of the affinity descriptor structure:
> 
> unsigned int num_sets;
>   unsigned int set_vectors[MAX_SETS];
> 
> We surely can define a sensible maximum of sets for now. If that ever turns
> out to be insufficient, then struct affd might become to large for the
> stack, but for now, using e.g. 8, there is no need to do so.
> 
> So then the logic in the generic code becomes exactly the same as what you
> added to nvme_setup_affinity():
> 
>   if (affd->calc_sets) {
>   affd->calc_sets(affd, nvecs);
>   } else if (!affd->num_sets) {
>   affd->num_sets = 1;
>   affd->set_vectors[0] = affvecs;
>   }
> 
>   for (i = 0; i < affd->num_sets; i++) {
>   
>   }
> 
> See?

OK, will do this way in V2, then we can avoid drivers to abuse
the callback.

Thanks,
Ming


Re: [PATCH 4/5] nvme-pci: simplify nvme_setup_irqs() via .setup_affinity callback

2019-02-10 Thread Ming Lei
On Sun, Feb 10, 2019 at 05:39:20PM +0100, Thomas Gleixner wrote:
> On Fri, 25 Jan 2019, Ming Lei wrote:
> 
> > Use the callback of .setup_affinity() to re-caculate number
> > of queues, and build irqs affinity with help of irq_build_affinity().
> > 
> > Then nvme_setup_irqs() gets simplified a lot.
> 
> I'm pretty sure you can achieve the same by reworking the core code without
> that callback.

Could you share the idea a bit? As I mentioned, the re-distribution
needs driver's knowledge.

>  
> > +   /* Fill out vectors at the beginning that don't need affinity */
> > +   for (curvec = 0; curvec < affd->pre_vectors; curvec++)
> > +   cpumask_copy(&masks[curvec].mask, cpu_possible_mask);
> 
> cpu_possible_mask is wrong.  Why are you deliberately trying to make this
> special? There is absolutely no reason to do so.

It is just for avoiding to export 'irq_default_affinity'.

> 
> These interrupts are not managed and therefore the initial affinity has to
> be irq_default_affinity. Setting them to cpu_possible_mask can and probably
> will evade a well thought out default affinity mask, which was set to
> isolate a set of cores from general purpose interrupts.
> 
> This is exactly the thing which happens with driver special stuff and which
> needs to be avoided. There is nothing special about this NVME setup and
> yes, I can see why the current core code is a bit tedious to work with, but
> that does not justify that extra driver magic by any means.

OK, thanks for your explanation.

Thanks,
Ming


Re: [PATCH 2/5] genirq/affinity: allow driver to setup managed IRQ's affinity

2019-02-10 Thread Ming Lei
Hello Thomas,

On Sun, Feb 10, 2019 at 05:30:41PM +0100, Thomas Gleixner wrote:
> Ming,
> 
> On Fri, 25 Jan 2019, Ming Lei wrote:
> 
> > This patch introduces callback of .setup_affinity into 'struct
> > irq_affinity', so that:
> 
> Please see Documentation/process/submitting-patches.rst. Search for 'This
> patch' 

Sorry for that, because I am not a native English speaker and it looks a bit
difficult for me to understand the subtle difference.

> 
> > 
> > 1) allow drivers to customize the affinity for managed IRQ, for
> > example, now NVMe has special requirement for read queues & poll
> > queues
> 
> That's nothing new and already handled today.
> 
> > 2) 6da4b3ab9a6e9 ("genirq/affinity: Add support for allocating interrupt 
> > sets")
> > makes pci_alloc_irq_vectors_affinity() a bit difficult to use for
> > allocating interrupt sets: 'max_vecs' is required to same with 'min_vecs'.
> 
> So it's a bit difficult, but you fail to explain why it's not sufficient.

The introduced limit is that 'max_vecs' has to be same with 'min_vecs' for
pci_alloc_irq_vectors_affinity() wrt. NVMe's use case since commit
6da4b3ab9a6e9, then NVMe has to deal with irq vectors allocation failure in
the awkward way of retrying.

And the topic has been discussed in the following links:

https://marc.info/?l=linux-pci&m=154655595615575&w=2
https://marc.info/?l=linux-pci&m=154646930922174&w=2

Bjorn and Keith thought this usage/interface is a bit awkward because the passed
'min_vecs' should have avoided driver's retrying.

For NVMe, when irq vectors are run out of from pci_alloc_irq_vectors_affinity(),
the requested number has to be decreased and retry until it succeeds, then the
allocated irq vectors has to be re-distributed among the whole irq sets. Turns
out the re-distribution need driver's knowledge, that is why the callback is
introduced.

> 
> > With this patch, driver can implement their own .setup_affinity to
> > customize the affinity, then the above thing can be solved easily.
> 
> Well, I don't really understand what is solved easily and you are merily
> describing the fact that the new callback allows drivers to customize
> something. What's the rationale? If it's just the 'bit difficult' part,
> then what is the reason for not making the core functionality easier to use
> instead of moving stuff into driver space again?

Another solution mentioned in previous discussion is to split building & 
setting up
affinities from allocating irq vectors, but one big problem is that allocating
'irq_desc' needs the affinity mask for figuring out 'node', see alloc_descs().

> 
> NVME is not special and all this achieves is that all drivers writers will

I mean that NVMe is the only user of irq sets.

> claim that their device is special and needs its own affinity setter
> routine. The whole point of having the generic code is to exactly avoid
> that. If it has shortcomings, then they need to be addressed, but not
> worked around with random driver callbacks.

Understood.

Thanks,
Ming


Re: [PATCH 2/5] genirq/affinity: allow driver to setup managed IRQ's affinity

2019-02-10 Thread Ming Lei
On Thu, Feb 07, 2019 at 04:21:30PM -0600, Bjorn Helgaas wrote:
> On Fri, Jan 25, 2019 at 05:53:44PM +0800, Ming Lei wrote:
> > This patch introduces callback of .setup_affinity into 'struct
> > irq_affinity', so that:
> > 
> > 1) allow drivers to customize the affinity for managed IRQ, for
> > example, now NVMe has special requirement for read queues & poll
> > queues
> > 
> > 2) 6da4b3ab9a6e9 ("genirq/affinity: Add support for allocating interrupt 
> > sets")
> > makes pci_alloc_irq_vectors_affinity() a bit difficult to use for
> > allocating interrupt sets: 'max_vecs' is required to same with 'min_vecs'.
> 
> s/is required to same with/is required to be equal to/
> 
> > With this patch, driver can implement their own .setup_affinity to
> > customize the affinity, then the above thing can be solved easily.
> 
> s/driver/drivers/
> 
> > Signed-off-by: Ming Lei 
> > ---
> >  include/linux/interrupt.h | 26 +-
> >  kernel/irq/affinity.c |  6 ++
> >  2 files changed, 23 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> > index c672f34235e7..f6cea778cf50 100644
> > --- a/include/linux/interrupt.h
> > +++ b/include/linux/interrupt.h
> > @@ -242,30 +242,38 @@ struct irq_affinity_notify {
> >  };
> >  
> >  /**
> > + * struct irq_affinity_desc - Interrupt affinity descriptor
> > + * @mask:  cpumask to hold the affinity assignment
> > + */
> > +struct irq_affinity_desc {
> > +   struct cpumask  mask;
> > +   unsigned intis_managed : 1;
> > +};
> 
> I was going to comment that "managed" already has a common usage
> related to the devm_*() functions, but I don't think that's what you
> mean here.  But then I noticed that you're only *moving* this code,
> so you couldn't change it anyway.
> 
> But I still wonder what "managed" means here.

'managed' irq's affinity is managed by kernel, and user space can't change
it any more via /proc/irq/...

> 
> > +
> > +/**
> >   * struct irq_affinity - Description for automatic irq affinity 
> > assignements
> >   * @pre_vectors:   Don't apply affinity to @pre_vectors at beginning of
> >   * the MSI(-X) vector space
> >   * @post_vectors:  Don't apply affinity to @post_vectors at end of
> >   * the MSI(-X) vector space
> > + * @setup_affinity:Use driver's method to setup irq vectors 
> > affinity,
> > + * and driver has to handle pre_vectors & 
> > post_vectors
> > + * correctly, set 'is_managed' flag correct too
> 
> s/irq vectors/irq vector/
> s/correct/correctly/
> 
> In general I don't think "correctly" is very useful in changelogs
> and comments.  Usually it just means "how *I* think it should be
> done", but it doesn't tell anybody else exactly *how* it should be
> done.

That is one nice question.

So far there are at least two rules related with correctness:

1) 'is_managed' flag needs to be set

2) for all managed irq vectors in one interrupt set of one device,
all possible CPUs should be covered in the setup affinities, meantime
one CPU can't be allocated to two irq vectors. The interrupt set is
unique for NVMe actually now, such as, all READ queues' interrupts
belong to one set, and other queues belong to another set. For other
device, we can think there is only one set for one device.

> 
> What does it mean for a driver to handle pre_vectors & post_vectors
> "correctly"?  The driver's .setup_affinity() method receives an array
> of struct irq_affinity; maybe it means that method should set the
> cpumask for each element as it desires.  For @pre_vectors and
> @post_vectors, I suppose that means their cpumask would be
> irq_default_affinity?

So far the default affinity for pre_vectors & post_vectors is to use
irq_default_affinity, and this patch changes it to cpu_possible_mask,
and this change won't be one big deal from driver's view.

> 
> But I guess the .setup_affinity() method means the driver would have
> complete flexibility for each vector, and it could use

Yeah, together with simplification in both driver and genirq/affinity
code.

> irq_default_affinity for arbitrary vectors, not just the first few
> (@pre_vectors) and the last few (@post_vectors)?
> 
> What's the rule for how a driver sets "is_managed"?

Please see above, and I plan to enhance the rule in genirq/affinity code
if this approach may be accepted.

Thanks,
Ming


Re: [PATCH 0/5] genirq/affinity: introduce .setup_affinity to support allocating interrupt sets

2019-01-25 Thread Ming Lei
On Fri, Jan 25, 2019 at 05:53:42PM +0800, Ming Lei wrote:
> Hi,
> 
> The current support for allocating interrupt sets requires that same 'max_vec'
> and 'min_vec' is passed to pci_alloc_irq_vectors_affinity(), then driver has 
> to
> try to allocate again and again until it succeeds.
> 
> This patch introduces .setup_affinity callback, and we can use it to
> re-caculate interrupt sets and build affinity for each set after
> irq vectors are allocated.
> 
> Turns out both genirq/affinity and nvme code get simplified a lot.
> 
> Please review and comment!
> 
> Ming Lei (5):
>   genirq/affinity: move allocation of 'node_to_cpumask' to
> irq_build_affinity_masks
>   genirq/affinity: allow driver to setup managed IRQ's affinity
>   genirq/affinity: introduce irq_build_affinity()
>   nvme-pci: simplify nvme_setup_irqs() via .setup_affinity callback
>   genirq/affinity: remove support for allocating interrupt sets
> 
>  drivers/nvme/host/pci.c   |  97 +
>  drivers/pci/msi.c |  14 --
>  include/linux/interrupt.h |  42 --
>  kernel/irq/affinity.c | 108 
> --
>  4 files changed, 133 insertions(+), 128 deletions(-)

oops, forget to CC Keith.

Thanks,
Ming


[PATCH 3/5] genirq/affinity: introduce irq_build_affinity()

2019-01-25 Thread Ming Lei
Drivers may use this API to build customized irq affinity, one example
is NVMe, which needs to build multiple irq sets, on each of which all
CPUs are spread.

Signed-off-by: Ming Lei 
---
 include/linux/interrupt.h | 12 
 kernel/irq/affinity.c | 27 +++
 2 files changed, 39 insertions(+)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f6cea778cf50..b820b07f3b55 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -323,6 +323,10 @@ irq_set_affinity_notifier(unsigned int irq, struct 
irq_affinity_notify *notify);
 struct irq_affinity_desc *
 irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
 
+int irq_build_affinity(const struct irq_affinity *affd, int startvec,
+  int numvecs, int firstvec,
+  struct irq_affinity_desc *masks, int nmasks);
+
 int irq_calc_affinity_vectors(int minvec, int maxvec, const struct 
irq_affinity *affd);
 
 #else /* CONFIG_SMP */
@@ -368,6 +372,14 @@ irq_calc_affinity_vectors(int minvec, int maxvec, const 
struct irq_affinity *aff
return maxvec;
 }
 
+static inline int
+irq_build_affinity(const struct irq_affinity *affd, int startvec,
+  int numvecs, int firstvec,
+  struct irq_affinity_desc *masks, int nmasks)
+{
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 7b77cbdf739c..524fdcda9f85 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -232,6 +232,33 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
 }
 
 /**
+ * irq_build_affinity - build affinity masks for multiqueue spreading
+ * @affd:  Description of the affinity requirements
+ * @startvec:  The start vector for building affinity masks
+ * @numvec:The number of vectors is needed for building affinity
+ * @firstvec:  It is the IRQ vector which we jump to for continue spread
+ * after the last vector(@startvec + @numvec - 1) is built.
+ * @masks: The mask array for storing the affinity masks
+ * @nmasks:The total number of @masks
+ *
+ * Both @startvec and @firstvec are relative to the 1st irq vectorc
+ * allocated to the device.
+ *
+ * Returns 0 if affinty masks is built successfully.
+ */
+int irq_build_affinity(const struct irq_affinity *affd, int startvec,
+  int numvecs, int firstvec,
+  struct irq_affinity_desc *masks, int nmasks)
+{
+   if (startvec >= nmasks || firstvec >= nmasks || numvecs > nmasks)
+   return -EINVAL;
+
+   return irq_build_affinity_masks(affd, startvec, numvecs, firstvec,
+   masks);
+}
+EXPORT_SYMBOL_GPL(irq_build_affinity);
+
+/**
  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
  * @nvecs: The total number of vectors
  * @affd:  Description of the affinity requirements
-- 
2.9.5



[PATCH 5/5] genirq/affinity: remove support for allocating interrupt sets

2019-01-25 Thread Ming Lei
Now allocating interrupt sets can be done via .setup_affinity()
easily, so remove the support for allocating interrupt sets.

With this change, we don't need the limit of 'minvec == maxvec'
any more in pci_alloc_irq_vectors_affinity().

Meantime irq_create_affinity_masks() gets simplified a lot.

Signed-off-by: Ming Lei 
---
 drivers/pci/msi.c | 14 -
 include/linux/interrupt.h |  4 
 kernel/irq/affinity.c | 52 +++
 3 files changed, 12 insertions(+), 58 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c0b47867258..331483de1294 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1035,13 +1035,6 @@ static int __pci_enable_msi_range(struct pci_dev *dev, 
int minvec, int maxvec,
if (maxvec < minvec)
return -ERANGE;
 
-   /*
-* If the caller is passing in sets, we can't support a range of
-* vectors. The caller needs to handle that.
-*/
-   if (affd && affd->nr_sets && minvec != maxvec)
-   return -EINVAL;
-
if (WARN_ON_ONCE(dev->msi_enabled))
return -EINVAL;
 
@@ -1093,13 +1086,6 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
if (maxvec < minvec)
return -ERANGE;
 
-   /*
-* If the caller is passing in sets, we can't support a range of
-* supported vectors. The caller needs to handle that.
-*/
-   if (affd && affd->nr_sets && minvec != maxvec)
-   return -EINVAL;
-
if (WARN_ON_ONCE(dev->msix_enabled))
return -EINVAL;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b820b07f3b55..a035e165f405 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -260,8 +260,6 @@ struct irq_affinity_desc {
  * and driver has to handle pre_vectors & post_vectors
  * correctly, set 'is_managed' flag correct too
  * @priv:  Private data of @setup_affinity
- * @nr_sets:   Length of passed in *sets array
- * @sets:  Number of affinitized sets
  */
 struct irq_affinity {
int pre_vectors;
@@ -270,8 +268,6 @@ struct irq_affinity {
  struct irq_affinity_desc *,
  unsigned int);
void*priv;
-   int nr_sets;
-   int *sets;
 };
 
 #if defined(CONFIG_SMP)
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 524fdcda9f85..e8fea65325d9 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -269,9 +269,9 @@ struct irq_affinity_desc *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
-   int curvec, usedvecs;
+   int curvec;
struct irq_affinity_desc *masks = NULL;
-   int i, nr_sets;
+   int i;
 
/*
 * If there aren't any vectors left after applying the pre/post
@@ -293,34 +293,14 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
-   /*
-* Spread on present CPUs starting from affd->pre_vectors. If we
-* have multiple sets, build each sets affinity mask separately.
-*/
-   nr_sets = affd->nr_sets;
-   if (!nr_sets)
-   nr_sets = 1;
-
-   for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-   int this_vecs = affd->sets ? affd->sets[i] : affvecs;
-   int ret;
-
-   ret = irq_build_affinity_masks(affd, curvec, this_vecs,
-   curvec, masks);
-   if (ret) {
-   kfree(masks);
-   return NULL;
-   }
-   curvec += this_vecs;
-   usedvecs += this_vecs;
+
+   if (irq_build_affinity_masks(affd, curvec, affvecs, curvec, masks)) {
+   kfree(masks);
+   return NULL;
}
 
/* Fill out vectors at the end that don't need affinity */
-   if (usedvecs >= affvecs)
-   curvec = affd->pre_vectors + affvecs;
-   else
-   curvec = affd->pre_vectors + usedvecs;
-   for (; curvec < nvecs; curvec++)
+   for (curvec = affd->pre_vectors + affvecs; curvec < nvecs; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
/* Mark the managed interrupts */
@@ -340,21 +320,13 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, 
const struct irq_affinity
 {
int resv = affd->pre_vectors + affd-&

[PATCH 4/5] nvme-pci: simplify nvme_setup_irqs() via .setup_affinity callback

2019-01-25 Thread Ming Lei
Use the callback of .setup_affinity() to re-caculate number
of queues, and build irqs affinity with help of irq_build_affinity().

Then nvme_setup_irqs() gets simplified a lot.

Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c | 97 -
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9bc585415d9b..24496de0a29b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2078,17 +2078,58 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, 
unsigned int irq_queues)
}
 }
 
+static int nvme_setup_affinity(const struct irq_affinity *affd,
+  struct irq_affinity_desc *masks,
+  unsigned int nmasks)
+{
+   struct nvme_dev *dev = affd->priv;
+   int affvecs = nmasks - affd->pre_vectors - affd->post_vectors;
+   int curvec, usedvecs;
+   int i;
+
+   nvme_calc_io_queues(dev, nmasks);
+
+   /* Fill out vectors at the beginning that don't need affinity */
+   for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+   cpumask_copy(&masks[curvec].mask, cpu_possible_mask);
+
+   for (i = 0, usedvecs = 0; i < HCTX_TYPE_POLL; i++) {
+   int this_vecs = dev->io_queues[i];
+   int ret;
+
+   if (!this_vecs)
+   break;
+
+   ret = irq_build_affinity(affd, curvec, this_vecs, curvec,
+masks, nmasks);
+   if (ret)
+   return ret;
+
+   curvec += this_vecs;
+   usedvecs += this_vecs;
+   }
+
+   /* Fill out vectors at the end that don't need affinity */
+   curvec = affd->pre_vectors + min(usedvecs, affvecs);
+   for (; curvec < nmasks; curvec++)
+   cpumask_copy(&masks[curvec].mask, cpu_possible_mask);
+
+   /* Mark the managed interrupts */
+   for (i = affd->pre_vectors; i < nmasks - affd->post_vectors; i++)
+   masks[i].is_managed = 1;
+
+   return 0;
+}
+
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int irq_sets[2];
struct irq_affinity affd = {
.pre_vectors = 1,
-   .nr_sets = ARRAY_SIZE(irq_sets),
-   .sets = irq_sets,
+   .setup_affinity = nvme_setup_affinity,
+   .priv = dev,
};
-   int result = 0;
-   unsigned int irq_queues, this_p_queues;
+   int result, irq_queues, this_p_queues;
 
/*
 * Poll queues don't need interrupts, but we need at least one IO
@@ -2103,50 +2144,8 @@ static int nvme_setup_irqs(struct nvme_dev *dev, 
unsigned int nr_io_queues)
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
-   /*
-* For irq sets, we have to ask for minvec == maxvec. This passes
-* any reduction back to us, so we can adjust our queue counts and
-* IRQ vector needs.
-*/
-   do {
-   nvme_calc_io_queues(dev, irq_queues);
-   irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
-   irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
-   if (!irq_sets[1])
-   affd.nr_sets = 1;
-
-   /*
-* If we got a failure and we're down to asking for just
-* 1 + 1 queues, just ask for a single vector. We'll share
-* that between the single IO queue and the admin queue.
-* Otherwise, we assign one independent vector to admin queue.
-*/
-   if (irq_queues > 1)
-   irq_queues = irq_sets[0] + irq_sets[1] + 1;
-
-   result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
-   irq_queues,
-   PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
-   /*
-* Need to reduce our vec counts. If we get ENOSPC, the
-* platform should support mulitple vecs, we just need
-* to decrease our ask. If we get EINVAL, the platform
-* likely does not. Back down to ask for just one vector.
-*/
-   if (result == -ENOSPC) {
-   irq_queues--;
-   if (!irq_queues)
-   return result;
-   continue;
-   } else if (result == -EINVAL) {
-   irq_queues = 1;
-   continue;
-   } else if (result <= 0)
-   return -EIO;
-   break;
-   } while (1);
-
+   result = pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+   PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
return result;
 }
 
-- 
2.9.5



[PATCH 1/5] genirq/affinity: move allocation of 'node_to_cpumask' to irq_build_affinity_masks

2019-01-25 Thread Ming Lei
'node_to_cpumask' is just one temparay variable for irq_build_affinity_masks(),
so move it into irq_build_affinity_masks().

No functioanl change.

Signed-off-by: Ming Lei 
---
 kernel/irq/affinity.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 45b68b4ea48b..118b66d64a53 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -175,18 +175,22 @@ static int __irq_build_affinity_masks(const struct 
irq_affinity *affd,
  */
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
int startvec, int numvecs, int firstvec,
-   cpumask_var_t *node_to_cpumask,
struct irq_affinity_desc *masks)
 {
int curvec = startvec, nr_present, nr_others;
int ret = -ENOMEM;
cpumask_var_t nmsk, npresmsk;
+   cpumask_var_t *node_to_cpumask;
 
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return ret;
 
if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
-   goto fail;
+   goto fail_nmsk;
+
+   node_to_cpumask = alloc_node_to_cpumask();
+   if (!node_to_cpumask)
+   goto fail_npresmsk;
 
ret = 0;
/* Stabilize the cpumasks */
@@ -217,9 +221,12 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
if (nr_present < numvecs)
WARN_ON(nr_present + nr_others < numvecs);
 
+   free_node_to_cpumask(node_to_cpumask);
+
+ fail_npresmsk:
free_cpumask_var(npresmsk);
 
- fail:
+ fail_nmsk:
free_cpumask_var(nmsk);
return ret;
 }
@@ -236,7 +243,6 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
 {
int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
int curvec, usedvecs;
-   cpumask_var_t *node_to_cpumask;
struct irq_affinity_desc *masks = NULL;
int i, nr_sets;
 
@@ -247,13 +253,9 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (nvecs == affd->pre_vectors + affd->post_vectors)
return NULL;
 
-   node_to_cpumask = alloc_node_to_cpumask();
-   if (!node_to_cpumask)
-   return NULL;
-
masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
if (!masks)
-   goto outnodemsk;
+   return NULL;
 
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
@@ -271,11 +273,10 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
int ret;
 
ret = irq_build_affinity_masks(affd, curvec, this_vecs,
-   curvec, node_to_cpumask, masks);
+   curvec, masks);
if (ret) {
kfree(masks);
-   masks = NULL;
-   goto outnodemsk;
+   return NULL;
}
curvec += this_vecs;
usedvecs += this_vecs;
@@ -293,8 +294,6 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
masks[i].is_managed = 1;
 
-outnodemsk:
-   free_node_to_cpumask(node_to_cpumask);
return masks;
 }
 
-- 
2.9.5



[PATCH 0/5] genirq/affinity: introduce .setup_affinity to support allocating interrupt sets

2019-01-25 Thread Ming Lei
Hi,

The current support for allocating interrupt sets requires that same 'max_vec'
and 'min_vec' is passed to pci_alloc_irq_vectors_affinity(), then driver has to
try to allocate again and again until it succeeds.

This patch introduces .setup_affinity callback, and we can use it to
re-caculate interrupt sets and build affinity for each set after
irq vectors are allocated.

Turns out both genirq/affinity and nvme code get simplified a lot.

Please review and comment!

Ming Lei (5):
  genirq/affinity: move allocation of 'node_to_cpumask' to
irq_build_affinity_masks
  genirq/affinity: allow driver to setup managed IRQ's affinity
  genirq/affinity: introduce irq_build_affinity()
  nvme-pci: simplify nvme_setup_irqs() via .setup_affinity callback
  genirq/affinity: remove support for allocating interrupt sets

 drivers/nvme/host/pci.c   |  97 +
 drivers/pci/msi.c |  14 --
 include/linux/interrupt.h |  42 --
 kernel/irq/affinity.c | 108 --
 4 files changed, 133 insertions(+), 128 deletions(-)

-- 
2.9.5



[PATCH 2/5] genirq/affinity: allow driver to setup managed IRQ's affinity

2019-01-25 Thread Ming Lei
This patch introduces callback of .setup_affinity into 'struct
irq_affinity', so that:

1) allow drivers to customize the affinity for managed IRQ, for
example, now NVMe has special requirement for read queues & poll
queues

2) 6da4b3ab9a6e9 ("genirq/affinity: Add support for allocating interrupt sets")
makes pci_alloc_irq_vectors_affinity() a bit difficult to use for
allocating interrupt sets: 'max_vecs' is required to same with 'min_vecs'.

With this patch, driver can implement their own .setup_affinity to
customize the affinity, then the above thing can be solved easily.

Signed-off-by: Ming Lei 
---
 include/linux/interrupt.h | 26 +-
 kernel/irq/affinity.c |  6 ++
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c672f34235e7..f6cea778cf50 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -242,30 +242,38 @@ struct irq_affinity_notify {
 };
 
 /**
+ * struct irq_affinity_desc - Interrupt affinity descriptor
+ * @mask:  cpumask to hold the affinity assignment
+ */
+struct irq_affinity_desc {
+   struct cpumask  mask;
+   unsigned intis_managed : 1;
+};
+
+/**
  * struct irq_affinity - Description for automatic irq affinity assignements
  * @pre_vectors:   Don't apply affinity to @pre_vectors at beginning of
  * the MSI(-X) vector space
  * @post_vectors:  Don't apply affinity to @post_vectors at end of
  * the MSI(-X) vector space
+ * @setup_affinity:Use driver's method to setup irq vectors affinity,
+ * and driver has to handle pre_vectors & post_vectors
+ * correctly, set 'is_managed' flag correct too
+ * @priv:  Private data of @setup_affinity
  * @nr_sets:   Length of passed in *sets array
  * @sets:  Number of affinitized sets
  */
 struct irq_affinity {
int pre_vectors;
int post_vectors;
+   int (*setup_affinity)(const struct irq_affinity *,
+ struct irq_affinity_desc *,
+ unsigned int);
+   void*priv;
int nr_sets;
int *sets;
 };
 
-/**
- * struct irq_affinity_desc - Interrupt affinity descriptor
- * @mask:  cpumask to hold the affinity assignment
- */
-struct irq_affinity_desc {
-   struct cpumask  mask;
-   unsigned intis_managed : 1;
-};
-
 #if defined(CONFIG_SMP)
 
 extern cpumask_var_t irq_default_affinity;
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 118b66d64a53..7b77cbdf739c 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -257,6 +257,12 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (!masks)
return NULL;
 
+   if (affd->setup_affinity) {
+   if (affd->setup_affinity(affd, masks, nvecs))
+   return NULL;
+   return masks;
+   }
+
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
-- 
2.9.5



Re: fsync hangs after scsi rejected a request

2019-01-24 Thread Ming Lei
On Fri, Jan 25, 2019 at 8:45 AM Florian Stecker  wrote:
>
>
>
> On 1/22/19 4:22 AM, Ming Lei wrote:
> > On Tue, Jan 22, 2019 at 5:13 AM Florian Stecker  
> > wrote:
> >>
> >> Hi everyone,
> >>
> >> on my laptop, I am experiencing occasional hangs of applications during
> >> fsync(), which are sometimes up to 30 seconds long. I'm using a BTRFS
> >> which spans two partitions on the same SSD (one of them used to contain
> >> a Windows, but I removed it and added the partition to the BTRFS volume
> >> instead). Also, the problem only occurs when an I/O scheduler
> >> (mq-deadline) is in use. I'm running kernel version 4.20.3.
> >>
> >>   From what I understand so far, what happens is that a sync request
> >> fails in the SCSI/ATA layer, in ata_std_qc_defer(), because it is a
> >> "Non-NCQ command" and can not be queued together with other commands.
> >> This propagates up into blk_mq_dispatch_rq_list(), where the call
> >>
> >> ret = q->mq_ops->queue_rq(hctx, &bd);
> >>
> >> returns BLK_STS_DEV_RESOURCE. Later in blk_mq_dispatch_rq_list(), there
> >> is the piece of code
> >>
> >> needs_restart = blk_mq_sched_needs_restart(hctx);
> >> if (!needs_restart ||
> >>  (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
> >>  blk_mq_run_hw_queue(hctx, true);
> >> else if (needs_restart && (ret == BLK_STS_RESOURCE))
> >>  blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
> >>
> >> which restarts the queue after a delay if BLK_STS_RESOURCE was returned,
> >> but somehow not for BLK_STS_DEV_RESOURCE. Instead, nothing happens and
> >> fsync() seems to hang until some other process wants to do I/O.
> >>
> >> So if I do
> >>
> >> - else if (needs_restart && (ret == BLK_STS_RESOURCE))
> >> + else if (needs_restart && (ret == BLK_STS_RESOURCE || ret ==
> >> BLK_STS_DEV_RESOURCE))
> >>
> >> it fixes my problem. But was there a reason why BLK_STS_DEV_RESOURCE was
> >> treated differently that BLK_STS_RESOURCE here?
> >
> > Please see the comment:
> >
> > /*
> >   * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
> >   * device related resources are unavailable, but the driver can guarantee
> >   * that the queue will be rerun in the future once resources become
> >   * available again. This is typically the case for device specific
> >   * resources that are consumed for IO. If the driver fails allocating these
> >   * resources, we know that inflight (or pending) IO will free these
> >   * resource upon completion.
> >   *
> >   * This is different from BLK_STS_RESOURCE in that it explicitly references
> >   * a device specific resource. For resources of wider scope, allocation
> >   * failure can happen without having pending IO. This means that we can't
> >   * rely on request completions freeing these resources, as IO may not be in
> >   * flight. Examples of that are kernel memory allocations, DMA mappings, or
> >   * any other system wide resources.
> >   */
> > #define BLK_STS_DEV_RESOURCE((__force blk_status_t)13)
> >
> >>
> >> In any case, it seems wrong to me that ret is used here at all, as it
> >> just contains the return value of the last request in the list, and
> >> whether we rerun the queue should probably not only depend on the last
> >> request?
> >>
> >> Can anyone of the experts tell me whether this makes sense or I got
> >> something completely wrong?
> >
> > Sounds a bug in SCSI or ata driver.
> >
> > I remember there is hole in SCSI wrt. returning BLK_STS_DEV_RESOURCE,
> > but I never get lucky to reproduce it.
> >
> > scsi_queue_rq():
> >  ..
> >  case BLK_STS_RESOURCE:
> >  if (atomic_read(&sdev->device_busy) ||
> >  scsi_device_blocked(sdev))
> >  ret = BLK_STS_DEV_RESOURCE;
> >
>
> OK, please tell me if I understand this right now. What is supposed to
> happen is
>
> - request 1 is being processed by the device
> - request 2 (sync) fails with STS_DEV_RESOURCE because request 1 is
> still being processed
> - request 2 is put into hctx->dispatch inside blk_mq_dispatch_rq_list *
> - queue does not get rerun because of STS_DEV_RESOURCE
> - request 1 finishes
> - scsi_end_request calls blk_mq_run_hw_queues

[PATCH] block: cover another queue enter recursion via BIO_QUEUE_ENTERED

2019-01-22 Thread Ming Lei
Except for blk_queue_split(), bio_split() is used for splitting bio too,
then the remained bio is often resubmit to queue via generic_make_request().
So the same queue enter recursion exits in this case too. Unfortunatley
commit cd4a4ae4683dc2 doesn't help this case.

This patch covers the above case by setting BIO_QUEUE_ENTERED before calling
q->make_request_fn.

In theory the per-bio flag is used to simulate one stack variable, it is
just fine to clear it after q->make_request_fn is returned. Especially
the same bio can't be submitted from another context.

Fixes: cd4a4ae4683dc2 ("block: don't use blocking queue entered for recursive 
bio submits")
Cc: Tetsuo Handa 
Cc: Mike Snitzer 
Cc: NeilBrown 
Signed-off-by: Ming Lei 
---
 block/blk-core.c  | 11 +++
 block/blk-merge.c | 10 --
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3c5f61ceeb67..1ccec27d20c3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1083,7 +1083,18 @@ blk_qc_t generic_make_request(struct bio *bio)
/* Create a fresh bio_list for all subordinate requests 
*/
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
+
+   /*
+* Since we're recursing into make_request here, ensure
+* that we mark this bio as already having entered the 
queue.
+* If not, and the queue is going away, we can get stuck
+* forever on waiting for the queue reference to drop. 
But
+* that will never happen, as we're already holding a
+* reference to it.
+*/
+   bio_set_flag(bio, BIO_QUEUE_ENTERED);
ret = q->make_request_fn(q, bio);
+   bio_clear_flag(bio, BIO_QUEUE_ENTERED);
 
/* sort new bios into those for a lower level
 * and those for the same level
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b990853f6de7..8777e286bd3f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -339,16 +339,6 @@ void blk_queue_split(struct request_queue *q, struct bio 
**bio)
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
 
-   /*
-* Since we're recursing into make_request here, ensure
-* that we mark this bio as already having entered the queue.
-* If not, and the queue is going away, we can get stuck
-* forever on waiting for the queue reference to drop. But
-* that will never happen, as we're already holding a
-* reference to it.
-*/
-   bio_set_flag(*bio, BIO_QUEUE_ENTERED);
-
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
-- 
2.9.5



Re: [PATCH 3/4] dm: fix missing bio_split() pattern code in __split_and_process_bio()

2019-01-21 Thread Ming Lei
On Mon, Jan 21, 2019 at 10:35:11PM -0500, Mike Snitzer wrote:
> On Mon, Jan 21 2019 at 10:17pm -0500,
> Mike Snitzer  wrote:
> 
> > On Mon, Jan 21 2019 at  9:46pm -0500,
> > Ming Lei  wrote:
> > 
> > > On Mon, Jan 21, 2019 at 11:02:04AM -0500, Mike Snitzer wrote:
> > > > On Sun, Jan 20 2019 at 10:21P -0500,
> > > > Ming Lei  wrote:
> > > > 
> > > > > On Sat, Jan 19, 2019 at 01:05:05PM -0500, Mike Snitzer wrote:
> > > > > > Use the same BIO_QUEUE_ENTERED pattern that was established by 
> > > > > > commit
> > > > > > cd4a4ae4683dc ("block: don't use blocking queue entered for 
> > > > > > recursive
> > > > > > bio submits") by setting BIO_QUEUE_ENTERED after bio_split() and 
> > > > > > before
> > > > > > recursing via generic_make_request().
> > > > > > 
> > > > > > Also add trace_block_split() because it provides useful context 
> > > > > > about
> > > > > > bio splits in blktrace.
> > > > > > 
> > > > > > Depends-on: cd4a4ae4683dc ("block: don't use blocking queue entered 
> > > > > > for recursive bio submits")
> > > > > > Fixes: 18a25da84354 ("dm: ensure bio submission follows a 
> > > > > > depth-first tree walk")
> > > > > > Cc: sta...@vger.kernel.org # 4.16+
> > > > > > Signed-off-by: Mike Snitzer 
> > > > > > ---
> > > > > >  drivers/md/dm.c | 2 ++
> > > > > >  1 file changed, 2 insertions(+)
> > > > > > 
> > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > > > > index fbadda68e23b..6e29c2d99b99 100644
> > > > > > --- a/drivers/md/dm.c
> > > > > > +++ b/drivers/md/dm.c
> > > > > > @@ -1654,7 +1654,9 @@ static blk_qc_t 
> > > > > > __split_and_process_bio(struct mapped_device *md,
> > > > > >
> > > > > > sectors[op_stat_group(bio_op(bio))], ci.sector_count);
> > > > > > part_stat_unlock();
> > > > > >  
> > > > > > +   bio_set_flag(bio, BIO_QUEUE_ENTERED);
> > > > > > bio_chain(b, bio);
> > > > > > +   trace_block_split(md->queue, b, 
> > > > > > bio->bi_iter.bi_sector);
> > > > > > ret = generic_make_request(bio);
> > > > > > break;
> > > > > > }
> > > > > 
> > > > > In theory, BIO_QUEUE_ENTERED is only required when 
> > > > > __split_and_process_bio() is
> > > > > called from generic_make_request(). However, it may be called from 
> > > > > dm_wq_work(),
> > > > > this way might cause trouble on operation to q->q_usage_counter.
> > > > 
> > > > Good point, I've tweaked this patch to clear BIO_QUEUE_ENTERED in
> > > > dm_make_request().
> > > > 
> > > > And to Neil's point: yes, these changes really do need to made
> > > > common since it appears all bio_split() callers do go on to call
> > > > generic_make_request().
> > > > 
> > > > Anyway, here is the updated patch that is now staged in linux-next:
> > > > 
> > > > From: Mike Snitzer 
> > > > Date: Fri, 18 Jan 2019 01:21:11 -0500
> > > > Subject: [PATCH v2] dm: fix missing bio_split() pattern code in 
> > > > __split_and_process_bio()
> > > > 
> > > > Use the same BIO_QUEUE_ENTERED pattern that was established by commit
> > > > cd4a4ae4683dc ("block: don't use blocking queue entered for recursive
> > > > bio submits") by setting BIO_QUEUE_ENTERED after bio_split() and before
> > > > recursing via generic_make_request().
> > > > 
> > > > Also add trace_block_split() because it provides useful context about
> > > > bio splits in blktrace.
> > > > 
> > > > Depends-on: cd4a4ae4683dc ("block: don't use blocking queue entered for 
> > > > recursive bio submits")
> > > > Fixes: 18a25da84354 ("dm: ensure bio submission follows a depth-first 
> > &

Re: fsync hangs after scsi rejected a request

2019-01-21 Thread Ming Lei
On Tue, Jan 22, 2019 at 5:13 AM Florian Stecker  wrote:
>
> Hi everyone,
>
> on my laptop, I am experiencing occasional hangs of applications during
> fsync(), which are sometimes up to 30 seconds long. I'm using a BTRFS
> which spans two partitions on the same SSD (one of them used to contain
> a Windows, but I removed it and added the partition to the BTRFS volume
> instead). Also, the problem only occurs when an I/O scheduler
> (mq-deadline) is in use. I'm running kernel version 4.20.3.
>
>  From what I understand so far, what happens is that a sync request
> fails in the SCSI/ATA layer, in ata_std_qc_defer(), because it is a
> "Non-NCQ command" and can not be queued together with other commands.
> This propagates up into blk_mq_dispatch_rq_list(), where the call
>
> ret = q->mq_ops->queue_rq(hctx, &bd);
>
> returns BLK_STS_DEV_RESOURCE. Later in blk_mq_dispatch_rq_list(), there
> is the piece of code
>
> needs_restart = blk_mq_sched_needs_restart(hctx);
> if (!needs_restart ||
> (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
> blk_mq_run_hw_queue(hctx, true);
> else if (needs_restart && (ret == BLK_STS_RESOURCE))
> blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
>
> which restarts the queue after a delay if BLK_STS_RESOURCE was returned,
> but somehow not for BLK_STS_DEV_RESOURCE. Instead, nothing happens and
> fsync() seems to hang until some other process wants to do I/O.
>
> So if I do
>
> - else if (needs_restart && (ret == BLK_STS_RESOURCE))
> + else if (needs_restart && (ret == BLK_STS_RESOURCE || ret ==
> BLK_STS_DEV_RESOURCE))
>
> it fixes my problem. But was there a reason why BLK_STS_DEV_RESOURCE was
> treated differently that BLK_STS_RESOURCE here?

Please see the comment:

/*
 * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
 * device related resources are unavailable, but the driver can guarantee
 * that the queue will be rerun in the future once resources become
 * available again. This is typically the case for device specific
 * resources that are consumed for IO. If the driver fails allocating these
 * resources, we know that inflight (or pending) IO will free these
 * resource upon completion.
 *
 * This is different from BLK_STS_RESOURCE in that it explicitly references
 * a device specific resource. For resources of wider scope, allocation
 * failure can happen without having pending IO. This means that we can't
 * rely on request completions freeing these resources, as IO may not be in
 * flight. Examples of that are kernel memory allocations, DMA mappings, or
 * any other system wide resources.
 */
#define BLK_STS_DEV_RESOURCE((__force blk_status_t)13)

>
> In any case, it seems wrong to me that ret is used here at all, as it
> just contains the return value of the last request in the list, and
> whether we rerun the queue should probably not only depend on the last
> request?
>
> Can anyone of the experts tell me whether this makes sense or I got
> something completely wrong?

Sounds a bug in SCSI or ata driver.

I remember there is hole in SCSI wrt. returning BLK_STS_DEV_RESOURCE,
but I never get lucky to reproduce it.

scsi_queue_rq():
..
case BLK_STS_RESOURCE:
if (atomic_read(&sdev->device_busy) ||
scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;

All in-flight request may complete between reading 'sdev->device_busy'
and setting ret as 'BLK_STS_DEV_RESOURCE', then this IO hang may
be triggered.

Thanks,
Ming Lei


Re: [PATCH 3/4] dm: fix missing bio_split() pattern code in __split_and_process_bio()

2019-01-21 Thread Ming Lei
On Mon, Jan 21, 2019 at 11:02:04AM -0500, Mike Snitzer wrote:
> On Sun, Jan 20 2019 at 10:21P -0500,
> Ming Lei  wrote:
> 
> > On Sat, Jan 19, 2019 at 01:05:05PM -0500, Mike Snitzer wrote:
> > > Use the same BIO_QUEUE_ENTERED pattern that was established by commit
> > > cd4a4ae4683dc ("block: don't use blocking queue entered for recursive
> > > bio submits") by setting BIO_QUEUE_ENTERED after bio_split() and before
> > > recursing via generic_make_request().
> > > 
> > > Also add trace_block_split() because it provides useful context about
> > > bio splits in blktrace.
> > > 
> > > Depends-on: cd4a4ae4683dc ("block: don't use blocking queue entered for 
> > > recursive bio submits")
> > > Fixes: 18a25da84354 ("dm: ensure bio submission follows a depth-first 
> > > tree walk")
> > > Cc: sta...@vger.kernel.org # 4.16+
> > > Signed-off-by: Mike Snitzer 
> > > ---
> > >  drivers/md/dm.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > > 
> > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > index fbadda68e23b..6e29c2d99b99 100644
> > > --- a/drivers/md/dm.c
> > > +++ b/drivers/md/dm.c
> > > @@ -1654,7 +1654,9 @@ static blk_qc_t __split_and_process_bio(struct 
> > > mapped_device *md,
> > >  
> > > sectors[op_stat_group(bio_op(bio))], ci.sector_count);
> > >   part_stat_unlock();
> > >  
> > > + bio_set_flag(bio, BIO_QUEUE_ENTERED);
> > >   bio_chain(b, bio);
> > > + trace_block_split(md->queue, b, 
> > > bio->bi_iter.bi_sector);
> > >   ret = generic_make_request(bio);
> > >   break;
> > >   }
> > 
> > In theory, BIO_QUEUE_ENTERED is only required when 
> > __split_and_process_bio() is
> > called from generic_make_request(). However, it may be called from 
> > dm_wq_work(),
> > this way might cause trouble on operation to q->q_usage_counter.
> 
> Good point, I've tweaked this patch to clear BIO_QUEUE_ENTERED in
> dm_make_request().
> 
> And to Neil's point: yes, these changes really do need to made
> common since it appears all bio_split() callers do go on to call
> generic_make_request().
> 
> Anyway, here is the updated patch that is now staged in linux-next:
> 
> From: Mike Snitzer 
> Date: Fri, 18 Jan 2019 01:21:11 -0500
> Subject: [PATCH v2] dm: fix missing bio_split() pattern code in 
> __split_and_process_bio()
> 
> Use the same BIO_QUEUE_ENTERED pattern that was established by commit
> cd4a4ae4683dc ("block: don't use blocking queue entered for recursive
> bio submits") by setting BIO_QUEUE_ENTERED after bio_split() and before
> recursing via generic_make_request().
> 
> Also add trace_block_split() because it provides useful context about
> bio splits in blktrace.
> 
> Depends-on: cd4a4ae4683dc ("block: don't use blocking queue entered for 
> recursive bio submits")
> Fixes: 18a25da84354 ("dm: ensure bio submission follows a depth-first tree 
> walk")
> Cc: sta...@vger.kernel.org # 4.16+
> Signed-off-by: Mike Snitzer 
> ---
>  drivers/md/dm.c | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index fbadda68e23b..25884f833a32 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1654,7 +1654,9 @@ static blk_qc_t __split_and_process_bio(struct 
> mapped_device *md,
>  
> sectors[op_stat_group(bio_op(bio))], ci.sector_count);
>   part_stat_unlock();
>  
> + bio_set_flag(bio, BIO_QUEUE_ENTERED);
>   bio_chain(b, bio);
> + trace_block_split(md->queue, b, 
> bio->bi_iter.bi_sector);
>   ret = generic_make_request(bio);
>   break;
>   }
> @@ -1734,6 +1736,13 @@ static blk_qc_t dm_make_request(struct request_queue 
> *q, struct bio *bio)
>  
>   map = dm_get_live_table(md, &srcu_idx);
>  
> + /*
> +  * Clear the bio-reentered-generic_make_request() flag,
> +  * will be set again as needed if bio needs to be split.
> +  */
> + if (bio_flagged(bio, BIO_QUEUE_ENTERED))
> + bio_clear_flag(bio, BIO_QUEUE_ENTERED);
> +
>   /

Re: [PATCH V14 00/18] block: support multi-page bvec

2019-01-21 Thread Ming Lei
On Mon, Jan 21, 2019 at 01:43:21AM -0800, Sagi Grimberg wrote:
> 
> > V14:
> > - drop patch(patch 4 in V13) for renaming bvec helpers, as suggested by 
> > Jens
> > - use mp_bvec_* as multi-page bvec helper name
> > - fix one build issue, which is caused by missing one converion of
> > bio_for_each_segment_all in fs/gfs2
> > - fix one 32bit ARCH specific issue caused by segment boundary mask
> > overflow
> 
> Hey Ming,
> 
> So is nvme-tcp also affected here? The only point where I see nvme-tcp
> can be affected is when initializing a bvec iter using bio_segments() as
> everywhere else we use iters which should transparently work..
> 
> I see that loop was converted, does it mean that nvme-tcp needs to
> call something like?
> --
>   bio_for_each_mp_bvec(bv, bio, iter)
>   nr_bvecs++;

bio_for_each_segment()/bio_segments() still works, just not as efficient
as bio_for_each_mp_bvec() given each multi-page bvec(very similar with 
scatterlist)
is returned in each loop.

I don't look at nvme-tcp code yet. But if nvme-tcp supports this way,
it can benefit from bio_for_each_mp_bvec().

Thanks,
Ming


Re: [PATCH V14 00/18] block: support multi-page bvec

2019-01-21 Thread Ming Lei
On Mon, Jan 21, 2019 at 09:38:10AM +0100, Christoph Hellwig wrote:
> On Mon, Jan 21, 2019 at 04:37:12PM +0800, Ming Lei wrote:
> > On Mon, Jan 21, 2019 at 09:22:46AM +0100, Christoph Hellwig wrote:
> > > On Mon, Jan 21, 2019 at 04:17:47PM +0800, Ming Lei wrote:
> > > > V14:
> > > > - drop patch(patch 4 in V13) for renaming bvec helpers, as 
> > > > suggested by Jens
> > > > - use mp_bvec_* as multi-page bvec helper name
> > > 
> > > WTF?  Where is this coming from?  mp is just a nightmare of a name,
> > > and I also didn't see any comments like that.
> > 
> > You should see the recent discussion in which Jens doesn't agree on
> > renaming bvec helper name, so the previous patch of 'block: rename bvec 
> > helpers'
> 
> Where is that discussion?

https://marc.info/?l=linux-next&m=154777954232109&w=2


Thanks,
Ming


Re: [PATCH V14 00/18] block: support multi-page bvec

2019-01-21 Thread Ming Lei
On Mon, Jan 21, 2019 at 09:22:46AM +0100, Christoph Hellwig wrote:
> On Mon, Jan 21, 2019 at 04:17:47PM +0800, Ming Lei wrote:
> > V14:
> > - drop patch(patch 4 in V13) for renaming bvec helpers, as suggested by 
> > Jens
> > - use mp_bvec_* as multi-page bvec helper name
> 
> WTF?  Where is this coming from?  mp is just a nightmare of a name,
> and I also didn't see any comments like that.

You should see the recent discussion in which Jens doesn't agree on
renaming bvec helper name, so the previous patch of 'block: rename bvec helpers'
has to be dropped, and we have to find a new name for multi-page bvec
helpers.

Jens mentioned we can do the rename in next release after the whole
patchset is merged first, then I think mp_bvec_* should be fine since
its life time is short.

Or do you other suggestion?

Thanks,
Ming


[PATCH V14 09/18] fs/buffer.c: use bvec iterator to truncate the bio

2019-01-21 Thread Ming Lei
Once multi-page bvec is enabled, the last bvec may include more than one
page, this patch use mp_bvec_last_segment() to truncate the bio.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 fs/buffer.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 52d024bfdbc1..817871274c77 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3032,7 +3032,10 @@ void guard_bio_eod(int op, struct bio *bio)
 
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
-   zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+   struct bio_vec bv;
+
+   mp_bvec_last_segment(bvec, &bv);
+   zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
 }
-- 
2.9.5



[PATCH V14 17/18] block: kill QUEUE_FLAG_NO_SG_MERGE

2019-01-21 Thread Ming Lei
Since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"),
physical segment number is mainly figured out in blk_queue_split() for
fast path, and the flag of BIO_SEG_VALID is set there too.

Now only blk_recount_segments() and blk_recalc_rq_segments() use this
flag.

Basically blk_recount_segments() is bypassed in fast path given BIO_SEG_VALID
is set in blk_queue_split().

For another user of blk_recalc_rq_segments():

- run in partial completion branch of blk_update_request, which is an unusual 
case

- run in blk_cloned_rq_check_limits(), still not a big problem if the flag is 
killed
since dm-rq is the only user.

Multi-page bvec is enabled now, not doing S/G merging is rather pointless with 
the
current setup of the I/O path, as it isn't going to save you a significant 
amount
of cycles.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c  | 31 ++-
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c |  3 ---
 drivers/md/dm-table.c  | 13 -
 include/linux/blkdev.h |  1 -
 5 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8a498f29636f..b990853f6de7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -358,8 +358,7 @@ void blk_queue_split(struct request_queue *q, struct bio 
**bio)
 EXPORT_SYMBOL(blk_queue_split);
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-struct bio *bio,
-bool no_sg_merge)
+struct bio *bio)
 {
struct bio_vec bv, bvprv = { NULL };
int prev = 0;
@@ -385,13 +384,6 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
nr_phys_segs = 0;
for_each_bio(bio) {
bio_for_each_mp_bvec(bv, bio, iter) {
-   /*
-* If SG merging is disabled, each bio vector is
-* a segment
-*/
-   if (no_sg_merge)
-   goto new_segment;
-
if (prev) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
@@ -421,27 +413,16 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-   bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
-   &rq->q->queue_flags);
-
-   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
-   no_sg_merge);
+   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   unsigned short seg_cnt = bio_segments(bio);
-
-   if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
-   (seg_cnt < queue_max_segments(q)))
-   bio->bi_phys_segments = seg_cnt;
-   else {
-   struct bio *nxt = bio->bi_next;
+   struct bio *nxt = bio->bi_next;
 
-   bio->bi_next = NULL;
-   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
-   bio->bi_next = nxt;
-   }
+   bio->bi_next = NULL;
+   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+   bio->bi_next = nxt;
 
bio_set_flag(bio, BIO_SEG_VALID);
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 90d68760af08..2f9a11ef5bad 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -128,7 +128,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(NO_SG_MERGE),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ba37b9e15e9..fa45817a7e62 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2829,9 +2829,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
-   if (!(set->flags & BLK_MQ_F_SG_MERGE))
-   blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
q->sg_reserved_size = INT_MAX;
 
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4b1be754cc41..ba9481f1bf3c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, 
struct dm_dev *dev,
return q && !blk_queue_add_random(q);
 }
 
-static 

[PATCH V14 16/18] block: document usage of bio iterator helpers

2019-01-21 Thread Ming Lei
Now multi-page bvec is supported, some helpers may return page by
page, meantime some may return segment by segment, this patch
documents the usage.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 Documentation/block/biovecs.txt | 25 +
 1 file changed, 25 insertions(+)

diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
index 25689584e6e0..4a466bcb9611 100644
--- a/Documentation/block/biovecs.txt
+++ b/Documentation/block/biovecs.txt
@@ -117,3 +117,28 @@ Other implications:
size limitations and the limitations of the underlying devices. Thus
there's no need to define ->merge_bvec_fn() callbacks for individual block
drivers.
+
+Usage of helpers:
+=
+
+* The following helpers whose names have the suffix of "_all" can only be used
+on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
+shouldn't use them because the bio may have been split before it reached the
+driver.
+
+   bio_for_each_segment_all()
+   bio_first_bvec_all()
+   bio_first_page_all()
+   bio_last_bvec_all()
+
+* The following helpers iterate over single-page segment. The passed 'struct
+bio_vec' will contain a single-page IO vector during the iteration
+
+   bio_for_each_segment()
+   bio_for_each_segment_all()
+
+* The following helpers iterate over multi-page bvec. The passed 'struct
+bio_vec' will contain a multi-page IO vector during the iteration
+
+   bio_for_each_mp_bvec()
+   rq_for_each_mp_bvec()
-- 
2.9.5



[PATCH V14 18/18] block: kill BLK_MQ_F_SG_MERGE

2019-01-21 Thread Ming Lei
QUEUE_FLAG_NO_SG_MERGE has been killed, so kill BLK_MQ_F_SG_MERGE too.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/block/loop.c | 2 +-
 drivers/block/nbd.c  | 2 +-
 drivers/block/rbd.c  | 2 +-
 drivers/block/skd_main.c | 1 -
 drivers/block/xen-blkfront.c | 2 +-
 drivers/md/dm-rq.c   | 2 +-
 drivers/mmc/core/queue.c | 3 +--
 drivers/scsi/scsi_lib.c  | 2 +-
 include/linux/blk-mq.h   | 1 -
 10 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 2f9a11ef5bad..2ba0aa05ce13 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -250,7 +250,6 @@ static const char *const alloc_policy_name[] = {
 static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED),
-   HCTX_FLAG_NAME(SG_MERGE),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
 };
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 168a151aba49..714924cfb050 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1937,7 +1937,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
lo->tag_set.driver_data = lo;
 
err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 08696f5f00bb..999c94de78e5 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1570,7 +1570,7 @@ static int nbd_dev_add(int index)
nbd->tag_set.numa_node = NUMA_NO_NODE;
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-   BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
+   BLK_MQ_F_BLOCKING;
nbd->tag_set.driver_data = nbd;
 
err = blk_mq_alloc_tag_set(&nbd->tag_set);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1e92b61d0bd5..abe9e1c89227 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3988,7 +3988,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-   rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
 
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index ab893a7571a2..7d3ad6c22ee5 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev)
skdev->sgs_per_request * sizeof(struct scatterlist);
skdev->tag_set.numa_node = NUMA_NO_NODE;
skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-   BLK_MQ_F_SG_MERGE |
BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
skdev->tag_set.driver_data = skdev;
rc = blk_mq_alloc_tag_set(&skdev->tag_set);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0ed4b200fa58..d43a5677ccbc 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
sector_size,
} else
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
-   info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 4eb5f8c56535..b2f8eb2365ee 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, 
struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
-   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 35cc138b096d..cc19e71c71d4 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -410,8 +410,7 @@ int m

[PATCH V14 13/18] block: allow bio_for_each_segment_all() to iterate over multi-page bvec

2019-01-21 Thread Ming Lei
This patch introduces one extra iterator variable to bio_for_each_segment_all(),
then we can allow bio_for_each_segment_all() to iterate over multi-page bvec.

Given it is just one mechannical & simple change on all 
bio_for_each_segment_all()
users, this patch does tree-wide change in one single patch, so that we can
avoid to use a temporary helper for this conversion.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/bio.c   | 27 ++-
 block/bounce.c|  6 --
 drivers/md/bcache/btree.c |  3 ++-
 drivers/md/dm-crypt.c |  3 ++-
 drivers/md/raid1.c|  3 ++-
 drivers/staging/erofs/data.c  |  3 ++-
 drivers/staging/erofs/unzip_vle.c |  3 ++-
 fs/block_dev.c|  6 --
 fs/btrfs/compression.c|  3 ++-
 fs/btrfs/disk-io.c|  3 ++-
 fs/btrfs/extent_io.c  |  9 ++---
 fs/btrfs/inode.c  |  6 --
 fs/btrfs/raid56.c |  3 ++-
 fs/crypto/bio.c   |  3 ++-
 fs/direct-io.c|  4 +++-
 fs/exofs/ore.c|  3 ++-
 fs/exofs/ore_raid.c   |  3 ++-
 fs/ext4/page-io.c |  3 ++-
 fs/ext4/readpage.c|  3 ++-
 fs/f2fs/data.c|  9 ++---
 fs/gfs2/lops.c|  9 ++---
 fs/gfs2/meta_io.c |  3 ++-
 fs/iomap.c|  6 --
 fs/mpage.c|  3 ++-
 fs/xfs/xfs_aops.c |  5 +++--
 include/linux/bio.h   | 11 +--
 include/linux/bvec.h  | 30 ++
 27 files changed, 127 insertions(+), 46 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 4db1008309ed..968b12fea564 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1072,8 +1072,9 @@ static int bio_copy_from_iter(struct bio *bio, struct 
iov_iter *iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_from_iter(bvec->bv_page,
@@ -1103,8 +1104,9 @@ static int bio_copy_to_iter(struct bio *bio, struct 
iov_iter iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_to_iter(bvec->bv_page,
@@ -1126,8 +1128,9 @@ void bio_free_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all(bvec, bio, i, iter_all)
__free_page(bvec->bv_page);
 }
 EXPORT_SYMBOL(bio_free_pages);
@@ -1295,6 +1298,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
struct bio *bio;
int ret;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
@@ -1368,7 +1372,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio;
 
  out_unmap:
-   bio_for_each_segment_all(bvec, bio, j) {
+   bio_for_each_segment_all(bvec, bio, j, iter_all) {
put_page(bvec->bv_page);
}
bio_put(bio);
@@ -1379,11 +1383,12 @@ static void __bio_unmap_user(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
/*
 * make sure we dirty pages we wrote to
 */
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
 
@@ -1475,8 +1480,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private;
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
@@ -1585,8 +1591,9 @@ void bio_set_pages_dirty(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
}
@@ -1596,8 +1603,9 @@ static void bio_release_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 

[PATCH V14 15/18] block: always define BIO_MAX_PAGES as 256

2019-01-21 Thread Ming Lei
Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to
increase BIO_MAX_PAGES for it.

CONFIG_THP_SWAP needs to split one THP into normal pages and adds
them all to one bio. With multipage-bvec, it just takes one bvec to
hold them all.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 include/linux/bio.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index af288f6e8ab0..1d279a6ae737 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -34,15 +34,7 @@
 #define BIO_BUG_ON
 #endif
 
-#ifdef CONFIG_THP_SWAP
-#if HPAGE_PMD_NR > 256
-#define BIO_MAX_PAGES  HPAGE_PMD_NR
-#else
 #define BIO_MAX_PAGES  256
-#endif
-#else
-#define BIO_MAX_PAGES  256
-#endif
 
 #define bio_prio(bio)  (bio)->bi_ioprio
 #define bio_set_prio(bio, prio)((bio)->bi_ioprio = prio)
-- 
2.9.5



[PATCH V14 11/18] block: loop: pass multi-page bvec to iov_iter

2019-01-21 Thread Ming Lei
iov_iter is implemented on bvec itererator helpers, so it is safe to pass
multi-page bvec to it, and this way is much more efficient than passing one
page in each bvec.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 drivers/block/loop.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cf5538942834..168a151aba49 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -511,21 +511,22 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 loff_t pos, bool rw)
 {
struct iov_iter iter;
+   struct req_iterator rq_iter;
struct bio_vec *bvec;
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct bio *bio = rq->bio;
struct file *file = lo->lo_backing_file;
+   struct bio_vec tmp;
unsigned int offset;
-   int segments = 0;
+   int nr_bvec = 0;
int ret;
 
+   rq_for_each_mp_bvec(tmp, rq, rq_iter)
+   nr_bvec++;
+
if (rq->bio != rq->biotail) {
-   struct req_iterator iter;
-   struct bio_vec tmp;
 
-   __rq_for_each_bio(bio, rq)
-   segments += bio_segments(bio);
-   bvec = kmalloc_array(segments, sizeof(struct bio_vec),
+   bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
 GFP_NOIO);
if (!bvec)
return -EIO;
@@ -534,10 +535,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
/*
 * The bios of the request may be started from the middle of
 * the 'bvec' because of bio splitting, so we can't directly
-* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
+* copy bio->bi_iov_vec to new bvec. The rq_for_each_mp_bvec
 * API will take care of all details for us.
 */
-   rq_for_each_segment(tmp, rq, iter) {
+   rq_for_each_mp_bvec(tmp, rq, rq_iter) {
*bvec = tmp;
bvec++;
}
@@ -551,11 +552,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 */
offset = bio->bi_iter.bi_bvec_done;
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-   segments = bio_segments(bio);
}
atomic_set(&cmd->ref, 2);
 
-   iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq));
+   iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
iter.iov_offset = offset;
 
cmd->iocb.ki_pos = pos;
-- 
2.9.5



[PATCH V14 14/18] block: enable multipage bvecs

2019-01-21 Thread Ming Lei
This patch pulls the trigger for multi-page bvecs.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/bio.c | 22 +++---
 fs/iomap.c  |  4 ++--
 fs/xfs/xfs_aops.c   |  4 ++--
 include/linux/bio.h |  2 +-
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 968b12fea564..83a2dfa417ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * @page: page to add
  * @len: length of the data to add
  * @off: offset of the data in @page
+ * @same_page: if %true only merge if the new data is in the same physical
+ * page as the last segment of the bio.
  *
  * Try to add the data at @page + @off to the last bvec of @bio.  This is a
  * a useful optimisation for file systems with a block size smaller than the
@@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * Return %true on success or %false on failure.
  */
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off)
+   unsigned int len, unsigned int off, bool same_page)
 {
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
 
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+   phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
+   bv->bv_offset + bv->bv_len - 1;
+   phys_addr_t page_addr = page_to_phys(page);
 
-   if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
-   bv->bv_len += len;
-   bio->bi_iter.bi_size += len;
-   return true;
-   }
+   if (vec_end_addr + 1 != page_addr + off)
+   return false;
+   if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
+   return false;
+
+   bv->bv_len += len;
+   bio->bi_iter.bi_size += len;
+   return true;
}
return false;
 }
@@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
 int bio_add_page(struct bio *bio, struct page *page,
 unsigned int len, unsigned int offset)
 {
-   if (!__bio_try_merge_page(bio, page, len, offset)) {
+   if (!__bio_try_merge_page(bio, page, len, offset, false)) {
if (bio_full(bio))
return 0;
__bio_add_page(bio, page, len, offset);
diff --git a/fs/iomap.c b/fs/iomap.c
index af736acd9006..0c350e658b7f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -318,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
 */
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
-   if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+   if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -349,7 +349,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
 
-   __bio_add_page(ctx->bio, page, plen, poff);
+   bio_add_page(ctx->bio, page, plen, poff);
 done:
/*
 * Move the caller beyond our range so that it keeps making progress.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1f1829e506e8..b9fd44168f61 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -616,12 +616,12 @@ xfs_add_to_ioend(
bdev, sector);
}
 
-   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(&iop->write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-   __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+   bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
 
wpc->ioend->io_size += len;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e6a6f3d78afd..af288f6e8ab0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -441,7 +441,7 @@ extern int bio_add_page(struct bio *, struct page *, 
unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
   unsigned int, unsigned int);
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off);
+   unsigned int len, unsigned int off, bool same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-- 
2.9.5



[PATCH V14 12/18] bcache: avoid to use bio_for_each_segment_all() in bch_bio_alloc_pages()

2019-01-21 Thread Ming Lei
bch_bio_alloc_pages() is always called on one new bio, so it is safe
to access the bvec table directly. Given it is the only kind of this
case, open code the bvec table access since bio_for_each_segment_all()
will be changed to support for iterating over multipage bvec.

Acked-by: Coly Li 
Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 drivers/md/bcache/util.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 20eddeac1531..62fb917f7a4f 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
int i;
struct bio_vec *bv;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   /*
+* This is called on freshly new bio, so it is safe to access the
+* bvec table directly.
+*/
+   for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
-- 
2.9.5



[PATCH V14 10/18] btrfs: use mp_bvec_last_segment to get bio's last page

2019-01-21 Thread Ming Lei
Preparing for supporting multi-page bvec.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 fs/btrfs/extent_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dc8ba3ee515d..986ef49b0269 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2697,11 +2697,12 @@ static int __must_check submit_one_bio(struct bio *bio, 
int mirror_num,
 {
blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio);
-   struct page *page = bvec->bv_page;
+   struct bio_vec bv;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
 
-   start = page_offset(page) + bvec->bv_offset;
+   mp_bvec_last_segment(bvec, &bv);
+   start = page_offset(bv.bv_page) + bv.bv_offset;
 
bio->bi_private = NULL;
 
-- 
2.9.5



[PATCH V14 07/18] block: use bio_for_each_mp_bvec() to map sg

2019-01-21 Thread Ming Lei
It is more efficient to use bio_for_each_mp_bvec() to map sg, meantime
we have to consider splitting multipage bvec as done in blk_bio_segment_split().

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 70 +++
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2dfc30d8bc77..8a498f29636f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -464,6 +464,54 @@ static int blk_phys_contig_segment(struct request_queue 
*q, struct bio *bio,
return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
 }
 
+static struct scatterlist *blk_next_sg(struct scatterlist **sg,
+   struct scatterlist *sglist)
+{
+   if (!*sg)
+   return sglist;
+
+   /*
+* If the driver previously mapped a shorter list, we could see a
+* termination bit prematurely unless it fully inits the sg table
+* on each mapping. We KNOW that there must be more entries here
+* or the driver would be buggy, so force clear the termination bit
+* to avoid doing a full sg_init_table() in drivers for each command.
+*/
+   sg_unmark_end(*sg);
+   return sg_next(*sg);
+}
+
+static unsigned blk_bvec_map_sg(struct request_queue *q,
+   struct bio_vec *bvec, struct scatterlist *sglist,
+   struct scatterlist **sg)
+{
+   unsigned nbytes = bvec->bv_len;
+   unsigned nsegs = 0, total = 0, offset = 0;
+
+   while (nbytes > 0) {
+   unsigned seg_size;
+   struct page *pg;
+   unsigned idx;
+
+   *sg = blk_next_sg(sg, sglist);
+
+   seg_size = get_max_segment_size(q, bvec->bv_offset + total);
+   seg_size = min(nbytes, seg_size);
+
+   offset = (total + bvec->bv_offset) % PAGE_SIZE;
+   idx = (total + bvec->bv_offset) / PAGE_SIZE;
+   pg = nth_page(bvec->bv_page, idx);
+
+   sg_set_page(*sg, pg, seg_size, offset);
+
+   total += seg_size;
+   nbytes -= seg_size;
+   nsegs++;
+   }
+
+   return nsegs;
+}
+
 static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 struct scatterlist *sglist, struct bio_vec *bvprv,
@@ -481,25 +529,7 @@ __blk_segment_map_sg(struct request_queue *q, struct 
bio_vec *bvec,
(*sg)->length += nbytes;
} else {
 new_segment:
-   if (!*sg)
-   *sg = sglist;
-   else {
-   /*
-* If the driver previously mapped a shorter
-* list, we could see a termination bit
-* prematurely unless it fully inits the sg
-* table on each mapping. We KNOW that there
-* must be more entries here or the driver
-* would be buggy, so force clear the
-* termination bit to avoid doing a full
-* sg_init_table() in drivers for each command.
-*/
-   sg_unmark_end(*sg);
-   *sg = sg_next(*sg);
-   }
-
-   sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
-   (*nsegs)++;
+   (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
}
*bvprv = *bvec;
 }
@@ -521,7 +551,7 @@ static int __blk_bios_map_sg(struct request_queue *q, 
struct bio *bio,
int nsegs = 0;
 
for_each_bio(bio)
-   bio_for_each_segment(bvec, bio, iter)
+   bio_for_each_mp_bvec(bvec, bio, iter)
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
 &nsegs);
 
-- 
2.9.5



[PATCH V14 06/18] block: use bio_for_each_mp_bvec() to compute multi-page bvec count

2019-01-21 Thread Ming Lei
First it is more efficient to use bio_for_each_mp_bvec() in both
blk_bio_segment_split() and __blk_recalc_rq_segments() to compute how
many multi-page bvecs there are in the bio.

Secondly once bio_for_each_mp_bvec() is used, the bvec may need to be
splitted because its length can be very longer than max segment size,
so we have to split the big bvec into several segments.

Thirdly when splitting multi-page bvec into segments, the max segment
limit may be reached, so the bio split need to be considered under
this situation too.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 103 +++---
 1 file changed, 83 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index f85d878f313d..2dfc30d8bc77 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct 
request_queue *q,
return sectors;
 }
 
+static unsigned get_max_segment_size(struct request_queue *q,
+unsigned offset)
+{
+   unsigned long mask = queue_segment_boundary(q);
+
+   /* default segment boundary mask means no boundary limit */
+   if (mask == BLK_SEG_BOUNDARY_MASK)
+   return queue_max_segment_size(q);
+
+   return min_t(unsigned long, mask - (mask & offset) + 1,
+queue_max_segment_size(q));
+}
+
+/*
+ * Split the bvec @bv into segments, and update all kinds of
+ * variables.
+ */
+static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
+   unsigned *nsegs, unsigned *last_seg_size,
+   unsigned *front_seg_size, unsigned *sectors)
+{
+   unsigned len = bv->bv_len;
+   unsigned total_len = 0;
+   unsigned new_nsegs = 0, seg_size = 0;
+
+   /*
+* Multi-page bvec may be too big to hold in one segment, so the
+* current bvec has to be splitted as multiple segments.
+*/
+   while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
+   seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
+   seg_size = min(seg_size, len);
+
+   new_nsegs++;
+   total_len += seg_size;
+   len -= seg_size;
+
+   if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+   break;
+   }
+
+   if (!new_nsegs)
+   return !!len;
+
+   /* update front segment size */
+   if (!*nsegs) {
+   unsigned first_seg_size;
+
+   if (new_nsegs == 1)
+   first_seg_size = get_max_segment_size(q, bv->bv_offset);
+   else
+   first_seg_size = queue_max_segment_size(q);
+
+   if (*front_seg_size < first_seg_size)
+   *front_seg_size = first_seg_size;
+   }
+
+   /* update other varibles */
+   *last_seg_size = seg_size;
+   *nsegs += new_nsegs;
+   if (sectors)
+   *sectors += total_len >> 9;
+
+   /* split in the middle of the bvec if len != 0 */
+   return !!len;
+}
+
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 struct bio *bio,
 struct bio_set *bs,
@@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
 
-   bio_for_each_segment(bv, bio, iter) {
+   bio_for_each_mp_bvec(bv, bio, iter) {
/*
 * If the queue doesn't support SG gaps and adding this
 * offset would create a gap, disallow it.
@@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
 */
if (nsegs < queue_max_segments(q) &&
sectors < max_sectors) {
-   nsegs++;
-   sectors = max_sectors;
+   /* split in the middle of bvec */
+   bv.bv_len = (max_sectors - sectors) << 9;
+   bvec_split_segs(q, &bv, &nsegs,
+   &seg_size,
+   &front_seg_size,
+   §ors);
}
goto split;
}
@@ -212,14 +283,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
if (nsegs == queue_max_segments(q))
goto split;
 
-   if (nsegs == 1 && seg_size > front_seg_size)
-   front_seg_size = seg_size;
-
-   ns

[PATCH V14 08/18] block: introduce mp_bvec_last_segment()

2019-01-21 Thread Ming Lei
BTRFS and guard_bio_eod() need to get the last singlepage segment
from one multipage bvec, so introduce this helper to make them happy.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 0ae729b1c9fe..21f76bad7be2 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -131,4 +131,26 @@ static inline bool bvec_iter_advance(const struct bio_vec 
*bv,
.bi_bvec_done   = 0,\
 }
 
+/*
+ * Get the last single-page segment from the multi-page bvec and store it
+ * in @seg
+ */
+static inline void mp_bvec_last_segment(const struct bio_vec *bvec,
+   struct bio_vec *seg)
+{
+   unsigned total = bvec->bv_offset + bvec->bv_len;
+   unsigned last_page = (total - 1) / PAGE_SIZE;
+
+   seg->bv_page = nth_page(bvec->bv_page, last_page);
+
+   /* the whole segment is inside the last page */
+   if (bvec->bv_offset >= last_page * PAGE_SIZE) {
+   seg->bv_offset = bvec->bv_offset % PAGE_SIZE;
+   seg->bv_len = bvec->bv_len;
+   } else {
+   seg->bv_offset = 0;
+   seg->bv_len = total - last_page * PAGE_SIZE;
+   }
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
2.9.5



[PATCH V14 05/18] block: introduce bio_for_each_mp_bvec() and rq_for_each_mp_bvec()

2019-01-21 Thread Ming Lei
bio_for_each_mp_bvec() is used for iterating over multi-page bvec for bio
split & merge code.

rq_for_each_mp_bvec() can be used for drivers which may handle the
multi-page bvec directly, so far loop is one perfect use case.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bio.h| 10 ++
 include/linux/blkdev.h |  4 
 2 files changed, 14 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 72b4f7be2106..730288145568 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -156,6 +156,16 @@ static inline void bio_advance_iter(struct bio *bio, 
struct bvec_iter *iter,
 #define bio_for_each_segment(bvl, bio, iter)   \
__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
 
+#define __bio_for_each_mp_bvec(bvl, bio, iter, start)  \
+   for (iter = (start);\
+(iter).bi_size &&  \
+   ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
+bio_advance_iter((bio), &(iter), (bvl).bv_len))
+
+/* iterate over multi-page bvec */
+#define bio_for_each_mp_bvec(bvl, bio, iter)   \
+   __bio_for_each_mp_bvec(bvl, bio, iter, (bio)->bi_iter)
+
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
 static inline unsigned bio_segments(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 338604dff7d0..6ebae3ee8f44 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -797,6 +797,10 @@ struct req_iterator {
__rq_for_each_bio(_iter.bio, _rq)   \
bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 
+#define rq_for_each_mp_bvec(bvl, _rq, _iter)   \
+   __rq_for_each_bio(_iter.bio, _rq)   \
+   bio_for_each_mp_bvec(bvl, _iter.bio, _iter.iter)
+
 #define rq_iter_last(bvec, _iter)  \
(_iter.bio->bi_next == NULL &&  \
 bio_iter_last(bvec, _iter.iter))
-- 
2.9.5



[PATCH V14 03/18] block: remove bvec_iter_rewind()

2019-01-21 Thread Ming Lei
Commit 7759eb23fd980 ("block: remove bio_rewind_iter()") removes
bio_rewind_iter(), then no one uses bvec_iter_rewind() any more,
so remove it.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 24 
 1 file changed, 24 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 02c73c6aa805..ba0ae40e77c9 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -92,30 +92,6 @@ static inline bool bvec_iter_advance(const struct bio_vec 
*bv,
return true;
 }
 
-static inline bool bvec_iter_rewind(const struct bio_vec *bv,
-struct bvec_iter *iter,
-unsigned int bytes)
-{
-   while (bytes) {
-   unsigned len = min(bytes, iter->bi_bvec_done);
-
-   if (iter->bi_bvec_done == 0) {
-   if (WARN_ONCE(iter->bi_idx == 0,
- "Attempted to rewind iter beyond "
- "bvec's boundaries\n")) {
-   return false;
-   }
-   iter->bi_idx--;
-   iter->bi_bvec_done = __bvec_iter_bvec(bv, 
*iter)->bv_len;
-   continue;
-   }
-   bytes -= len;
-   iter->bi_size += len;
-   iter->bi_bvec_done -= len;
-   }
-   return true;
-}
-
 #define for_each_bvec(bvl, bio_vec, iter, start)   \
for (iter = (start);\
 (iter).bi_size &&  \
-- 
2.9.5



[PATCH V14 04/18] block: introduce multi-page bvec helpers

2019-01-21 Thread Ming Lei
This patch introduces helpers of 'mp_bvec_iter_*' for multi-page bvec
support.

The introduced helpers treate one bvec as real multi-page segment,
which may include more than one pages.

The existed helpers of bvec_iter_* are interfaces for supporting current
bvec iterator which is thought as single-page by drivers, fs, dm and
etc. These introduced helpers will build single-page bvec in flight, so
this way won't break current bio/bvec users, which needn't any change.

Follows some multi-page bvec background:

- bvecs stored in bio->bi_io_vec is always multi-page style

- bvec(struct bio_vec) represents one physically contiguous I/O
  buffer, now the buffer may include more than one page after
  multi-page bvec is supported, and all these pages represented
  by one bvec is physically contiguous. Before multi-page bvec
  support, at most one page is included in one bvec, we call it
  single-page bvec.

- .bv_page of the bvec points to the 1st page in the multi-page bvec

- .bv_offset of the bvec is the offset of the buffer in the bvec

The effect on the current drivers/filesystem/dm/bcache/...:

- almost everyone supposes that one bvec only includes one single
  page, so we keep the sp interface not changed, for example,
  bio_for_each_segment() still returns single-page bvec

- bio_for_each_segment_all() will return single-page bvec too

- during iterating, iterator variable(struct bvec_iter) is always
  updated in multi-page bvec style, and bvec_iter_advance() is kept
  not changed

- returned(copied) single-page bvec is built in flight by bvec
  helpers from the stored multi-page bvec

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ba0ae40e77c9..0ae729b1c9fe 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * was unsigned short, but we might as well be ready for > 64kB I/O pages
@@ -50,16 +51,39 @@ struct bvec_iter {
  */
 #define __bvec_iter_bvec(bvec, iter)   (&(bvec)[(iter).bi_idx])
 
-#define bvec_iter_page(bvec, iter) \
+/* multi-page (mp_bvec) helpers */
+#define mp_bvec_iter_page(bvec, iter)  \
(__bvec_iter_bvec((bvec), (iter))->bv_page)
 
-#define bvec_iter_len(bvec, iter)  \
+#define mp_bvec_iter_len(bvec, iter)   \
min((iter).bi_size, \
__bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
 
-#define bvec_iter_offset(bvec, iter)   \
+#define mp_bvec_iter_offset(bvec, iter)\
(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
 
+#define mp_bvec_iter_page_idx(bvec, iter)  \
+   (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)
+
+#define mp_bvec_iter_bvec(bvec, iter)  \
+((struct bio_vec) {\
+   .bv_page= mp_bvec_iter_page((bvec), (iter)),\
+   .bv_len = mp_bvec_iter_len((bvec), (iter)), \
+   .bv_offset  = mp_bvec_iter_offset((bvec), (iter)),  \
+})
+
+/* For building single-page bvec in flight */
+ #define bvec_iter_offset(bvec, iter)  \
+   (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)
+
+#define bvec_iter_len(bvec, iter)  \
+   min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),   \
+ PAGE_SIZE - bvec_iter_offset((bvec), (iter)))
+
+#define bvec_iter_page(bvec, iter) \
+   nth_page(mp_bvec_iter_page((bvec), (iter)), \
+mp_bvec_iter_page_idx((bvec), (iter)))
+
 #define bvec_iter_bvec(bvec, iter) \
 ((struct bio_vec) {\
.bv_page= bvec_iter_page((bvec), (iter)),   \
-- 
2.9.5



[PATCH V14 02/18] block: don't use bio->bi_vcnt to figure out segment number

2019-01-21 Thread Ming Lei
It is wrong to use bio->bi_vcnt to figure out how many segments
there are in the bio even though CLONED flag isn't set on this bio,
because this bio may be splitted or advanced.

So always use bio_segments() in blk_recount_segments(), and it shouldn't
cause any performance loss now because the physical segment number is figured
out in blk_queue_split() and BIO_SEG_VALID is set meantime since
bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting").

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Fixes: 76d8137a3113 ("blk-merge: recaculate segment if it isn't less than max 
segments")
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 71e9ac03f621..f85d878f313d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -367,13 +367,7 @@ void blk_recalc_rq_segments(struct request *rq)
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   unsigned short seg_cnt;
-
-   /* estimate segment number by bi_vcnt for non-cloned bio */
-   if (bio_flagged(bio, BIO_CLONED))
-   seg_cnt = bio_segments(bio);
-   else
-   seg_cnt = bio->bi_vcnt;
+   unsigned short seg_cnt = bio_segments(bio);
 
if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
(seg_cnt < queue_max_segments(q)))
-- 
2.9.5



[PATCH V14 00/18] block: support multi-page bvec

2019-01-21 Thread Ming Lei
ize io vec, covered by patch 13
V8:
- remove prepare patches which all are merged to linus tree
- rebase on for-4.21/block
- address comments on V7
- add patches of killing NO_SG_MERGE

V7:
- include Christoph and Mike's bio_clone_bioset() patches, which is
  actually prepare patches for multipage bvec
- address Christoph's comments

V6:
- avoid to introduce lots of renaming, follow Jen's suggestion of
using the name of chunk for multipage io vector
- include Christoph's three prepare patches
- decrease stack usage for using bio_for_each_chunk_segment_all()
- address Kent's comment

V5:
- remove some of prepare patches, which have been merged already
- add bio_clone_seg_bioset() to fix DM's bio clone, which
is introduced by 18a25da84354c6b (dm: ensure bio submission follows
a depth-first tree walk)
- rebase on the latest block for-v4.18

V4:
- rename bio_for_each_segment*() as bio_for_each_page*(), rename
bio_segments() as bio_pages(), rename rq_for_each_segment() as
rq_for_each_pages(), because these helpers never return real
segment, and they always return single page bvec

- introducing segment_for_each_page_all()

- introduce new 
bio_for_each_segment*()/rq_for_each_segment()/bio_segments()
for returning real multipage segment

- rewrite segment_last_page()

- rename bvec iterator helper as suggested by Christoph

- replace comment with applying bio helpers as suggested by Christoph

- document usage of bio iterator helpers

- redefine BIO_MAX_PAGES as 256 to make the biggest bvec table
accommodated in 4K page

- move bio_alloc_pages() into bcache as suggested by Christoph

V3:
- rebase on v4.13-rc3 with for-next of block tree
- run more xfstests: xfs/ext4 over NVMe, Sata, DM(linear),
MD(raid1), and not see regressions triggered
- add Reviewed-by on some btrfs patches
- remove two MD patches because both are merged to linus tree
  already

V2:
- bvec table direct access in raid has been cleaned, so NO_MP
flag is dropped
- rebase on recent Neil Brown's change on bio and bounce code
- reorganize the patchset

V1:
- against v4.10-rc1 and some cleanup in V0 are in -linus already
- handle queue_virt_boundary() in mp bvec change and make NVMe happy
- further BTRFS cleanup
- remove QUEUE_FLAG_SPLIT_MP
- rename for two new helpers of bio_for_each_segment_all()
- fix bounce convertion
- address comments in V0

[1], http://marc.info/?l=linux-kernel&m=141680246629547&w=2
[2], https://patchwork.kernel.org/patch/9451523/
[3], http://marc.info/?t=14773544711&r=1&w=2
[4], http://marc.info/?l=linux-mm&m=147745525801433&w=2
[5], http://marc.info/?t=14956948457&r=1&w=2
[6], http://marc.info/?t=14982021534&r=1&w=2


Christoph Hellwig (1):
  btrfs: look at bi_size for repair decisions

Ming Lei (17):
  block: don't use bio->bi_vcnt to figure out segment number
  block: remove bvec_iter_rewind()
  block: introduce multi-page bvec helpers
  block: introduce bio_for_each_mp_bvec() and rq_for_each_mp_bvec()
  block: use bio_for_each_mp_bvec() to compute multi-page bvec count
  block: use bio_for_each_mp_bvec() to map sg
  block: introduce mp_bvec_last_segment()
  fs/buffer.c: use bvec iterator to truncate the bio
  btrfs: use mp_bvec_last_segment to get bio's last page
  block: loop: pass multi-page bvec to iov_iter
  bcache: avoid to use bio_for_each_segment_all() in
bch_bio_alloc_pages()
  block: allow bio_for_each_segment_all() to iterate over multi-page
bvec
  block: enable multipage bvecs
  block: always define BIO_MAX_PAGES as 256
  block: document usage of bio iterator helpers
  block: kill QUEUE_FLAG_NO_SG_MERGE
  block: kill BLK_MQ_F_SG_MERGE

 Documentation/block/biovecs.txt   |  25 +
 block/bio.c   |  49 ++---
 block/blk-merge.c | 210 +-
 block/blk-mq-debugfs.c|   2 -
 block/blk-mq.c|   3 -
 block/bounce.c|   6 +-
 drivers/block/loop.c  |  22 ++--
 drivers/block/nbd.c   |   2 +-
 drivers/block/rbd.c   |   2 +-
 drivers/block/skd_main.c  |   1 -
 drivers/block/xen-blkfront.c  |   2 +-
 drivers/md/bcache/btree.c |   3 +-
 drivers/md/bcache/util.c  |   6 +-
 drivers/md/dm-crypt.c |   3 +-
 drivers/md/dm-rq.c|   2 +-
 drivers/md/dm-table.c |  13 ---
 drivers/md/raid1.c|   3 +-
 drivers/mmc/core/queue.c  |   3 +-
 drivers/scsi/scsi_lib.c   |   2 +-
 dri

[PATCH V14 01/18] btrfs: look at bi_size for repair decisions

2019-01-21 Thread Ming Lei
From: Christoph Hellwig 

bio_readpage_error currently uses bi_vcnt to decide if it is worth
retrying an I/O.  But the vector count is mostly an implementation
artifact - it really should figure out if there is more than a
single sector worth retrying.  Use bi_size for that and shift by
PAGE_SHIFT.  This really should be blocks/sectors, but given that
btrfs doesn't support a sector size different from the PAGE_SIZE
using the page size keeps the changes to a minimum.

Reviewed-by: Omar Sandoval 
Reviewed-by: David Sterba 
Signed-off-by: Christoph Hellwig 
---
 fs/btrfs/extent_io.c | 2 +-
 include/linux/bio.h  | 6 --
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 52abe4082680..dc8ba3ee515d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2350,7 +2350,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 
phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
-   unsigned failed_bio_pages = bio_pages_all(failed_bio);
+   unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
 
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7380b094dcca..72b4f7be2106 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -263,12 +263,6 @@ static inline void bio_get_last_bvec(struct bio *bio, 
struct bio_vec *bv)
bv->bv_len = iter.bi_bvec_done;
 }
 
-static inline unsigned bio_pages_all(struct bio *bio)
-{
-   WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-   return bio->bi_vcnt;
-}
-
 static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
 {
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-- 
2.9.5



Re: [PATCH 1/4] dm: fix clone_bio() to trigger blk_recount_segments()

2019-01-21 Thread Ming Lei
On Sat, Jan 19, 2019 at 01:05:03PM -0500, Mike Snitzer wrote:
> DM's clone_bio() now benefits from using bio_trim() by fixing the fact
> that clone_bio() wasn't clearing BIO_SEG_VALID like bio_trim() does;
> which triggers blk_recount_segments() via bio_phys_segments().
> 
> Signed-off-by: Mike Snitzer 
> ---
>  drivers/md/dm.c | 8 ++--
>  1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index d67c95ef8d7e..fcb97b0a5743 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1320,7 +1320,7 @@ static int clone_bio(struct dm_target_io *tio, struct 
> bio *bio,
>  
>   __bio_clone_fast(clone, bio);
>  
> - if (unlikely(bio_integrity(bio) != NULL)) {
> + if (bio_integrity(bio)) {
>   int r;
>  
>   if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
> @@ -1336,11 +1336,7 @@ static int clone_bio(struct dm_target_io *tio, struct 
> bio *bio,
>   return r;
>   }
>  
> - bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
> - clone->bi_iter.bi_size = to_bytes(len);
> -
> - if (unlikely(bio_integrity(bio) != NULL))
> - bio_integrity_trim(clone);
> + bio_trim(clone, sector - clone->bi_iter.bi_sector, len);
>  
>   return 0;
>  }
> -- 
> 2.15.0
> 

Reviewed-by: Ming Lei 

Thanks,
Ming


Re: [PATCH 3/4] dm: fix missing bio_split() pattern code in __split_and_process_bio()

2019-01-21 Thread Ming Lei
On Sat, Jan 19, 2019 at 01:05:05PM -0500, Mike Snitzer wrote:
> Use the same BIO_QUEUE_ENTERED pattern that was established by commit
> cd4a4ae4683dc ("block: don't use blocking queue entered for recursive
> bio submits") by setting BIO_QUEUE_ENTERED after bio_split() and before
> recursing via generic_make_request().
> 
> Also add trace_block_split() because it provides useful context about
> bio splits in blktrace.
> 
> Depends-on: cd4a4ae4683dc ("block: don't use blocking queue entered for 
> recursive bio submits")
> Fixes: 18a25da84354 ("dm: ensure bio submission follows a depth-first tree 
> walk")
> Cc: sta...@vger.kernel.org # 4.16+
> Signed-off-by: Mike Snitzer 
> ---
>  drivers/md/dm.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index fbadda68e23b..6e29c2d99b99 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1654,7 +1654,9 @@ static blk_qc_t __split_and_process_bio(struct 
> mapped_device *md,
>  
> sectors[op_stat_group(bio_op(bio))], ci.sector_count);
>   part_stat_unlock();
>  
> + bio_set_flag(bio, BIO_QUEUE_ENTERED);
>   bio_chain(b, bio);
> + trace_block_split(md->queue, b, 
> bio->bi_iter.bi_sector);
>   ret = generic_make_request(bio);
>   break;
>   }

In theory, BIO_QUEUE_ENTERED is only required when __split_and_process_bio() is
called from generic_make_request(). However, it may be called from dm_wq_work(),
this way might cause trouble on operation to q->q_usage_counter.

Thanks,
Ming


Re: [PATCH 2/4] dm: fix redundant IO accounting for bios that need splitting

2019-01-20 Thread Ming Lei
On Sat, Jan 19, 2019 at 01:05:04PM -0500, Mike Snitzer wrote:
> The risk of redundant IO accounting was not taken into consideration
> when commit 18a25da84354 ("dm: ensure bio submission follows a
> depth-first tree walk") introduced IO splitting in terms of recursion
> via generic_make_request().
> 
> Fix this by subtracting the split bio's payload from the IO stats that
> were already accounted for by start_io_acct() upon dm_make_request()
> entry.  This repeat oscillation of the IO accounting, up then down,
> isn't ideal but refactoring DM core's IO splitting to pre-split bios
> _before_ they are accounted turned out to be an excessive amount of
> change that will need a full development cycle to refine and verify.
> 
> Before this fix:
> 
>   /dev/mapper/stripe_dev is a 4-way stripe using a 32k chunksize, so
>   bios are split on 32k boundaries.
> 
>   # fio --name=16M --filename=/dev/mapper/stripe_dev --rw=write --bs=64k 
> --size=16M \
>   --iodepth=1 --ioengine=libaio --direct=1 --refill_buffers
> 
>   with debugging added:
>   [103898.310264] device-mapper: core: start_io_acct: dm-2 WRITE 
> bio->bi_iter.bi_sector=0 len=128
>   [103898.318704] device-mapper: core: __split_and_process_bio: recursing for 
> following split bio:
>   [103898.329136] device-mapper: core: start_io_acct: dm-2 WRITE 
> bio->bi_iter.bi_sector=64 len=64
>   ...
> 
>   16M written yet 136M (278528 * 512b) accounted:
>   # cat /sys/block/dm-2/stat | awk '{ print $7 }'
>   278528
> 
> After this fix:
> 
>   16M written and 16M (32768 * 512b) accounted:
>   # cat /sys/block/dm-2/stat | awk '{ print $7 }'
>   32768
> 
> Fixes: 18a25da84354 ("dm: ensure bio submission follows a depth-first tree 
> walk")
> Cc: sta...@vger.kernel.org # 4.16+
> Reported-by: Bryan Gurney 
> Signed-off-by: Mike Snitzer 
> ---
>  drivers/md/dm.c | 16 
>  1 file changed, 16 insertions(+)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index fcb97b0a5743..fbadda68e23b 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1584,6 +1584,9 @@ static void init_clone_info(struct clone_info *ci, 
> struct mapped_device *md,
>   ci->sector = bio->bi_iter.bi_sector;
>  }
>  
> +#define __dm_part_stat_sub(part, field, subnd)   \
> + (part_stat_get(part, field) -= (subnd))
> +
>  /*
>   * Entry point to split a bio into clones and submit them to the targets.
>   */
> @@ -1638,6 +1641,19 @@ static blk_qc_t __split_and_process_bio(struct 
> mapped_device *md,
>   struct bio *b = bio_split(bio, bio_sectors(bio) 
> - ci.sector_count,
> GFP_NOIO, 
> &md->queue->bio_split);
>   ci.io->orig_bio = b;
> +
> + /*
> +  * Adjust IO stats for each split, otherwise 
> upon queue
> +  * reentry there will be redundant IO 
> accounting.
> +  * NOTE: this is a stop-gap fix, a proper fix 
> involves
> +  * significant refactoring of DM core's bio 
> splitting
> +  * (by eliminating DM's splitting and just 
> using bio_split)
> +  */
> + part_stat_lock();
> + __dm_part_stat_sub(&dm_disk(md)->part0,
> +
> sectors[op_stat_group(bio_op(bio))], ci.sector_count);
> + part_stat_unlock();
> +
>   bio_chain(b, bio);
>   ret = generic_make_request(bio);
>   break;

This ways is a bit ugly, but looks it works and it is simple, especially
DM target may accept partial bio, so:

Reviewed-by: Ming Lei 

Thanks,
Ming


Re: dd hangs when reading large partitions

2019-01-18 Thread Ming Lei
On Fri, Jan 18, 2019 at 8:11 PM Marc Gonzalez  wrote:
>
> Hello,
>
> I'm running into an issue which I don't know how to debug.
> So I'm open to ideas and suggestions :-)
>
> On my arm64 board, I have enabled Universal Flash Storage support.
>
> I wanted to benchmark read performance, and noticed that the system
> locks up when I read partitions larger than 3.5 GB, unless I tell
> dd to use direct IO:
>
> *** WITH O_DIRECT ***
> # dd if=/dev/sda of=/dev/null bs=1M iflag=direct status=progress
> 57892929536 bytes (58 GB, 54 GiB) copied, 697.006 s, 83.1 MB/s
> 55256+0 records in
> 55256+0 records out
> 57940115456 bytes (58 GB, 54 GiB) copied, 697.575 s, 83.1 MB/s
>
> *** WITHOUT O_DIRECT ***
> # dd if=/dev/sda of=/dev/null bs=1M status=progress
> 3853516800 bytes (3.9 GB, 3.6 GiB) copied, 49.0002 s, 78.6 MB/s
>
>
> rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
> rcu:1-...0: (8242 ticks this GP) idle=106/1/0x4000 
> softirq=168/171 fqs=2626
> rcu:6-...0: (99 GPs behind) idle=ec2/1/0x4000 softirq=71/71 
> fqs=2626
> rcu:(detected by 7, t=5254 jiffies, g=-275, q=2)
> Task dump for CPU 1:
> kworker/1:1HR  running task0   675  2 0x002a
> Workqueue: kblockd blk_mq_run_work_fn
> Call trace:
>  __switch_to+0x168/0x1d0
>  0xffc0f6efbbc8
>  blk_mq_run_work_fn+0x28/0x40
>  process_one_work+0x208/0x470
>  worker_thread+0x48/0x460
>  kthread+0x128/0x130
>  ret_from_fork+0x10/0x1c
> Task dump for CPU 6:
> kthreaddR  running task0 2  0 0x002a
> Call trace:
>  __switch_to+0x168/0x1d0
>  0x5b36396f4e7d4000
>
>
> rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
> rcu:1-...0: (8242 ticks this GP) idle=106/1/0x4000 
> softirq=168/171 fqs=10500
> rcu:6-...0: (99 GPs behind) idle=ec2/1/0x4000 softirq=71/71 
> fqs=10500
> rcu:(detected by 7, t=21009 jiffies, g=-275, q=2)
> Task dump for CPU 1:
> kworker/1:1HR  running task0   675  2 0x002a
> Workqueue: kblockd blk_mq_run_work_fn
> Call trace:
>  __switch_to+0x168/0x1d0
>  0xffc0f6efbbc8
>  blk_mq_run_work_fn+0x28/0x40
>  process_one_work+0x208/0x470
>  worker_thread+0x48/0x460
>  kthread+0x128/0x130
>  ret_from_fork+0x10/0x1c
> Task dump for CPU 6:
> kthreaddR  running task0 2  0 0x002a
> Call trace:
>  __switch_to+0x168/0x1d0
>  0x5b36396f4e7d4000
>
>
> The system always hangs around the 3.6 GiB mark, wherever I start from.
> How can I debug this issue?

Maybe you can try to collect debugfs log first via the following command?

(cd /sys/kernel/debug/block/sda && find . -type f -exec grep -aH . {} \;)

BTW,  I have several questions on this report:

- what is the kernel version in your test?
- can you reproduce this issue on other disk(not UFS)?
- are there any tasks in 'D' state shown via 'ps -ax'? If yes, please dump their
stack trace.

Thanks,
Ming Lei


Re: [BUG bisect] kernel BUG at block/bio.c:1833 and fail to mount disk

2019-01-17 Thread Ming Lei
On Wed, Jan 16, 2019 at 09:54:05AM +0100, Krzysztof Kozlowski wrote:
> On Wed, 16 Jan 2019 at 09:52, Krzysztof Kozlowski  wrote:
> >
> > Hi,
> >
> > On today's next-20190116 I see a bug during boot:
> > [ 6.843308] kernel BUG at ../block/bio.c:1833!
> > [ 6.847723] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM
> > ...
> > [ 7.543824] [] (bio_split) from [<>] ( (null))
> > [ 7.549881] Code: 13833b01 11c630bc e1a6 e8bd8070 (e7f001f2)
> >
> > (not much in the calltrace)
> > On all my boards. Also QEMU-arm fails.
> >
> 
> I forgot the bisect commit:
> 
> 258cfdfaf7bd729e759a0a91fd00ac9794796ad3 is the first bad commit
> commit 258cfdfaf7bd729e759a0a91fd00ac9794796ad3
> Author: Ming Lei 
> Date:   Fri Jan 11 19:01:15 2019 +0800
> 
> block: use bio_for_each_bvec() to compute multi-page bvec count
> 
> :04 04 d79b2e71d308650df4764ff644f29d3a24dbab96
> 0a5d624843b805ee0c9fd9a7e2d5163f5b15b167 M block

It should be one 32-bit arch specific issue, I guess.

The following patch should fix this issue:

diff --git a/block/blk-merge.c b/block/blk-merge.c
index dc4877eaf9f9..4dd7183de849 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -166,6 +166,9 @@ static unsigned get_max_segment_size(struct request_queue 
*q,
 {
unsigned long mask = queue_segment_boundary(q);
 
+   if (mask == BLK_SEG_BOUNDARY_MASK)
+   return queue_max_segment_size(q);
+
return min_t(unsigned long, mask - (mask & offset) + 1,
 queue_max_segment_size(q));
 }

Thanks,
Ming


[PATCH] block: 028: block integrity funtion test

2019-01-16 Thread Ming Lei
Use scsi_debug's dif/dix to cover block layer's integrity function
test, then it can serve as block integrity regeression test.

Signed-off-by: Ming Lei 
---
 tests/block/028 | 42 ++
 tests/block/028.out |  9 +
 2 files changed, 51 insertions(+)
 create mode 100755 tests/block/028
 create mode 100644 tests/block/028.out

diff --git a/tests/block/028 b/tests/block/028
new file mode 100755
index ..9b76b93d9eb6
--- /dev/null
+++ b/tests/block/028
@@ -0,0 +1,42 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Ming Lei 
+#
+# Test basic DIF/DIX test. Regression test for commit 7809167da5c86fd6
+# ("block: don't lose track of REQ_INTEGRITY flag")
+
+. tests/block/rc
+. common/scsi_debug
+
+DESCRIPTION="do scsi_debug dif/dix function test"
+
+requires() {
+   _have_scsi_debug
+}
+
+test_pi() {
+   if ! _init_scsi_debug dev_size_mb=128 dix=$1 dif=$2 delay=0; then
+   return 1
+   fi
+
+   local dev="/dev/${SCSI_DEBUG_DEVICES[0]}"
+   local nr_sects=`blockdev --getsz $dev`
+
+   dd if=/dev/urandom of=$dev bs=512 count=$nr_sects status=none
+   dd if=$dev of=/dev/null bs=512 status=none
+
+   _exit_scsi_debug
+}
+
+test() {
+   echo "Running ${TEST_NAME}"
+
+   for ((dix = 0; dix <= 1; dix++)); do
+   for ((dif = 0; dif <= 3; dif++)); do
+   test_pi $dix $dif
+   echo "Test(dix:$dix dif:$dif) complete"
+   done
+   done
+
+   rm -f "$FULL"
+}
diff --git a/tests/block/028.out b/tests/block/028.out
new file mode 100644
index ..d2512334b9f5
--- /dev/null
+++ b/tests/block/028.out
@@ -0,0 +1,9 @@
+Running block/028
+Test(dix:0 dif:0) complete
+Test(dix:0 dif:1) complete
+Test(dix:0 dif:2) complete
+Test(dix:0 dif:3) complete
+Test(dix:1 dif:0) complete
+Test(dix:1 dif:1) complete
+Test(dix:1 dif:2) complete
+Test(dix:1 dif:3) complete
-- 
2.9.5



[PATCH] block: don't lose track of REQ_INTEGRITY flag

2019-01-16 Thread Ming Lei
We need to pass bio->bi_opf after bio intergrity preparing, otherwise
the flag of REQ_INTEGRITY may not be set on the allocated request, then
breaks block integrity.

Fixes: f9afca4d367b ("blk-mq: pass in request/bio flags to queue mapping")
Cc: Hannes Reinecke 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ba37b9e15e9..8f5b533764ca 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1906,7 +1906,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue 
*q, struct bio *bio)
 {
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
-   struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf 
};
+   struct blk_mq_alloc_data data = { .flags = 0};
struct request *rq;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
@@ -1928,6 +1928,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue 
*q, struct bio *bio)
 
rq_qos_throttle(q, bio);
 
+   data.cmd_flags = bio->bi_opf;
rq = blk_mq_get_request(q, bio, &data);
if (unlikely(!rq)) {
rq_qos_cleanup(q, bio);
-- 
2.9.5



[PATCH] fs: gfs2: fix build failure

2019-01-15 Thread Ming Lei
There is one bio_for_each_segment_all() not converted, so fix it.

Reported-by: kbuild test robot 
Fixes: eb754eb2a953ead0 ("block: allow bio_for_each_segment_all() to iterate 
over multi-page bvec")
Cc: Omar Sandoval 
Cc: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 fs/gfs2/lops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ef68bf6232e7..15deefeaafd0 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -390,8 +390,9 @@ static void gfs2_end_log_read(struct bio *bio)
struct page *page;
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
page = bvec->bv_page;
if (bio->bi_status) {
int err = blk_status_to_errno(bio->bi_status);
-- 
2.14.4



[PATCH] sbitmap: Protect swap_lock from hardirq

2019-01-14 Thread Ming Lei
The original report is actually one real deadlock:

[  106.132865]  Possible interrupt unsafe locking scenario:
[  106.132865]
[  106.133659]CPU0CPU1
[  106.134194]
[  106.134733]   lock(&(&sb->map[i].swap_lock)->rlock);
[  106.135318]local_irq_disable();
[  106.136014]lock(&sbq->ws[i].wait);
[  106.136747]
lock(&(&hctx->dispatch_wait_lock)->rlock);
[  106.137742]   
[  106.138110] lock(&sbq->ws[i].wait);

Because we may call blk_mq_get_driver_tag() directly from
blk_mq_dispatch_rq_list() without holding any lock, then HARDIRQ may come
and the above DEADLOCK is triggered.

ab53dcfb3e7b ("sbitmap: Protect swap_lock from hardirq") tries to fix
this issue by using 'spin_lock_bh', which isn't enough because we complete
request from hardirq context direclty in case of multiqueue.

Cc: Clark Williams 
Fixes: ab53dcfb3e7b ("sbitmap: Protect swap_lock from hardirq")
Cc: Jens Axboe 
Cc: Ming Lei 
Cc: Guenter Roeck 
Cc: Steven Rostedt (VMware) 
Cc: Linus Torvalds 
Cc: linux-ker...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 lib/sbitmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 864354000e04..5b382c1244ed 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -27,8 +27,9 @@ static inline bool sbitmap_deferred_clear(struct sbitmap *sb, 
int index)
 {
unsigned long mask, val;
bool ret = false;
+   unsigned long flags;
 
-   spin_lock_bh(&sb->map[index].swap_lock);
+   spin_lock_irqsave(&sb->map[index].swap_lock, flags);
 
if (!sb->map[index].cleared)
goto out_unlock;
@@ -49,7 +50,7 @@ static inline bool sbitmap_deferred_clear(struct sbitmap *sb, 
int index)
 
ret = true;
 out_unlock:
-   spin_unlock_bh(&sb->map[index].swap_lock);
+   spin_unlock_irqrestore(&sb->map[index].swap_lock, flags);
return ret;
 }
 
-- 
2.14.4



[PATCH V13 17/19] block: document usage of bio iterator helpers

2019-01-11 Thread Ming Lei
Now multi-page bvec is supported, some helpers may return page by
page, meantime some may return segment by segment, this patch
documents the usage.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 Documentation/block/biovecs.txt | 25 +
 1 file changed, 25 insertions(+)

diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
index 25689584e6e0..ce6eccaf5df7 100644
--- a/Documentation/block/biovecs.txt
+++ b/Documentation/block/biovecs.txt
@@ -117,3 +117,28 @@ Other implications:
size limitations and the limitations of the underlying devices. Thus
there's no need to define ->merge_bvec_fn() callbacks for individual block
drivers.
+
+Usage of helpers:
+=
+
+* The following helpers whose names have the suffix of "_all" can only be used
+on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
+shouldn't use them because the bio may have been split before it reached the
+driver.
+
+   bio_for_each_segment_all()
+   bio_first_bvec_all()
+   bio_first_page_all()
+   bio_last_bvec_all()
+
+* The following helpers iterate over single-page segment. The passed 'struct
+bio_vec' will contain a single-page IO vector during the iteration
+
+   bio_for_each_segment()
+   bio_for_each_segment_all()
+
+* The following helpers iterate over multi-page bvec. The passed 'struct
+bio_vec' will contain a multi-page IO vector during the iteration
+
+   bio_for_each_bvec()
+   rq_for_each_bvec()
-- 
2.9.5



[PATCH V13 16/19] block: always define BIO_MAX_PAGES as 256

2019-01-11 Thread Ming Lei
Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to
increase BIO_MAX_PAGES for it.

CONFIG_THP_SWAP needs to split one THP into normal pages and adds
them all to one bio. With multipage-bvec, it just takes one bvec to
hold them all.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 include/linux/bio.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1ece9f30294b..54ef81f11f83 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -34,15 +34,7 @@
 #define BIO_BUG_ON
 #endif
 
-#ifdef CONFIG_THP_SWAP
-#if HPAGE_PMD_NR > 256
-#define BIO_MAX_PAGES  HPAGE_PMD_NR
-#else
 #define BIO_MAX_PAGES  256
-#endif
-#else
-#define BIO_MAX_PAGES  256
-#endif
 
 #define bio_prio(bio)  (bio)->bi_ioprio
 #define bio_set_prio(bio, prio)((bio)->bi_ioprio = prio)
-- 
2.9.5



[PATCH V13 15/19] block: enable multipage bvecs

2019-01-11 Thread Ming Lei
This patch pulls the trigger for multi-page bvecs.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/bio.c | 22 +++---
 fs/iomap.c  |  4 ++--
 fs/xfs/xfs_aops.c   |  4 ++--
 include/linux/bio.h |  2 +-
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 968b12fea564..83a2dfa417ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * @page: page to add
  * @len: length of the data to add
  * @off: offset of the data in @page
+ * @same_page: if %true only merge if the new data is in the same physical
+ * page as the last segment of the bio.
  *
  * Try to add the data at @page + @off to the last bvec of @bio.  This is a
  * a useful optimisation for file systems with a block size smaller than the
@@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * Return %true on success or %false on failure.
  */
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off)
+   unsigned int len, unsigned int off, bool same_page)
 {
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
 
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+   phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
+   bv->bv_offset + bv->bv_len - 1;
+   phys_addr_t page_addr = page_to_phys(page);
 
-   if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
-   bv->bv_len += len;
-   bio->bi_iter.bi_size += len;
-   return true;
-   }
+   if (vec_end_addr + 1 != page_addr + off)
+   return false;
+   if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
+   return false;
+
+   bv->bv_len += len;
+   bio->bi_iter.bi_size += len;
+   return true;
}
return false;
 }
@@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
 int bio_add_page(struct bio *bio, struct page *page,
 unsigned int len, unsigned int offset)
 {
-   if (!__bio_try_merge_page(bio, page, len, offset)) {
+   if (!__bio_try_merge_page(bio, page, len, offset, false)) {
if (bio_full(bio))
return 0;
__bio_add_page(bio, page, len, offset);
diff --git a/fs/iomap.c b/fs/iomap.c
index af736acd9006..0c350e658b7f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -318,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
 */
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
-   if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+   if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -349,7 +349,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
 
-   __bio_add_page(ctx->bio, page, plen, poff);
+   bio_add_page(ctx->bio, page, plen, poff);
 done:
/*
 * Move the caller beyond our range so that it keeps making progress.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1f1829e506e8..b9fd44168f61 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -616,12 +616,12 @@ xfs_add_to_ioend(
bdev, sector);
}
 
-   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(&iop->write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-   __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+   bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
 
wpc->ioend->io_size += len;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c5231e5c7e85..1ece9f30294b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -441,7 +441,7 @@ extern int bio_add_page(struct bio *, struct page *, 
unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
   unsigned int, unsigned int);
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off);
+   unsigned int len, unsigned int off, bool same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-- 
2.9.5



[PATCH V13 18/19] block: kill QUEUE_FLAG_NO_SG_MERGE

2019-01-11 Thread Ming Lei
Since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"),
physical segment number is mainly figured out in blk_queue_split() for
fast path, and the flag of BIO_SEG_VALID is set there too.

Now only blk_recount_segments() and blk_recalc_rq_segments() use this
flag.

Basically blk_recount_segments() is bypassed in fast path given BIO_SEG_VALID
is set in blk_queue_split().

For another user of blk_recalc_rq_segments():

- run in partial completion branch of blk_update_request, which is an unusual 
case

- run in blk_cloned_rq_check_limits(), still not a big problem if the flag is 
killed
since dm-rq is the only user.

Multi-page bvec is enabled now, not doing S/G merging is rather pointless with 
the
current setup of the I/O path, as it isn't going to save you a significant 
amount
of cycles.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c  | 31 ++-
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c |  3 ---
 drivers/md/dm-table.c  | 13 -
 include/linux/blkdev.h |  1 -
 5 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index bf736d2b3710..dc4877eaf9f9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -354,8 +354,7 @@ void blk_queue_split(struct request_queue *q, struct bio 
**bio)
 EXPORT_SYMBOL(blk_queue_split);
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-struct bio *bio,
-bool no_sg_merge)
+struct bio *bio)
 {
struct bio_vec bv, bvprv = { NULL };
int prev = 0;
@@ -381,13 +380,6 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
nr_phys_segs = 0;
for_each_bio(bio) {
bio_for_each_bvec(bv, bio, iter) {
-   /*
-* If SG merging is disabled, each bio vector is
-* a segment
-*/
-   if (no_sg_merge)
-   goto new_segment;
-
if (prev) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
@@ -417,27 +409,16 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-   bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
-   &rq->q->queue_flags);
-
-   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
-   no_sg_merge);
+   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   unsigned short seg_cnt = bio_segments(bio);
-
-   if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
-   (seg_cnt < queue_max_segments(q)))
-   bio->bi_phys_segments = seg_cnt;
-   else {
-   struct bio *nxt = bio->bi_next;
+   struct bio *nxt = bio->bi_next;
 
-   bio->bi_next = NULL;
-   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
-   bio->bi_next = nxt;
-   }
+   bio->bi_next = NULL;
+   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+   bio->bi_next = nxt;
 
bio_set_flag(bio, BIO_SEG_VALID);
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 90d68760af08..2f9a11ef5bad 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -128,7 +128,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(NO_SG_MERGE),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ba37b9e15e9..fa45817a7e62 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2829,9 +2829,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
-   if (!(set->flags & BLK_MQ_F_SG_MERGE))
-   blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
q->sg_reserved_size = INT_MAX;
 
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4b1be754cc41..ba9481f1bf3c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, 
struct dm_dev *dev,
return q && !blk_queue_add_random(q);
 }
 
-static 

<    1   2   3   4   5   6   7   8   9   10   >