Re: [block regression] kernel oops triggered by removing scsi device dring IO

2018-04-08 Thread Ming Lei
On Sun, Apr 08, 2018 at 05:25:42PM +0800, Ming Lei wrote:
> On Sun, Apr 08, 2018 at 04:11:51PM +0800, Joseph Qi wrote:
> > This is because scsi_remove_device() will call blk_cleanup_queue(), and
> > then all blkgs have been destroyed and root_blkg is NULL.
> > Thus tg is NULL and trigger NULL pointer dereference when get td from
> > tg (tg->td).
> > It seems that we cannot simply move blkcg_exit_queue() up to
> > blk_cleanup_queue().
> 
> Maybe one per-queue blkcg should be introduced, which seems reasonable
> too.

Sorry, I mean one per-queue blkcg lock.

-- 
Ming


[PATCH 8/8] blk-mq: remove code for dealing with remapping queue

2018-04-08 Thread Ming Lei
Firstly, from commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU),
blk-mq doesn't remap queue any more after CPU topo is changed.

Secondly, set->nr_hw_queues can't be bigger than nr_cpu_ids, and now we map
all possible CPUs to hw queues, so at least one CPU is mapped to each hctx.

So queue mapping has became static and fixed just like percpu variable, and
we don't need to handle queue remapping any more.

Cc: Christian Borntraeger <borntrae...@de.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Stefan Haberland <s...@linux.vnet.ibm.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 34 +++---
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b4ce83a0ba2..c3964a79638e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2330,7 +2330,7 @@ static void blk_mq_free_map_and_requests(struct 
blk_mq_tag_set *set,
 
 static void blk_mq_map_swqueue(struct request_queue *q)
 {
-   unsigned int i, hctx_idx;
+   unsigned int i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
@@ -2347,23 +2347,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
/*
 * Map software to hardware queues.
-*
-* If the cpu isn't present, the cpu is mapped to first hctx.
 */
for_each_possible_cpu(i) {
-   hctx_idx = q->mq_map[i];
-   /* unmapped hw queue can be remapped after CPU topo changed */
-   if (!set->tags[hctx_idx] &&
-   !__blk_mq_alloc_rq_map(set, hctx_idx)) {
-   /*
-* If tags initialization fail for some hctx,
-* that hctx won't be brought online.  In this
-* case, remap the current ctx to hctx[0] which
-* is guaranteed to always have tags allocated
-*/
-   q->mq_map[i] = 0;
-   }
-
ctx = per_cpu_ptr(q->queue_ctx, i);
hctx = blk_mq_map_queue(q, i);
 
@@ -2375,21 +2360,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
mutex_unlock(>sysfs_lock);
 
queue_for_each_hw_ctx(q, hctx, i) {
-   /*
-* If no software queues are mapped to this hardware queue,
-* disable it and free the request entries.
-*/
-   if (!hctx->nr_ctx) {
-   /* Never unmap queue 0.  We need it as a
-* fallback in case of a new remap fails
-* allocation
-*/
-   if (i && set->tags[i])
-   blk_mq_free_map_and_requests(set, i);
-
-   hctx->tags = NULL;
-   continue;
-   }
+   /* every hctx should get mapped by at least one CPU */
+   WARN_ON(!hctx->nr_ctx);
 
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
-- 
2.9.5



[PATCH 6/8] blk-mq: don't check queue mapped in __blk_mq_delay_run_hw_queue()

2018-04-08 Thread Ming Lei
There are several reasons for removing the check:

1) blk_mq_hw_queue_mapped() returns true always now since each hctx
may be mapped by one CPU at least

2) when there isn't any online CPU mapped to this hctx, there won't
be any IO queued to this CPU, blk_mq_run_hw_queue() only runs queue
if there is IO queued to this hctx

3) If __blk_mq_delay_run_hw_queue() is called by blk_mq_delay_run_hw_queue(),
which is run from blk_mq_dispatch_rq_list() or scsi_mq_get_budget(), and
the hctx to be handled has to be mapped.

Cc: Christian Borntraeger <borntrae...@de.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Stefan Haberland <s...@linux.vnet.ibm.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8743e9105612..3b4ce83a0ba2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1394,9 +1394,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx 
*hctx)
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
unsigned long msecs)
 {
-   if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
-   return;
-
if (unlikely(blk_mq_hctx_stopped(hctx)))
return;
 
-- 
2.9.5



[PATCH 7/8] blk-mq: reimplement blk_mq_hw_queue_mapped

2018-04-08 Thread Ming Lei
Now the actual meaning of queue mapped is that if there is any online
CPU mapped to this hctx, so implement blk_mq_hw_queue_mapped() in this
way.

Cc: Christian Borntraeger <borntrae...@de.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Stefan Haberland <s...@linux.vnet.ibm.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 88c558f71819..502af371b83b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -181,7 +181,7 @@ static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx 
*hctx)
 
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 {
-   return hctx->nr_ctx && hctx->tags;
+   return cpumask_first_and(hctx->cpumask, cpu_online_mask) < nr_cpu_ids;
 }
 
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
-- 
2.9.5



[PATCH 4/8] blk-mq: introduce blk_mq_hw_queue_first_cpu() to figure out first cpu

2018-04-08 Thread Ming Lei
This patch introduces helper of blk_mq_hw_queue_first_cpu() for
figuring out the hctx's first cpu, and code duplication can be
avoided.

Cc: Christian Borntraeger <borntrae...@de.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Stefan Haberland <s...@linux.vnet.ibm.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 23 +++
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a16efa6f2e7f..e3d02af79010 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1336,6 +1336,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
*hctx)
hctx_unlock(hctx, srcu_idx);
 }
 
+static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
+{
+   int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
+
+   if (cpu >= nr_cpu_ids)
+   cpu = cpumask_first(hctx->cpumask);
+   return cpu;
+}
+
 /*
  * It'd be great if the workqueue API had a way to pass
  * in a mask and had some smarts for more clever placement.
@@ -1355,14 +1364,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx 
*hctx)
next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
-   next_cpu = cpumask_first_and(hctx->cpumask, 
cpu_online_mask);
-
-   /*
-* No online CPU is found, so have to make sure hctx->next_cpu
-* is set correctly for not breaking workqueue.
-*/
-   if (next_cpu >= nr_cpu_ids)
-   next_cpu = cpumask_first(hctx->cpumask);
+   next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
 
@@ -2431,10 +2433,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
/*
 * Initialize batch roundrobin counts
 */
-   hctx->next_cpu = cpumask_first_and(hctx->cpumask,
-   cpu_online_mask);
-   if (hctx->next_cpu >= nr_cpu_ids)
-   hctx->next_cpu = cpumask_first(hctx->cpumask);
+   hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
 }
-- 
2.9.5



[PATCH 3/8] blk-mq: avoid to write intermediate result to hctx->next_cpu

2018-04-08 Thread Ming Lei
This patch figures out the final selected CPU, then writes
it to hctx->next_cpu once, then we can avoid to intermediate
next cpu observed from other dispatch paths.

Cc: Christian Borntraeger <borntrae...@de.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Stefan Haberland <s...@linux.vnet.ibm.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9b220dc415ac..a16efa6f2e7f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1345,26 +1345,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
*hctx)
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
bool tried = false;
+   int next_cpu = hctx->next_cpu;
 
if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
 
if (--hctx->next_cpu_batch <= 0) {
-   int next_cpu;
 select_cpu:
-   next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+   next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
-   next_cpu = 
cpumask_first_and(hctx->cpumask,cpu_online_mask);
+   next_cpu = cpumask_first_and(hctx->cpumask, 
cpu_online_mask);
 
/*
 * No online CPU is found, so have to make sure hctx->next_cpu
 * is set correctly for not breaking workqueue.
 */
if (next_cpu >= nr_cpu_ids)
-   hctx->next_cpu = cpumask_first(hctx->cpumask);
-   else
-   hctx->next_cpu = next_cpu;
+   next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
 
@@ -1372,7 +1370,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx 
*hctx)
 * Do unbound schedule if we can't find a online CPU for this hctx,
 * and it should only happen in the path of handling CPU DEAD.
 */
-   if (!cpu_online(hctx->next_cpu)) {
+   if (!cpu_online(next_cpu)) {
if (!tried) {
tried = true;
goto select_cpu;
@@ -1382,10 +1380,13 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx 
*hctx)
 * Make sure to re-select CPU next time once after CPUs
 * in hctx->cpumask become online again.
 */
+   hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = 1;
return WORK_CPU_UNBOUND;
}
-   return hctx->next_cpu;
+
+   hctx->next_cpu = next_cpu;
+   return next_cpu;
 }
 
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
-- 
2.9.5



[PATCH 2/8] blk-mq: don't keep offline CPUs mapped to hctx 0

2018-04-08 Thread Ming Lei
>From commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU),
blk-mq doesn't remap queue after CPU topo is changed, that said when
some of these offline CPUs become online, they are still mapped to
hctx 0, then hctx 0 may become the bottleneck of IO dispatch and
completion.

This patch sets up the mapping from the beginning, and aligns to
queue mapping for PCI device (blk_mq_pci_map_queues()).

Fixes: 4b855ad37194 ("blk-mq: Create hctx for each present CPU)
Cc: Christian Borntraeger <borntrae...@de.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Stefan Haberland <s...@linux.vnet.ibm.com>
Cc: Keith Busch <keith.bu...@intel.com>
Cc: sta...@vger.kernel.org
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq-cpumap.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9f8cffc8a701..3eb169f15842 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -16,11 +16,6 @@
 
 static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
-   /*
-* Non present CPU will be mapped to queue index 0.
-*/
-   if (!cpu_present(cpu))
-   return 0;
return cpu % nr_queues;
 }
 
-- 
2.9.5



[PATCH 1/8] blk-mq: make sure that correct hctx->next_cpu is set

2018-04-08 Thread Ming Lei
>From commit 20e4d81393196 (blk-mq: simplify queue mapping & schedule
with each possisble CPU), one hctx can be mapped from all offline CPUs,
then hctx->next_cpu can be set as wrong.

This patch fixes this issue by making hctx->next_cpu pointing to the
first CPU in hctx->cpumask if all CPUs in hctx->cpumask are offline.

Cc: Christian Borntraeger <borntrae...@de.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Stefan Haberland <s...@linux.vnet.ibm.com>
Fixes: 20e4d81393196 ("blk-mq: simplify queue mapping & schedule with each 
possisble CPU")
Cc: sta...@vger.kernel.org
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5c7dbcb954f..9b220dc415ac 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2432,6 +2432,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 */
hctx->next_cpu = cpumask_first_and(hctx->cpumask,
cpu_online_mask);
+   if (hctx->next_cpu >= nr_cpu_ids)
+   hctx->next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
 }
-- 
2.9.5



[PATCH 0/8] blk-mq: fix and improve queue mapping

2018-04-08 Thread Ming Lei
Hi Jens,

The first two patches fix issues about queue mapping.

The other 6 patches improve queue mapping for blk-mq.

Christian, this patches should fix your issue, so please give
a test, and the patches can be found in the following tree:

https://github.com/ming1/linux/commits/v4.17-rc-blk-fix_mapping_v1

Thanks,
Ming

Ming Lei (8):
  blk-mq: make sure that correct hctx->next_cpu is set
  blk-mq: don't keep offline CPUs mapped to hctx 0
  blk-mq: avoid to write intermediate result to hctx->next_cpu
  blk-mq: introduce blk_mq_hw_queue_first_cpu() to figure out first cpu
  blk-mq: remove blk_mq_delay_queue()
  blk-mq: don't check queue mapped in __blk_mq_delay_run_hw_queue()
  blk-mq: reimplement blk_mq_hw_queue_mapped
  blk-mq: remove code for dealing with remapping queue

 block/blk-mq-cpumap.c  |   5 ---
 block/blk-mq-debugfs.c |   1 -
 block/blk-mq.c | 101 +++--
 block/blk-mq.h |   2 +-
 include/linux/blk-mq.h |   2 -
 5 files changed, 24 insertions(+), 87 deletions(-)

-- 
2.9.5



Re: [block regression] kernel oops triggered by removing scsi device dring IO

2018-04-08 Thread Ming Lei
On Sun, Apr 08, 2018 at 04:11:51PM +0800, Joseph Qi wrote:
> This is because scsi_remove_device() will call blk_cleanup_queue(), and
> then all blkgs have been destroyed and root_blkg is NULL.
> Thus tg is NULL and trigger NULL pointer dereference when get td from
> tg (tg->td).
> It seems that we cannot simply move blkcg_exit_queue() up to
> blk_cleanup_queue().

Maybe one per-queue blkcg should be introduced, which seems reasonable
too.

Thanks,
Ming


[block regression] kernel oops triggered by removing scsi device dring IO

2018-04-07 Thread Ming Lei
Hi,

The following kernel oops is triggered by 'removing scsi device' during
heavy IO.

'git bisect' shows that commit a063057d7c731cffa7d10740(block: Fix a race
between request queue removal and the block cgroup controller)
introduced this regression:

[   42.268257] BUG: unable to handle kernel NULL pointer dereference at 
0028
[   42.269339] PGD 26bd9f067 P4D 26bd9f067 PUD 26bfec067 PMD 0 
[   42.270077] Oops:  [#1] PREEMPT SMP NOPTI
[   42.270681] Dumping ftrace buffer:
[   42.271141](ftrace buffer empty)
[   42.271641] Modules linked in: scsi_debug iTCO_wdt iTCO_vendor_support 
crc32c_intel i2c_i801 i2c_core lpc_ich mfd_core usb_storage nvme shpchp 
nvme_core virtio_scsi qemu_fw_cfg ip_tables
[   42.273770] CPU: 5 PID: 1076 Comm: fio Not tainted 4.16.0+ #49
[   42.274530] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
1.10.2-2.fc27 04/01/2014
[   42.275634] RIP: 0010:blk_throtl_bio+0x41/0x904
[   42.276225] RSP: 0018:c900033cfaa0 EFLAGS: 00010246
[   42.276907] RAX: 8000 RBX: 8801bdcc5118 RCX: 0001
[   42.277818] RDX: 8801bdcc5118 RSI:  RDI: 8802641f8870
[   42.278733] RBP:  R08: 0001 R09: c900033cfb94
[   42.279651] R10: c900033cfc00 R11: 06ea R12: 8802641f8870
[   42.280567] R13: 88026f34f000 R14:  R15: 8801bdcc5118
[   42.281489] FS:  7fc123922d40() GS:880272f4() 
knlGS:
[   42.282525] CS:  0010 DS:  ES:  CR0: 80050033
[   42.283270] CR2: 0028 CR3: 00026d7ac004 CR4: 007606e0
[   42.284194] DR0:  DR1:  DR2: 
[   42.285116] DR3:  DR6: fffe0ff0 DR7: 0400
[   42.286036] PKRU: 5554
[   42.286393] Call Trace:
[   42.286725]  ? try_to_wake_up+0x3a3/0x3c9
[   42.287255]  ? blk_mq_hctx_notify_dead+0x135/0x135
[   42.287880]  ? gup_pud_range+0xb5/0x7e1
[   42.288381]  generic_make_request_checks+0x3cf/0x539
[   42.289027]  ? gup_pgd_range+0x8e/0xaa
[   42.289515]  generic_make_request+0x38/0x25b
[   42.290078]  ? submit_bio+0x103/0x11f
[   42.290555]  submit_bio+0x103/0x11f
[   42.291018]  ? bio_iov_iter_get_pages+0xe4/0x104
[   42.291620]  blkdev_direct_IO+0x2a3/0x3af
[   42.292151]  ? kiocb_free+0x34/0x34
[   42.292607]  ? ___preempt_schedule+0x16/0x18
[   42.293168]  ? preempt_schedule_common+0x4c/0x65
[   42.293771]  ? generic_file_read_iter+0x96/0x110
[   42.294377]  generic_file_read_iter+0x96/0x110
[   42.294962]  aio_read+0xca/0x13b
[   42.295388]  ? preempt_count_add+0x6d/0x8c
[   42.295926]  ? aio_read_events+0x287/0x2d6
[   42.296460]  ? do_io_submit+0x4d2/0x62c
[   42.296964]  do_io_submit+0x4d2/0x62c
[   42.297446]  ? do_syscall_64+0x9d/0x15e
[   42.297950]  do_syscall_64+0x9d/0x15e
[   42.298431]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[   42.299090] RIP: 0033:0x7fc12244e687
[   42.299556] RSP: 002b:7ffe18388a68 EFLAGS: 0202 ORIG_RAX: 
00d1
[   42.300528] RAX: ffda RBX: 7fc0fde08670 RCX: 7fc12244e687
[   42.301442] RDX: 01d1b388 RSI: 0001 RDI: 7fc123782000
[   42.302359] RBP: 22d8 R08: 0001 R09: 01c461e0
[   42.303275] R10:  R11: 0202 R12: 7fc0fde08670
[   42.304195] R13:  R14: 01d1d0c0 R15: 01b872f0
[   42.305117] Code: 48 85 f6 48 89 7c 24 10 75 0e 48 8b b7 b8 05 00 00 31 ed 
48 85 f6 74 0f 48 63 05 75 a4 e4 00 48 8b ac c6 28 02 00 00 f6 43 15 02 <48> 8b 
45 28 48 89 04 24 0f 85 28 08 00 00 8b 43 10 45 31 e4 83 
[   42.307553] RIP: blk_throtl_bio+0x41/0x904 RSP: c900033cfaa0
[   42.308328] CR2: 0028
[   42.308920] ---[ end trace f53a144979f63b29 ]---
[   42.309520] Kernel panic - not syncing: Fatal exception
[   42.310635] Dumping ftrace buffer:
[   42.311087](ftrace buffer empty)
[   42.311583] Kernel Offset: disabled
[   42.312163] ---[ end Kernel panic - not syncing: Fatal exception ]---

-- 
Ming


Re: [PATCH V3 4/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-04-07 Thread Ming Lei
On Fri, Apr 06, 2018 at 11:49:47PM +0200, Thomas Gleixner wrote:
> On Fri, 6 Apr 2018, Thomas Gleixner wrote:
> 
> > On Fri, 6 Apr 2018, Ming Lei wrote:
> > > 
> > > I will post V4 soon by using cpu_present_mask in the 1st stage irq spread.
> > > And it should work fine for Kashyap's case in normal cases.
> > 
> > No need to resend. I've changed it already and will push it out after
> > lunch.
> 
> No. Lunch did not last that long :)
> 
> I pushed out the lot to
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
> 
> Please double check the modifications I did. The first related commit fixes
> an existing error handling bug.

I think your modification is better, especially about adding comment in
irq_create_affinity_masks().

I also testes these patches again, and they just work fine.

Thanks,
Ming


Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-04-06 Thread Ming Lei
On Fri, Apr 06, 2018 at 05:11:53PM +0200, Christian Borntraeger wrote:
> 
> 
> On 04/06/2018 04:58 PM, Ming Lei wrote:
> > On Fri, Apr 06, 2018 at 04:26:49PM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 04/06/2018 03:41 PM, Ming Lei wrote:
> >>> On Fri, Apr 06, 2018 at 12:19:19PM +0200, Christian Borntraeger wrote:
> >>>>
> >>>>
> >>>> On 04/06/2018 11:23 AM, Ming Lei wrote:
> >>>>> On Fri, Apr 06, 2018 at 10:51:28AM +0200, Christian Borntraeger wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 04/06/2018 10:41 AM, Ming Lei wrote:
> >>>>>>> On Thu, Apr 05, 2018 at 07:39:56PM +0200, Christian Borntraeger wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 04/05/2018 06:11 PM, Ming Lei wrote:
> >>>>>>>>>>
> >>>>>>>>>> Could you please apply the following patch and provide the dmesg 
> >>>>>>>>>> boot log?
> >>>>>>>>>
> >>>>>>>>> And please post out the 'lscpu' log together from the test machine 
> >>>>>>>>> too.
> >>>>>>>>
> >>>>>>>> attached.
> >>>>>>>>
> >>>>>>>> As I said before this seems to go way with CONFIG_NR_CPUS=64 or 
> >>>>>>>> smaller.
> >>>>>>>> We have 282 nr_cpu_ids here (max 141CPUs on that z13 with SMT2) but 
> >>>>>>>> only 8 Cores
> >>>>>>>> == 16 threads.
> >>>>>>>
> >>>>>>> OK, thanks!
> >>>>>>>
> >>>>>>> The most weird thing is that hctx->next_cpu is computed as 512 since
> >>>>>>> nr_cpu_id is 282, and hctx->next_cpu should have pointed to one of
> >>>>>>> possible CPU.
> >>>>>>>
> >>>>>>> Looks like it is a s390 specific issue, since I can setup one queue
> >>>>>>> which has same mapping with yours:
> >>>>>>>
> >>>>>>>   - nr_cpu_id is 282
> >>>>>>>   - CPU 0~15 is online
> >>>>>>>   - 64 queues null_blk
> >>>>>>>   - still run all hw queues in .complete handler
> >>>>>>>
> >>>>>>> But can't reproduce this issue at all.
> >>>>>>>
> >>>>>>> So please test the following patch, which may tell us why 
> >>>>>>> hctx->next_cpu
> >>>>>>> is computed wrong:
> >>>>>>
> >>>>>> I see things like
> >>>>>>
> >>>>>> [8.196907] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196910] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196912] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196913] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196914] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196915] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196917] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>> [8.196918] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>>>
> >>>>>> which is exactly what happens if the find and and operation fails 
> >>>>>> (returns size of bitmap).
> >>>>>
> >>>>> Given both 'cpu_online_mask' and 'hctx->cpumask' are shown as correct
> >>>>> in your previous debug log, it means the following function returns
> >>>>> totally wrong result on S390.
> >>>>>
> >>>>> cpumask_first_and(hctx->cpumask, cpu_online_mask);
> >>>>>
> >>>>> The debugfs log shows that each hctx->cpumask includes one online
> >>>>> CPU(0~15).
> >>>>
> >>>> Really? the last log (with the latest patch applied  shows a lot of 
> >>>> contexts
> >>>> that do not have CPUs in 0-15:
> >>>>
> >>>> e.g. 
> >>>> [4.049828] dump CPUs mapped to this hctx:
> >>>> [4.049829] 18 
> >>>> [4.049829] 82 
> >>>> [4.049830] 146 
> >>>> [4.049830] 210 
> >>>> [4.049831] 274 
> >>>
> >>> That won't be an issue, since no IO can be submitted from these offline
> >>> CPUs, then these hctx shouldn't have been run at all.
> >>>
> >>> But hctx->next_cpu can be set as 512 for these inactive hctx in
> >>> blk_mq_map_swqueue(), then please test the attached patch, and if
> >>> hctx->next_cpu is still set as 512, something is still wrong.
> >>
> >>
> >> WIth this patch I no longer see the "run queue from wrong CPU x, hctx 
> >> active" messages.
> >> your debug code still triggers, though.
> >>
> >> wrong next_cpu 512, blk_mq_hctx_next_cpu, first_and
> >> wrong next_cpu 512, blk_mq_hctx_next_cpu, next_and
> >>
> >> If we would remove the debug code then dmesg would be clean it seems.
> > 
> > That is still a bit strange, since for any inactive hctx(without online
> > CPU mapped), blk_mq_run_hw_queue() will check blk_mq_hctx_has_pending()
> 
> I think for next_and it is reasonable to see this, as the next_and will return
> 512 after we have used the last one. In fact the code does call first_and in
> that case for a reason, no?

It is possible for dumping 'first_and' when there isn't any online CPUs mapped
to this hctx.

But my question is that for this case, there shouldn't be any IO queued
for this hctx, and blk_mq_hctx_has_pending() has been called to check
that, so blk_mq_hctx_next_cpu() should have only be called when
blk_mq_hctx_has_pending() in blk_mq_run_hw_queue() is true.


Thanks,
Ming


Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-04-06 Thread Ming Lei
On Fri, Apr 06, 2018 at 04:26:49PM +0200, Christian Borntraeger wrote:
> 
> 
> On 04/06/2018 03:41 PM, Ming Lei wrote:
> > On Fri, Apr 06, 2018 at 12:19:19PM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 04/06/2018 11:23 AM, Ming Lei wrote:
> >>> On Fri, Apr 06, 2018 at 10:51:28AM +0200, Christian Borntraeger wrote:
> >>>>
> >>>>
> >>>> On 04/06/2018 10:41 AM, Ming Lei wrote:
> >>>>> On Thu, Apr 05, 2018 at 07:39:56PM +0200, Christian Borntraeger wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 04/05/2018 06:11 PM, Ming Lei wrote:
> >>>>>>>>
> >>>>>>>> Could you please apply the following patch and provide the dmesg 
> >>>>>>>> boot log?
> >>>>>>>
> >>>>>>> And please post out the 'lscpu' log together from the test machine 
> >>>>>>> too.
> >>>>>>
> >>>>>> attached.
> >>>>>>
> >>>>>> As I said before this seems to go way with CONFIG_NR_CPUS=64 or 
> >>>>>> smaller.
> >>>>>> We have 282 nr_cpu_ids here (max 141CPUs on that z13 with SMT2) but 
> >>>>>> only 8 Cores
> >>>>>> == 16 threads.
> >>>>>
> >>>>> OK, thanks!
> >>>>>
> >>>>> The most weird thing is that hctx->next_cpu is computed as 512 since
> >>>>> nr_cpu_id is 282, and hctx->next_cpu should have pointed to one of
> >>>>> possible CPU.
> >>>>>
> >>>>> Looks like it is a s390 specific issue, since I can setup one queue
> >>>>> which has same mapping with yours:
> >>>>>
> >>>>> - nr_cpu_id is 282
> >>>>> - CPU 0~15 is online
> >>>>> - 64 queues null_blk
> >>>>> - still run all hw queues in .complete handler
> >>>>>
> >>>>> But can't reproduce this issue at all.
> >>>>>
> >>>>> So please test the following patch, which may tell us why hctx->next_cpu
> >>>>> is computed wrong:
> >>>>
> >>>> I see things like
> >>>>
> >>>> [8.196907] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196910] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196912] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196913] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196914] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196915] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196917] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>> [8.196918] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>>>
> >>>> which is exactly what happens if the find and and operation fails 
> >>>> (returns size of bitmap).
> >>>
> >>> Given both 'cpu_online_mask' and 'hctx->cpumask' are shown as correct
> >>> in your previous debug log, it means the following function returns
> >>> totally wrong result on S390.
> >>>
> >>>   cpumask_first_and(hctx->cpumask, cpu_online_mask);
> >>>
> >>> The debugfs log shows that each hctx->cpumask includes one online
> >>> CPU(0~15).
> >>
> >> Really? the last log (with the latest patch applied  shows a lot of 
> >> contexts
> >> that do not have CPUs in 0-15:
> >>
> >> e.g. 
> >> [4.049828] dump CPUs mapped to this hctx:
> >> [4.049829] 18 
> >> [4.049829] 82 
> >> [4.049830] 146 
> >> [4.049830] 210 
> >> [4.049831] 274 
> > 
> > That won't be an issue, since no IO can be submitted from these offline
> > CPUs, then these hctx shouldn't have been run at all.
> > 
> > But hctx->next_cpu can be set as 512 for these inactive hctx in
> > blk_mq_map_swqueue(), then please test the attached patch, and if
> > hctx->next_cpu is still set as 512, something is still wrong.
> 
> 
> WIth this patch I no longer see the "run queue from wrong CPU x, hctx active" 
> messages.
> your debug code still triggers, though.
> 
> wrong next_cpu 512, blk_mq_hctx_next_cpu, first_and
> wrong next_cpu 512, blk_mq_hctx_next_cpu, next_and
> 
> If we would remove the debug code then dmesg would be clean it seems.

That is still a bit strange, since for any inactive hctx(without online
CPU mapped), blk_mq_run_hw_queue() will check blk_mq_hctx_has_pending()
first. And there shouldn't be any pending IO for all inactive hctx
in your case, so looks blk_mq_hctx_next_cpu() shouldn't be called for
inactive hctx.

I will prepare one patchset and post out soon, and hope all these issues
can be covered.

Thanks,
Ming


Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-04-06 Thread Ming Lei
On Fri, Apr 06, 2018 at 12:19:19PM +0200, Christian Borntraeger wrote:
> 
> 
> On 04/06/2018 11:23 AM, Ming Lei wrote:
> > On Fri, Apr 06, 2018 at 10:51:28AM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 04/06/2018 10:41 AM, Ming Lei wrote:
> >>> On Thu, Apr 05, 2018 at 07:39:56PM +0200, Christian Borntraeger wrote:
> >>>>
> >>>>
> >>>> On 04/05/2018 06:11 PM, Ming Lei wrote:
> >>>>>>
> >>>>>> Could you please apply the following patch and provide the dmesg boot 
> >>>>>> log?
> >>>>>
> >>>>> And please post out the 'lscpu' log together from the test machine too.
> >>>>
> >>>> attached.
> >>>>
> >>>> As I said before this seems to go way with CONFIG_NR_CPUS=64 or smaller.
> >>>> We have 282 nr_cpu_ids here (max 141CPUs on that z13 with SMT2) but only 
> >>>> 8 Cores
> >>>> == 16 threads.
> >>>
> >>> OK, thanks!
> >>>
> >>> The most weird thing is that hctx->next_cpu is computed as 512 since
> >>> nr_cpu_id is 282, and hctx->next_cpu should have pointed to one of
> >>> possible CPU.
> >>>
> >>> Looks like it is a s390 specific issue, since I can setup one queue
> >>> which has same mapping with yours:
> >>>
> >>>   - nr_cpu_id is 282
> >>>   - CPU 0~15 is online
> >>>   - 64 queues null_blk
> >>>   - still run all hw queues in .complete handler
> >>>
> >>> But can't reproduce this issue at all.
> >>>
> >>> So please test the following patch, which may tell us why hctx->next_cpu
> >>> is computed wrong:
> >>
> >> I see things like
> >>
> >> [8.196907] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196910] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196912] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196913] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196914] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196915] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196917] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >> [8.196918] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> >>
> >> which is exactly what happens if the find and and operation fails (returns 
> >> size of bitmap).
> > 
> > Given both 'cpu_online_mask' and 'hctx->cpumask' are shown as correct
> > in your previous debug log, it means the following function returns
> > totally wrong result on S390.
> > 
> > cpumask_first_and(hctx->cpumask, cpu_online_mask);
> > 
> > The debugfs log shows that each hctx->cpumask includes one online
> > CPU(0~15).
> 
> Really? the last log (with the latest patch applied  shows a lot of contexts
> that do not have CPUs in 0-15:
> 
> e.g. 
> [4.049828] dump CPUs mapped to this hctx:
> [4.049829] 18 
> [4.049829] 82 
> [4.049830] 146 
> [4.049830] 210 
> [4.049831] 274 

That won't be an issue, since no IO can be submitted from these offline
CPUs, then these hctx shouldn't have been run at all.

But hctx->next_cpu can be set as 512 for these inactive hctx in
blk_mq_map_swqueue(), then please test the attached patch, and if
hctx->next_cpu is still set as 512, something is still wrong.

---

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9f8cffc8a701..638ab5c11b3c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,13 +14,12 @@
 #include "blk.h"
 #include "blk-mq.h"
 
+/*
+ * Given there isn't CPU hotplug handler in blk-mq, map all CPUs to
+ * queues even it isn't present yet.
+ */
 static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
-   /*
-* Non present CPU will be mapped to queue index 0.
-*/
-   if (!cpu_present(cpu))
-   return 0;
return cpu % nr_queues;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90838e998f66..1a834d96a718 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1343,6 +1343,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
*hctx)
hctx_unlock(hctx, srcu_idx);
 }
 
+static void check_next_cpu(int next_cpu, const char *str1, const char *str2)
+{
+   if (

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-04-06 Thread Ming Lei
On Fri, Apr 06, 2018 at 10:51:28AM +0200, Christian Borntraeger wrote:
> 
> 
> On 04/06/2018 10:41 AM, Ming Lei wrote:
> > On Thu, Apr 05, 2018 at 07:39:56PM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 04/05/2018 06:11 PM, Ming Lei wrote:
> >>>>
> >>>> Could you please apply the following patch and provide the dmesg boot 
> >>>> log?
> >>>
> >>> And please post out the 'lscpu' log together from the test machine too.
> >>
> >> attached.
> >>
> >> As I said before this seems to go way with CONFIG_NR_CPUS=64 or smaller.
> >> We have 282 nr_cpu_ids here (max 141CPUs on that z13 with SMT2) but only 8 
> >> Cores
> >> == 16 threads.
> > 
> > OK, thanks!
> > 
> > The most weird thing is that hctx->next_cpu is computed as 512 since
> > nr_cpu_id is 282, and hctx->next_cpu should have pointed to one of
> > possible CPU.
> > 
> > Looks like it is a s390 specific issue, since I can setup one queue
> > which has same mapping with yours:
> > 
> > - nr_cpu_id is 282
> > - CPU 0~15 is online
> > - 64 queues null_blk
> > - still run all hw queues in .complete handler
> > 
> > But can't reproduce this issue at all.
> > 
> > So please test the following patch, which may tell us why hctx->next_cpu
> > is computed wrong:
> 
> I see things like
> 
> [8.196907] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196910] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196912] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196913] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196914] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196915] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196916] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196917] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> [8.196918] wrong next_cpu 512, blk_mq_map_swqueue, first_and
> 
> which is exactly what happens if the find and and operation fails (returns 
> size of bitmap).

Given both 'cpu_online_mask' and 'hctx->cpumask' are shown as correct
in your previous debug log, it means the following function returns
totally wrong result on S390.

cpumask_first_and(hctx->cpumask, cpu_online_mask);

The debugfs log shows that each hctx->cpumask includes one online
CPU(0~15).

So looks it isn't one issue in block MQ core.

Thanks,
Ming


Re: [PATCH V3 4/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-04-06 Thread Ming Lei
Hi Thomas,

On Wed, Apr 04, 2018 at 09:38:26PM +0200, Thomas Gleixner wrote:
> On Wed, 4 Apr 2018, Ming Lei wrote:
> > On Wed, Apr 04, 2018 at 10:25:16AM +0200, Thomas Gleixner wrote:
> > > In the example above:
> > > 
> > > > > > irq 39, cpu list 0,4
> > > > > > irq 40, cpu list 1,6
> > > > > > irq 41, cpu list 2,5
> > > > > > irq 42, cpu list 3,7
> > > 
> > > and assumed that at driver init time only CPU 0-3 are online then the
> > > hotplug of CPU 4-7 will not result in any interrupt delivered to CPU 4-7.
> > 
> > Indeed, and I just tested this case, and found that no interrupts are
> > delivered to CPU 4-7.
> > 
> > In theory, the affinity has been assigned to these irq vectors, and
> > programmed to interrupt controller, I understand it should work.
> > 
> > Could you explain it a bit why interrupts aren't delivered to CPU 4-7?
> 
> As I explained before:
> 
> "If the device is already in use when the offline CPUs get hot plugged, then
>  the interrupts still stay on cpu 0-3 because the effective affinity of
>  interrupts on X86 (and other architectures) is always a single CPU."
> 
> IOW. If you set the affinity mask so it contains more than one CPU then the
> kernel selects a single CPU as target. The selected CPU must be online and
> if there is more than one online CPU in the mask then the kernel picks the
> one which has the least number of interrupts targeted at it. This selected
> CPU target is programmed into the corresponding interrupt chip
> (IOAPIC/MSI/MSIX) and it stays that way until the selected target CPU
> goes offline or the affinity mask changes.
> 
> The reasons why we use single target delivery on X86 are:
> 
>1) Not all X86 systems support multi target delivery
> 
>2) If a system supports multi target delivery then the interrupt is
>   preferrably delivered to the CPU with the lowest APIC ID (which
>   usually corresponds to the lowest CPU number) due to hardware magic
>   and only a very small percentage of interrupts are delivered to the
>   other CPUs in the multi target set. So the benefit is rather dubious
>   and extensive performance testing did not show any significant
>   difference.
> 
>3) The management of multi targets on the software side is painful as
>   the same low level vector number has to be allocated on all possible
>   target CPUs. That's making a lot of things including hotplug more
>   complex for very little - if at all - benefit.
> 
> So at some point we ripped out the multi target support on X86 and moved
> everything to single target delivery mode.
> 
> Other architectures never supported multi target delivery either due to
> hardware restrictions or for similar reasons why X86 dropped it. There
> might be a few architectures which support it, but I have no overview at
> the moment.
> 
> The information is in procfs
> 
> # cat /proc/irq/9/smp_affinity_list 
> 0-3
> # cat /proc/irq/9/effective_affinity_list 
> 1
> 
> # cat /proc/irq/10/smp_affinity_list 
> 0-3
> # cat /proc/irq/10/effective_affinity_list 
> 2
> 
> smp_affinity[_list] is the affinity which is set either by the kernel or by
> writing to /proc/irq/$N/smp_affinity[_list]
> 
> effective_affinity[_list] is the affinity which is effective, i.e. the
> single target CPU to which the interrupt is affine at this point.
> 
> As you can see in the above examples the target CPU is selected from the
> given possible target set and the internal spreading of the low level x86
> vector allocation code picks a CPU which has the lowest number of
> interrupts targeted at it.
> 
> Let's assume for the example below
> 
> # cat /proc/irq/10/smp_affinity_list 
> 0-3
> # cat /proc/irq/10/effective_affinity_list 
> 2
> 
> that CPU 3 was offline when the device was initialized. So there was no way
> to select it and when CPU 3 comes online there is no reason to change the
> affinity of that interrupt, at least not from the kernel POV. Actually we
> don't even have a mechanism to do so automagically.
> 
> If I offline CPU 2 after onlining CPU 3 then the kernel has to move the
> interrupt away from CPU 2, so it selects CPU 3 as it's the one with the
> lowest number of interrupts targeted at it.
> 
> Now this is a bit different if you use affinity managed interrupts like
> NVME and other devices do.
> 
> Many of these devices create one queue per possible CPU, so the spreading
> is simple; One interrupt per possible cpu. Pretty boring.
> 
> When the device has less queues than possible CPUs, then stuff gets more
&

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-04-06 Thread Ming Lei
On Thu, Apr 05, 2018 at 07:39:56PM +0200, Christian Borntraeger wrote:
> 
> 
> On 04/05/2018 06:11 PM, Ming Lei wrote:
> >>
> >> Could you please apply the following patch and provide the dmesg boot log?
> > 
> > And please post out the 'lscpu' log together from the test machine too.
> 
> attached.
> 
> As I said before this seems to go way with CONFIG_NR_CPUS=64 or smaller.
> We have 282 nr_cpu_ids here (max 141CPUs on that z13 with SMT2) but only 8 
> Cores
> == 16 threads.

OK, thanks!

The most weird thing is that hctx->next_cpu is computed as 512 since
nr_cpu_id is 282, and hctx->next_cpu should have pointed to one of
possible CPU.

Looks like it is a s390 specific issue, since I can setup one queue
which has same mapping with yours:

- nr_cpu_id is 282
- CPU 0~15 is online
- 64 queues null_blk
- still run all hw queues in .complete handler

But can't reproduce this issue at all.

So please test the following patch, which may tell us why hctx->next_cpu
is computed wrong:

---
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9f8cffc8a701..638ab5c11b3c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,13 +14,12 @@
 #include "blk.h"
 #include "blk-mq.h"
 
+/*
+ * Given there isn't CPU hotplug handler in blk-mq, map all CPUs to
+ * queues even it isn't present yet.
+ */
 static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
-   /*
-* Non present CPU will be mapped to queue index 0.
-*/
-   if (!cpu_present(cpu))
-   return 0;
return cpu % nr_queues;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90838e998f66..9b130e4b87df 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1343,6 +1343,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
*hctx)
hctx_unlock(hctx, srcu_idx);
 }
 
+static void check_next_cpu(int next_cpu, const char *str1, const char *str2)
+{
+   if (next_cpu > nr_cpu_ids)
+   printk_ratelimited("wrong next_cpu %d, %s, %s\n",
+   next_cpu, str1, str2);
+}
+
 /*
  * It'd be great if the workqueue API had a way to pass
  * in a mask and had some smarts for more clever placement.
@@ -1352,26 +1359,29 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
*hctx)
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
bool tried = false;
+   int next_cpu = hctx->next_cpu;
 
if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
 
if (--hctx->next_cpu_batch <= 0) {
-   int next_cpu;
 select_cpu:
-   next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+   next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
cpu_online_mask);
-   if (next_cpu >= nr_cpu_ids)
+   check_next_cpu(next_cpu, __func__, "next_and");
+   if (next_cpu >= nr_cpu_ids) {
next_cpu = 
cpumask_first_and(hctx->cpumask,cpu_online_mask);
+   check_next_cpu(next_cpu, __func__, "first_and");
+   }
 
/*
 * No online CPU is found, so have to make sure hctx->next_cpu
 * is set correctly for not breaking workqueue.
 */
-   if (next_cpu >= nr_cpu_ids)
-   hctx->next_cpu = cpumask_first(hctx->cpumask);
-   else
-   hctx->next_cpu = next_cpu;
+   if (next_cpu >= nr_cpu_ids) {
+   next_cpu = cpumask_first(hctx->cpumask);
+   check_next_cpu(next_cpu, __func__, "first");
+   }
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
 
@@ -1379,7 +1389,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx 
*hctx)
 * Do unbound schedule if we can't find a online CPU for this hctx,
 * and it should only happen in the path of handling CPU DEAD.
 */
-   if (!cpu_online(hctx->next_cpu)) {
+   if (!cpu_online(next_cpu)) {
if (!tried) {
tried = true;
goto select_cpu;
@@ -1392,7 +1402,9 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx 
*hctx)
hctx->next_cpu_batch = 1;
return WORK_CPU_UNBOUND;
}
-   return hctx->next_cpu;
+
+   hctx->next_cpu = next_cpu;
+   return next_cpu;
 }
 
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
@@ -2408,6 +2420,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
mutex_unlock(>sysfs_lock);
 
queue_for_each_hw_ctx(q, hctx, i) {
+   int next_cpu;
+
/*
  

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-04-05 Thread Ming Lei
On Fri, Apr 06, 2018 at 12:05:03AM +0800, Ming Lei wrote:
> On Wed, Apr 04, 2018 at 10:18:13AM +0200, Christian Borntraeger wrote:
> > 
> > 
> > On 03/30/2018 04:53 AM, Ming Lei wrote:
> > > On Thu, Mar 29, 2018 at 01:49:29PM +0200, Christian Borntraeger wrote:
> > >>
> > >>
> > >> On 03/29/2018 01:43 PM, Ming Lei wrote:
> > >>> On Thu, Mar 29, 2018 at 12:49:55PM +0200, Christian Borntraeger wrote:
> > >>>>
> > >>>>
> > >>>> On 03/29/2018 12:48 PM, Ming Lei wrote:
> > >>>>> On Thu, Mar 29, 2018 at 12:10:11PM +0200, Christian Borntraeger wrote:
> > >>>>>>
> > >>>>>>
> > >>>>>> On 03/29/2018 11:40 AM, Ming Lei wrote:
> > >>>>>>> On Thu, Mar 29, 2018 at 11:09:08AM +0200, Christian Borntraeger 
> > >>>>>>> wrote:
> > >>>>>>>>
> > >>>>>>>>
> > >>>>>>>> On 03/29/2018 09:23 AM, Christian Borntraeger wrote:
> > >>>>>>>>>
> > >>>>>>>>>
> > >>>>>>>>> On 03/29/2018 04:00 AM, Ming Lei wrote:
> > >>>>>>>>>> On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger 
> > >>>>>>>>>> wrote:
> > >>>>>>>>>>>
> > >>>>>>>>>>>
> > >>>>>>>>>>> On 03/28/2018 05:26 PM, Ming Lei wrote:
> > >>>>>>>>>>>> Hi Christian,
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian 
> > >>>>>>>>>>>> Borntraeger wrote:
> > >>>>>>>>>>>>> FWIW, this patch does not fix the issue for me:
> > >>>>>>>>>>>>>
> > >>>>>>>>>>>>> ostname=? addr=? terminal=? res=success'
> > >>>>>>>>>>>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at 
> > >>>>>>>>>>>>> block/blk-mq.c:1410 __blk_mq_delay_run_hw_queue+0xbe/0xd8
> > >>>>>>>>>>>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc 
> > >>>>>>>>>>>>> scsi_dh_alua dm_mirror dm_region_hash dm_log dm_multipath 
> > >>>>>>>>>>>>> dm_mod autofs4
> > >>>>>>>>>>>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> > >>>>>>>>>>>>> 4.16.0-rc7+ #26
> > >>>>>>>>>>>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> > >>>>>>>>>>>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> > >>>>>>>>>>>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> > >>>>>>>>>>>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 
> > >>>>>>>>>>>>> AS:3 CC:0 PM:0 RI:0 EA:3
> > >>>>>>>>>>>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> > >>>>>>>>>>>>> 013ac6c0dc00 0001
> > >>>>>>>>>>>>> [   21.455008] 013abb69a710 
> > >>>>>>>>>>>>> 013a 0001b691fd98
> > >>>>>>>>>>>>> [   21.455011]0001b691fd98 013ace4775c8 
> > >>>>>>>>>>>>> 0001 
> > >>>>>>>>>>>>> [   21.455014]013ac6c0dc00 00b47238 
> > >>>>>>>>>>>>> 0001b691fc08 0001b691fbd0
> > >>>>>>>>>>>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4  
> > >>>>>>>>>>>>> lmg %r10,%r15,160(%r15)
> > >>>>>>>>>>>>>   0069c59c: c0f47a5e  
> > >>>>>>>>>>>>> brcl15,68ba5

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-04-05 Thread Ming Lei
On Wed, Apr 04, 2018 at 10:18:13AM +0200, Christian Borntraeger wrote:
> 
> 
> On 03/30/2018 04:53 AM, Ming Lei wrote:
> > On Thu, Mar 29, 2018 at 01:49:29PM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 03/29/2018 01:43 PM, Ming Lei wrote:
> >>> On Thu, Mar 29, 2018 at 12:49:55PM +0200, Christian Borntraeger wrote:
> >>>>
> >>>>
> >>>> On 03/29/2018 12:48 PM, Ming Lei wrote:
> >>>>> On Thu, Mar 29, 2018 at 12:10:11PM +0200, Christian Borntraeger wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 03/29/2018 11:40 AM, Ming Lei wrote:
> >>>>>>> On Thu, Mar 29, 2018 at 11:09:08AM +0200, Christian Borntraeger wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 03/29/2018 09:23 AM, Christian Borntraeger wrote:
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> On 03/29/2018 04:00 AM, Ming Lei wrote:
> >>>>>>>>>> On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger 
> >>>>>>>>>> wrote:
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> On 03/28/2018 05:26 PM, Ming Lei wrote:
> >>>>>>>>>>>> Hi Christian,
> >>>>>>>>>>>>
> >>>>>>>>>>>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger 
> >>>>>>>>>>>> wrote:
> >>>>>>>>>>>>> FWIW, this patch does not fix the issue for me:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> ostname=? addr=? terminal=? res=success'
> >>>>>>>>>>>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> >>>>>>>>>>>>> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> >>>>>>>>>>>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc 
> >>>>>>>>>>>>> scsi_dh_alua dm_mirror dm_region_hash dm_log dm_multipath 
> >>>>>>>>>>>>> dm_mod autofs4
> >>>>>>>>>>>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> >>>>>>>>>>>>> 4.16.0-rc7+ #26
> >>>>>>>>>>>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> >>>>>>>>>>>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> >>>>>>>>>>>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> >>>>>>>>>>>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 
> >>>>>>>>>>>>> AS:3 CC:0 PM:0 RI:0 EA:3
> >>>>>>>>>>>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> >>>>>>>>>>>>> 013ac6c0dc00 0001
> >>>>>>>>>>>>> [   21.455008] 013abb69a710 
> >>>>>>>>>>>>> 013a 0001b691fd98
> >>>>>>>>>>>>> [   21.455011]0001b691fd98 013ace4775c8 
> >>>>>>>>>>>>> 0001 
> >>>>>>>>>>>>> [   21.455014]013ac6c0dc00 00b47238 
> >>>>>>>>>>>>> 0001b691fc08 0001b691fbd0
> >>>>>>>>>>>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4
> >>>>>>>>>>>>> lmg %r10,%r15,160(%r15)
> >>>>>>>>>>>>>   0069c59c: c0f47a5e
> >>>>>>>>>>>>> brcl15,68ba58
> >>>>>>>>>>>>>  #0069c5a2: a7f40001
> >>>>>>>>>>>>> brc 15,69c5a4
> >>>>>>>>>>>>>  >0069c5a6: e340f0c4
> >>>>>>>>>>>>> lg  %r4,192(%r15)
> >>

BUG: KASAN: use-after-free in bt_for_each+0x1ea/0x29f

2018-04-04 Thread Ming Lei
Hi,

The following warning is observed once when running dbench on NVMe with
the linus tree(top commit is 642e7fd23353).

[ 1446.882043] 
==
[ 1446.886884] BUG: KASAN: use-after-free in bt_for_each+0x1ea/0x29f
[ 1446.888045] Read of size 8 at addr 880055a60a00 by task dbench/13443
[ 1446.889660]
[ 1446.889892] CPU: 1 PID: 13443 Comm: dbench Not tainted 
4.16.0_642e7fd23353_master+ #1
[ 1446.891007] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
1.10.2-2.fc27 04/01/2014
[ 1446.892290] Call Trace:
[ 1446.892641]  
[ 1446.892937]  dump_stack+0xf0/0x191
[ 1446.893600]  ? dma_direct_map_page+0x6f/0x6f
[ 1446.894425]  ? show_regs_print_info+0xa/0xa
[ 1446.895247]  ? ext4_writepages+0x196d/0x1e6d
[ 1446.896063]  ? do_writepages+0x57/0xa3
[ 1446.896810]  print_address_description+0x6e/0x23b
[ 1446.897882]  ? bt_for_each+0x1ea/0x29f
[ 1446.898693]  kasan_report+0x247/0x285
[ 1446.899484]  bt_for_each+0x1ea/0x29f
[ 1446.900233]  ? blk_mq_tagset_busy_iter+0xa3/0xa3
[ 1446.901190]  ? generic_file_buffered_read+0x14b1/0x14b1
[ 1446.903097]  ? blk_mq_hctx_mark_pending.isra.0+0x5c/0x5c
[ 1446.904418]  ? bio_free+0x64/0xaa
[ 1446.905113]  ? debug_lockdep_rcu_enabled+0x26/0x52
[ 1446.906332]  ? bio_put+0x7a/0x10e
[ 1446.906811]  ? debug_lockdep_rcu_enabled+0x26/0x52
[ 1446.907527]  ? blk_mq_hctx_mark_pending.isra.0+0x5c/0x5c
[ 1446.908334]  blk_mq_queue_tag_busy_iter+0xd0/0xde
[ 1446.909023]  blk_mq_in_flight+0xb4/0xdb
[ 1446.909619]  ? blk_mq_exit_hctx+0x190/0x190
[ 1446.910281]  ? ext4_end_bio+0x25d/0x2a1
[ 1446.911713]  part_in_flight+0xc0/0x2ac
[ 1446.912470]  ? ext4_put_io_end_defer+0x277/0x277
[ 1446.913465]  ? part_dec_in_flight+0x8f/0x8f
[ 1446.914375]  ? __lock_acquire+0x38/0x8e5
[ 1446.915182]  ? bio_endio+0x3d9/0x41c
[ 1446.915936]  ? __rcu_read_unlock+0x134/0x180
[ 1446.916796]  ? lock_acquire+0x2ba/0x32d
[ 1446.917570]  ? blk_account_io_done+0xea/0x572
[ 1446.918424]  part_round_stats+0x167/0x1a3
[ 1446.919188]  ? part_round_stats_single.isra.1+0xc7/0xc7
[ 1446.920187]  blk_account_io_done+0x34d/0x572
[ 1446.921056]  ? blk_update_bidi_request+0x8f/0x8f
[ 1446.921923]  ? blk_mq_run_hw_queue+0x13d/0x187
[ 1446.922803]  blk_mq_end_request+0x3f/0xbf
[ 1446.923631]  nvme_complete_rq+0x305/0x348 [nvme_core]
[ 1446.924612]  ? nvme_delete_ctrl_sync+0x5c/0x5c [nvme_core]
[ 1446.925696]  ? nvme_pci_complete_rq+0x1f6/0x20c [nvme]
[ 1446.926673]  ? kfree+0x21c/0x2ab
[ 1446.927317]  ? nvme_pci_complete_rq+0x1f6/0x20c [nvme]
[ 1446.928239]  __blk_mq_complete_request+0x391/0x3ee
[ 1446.928938]  ? blk_mq_free_request+0x479/0x479
[ 1446.929588]  ? rcu_read_lock_bh_held+0x3a/0x3a
[ 1446.930321]  ? enqueue_hrtimer+0x252/0x29a
[ 1446.930938]  ? do_raw_spin_lock+0xd8/0xd8
[ 1446.931532]  ? debug_lockdep_rcu_enabled+0x26/0x52
[ 1446.932425]  blk_mq_complete_request+0x10e/0x159
[ 1446.933341]  ? hctx_lock+0xe8/0xe8
[ 1446.933985]  ? lock_contended+0x680/0x680
[ 1446.934707]  ? lock_downgrade+0x338/0x338
[ 1446.935463]  nvme_process_cq+0x26a/0x34d [nvme]
[ 1446.936297]  ? nvme_init_hctx+0xa6/0xa6 [nvme]
[ 1446.937150]  nvme_irq+0x23/0x51 [nvme]
[ 1446.937864]  ? nvme_process_cq+0x34d/0x34d [nvme]
[ 1446.938713]  __handle_irq_event_percpu+0x29d/0x568
[ 1446.939516]  ? __irq_wake_thread+0x99/0x99
[ 1446.940241]  ? rcu_user_enter+0x72/0x72
[ 1446.940978]  ? do_timer+0x25/0x25
[ 1446.941650]  ? do_raw_spin_unlock+0x146/0x179
[ 1446.942514]  ? __lock_acquire+0x38/0x8e5
[ 1446.943305]  ? debug_lockdep_rcu_enabled+0x26/0x52
[ 1446.944242]  ? lock_acquire+0x32d/0x32d
[ 1446.944995]  ? lock_contended+0x680/0x680
[ 1446.945718]  handle_irq_event_percpu+0x7c/0xf7
[ 1446.946438]  ? __handle_irq_event_percpu+0x568/0x568
[ 1446.947124]  ? rcu_user_exit+0xa/0xa
[ 1446.947781]  handle_irq_event+0x53/0x83
[ 1446.948553]  handle_edge_irq+0x1f2/0x279
[ 1446.949397]  handle_irq+0x1d8/0x1e9
[ 1446.950094]  do_IRQ+0x90/0x12d
[ 1446.950750]  common_interrupt+0xf/0xf
[ 1446.951507]  
[ 1446.951953] RIP: 0010:__blk_mq_get_tag+0x201/0x22d
[ 1446.952894] RSP: 0018:880055b467a0 EFLAGS: 0246 ORIG_RAX: 
ffdc
[ 1446.954295] RAX:  RBX: 88005952f648 RCX: 
[ 1446.955641] RDX: 0259 RSI:  RDI: ed000ab68d06
[ 1446.956972] RBP: ed000ab68cf6 R08: 0007 R09: 
[ 1446.958356] R10: ed000a0ec0f2 R11: ed000a0ec0f1 R12: 88007f113978
[ 1446.959737] R13: 880055b46ce8 R14: dc00 R15: 880058bf60c0
[ 1446.961184]  ? modules_open+0x5e/0x5e
[ 1446.961922]  ? blk_mq_unique_tag+0xc5/0xc5
[ 1446.962748]  ? lock_acquire+0x32d/0x32d
[ 1446.963534]  ? __rcu_read_unlock+0x134/0x180
[ 1446.964393]  ? rcu_read_lock_bh_held+0x3a/0x3a
[ 1446.965282]  blk_mq_get_tag+0x1ad/0x67a
[ 1446.966079]  ? __blk_mq_tag_idle+0x44/0x44
[ 1446.966891]  ? wait_woken+0x13c/0x13c
[ 1446.967638]  ? debug_lockdep_rcu_enabled+0x26/0x52
[ 1446.968566]  ? lock_acquire+0x32d/0x32d
[ 

[PATCH] blk-mq: order getting budget and driver tag

2018-04-04 Thread Ming Lei
This patch orders getting budget and driver tag by making sure to acquire
driver tag after budget is got, this way can help to avoid the following
race:

1) before dispatch request from scheduler queue, get one budget first, then
dequeue a request, call it request A.

2) in another IO path for dispatching request B which is from hctx->dispatch,
driver tag is got, then try to get budget in blk_mq_dispatch_rq_list(),
unfortunately the budget is held by request A.

3) meantime blk_mq_dispatch_rq_list() is called for dispatching request
A, and try to get driver tag first, unfortunately no driver tag is
available because the driver tag is held by request B

4) both two IO pathes can't move on, and IO stall is caused.

This issue can be observed when running dbench on USB storage.

This patch fixes this issue by always getting budget before getting
driver tag.

Cc: sta...@vger.kernel.org
Fixes: de1482974080ec9e ("blk-mq: introduce .get_budget and .put_budget in 
blk_mq_ops")
Cc: Christoph Hellwig <h...@lst.de>
Cc: Bart Van Assche <bart.vanass...@wdc.com>
Cc: Omar Sandoval <osan...@fb.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 16e83e6df404..90838e998f66 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1188,7 +1188,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
struct list_head *list,
struct blk_mq_queue_data bd;
 
rq = list_first_entry(list, struct request, queuelist);
-   if (!blk_mq_get_driver_tag(rq, , false)) {
+
+   hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+   if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+   break;
+
+   if (!blk_mq_get_driver_tag(rq, NULL, false)) {
/*
 * The initial allocation attempt failed, so we need to
 * rerun the hardware queue when a tag is freed. The
@@ -1197,8 +1202,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
struct list_head *list,
 * we'll re-run it below.
 */
if (!blk_mq_mark_tag_wait(, rq)) {
-   if (got_budget)
-   blk_mq_put_dispatch_budget(hctx);
+   blk_mq_put_dispatch_budget(hctx);
/*
 * For non-shared tags, the RESTART check
 * will suffice.
@@ -1209,11 +1213,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
struct list_head *list,
}
}
 
-   if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
-   blk_mq_put_driver_tag(rq);
-   break;
-   }
-
list_del_init(>queuelist);
 
bd.rq = rq;
@@ -1812,11 +1811,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct 
blk_mq_hw_ctx *hctx,
if (q->elevator && !bypass_insert)
goto insert;
 
-   if (!blk_mq_get_driver_tag(rq, NULL, false))
+   if (!blk_mq_get_dispatch_budget(hctx))
goto insert;
 
-   if (!blk_mq_get_dispatch_budget(hctx)) {
-   blk_mq_put_driver_tag(rq);
+   if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+   blk_mq_put_dispatch_budget(hctx);
goto insert;
}
 
-- 
2.9.5



Re: [PATCH V3 4/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-04-04 Thread Ming Lei
On Wed, Apr 04, 2018 at 02:45:18PM +0200, Thomas Gleixner wrote:
> On Wed, 4 Apr 2018, Thomas Gleixner wrote:
> > I'm aware how that hw-queue stuff works. But that only works if the
> > spreading algorithm makes the interrupts affine to offline/not-present CPUs
> > when the block device is initialized.
> > 
> > In the example above:
> > 
> > > > >   irq 39, cpu list 0,4
> > > > >   irq 40, cpu list 1,6
> > > > >   irq 41, cpu list 2,5
> > > > >   irq 42, cpu list 3,7
> > 
> > and assumed that at driver init time only CPU 0-3 are online then the
> > hotplug of CPU 4-7 will not result in any interrupt delivered to CPU 4-7.
> > 
> > So the extra assignment to CPU 4-7 in the affinity mask has no effect
> > whatsoever and even if the spreading result is 'perfect' it just looks
> > perfect as it is not making any difference versus the original result:
> > 
> > > > >   irq 39, cpu list 0
> > > > >   irq 40, cpu list 1
> > > > >   irq 41, cpu list 2
> > > > >   irq 42, cpu list 3
> 
> And looking deeper into the changes, I think that the first spreading step
> has to use cpu_present_mask and not cpu_online_mask.
> 
> Assume the following scenario:
> 
> Machine with 8 present CPUs is booted, the 4 last CPUs are
> unplugged. Device with 4 queues is initialized.
> 
> The resulting spread is going to be exactly your example:
> 
>   irq 39, cpu list 0,4
>   irq 40, cpu list 1,6
>   irq 41, cpu list 2,5
>   irq 42, cpu list 3,7
> 
> Now the 4 offline CPUs are plugged in again. These CPUs won't ever get an
> interrupt as all interrupts stay on CPU 0-3 unless one of these CPUs is
> unplugged. Using cpu_present_mask the spread would be:
> 
>   irq 39, cpu list 0,1
>   irq 40, cpu list 2,3
>   irq 41, cpu list 4,5
>   irq 42, cpu list 6,7

Given physical CPU hotplug isn't common, this way will make only irq 39
and irq 40 active most of times, so performance regression is caused just
as Kashyap reported.

> 
> while on a machine where CPU 4-7 are NOT present, but advertised as
> possible the spread would be:
> 
>   irq 39, cpu list 0,4
>   irq 40, cpu list 1,6
>   irq 41, cpu list 2,5
>   irq 42, cpu list 3,7

I think this way is still better, since performance regression can be
avoided, and there is at least one CPU for covering one irq vector,
in reality, it is often enough.

As I mentioned in another email, I still don't understand why interrupts
can't be delivered to CPU 4~7 after these CPUs become present & online.
Seems in theory, interrupts should be delivered to these CPUs since
affinity info has been programmed to interrupt controller already. 

Or do we still need CPU hotplug handler for device driver to tell device
the CPU hotplug change for delivering interrupts to new added CPUs?


Thanks,
Ming


Re: [PATCH V3 4/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-04-04 Thread Ming Lei
On Wed, Apr 04, 2018 at 10:25:16AM +0200, Thomas Gleixner wrote:
> On Wed, 4 Apr 2018, Ming Lei wrote:
> > On Tue, Apr 03, 2018 at 03:32:21PM +0200, Thomas Gleixner wrote:
> > > On Thu, 8 Mar 2018, Ming Lei wrote:
> > > > 1) before 84676c1f21 ("genirq/affinity: assign vectors to all possible 
> > > > CPUs")
> > > > irq 39, cpu list 0
> > > > irq 40, cpu list 1
> > > > irq 41, cpu list 2
> > > > irq 42, cpu list 3
> > > > 
> > > > 2) after 84676c1f21 ("genirq/affinity: assign vectors to all possible 
> > > > CPUs")
> > > > irq 39, cpu list 0-2
> > > > irq 40, cpu list 3-4,6
> > > > irq 41, cpu list 5
> > > > irq 42, cpu list 7
> > > > 
> > > > 3) after applying this patch against V4.15+:
> > > > irq 39, cpu list 0,4
> > > > irq 40, cpu list 1,6
> > > > irq 41, cpu list 2,5
> > > > irq 42, cpu list 3,7
> > > 
> > > That's more or less window dressing. If the device is already in use when
> > > the offline CPUs get hot plugged, then the interrupts still stay on cpu 
> > > 0-3
> > > because the effective affinity of interrupts on X86 (and other
> > > architectures) is always a single CPU.
> > > 
> > > So this only might move interrupts to the hotplugged CPUs when the device
> > > is initialized after CPU hotplug and the actual vector allocation moves an
> > > interrupt out to the higher numbered CPUs if they have less vectors
> > > allocated than the lower numbered ones.
> > 
> > It works for blk-mq devices, such as NVMe.
> > 
> > Now NVMe driver creates num_possible_cpus() hw queues, and each
> > hw queue is assigned one msix irq vector.
> > 
> > Storage is Client/Server model, that means the interrupt is only
> > delivered to CPU after one IO request is submitted to hw queue and
> > it is completed by this hw queue.
> > 
> > When CPUs is hotplugged, and there will be IO submitted from these
> > CPUs, then finally IOs complete and irq events are generated from
> > hw queues, and notify these submission CPU by IRQ finally.
> 
> I'm aware how that hw-queue stuff works. But that only works if the
> spreading algorithm makes the interrupts affine to offline/not-present CPUs
> when the block device is initialized.
> 
> In the example above:
> 
> > > > irq 39, cpu list 0,4
> > > > irq 40, cpu list 1,6
> > > > irq 41, cpu list 2,5
> > > > irq 42, cpu list 3,7
> 
> and assumed that at driver init time only CPU 0-3 are online then the
> hotplug of CPU 4-7 will not result in any interrupt delivered to CPU 4-7.

Indeed, and I just tested this case, and found that no interrupts are
delivered to CPU 4-7.

In theory, the affinity has been assigned to these irq vectors, and
programmed to interrupt controller, I understand it should work.

Could you explain it a bit why interrupts aren't delivered to CPU 4-7?


Thanks,
Ming


Re: [PATCH V3 4/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-04-03 Thread Ming Lei
On Tue, Apr 03, 2018 at 03:32:21PM +0200, Thomas Gleixner wrote:
> On Thu, 8 Mar 2018, Ming Lei wrote:
> > 1) before 84676c1f21 ("genirq/affinity: assign vectors to all possible 
> > CPUs")
> > irq 39, cpu list 0
> > irq 40, cpu list 1
> > irq 41, cpu list 2
> > irq 42, cpu list 3
> > 
> > 2) after 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
> > irq 39, cpu list 0-2
> > irq 40, cpu list 3-4,6
> > irq 41, cpu list 5
> > irq 42, cpu list 7
> > 
> > 3) after applying this patch against V4.15+:
> > irq 39, cpu list 0,4
> > irq 40, cpu list 1,6
> > irq 41, cpu list 2,5
> > irq 42, cpu list 3,7
> 
> That's more or less window dressing. If the device is already in use when
> the offline CPUs get hot plugged, then the interrupts still stay on cpu 0-3
> because the effective affinity of interrupts on X86 (and other
> architectures) is always a single CPU.
> 
> So this only might move interrupts to the hotplugged CPUs when the device
> is initialized after CPU hotplug and the actual vector allocation moves an
> interrupt out to the higher numbered CPUs if they have less vectors
> allocated than the lower numbered ones.

It works for blk-mq devices, such as NVMe.

Now NVMe driver creates num_possible_cpus() hw queues, and each
hw queue is assigned one msix irq vector.

Storage is Client/Server model, that means the interrupt is only
delivered to CPU after one IO request is submitted to hw queue and
it is completed by this hw queue.

When CPUs is hotplugged, and there will be IO submitted from these
CPUs, then finally IOs complete and irq events are generated from
hw queues, and notify these submission CPU by IRQ finally.

Thanks,
Ming


Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-29 Thread Ming Lei
Hi Thomas,

On Fri, Mar 09, 2018 at 04:08:19PM +0100, Thomas Gleixner wrote:
> On Fri, 9 Mar 2018, Ming Lei wrote:
> > On Fri, Mar 09, 2018 at 11:08:54AM +0100, Thomas Gleixner wrote:
> > > > > So my understanding is that these irq patches are enhancements and 
> > > > > not bug
> > > > > fixes. I'll queue them for 4.17 then.
> > > > 
> > > > Wrt. this IO hang issue, these patches shouldn't be bug fix, but they 
> > > > may
> > > > fix performance regression[1] for some systems caused by 84676c1f21 
> > > > ("genirq/affinity:
> > > > assign vectors to all possible CPUs").
> > > > 
> > > > [1] https://marc.info/?l=linux-block=152050347831149=2
> > > 
> > > Hmm. The patches are rather large for urgent and evtl. backporting. Is
> > > there a simpler way to address that performance issue?
> > 
> > Not thought of a simpler solution. The problem is that number of active 
> > msix vector
> > is decreased a lot by commit 84676c1f21.
> 
> It's reduced in cases where the number of possible CPUs is way larger than
> the number of online CPUs.
> 
> Now, if you look at the number of present CPUs on such systems it's
> probably the same as the number of online CPUs.
> 
> It only differs on machines which support physical hotplug, but that's not
> the normal case. Those systems are more special and less wide spread.
> 
> So the obvious simple fix for this regression issue is to spread out the
> vectors accross present CPUs and not accross possible CPUs.
> 
> I'm not sure if there is a clear indicator whether physcial hotplug is
> supported or not, but the ACPI folks (x86) and architecture maintainers
> should be able to answer that question. I have a machine which says:
> 
>smpboot: Allowing 128 CPUs, 96 hotplug CPUs
> 
> There is definitely no way to hotplug anything on that machine and sure the
> existing spread algorithm will waste vectors to no end.

percpu variable may waste space too if the possible cpu number is
provided not accurately from ACPI.

> 
> Sure then there is virt, which can pretend to have a gazillion of possible
> hotpluggable CPUs, but virt is an insanity on its own. Though someone might
> come up with reasonable heuristics for that as well.

There are also IBM s390, in which physical CPU hotplug is one normal use
case.

Looks not see any other solution posted out for virt, and it may cause
complicated queue dependency issue by re-introducing CPU hotplug
handler for blk-mq.

> 
> Thoughts?

Given this patchset doesn't have effect on normal machines without
supporting physical CPU hotplug, it can fix performance regression on
machines which might support physical CPU hotplug(cpu_present_mask !=
cpu_possible_mask) with some extra memory allocation cost.

So is there any chance to make it in v4.17?

Thanks,
Ming


Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-29 Thread Ming Lei
On Thu, Mar 29, 2018 at 01:49:29PM +0200, Christian Borntraeger wrote:
> 
> 
> On 03/29/2018 01:43 PM, Ming Lei wrote:
> > On Thu, Mar 29, 2018 at 12:49:55PM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 03/29/2018 12:48 PM, Ming Lei wrote:
> >>> On Thu, Mar 29, 2018 at 12:10:11PM +0200, Christian Borntraeger wrote:
> >>>>
> >>>>
> >>>> On 03/29/2018 11:40 AM, Ming Lei wrote:
> >>>>> On Thu, Mar 29, 2018 at 11:09:08AM +0200, Christian Borntraeger wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 03/29/2018 09:23 AM, Christian Borntraeger wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>> On 03/29/2018 04:00 AM, Ming Lei wrote:
> >>>>>>>> On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger 
> >>>>>>>> wrote:
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> On 03/28/2018 05:26 PM, Ming Lei wrote:
> >>>>>>>>>> Hi Christian,
> >>>>>>>>>>
> >>>>>>>>>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger 
> >>>>>>>>>> wrote:
> >>>>>>>>>>> FWIW, this patch does not fix the issue for me:
> >>>>>>>>>>>
> >>>>>>>>>>> ostname=? addr=? terminal=? res=success'
> >>>>>>>>>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> >>>>>>>>>>> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> >>>>>>>>>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc 
> >>>>>>>>>>> scsi_dh_alua dm_mirror dm_region_hash dm_log dm_multipath dm_mod 
> >>>>>>>>>>> autofs4
> >>>>>>>>>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> >>>>>>>>>>> 4.16.0-rc7+ #26
> >>>>>>>>>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> >>>>>>>>>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> >>>>>>>>>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> >>>>>>>>>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 
> >>>>>>>>>>> AS:3 CC:0 PM:0 RI:0 EA:3
> >>>>>>>>>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> >>>>>>>>>>> 013ac6c0dc00 0001
> >>>>>>>>>>> [   21.455008] 013abb69a710 
> >>>>>>>>>>> 013a 0001b691fd98
> >>>>>>>>>>> [   21.455011]0001b691fd98 013ace4775c8 
> >>>>>>>>>>> 0001 
> >>>>>>>>>>> [   21.455014]013ac6c0dc00 00b47238 
> >>>>>>>>>>> 0001b691fc08 0001b691fbd0
> >>>>>>>>>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4  lmg 
> >>>>>>>>>>> %r10,%r15,160(%r15)
> >>>>>>>>>>>   0069c59c: c0f47a5e  brcl
> >>>>>>>>>>> 15,68ba58
> >>>>>>>>>>>  #0069c5a2: a7f40001  
> >>>>>>>>>>> brc 15,69c5a4
> >>>>>>>>>>>  >0069c5a6: e340f0c4  lg  
> >>>>>>>>>>> %r4,192(%r15)
> >>>>>>>>>>>   0069c5ac: ebaff0a4  lmg 
> >>>>>>>>>>> %r10,%r15,160(%r15)
> >>>>>>>>>>>   0069c5b2: 07f4  bcr 
> >>>>>>>>>>> 15,%r4
> >>>>>>>>>>>   0069c5b4: c0e5feea  brasl   
> >>>>>>>>>>> %r14,69c388
> >>>>>>>>>>>   0069c5ba: a

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-29 Thread Ming Lei
On Thu, Mar 29, 2018 at 12:49:55PM +0200, Christian Borntraeger wrote:
> 
> 
> On 03/29/2018 12:48 PM, Ming Lei wrote:
> > On Thu, Mar 29, 2018 at 12:10:11PM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 03/29/2018 11:40 AM, Ming Lei wrote:
> >>> On Thu, Mar 29, 2018 at 11:09:08AM +0200, Christian Borntraeger wrote:
> >>>>
> >>>>
> >>>> On 03/29/2018 09:23 AM, Christian Borntraeger wrote:
> >>>>>
> >>>>>
> >>>>> On 03/29/2018 04:00 AM, Ming Lei wrote:
> >>>>>> On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>> On 03/28/2018 05:26 PM, Ming Lei wrote:
> >>>>>>>> Hi Christian,
> >>>>>>>>
> >>>>>>>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger 
> >>>>>>>> wrote:
> >>>>>>>>> FWIW, this patch does not fix the issue for me:
> >>>>>>>>>
> >>>>>>>>> ostname=? addr=? terminal=? res=success'
> >>>>>>>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> >>>>>>>>> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> >>>>>>>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc 
> >>>>>>>>> scsi_dh_alua dm_mirror dm_region_hash dm_log dm_multipath dm_mod 
> >>>>>>>>> autofs4
> >>>>>>>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> >>>>>>>>> 4.16.0-rc7+ #26
> >>>>>>>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> >>>>>>>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> >>>>>>>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> >>>>>>>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 
> >>>>>>>>> CC:0 PM:0 RI:0 EA:3
> >>>>>>>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> >>>>>>>>> 013ac6c0dc00 0001
> >>>>>>>>> [   21.455008] 013abb69a710 
> >>>>>>>>> 013a 0001b691fd98
> >>>>>>>>> [   21.455011]0001b691fd98 013ace4775c8 
> >>>>>>>>> 0001 
> >>>>>>>>> [   21.455014]013ac6c0dc00 00b47238 
> >>>>>>>>> 0001b691fc08 0001b691fbd0
> >>>>>>>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4lmg 
> >>>>>>>>> %r10,%r15,160(%r15)
> >>>>>>>>>   0069c59c: c0f47a5ebrcl
> >>>>>>>>> 15,68ba58
> >>>>>>>>>  #0069c5a2: a7f40001
> >>>>>>>>> brc 15,69c5a4
> >>>>>>>>>  >0069c5a6: e340f0c4lg  
> >>>>>>>>> %r4,192(%r15)
> >>>>>>>>>   0069c5ac: ebaff0a4lmg 
> >>>>>>>>> %r10,%r15,160(%r15)
> >>>>>>>>>   0069c5b2: 07f4bcr 
> >>>>>>>>> 15,%r4
> >>>>>>>>>   0069c5b4: c0e5feeabrasl   
> >>>>>>>>> %r14,69c388
> >>>>>>>>>   0069c5ba: a7f4fff6
> >>>>>>>>> brc 15,69c5a6
> >>>>>>>>> [   21.455067] Call Trace:
> >>>>>>>>> [   21.455072] ([<0001b691fd98>] 0x1b691fd98)
> >>>>>>>>> [   21.455079]  [<0069c692>] blk_mq_run_hw_queue+0xba/0x100 
> >>>>>>>>> [   21.455083]  [<0069c740>] blk_mq_run_hw_queues+0x68/0x88 
> >>>>>>>>> [   21.455089]  [<0069b956>] 
> >>>>>>>>> __blk_mq_complete

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-29 Thread Ming Lei
On Thu, Mar 29, 2018 at 12:10:11PM +0200, Christian Borntraeger wrote:
> 
> 
> On 03/29/2018 11:40 AM, Ming Lei wrote:
> > On Thu, Mar 29, 2018 at 11:09:08AM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 03/29/2018 09:23 AM, Christian Borntraeger wrote:
> >>>
> >>>
> >>> On 03/29/2018 04:00 AM, Ming Lei wrote:
> >>>> On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger wrote:
> >>>>>
> >>>>>
> >>>>> On 03/28/2018 05:26 PM, Ming Lei wrote:
> >>>>>> Hi Christian,
> >>>>>>
> >>>>>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger wrote:
> >>>>>>> FWIW, this patch does not fix the issue for me:
> >>>>>>>
> >>>>>>> ostname=? addr=? terminal=? res=success'
> >>>>>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> >>>>>>> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> >>>>>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc 
> >>>>>>> scsi_dh_alua dm_mirror dm_region_hash dm_log dm_multipath dm_mod 
> >>>>>>> autofs4
> >>>>>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> >>>>>>> 4.16.0-rc7+ #26
> >>>>>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> >>>>>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> >>>>>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> >>>>>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 
> >>>>>>> CC:0 PM:0 RI:0 EA:3
> >>>>>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> >>>>>>> 013ac6c0dc00 0001
> >>>>>>> [   21.455008] 013abb69a710 
> >>>>>>> 013a 0001b691fd98
> >>>>>>> [   21.455011]0001b691fd98 013ace4775c8 
> >>>>>>> 0001 
> >>>>>>> [   21.455014]013ac6c0dc00 00b47238 
> >>>>>>> 0001b691fc08 0001b691fbd0
> >>>>>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4  lmg 
> >>>>>>> %r10,%r15,160(%r15)
> >>>>>>>   0069c59c: c0f47a5e  brcl
> >>>>>>> 15,68ba58
> >>>>>>>  #0069c5a2: a7f40001  brc 
> >>>>>>> 15,69c5a4
> >>>>>>>  >0069c5a6: e340f0c4  lg  
> >>>>>>> %r4,192(%r15)
> >>>>>>>   0069c5ac: ebaff0a4  lmg 
> >>>>>>> %r10,%r15,160(%r15)
> >>>>>>>   0069c5b2: 07f4  bcr 
> >>>>>>> 15,%r4
> >>>>>>>   0069c5b4: c0e5feea  brasl   
> >>>>>>> %r14,69c388
> >>>>>>>   0069c5ba: a7f4fff6  brc 
> >>>>>>> 15,69c5a6
> >>>>>>> [   21.455067] Call Trace:
> >>>>>>> [   21.455072] ([<0001b691fd98>] 0x1b691fd98)
> >>>>>>> [   21.455079]  [<0069c692>] blk_mq_run_hw_queue+0xba/0x100 
> >>>>>>> [   21.455083]  [<0069c740>] blk_mq_run_hw_queues+0x68/0x88 
> >>>>>>> [   21.455089]  [<0069b956>] 
> >>>>>>> __blk_mq_complete_request+0x11e/0x1d8 
> >>>>>>> [   21.455091]  [<0069ba9c>] 
> >>>>>>> blk_mq_complete_request+0x8c/0xc8 
> >>>>>>> [   21.455103]  [<008aa250>] dasd_block_tasklet+0x158/0x490 
> >>>>>>> [   21.455110]  [<0014c742>] tasklet_hi_action+0x92/0x120 
> >>>>>>> [   21.455118]  [<00a7cfc0>] __do_softirq+0x120/0x348 
> >>>>>>> [   21.455122]  [<0014c212>] irq_exit+0xba/0xd0 
> >>>>>>> [   21.455130]  [<0010bf92>] do_IRQ+0x8a/0x

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-29 Thread Ming Lei
On Thu, Mar 29, 2018 at 05:52:16PM +0800, Ming Lei wrote:
> On Thu, Mar 29, 2018 at 09:23:10AM +0200, Christian Borntraeger wrote:
> > 
> > 
> > On 03/29/2018 04:00 AM, Ming Lei wrote:
> > > On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger wrote:
> > >>
> > >>
> > >> On 03/28/2018 05:26 PM, Ming Lei wrote:
> > >>> Hi Christian,
> > >>>
> > >>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger wrote:
> > >>>> FWIW, this patch does not fix the issue for me:
> > >>>>
> > >>>> ostname=? addr=? terminal=? res=success'
> > >>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> > >>>> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> > >>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc 
> > >>>> scsi_dh_alua dm_mirror dm_region_hash dm_log dm_multipath dm_mod 
> > >>>> autofs4
> > >>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> > >>>> 4.16.0-rc7+ #26
> > >>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> > >>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> > >>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> > >>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 
> > >>>> CC:0 PM:0 RI:0 EA:3
> > >>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> > >>>> 013ac6c0dc00 0001
> > >>>> [   21.455008] 013abb69a710 
> > >>>> 013a 0001b691fd98
> > >>>> [   21.455011]0001b691fd98 013ace4775c8 
> > >>>> 0001 
> > >>>> [   21.455014]013ac6c0dc00 00b47238 
> > >>>> 0001b691fc08 0001b691fbd0
> > >>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4   lmg 
> > >>>> %r10,%r15,160(%r15)
> > >>>>   0069c59c: c0f47a5e   brcl
> > >>>> 15,68ba58
> > >>>>  #0069c5a2: a7f40001   brc 
> > >>>> 15,69c5a4
> > >>>>  >0069c5a6: e340f0c4   lg  
> > >>>> %r4,192(%r15)
> > >>>>   0069c5ac: ebaff0a4   lmg 
> > >>>> %r10,%r15,160(%r15)
> > >>>>   0069c5b2: 07f4   bcr 
> > >>>> 15,%r4
> > >>>>   0069c5b4: c0e5feea   brasl   
> > >>>> %r14,69c388
> > >>>>   0069c5ba: a7f4fff6   brc 
> > >>>> 15,69c5a6
> > >>>> [   21.455067] Call Trace:
> > >>>> [   21.455072] ([<0001b691fd98>] 0x1b691fd98)
> > >>>> [   21.455079]  [<0069c692>] blk_mq_run_hw_queue+0xba/0x100 
> > >>>> [   21.455083]  [<0069c740>] blk_mq_run_hw_queues+0x68/0x88 
> > >>>> [   21.455089]  [<0069b956>] 
> > >>>> __blk_mq_complete_request+0x11e/0x1d8 
> > >>>> [   21.455091]  [<0069ba9c>] blk_mq_complete_request+0x8c/0xc8 
> > >>>> [   21.455103]  [<008aa250>] dasd_block_tasklet+0x158/0x490 
> > >>>> [   21.455110]  [<0014c742>] tasklet_hi_action+0x92/0x120 
> > >>>> [   21.455118]  [<00a7cfc0>] __do_softirq+0x120/0x348 
> > >>>> [   21.455122]  [<0014c212>] irq_exit+0xba/0xd0 
> > >>>> [   21.455130]  [<0010bf92>] do_IRQ+0x8a/0xb8 
> > >>>> [   21.455133]  [<00a7c298>] io_int_handler+0x130/0x298 
> > >>>> [   21.455136] Last Breaking-Event-Address:
> > >>>> [   21.455138]  [<0069c5a2>] 
> > >>>> __blk_mq_delay_run_hw_queue+0xba/0xd8
> > >>>> [   21.455140] ---[ end trace be43f99a5d1e553e ]---
> > >>>> [   21.510046] dasdconf.sh Warning: 0.0.241e is already online, not 
> > >>>> configuring
> > >>>
> > >>> Thinking about this issue further, I can't understand the root c

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-29 Thread Ming Lei
On Thu, Mar 29, 2018 at 09:23:10AM +0200, Christian Borntraeger wrote:
> 
> 
> On 03/29/2018 04:00 AM, Ming Lei wrote:
> > On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger wrote:
> >>
> >>
> >> On 03/28/2018 05:26 PM, Ming Lei wrote:
> >>> Hi Christian,
> >>>
> >>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger wrote:
> >>>> FWIW, this patch does not fix the issue for me:
> >>>>
> >>>> ostname=? addr=? terminal=? res=success'
> >>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> >>>> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> >>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc scsi_dh_alua 
> >>>> dm_mirror dm_region_hash dm_log dm_multipath dm_mod autofs4
> >>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> >>>> 4.16.0-rc7+ #26
> >>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> >>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> >>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> >>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 
> >>>> PM:0 RI:0 EA:3
> >>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> >>>> 013ac6c0dc00 0001
> >>>> [   21.455008] 013abb69a710 
> >>>> 013a 0001b691fd98
> >>>> [   21.455011]0001b691fd98 013ace4775c8 
> >>>> 0001 
> >>>> [   21.455014]013ac6c0dc00 00b47238 
> >>>> 0001b691fc08 0001b691fbd0
> >>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4 lmg 
> >>>> %r10,%r15,160(%r15)
> >>>>   0069c59c: c0f47a5e brcl
> >>>> 15,68ba58
> >>>>  #0069c5a2: a7f40001 brc 
> >>>> 15,69c5a4
> >>>>  >0069c5a6: e340f0c4 lg  
> >>>> %r4,192(%r15)
> >>>>   0069c5ac: ebaff0a4 lmg 
> >>>> %r10,%r15,160(%r15)
> >>>>   0069c5b2: 07f4 bcr 15,%r4
> >>>>   0069c5b4: c0e5feea brasl   
> >>>> %r14,69c388
> >>>>   0069c5ba: a7f4fff6 brc 
> >>>> 15,69c5a6
> >>>> [   21.455067] Call Trace:
> >>>> [   21.455072] ([<0001b691fd98>] 0x1b691fd98)
> >>>> [   21.455079]  [<0069c692>] blk_mq_run_hw_queue+0xba/0x100 
> >>>> [   21.455083]  [<0069c740>] blk_mq_run_hw_queues+0x68/0x88 
> >>>> [   21.455089]  [<0069b956>] 
> >>>> __blk_mq_complete_request+0x11e/0x1d8 
> >>>> [   21.455091]  [<0069ba9c>] blk_mq_complete_request+0x8c/0xc8 
> >>>> [   21.455103]  [<008aa250>] dasd_block_tasklet+0x158/0x490 
> >>>> [   21.455110]  [<0014c742>] tasklet_hi_action+0x92/0x120 
> >>>> [   21.455118]  [<00a7cfc0>] __do_softirq+0x120/0x348 
> >>>> [   21.455122]  [<0014c212>] irq_exit+0xba/0xd0 
> >>>> [   21.455130]  [<0010bf92>] do_IRQ+0x8a/0xb8 
> >>>> [   21.455133]  [<00a7c298>] io_int_handler+0x130/0x298 
> >>>> [   21.455136] Last Breaking-Event-Address:
> >>>> [   21.455138]  [<0069c5a2>] 
> >>>> __blk_mq_delay_run_hw_queue+0xba/0xd8
> >>>> [   21.455140] ---[ end trace be43f99a5d1e553e ]---
> >>>> [   21.510046] dasdconf.sh Warning: 0.0.241e is already online, not 
> >>>> configuring
> >>>
> >>> Thinking about this issue further, I can't understand the root cause for
> >>> this issue.
> >>>
> >>> After commit 20e4d813931961fe ("blk-mq: simplify queue mapping & schedule 
> >>> with
> >>> each possisble CPU"), each hw queue should be mapped to at least one CPU, 
> >>> that
> >>> means this issue shouldn't happen. Maybe blk_mq_map_queues() works wrong?
> >>>
> >>> Could you dump 'lscpu' an

Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-29 Thread Ming Lei
On Thu, Mar 29, 2018 at 11:09:08AM +0200, Christian Borntraeger wrote:
> 
> 
> On 03/29/2018 09:23 AM, Christian Borntraeger wrote:
> > 
> > 
> > On 03/29/2018 04:00 AM, Ming Lei wrote:
> >> On Wed, Mar 28, 2018 at 05:36:53PM +0200, Christian Borntraeger wrote:
> >>>
> >>>
> >>> On 03/28/2018 05:26 PM, Ming Lei wrote:
> >>>> Hi Christian,
> >>>>
> >>>> On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger wrote:
> >>>>> FWIW, this patch does not fix the issue for me:
> >>>>>
> >>>>> ostname=? addr=? terminal=? res=success'
> >>>>> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> >>>>> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> >>>>> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc scsi_dh_alua 
> >>>>> dm_mirror dm_region_hash dm_log dm_multipath dm_mod autofs4
> >>>>> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 
> >>>>> 4.16.0-rc7+ #26
> >>>>> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> >>>>> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> >>>>> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> >>>>> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 
> >>>>> PM:0 RI:0 EA:3
> >>>>> [   21.455005] Krnl GPRS: 013abb69a000 013a 
> >>>>> 013ac6c0dc00 0001
> >>>>> [   21.455008] 013abb69a710 
> >>>>> 013a 0001b691fd98
> >>>>> [   21.455011]0001b691fd98 013ace4775c8 
> >>>>> 0001 
> >>>>> [   21.455014]013ac6c0dc00 00b47238 
> >>>>> 0001b691fc08 0001b691fbd0
> >>>>> [   21.455032] Krnl Code: 0069c596: ebaff0a4lmg 
> >>>>> %r10,%r15,160(%r15)
> >>>>>   0069c59c: c0f47a5ebrcl
> >>>>> 15,68ba58
> >>>>>  #0069c5a2: a7f40001brc 
> >>>>> 15,69c5a4
> >>>>>  >0069c5a6: e340f0c4lg  
> >>>>> %r4,192(%r15)
> >>>>>   0069c5ac: ebaff0a4lmg 
> >>>>> %r10,%r15,160(%r15)
> >>>>>   0069c5b2: 07f4bcr 
> >>>>> 15,%r4
> >>>>>   0069c5b4: c0e5feeabrasl   
> >>>>> %r14,69c388
> >>>>>   0069c5ba: a7f4fff6brc 
> >>>>> 15,69c5a6
> >>>>> [   21.455067] Call Trace:
> >>>>> [   21.455072] ([<0001b691fd98>] 0x1b691fd98)
> >>>>> [   21.455079]  [<0069c692>] blk_mq_run_hw_queue+0xba/0x100 
> >>>>> [   21.455083]  [<0069c740>] blk_mq_run_hw_queues+0x68/0x88 
> >>>>> [   21.455089]  [<0069b956>] 
> >>>>> __blk_mq_complete_request+0x11e/0x1d8 
> >>>>> [   21.455091]  [<0069ba9c>] blk_mq_complete_request+0x8c/0xc8 
> >>>>> [   21.455103]  [<008aa250>] dasd_block_tasklet+0x158/0x490 
> >>>>> [   21.455110]  [<0014c742>] tasklet_hi_action+0x92/0x120 
> >>>>> [   21.455118]  [<00a7cfc0>] __do_softirq+0x120/0x348 
> >>>>> [   21.455122]  [<0014c212>] irq_exit+0xba/0xd0 
> >>>>> [   21.455130]  [<0010bf92>] do_IRQ+0x8a/0xb8 
> >>>>> [   21.455133]  [<00a7c298>] io_int_handler+0x130/0x298 
> >>>>> [   21.455136] Last Breaking-Event-Address:
> >>>>> [   21.455138]  [<0069c5a2>] 
> >>>>> __blk_mq_delay_run_hw_queue+0xba/0xd8
> >>>>> [   21.455140] ---[ end trace be43f99a5d1e553e ]---
> >>>>> [   21.510046] dasdconf.sh Warning: 0.0.241e is already online, not 
> >>>>> configuring
> >>>>
> >>>> Thinking about this issue further, I can't understand the root cause for
> >>>> this issue.
> 
> FWIW, Limiting CONFIG_NR_CPUS to 64 seems to make the problem go away.

I think the following patch is needed, and this way aligns to the mapping
created via managed IRQ at least.

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9f8cffc8a701..638ab5c11b3c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,13 +14,12 @@
 #include "blk.h"
 #include "blk-mq.h"
 
+/*
+ * Given there isn't CPU hotplug handler in blk-mq, map all possible CPUs to
+ * queues even it isn't present yet.
+ */
 static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
-   /*
-* Non present CPU will be mapped to queue index 0.
-*/
-   if (!cpu_present(cpu))
-   return 0;
return cpu % nr_queues;
 }

Thanks,
Ming


Re: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-28 Thread Ming Lei
Hi Christian,

On Wed, Mar 28, 2018 at 09:45:10AM +0200, Christian Borntraeger wrote:
> FWIW, this patch does not fix the issue for me:
> 
> ostname=? addr=? terminal=? res=success'
> [   21.454961] WARNING: CPU: 3 PID: 1882 at block/blk-mq.c:1410 
> __blk_mq_delay_run_hw_queue+0xbe/0xd8
> [   21.454968] Modules linked in: scsi_dh_rdac scsi_dh_emc scsi_dh_alua 
> dm_mirror dm_region_hash dm_log dm_multipath dm_mod autofs4
> [   21.454984] CPU: 3 PID: 1882 Comm: dasdconf.sh Not tainted 4.16.0-rc7+ #26
> [   21.454987] Hardware name: IBM 2964 NC9 704 (LPAR)
> [   21.454990] Krnl PSW : c0131ea3 3ea2f7bf 
> (__blk_mq_delay_run_hw_queue+0xbe/0xd8)
> [   21.454996]R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 
> RI:0 EA:3
> [   21.455005] Krnl GPRS: 013abb69a000 013a 013ac6c0dc00 
> 0001
> [   21.455008] 013abb69a710 013a 
> 0001b691fd98
> [   21.455011]0001b691fd98 013ace4775c8 0001 
> 
> [   21.455014]013ac6c0dc00 00b47238 0001b691fc08 
> 0001b691fbd0
> [   21.455032] Krnl Code: 0069c596: ebaff0a4  lmg 
> %r10,%r15,160(%r15)
>   0069c59c: c0f47a5e  brcl
> 15,68ba58
>  #0069c5a2: a7f40001  brc 
> 15,69c5a4
>  >0069c5a6: e340f0c4  lg  
> %r4,192(%r15)
>   0069c5ac: ebaff0a4  lmg 
> %r10,%r15,160(%r15)
>   0069c5b2: 07f4  bcr 15,%r4
>   0069c5b4: c0e5feea  brasl   
> %r14,69c388
>   0069c5ba: a7f4fff6  brc 
> 15,69c5a6
> [   21.455067] Call Trace:
> [   21.455072] ([<0001b691fd98>] 0x1b691fd98)
> [   21.455079]  [<0069c692>] blk_mq_run_hw_queue+0xba/0x100 
> [   21.455083]  [<0069c740>] blk_mq_run_hw_queues+0x68/0x88 
> [   21.455089]  [<0069b956>] __blk_mq_complete_request+0x11e/0x1d8 
> [   21.455091]  [<0069ba9c>] blk_mq_complete_request+0x8c/0xc8 
> [   21.455103]  [<008aa250>] dasd_block_tasklet+0x158/0x490 
> [   21.455110]  [<0014c742>] tasklet_hi_action+0x92/0x120 
> [   21.455118]  [<00a7cfc0>] __do_softirq+0x120/0x348 
> [   21.455122]  [<0014c212>] irq_exit+0xba/0xd0 
> [   21.455130]  [<0010bf92>] do_IRQ+0x8a/0xb8 
> [   21.455133]  [<00a7c298>] io_int_handler+0x130/0x298 
> [   21.455136] Last Breaking-Event-Address:
> [   21.455138]  [<0069c5a2>] __blk_mq_delay_run_hw_queue+0xba/0xd8
> [   21.455140] ---[ end trace be43f99a5d1e553e ]---
> [   21.510046] dasdconf.sh Warning: 0.0.241e is already online, not 
> configuring

Thinking about this issue further, I can't understand the root cause for
this issue.

After commit 20e4d813931961fe ("blk-mq: simplify queue mapping & schedule with
each possisble CPU"), each hw queue should be mapped to at least one CPU, that
means this issue shouldn't happen. Maybe blk_mq_map_queues() works wrong?

Could you dump 'lscpu' and provide blk-mq debugfs for your DASD via the
following command?

(cd /sys/kernel/debug/block/$DASD && find . -type f -exec grep -aH . {} \;)

Thanks,
Ming


Re: [PATCH 3/3] nvme-pci: Separate IO and admin queue IRQ vectors

2018-03-27 Thread Ming Lei
On Tue, Mar 27, 2018 at 09:39:08AM -0600, Keith Busch wrote:
> The admin and first IO queues shared the first irq vector, which has an
> affinity mask including cpu0. If a system allows cpu0 to be offlined,
> the admin queue may not be usable if no other CPUs in the affinity mask
> are online. This is a problem since unlike IO queues, there is only
> one admin queue that always needs to be usable.
> 
> To fix, this patch allocates one pre_vector for the admin queue that
> is assigned all CPUs, so will always be accessible. The IO queues are
> assigned the remaining managed vectors.
> 
> In case a controller has only one interrupt vector available, the admin
> and IO queues will share the pre_vector with all CPUs assigned.
> 
> Cc: Jianchao Wang <jianchao.w.w...@oracle.com>
> Cc: Ming Lei <ming@redhat.com>
> Signed-off-by: Keith Busch <keith.bu...@intel.com>
> ---
> v1 -> v2:
> 
>   Update to use new blk-mq API.
> 
>   Removed unnecessary braces, inline functions, and temp variables.
> 
>   Amended author (this has evolved significantly from the original).
> 
>  drivers/nvme/host/pci.c | 23 +--
>  1 file changed, 17 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index cc47fbe32ea5..50c8eaf51d92 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -84,6 +84,7 @@ struct nvme_dev {
>   struct dma_pool *prp_small_pool;
>   unsigned online_queues;
>   unsigned max_qid;
> + unsigned int num_vecs;
>   int q_depth;
>   u32 db_stride;
>   void __iomem *bar;
> @@ -414,7 +415,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
>  {
>   struct nvme_dev *dev = set->driver_data;
>  
> - return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 0);
> + return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
> +  dev->num_vecs > 1);
>  }
>  
>  /**
> @@ -1455,7 +1457,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, 
> int qid)
>   nvmeq->sq_cmds_io = dev->cmb + offset;
>   }
>  
> - nvmeq->cq_vector = qid - 1;
> + /*
> +  * A queue's vector matches the queue identifier unless the controller
> +  * has only one vector available.
> +  */
> + nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid;
>   result = adapter_alloc_cq(dev, qid, nvmeq);
>   if (result < 0)
>   goto release_vector;
> @@ -1909,6 +1915,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>   int result, nr_io_queues;
>   unsigned long size;
>  
> + struct irq_affinity affd = {
> + .pre_vectors = 1
> + };
> +
>   nr_io_queues = num_present_cpus();
>   result = nvme_set_queue_count(>ctrl, _io_queues);
>   if (result < 0)
> @@ -1944,11 +1954,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>* setting up the full range we need.
>*/
>   pci_free_irq_vectors(pdev);
> - nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
> - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
> - if (nr_io_queues <= 0)
> + result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
> + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, );
> + if (result <= 0)
>   return -EIO;
> - dev->max_qid = nr_io_queues;
> + dev->num_vecs = result;
> + dev->max_qid = max(result - 1, 1);
>  
>   /*
>* Should investigate if there's a performance win from allocating
> -- 
> 2.14.3
> 

Reviewed-by: Ming Lei <ming@redhat.com>

-- 
Ming


Re: [PATCH 2/3] nvme-pci: Remove unused queue parameter

2018-03-27 Thread Ming Lei
On Tue, Mar 27, 2018 at 09:39:07AM -0600, Keith Busch wrote:
> All the queue memory is allocated up front. We don't take the node
> into consideration when creating queues anymore, so removing the unused
> parameter.
> 
> Signed-off-by: Keith Busch <keith.bu...@intel.com>
> Reviewed-by: Christoph Hellwig <h...@lst.de>
> ---
> v1 -> v2:
> 
>Added review.
> 
>  drivers/nvme/host/pci.c | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index e3b9efca0571..cc47fbe32ea5 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1379,8 +1379,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, 
> struct nvme_queue *nvmeq,
>   return 0;
>  }
>  
> -static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
> - int depth, int node)
> +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
>  {
>   struct nvme_queue *nvmeq = >queues[qid];
>  
> @@ -1595,8 +1594,7 @@ static int nvme_pci_configure_admin_queue(struct 
> nvme_dev *dev)
>   if (result < 0)
>   return result;
>  
> - result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
> - dev_to_node(dev->dev));
> + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
>   if (result)
>   return result;
>  
> @@ -1629,9 +1627,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
>   int ret = 0;
>  
>   for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
> - /* vector == qid - 1, match nvme_create_queue */
> - if (nvme_alloc_queue(dev, i, dev->q_depth,
> -  pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
> +     if (nvme_alloc_queue(dev, i, dev->q_depth)) {
>   ret = -ENOMEM;
>   break;
>   }
> -- 
> 2.14.3
> 

Reviewed-by: Ming Lei <ming@redhat.com>

-- 
Ming


Re: [PATCH 1/3] blk-mq: Allow PCI vector offset for mapping queues

2018-03-27 Thread Ming Lei
On Tue, Mar 27, 2018 at 09:39:06AM -0600, Keith Busch wrote:
> The PCI interrupt vectors intended to be associated with a queue may
> not start at 0; a driver may allocate pre_vectors for special use. This
> patch adds an offset parameter so blk-mq may find the intended affinity
> mask and updates all drivers using this API accordingly.
> 
> Cc: Don Brace <don.br...@microsemi.com>
> Cc: <qla2xxx-upstr...@qlogic.com>
> Cc: <linux-s...@vger.kernel.org>
> Signed-off-by: Keith Busch <keith.bu...@intel.com>
> ---
> v1 -> v2:
> 
>   Update blk-mq API directly instead of chaining a default parameter to
>   a new API, and update all drivers accordingly.
> 
>  block/blk-mq-pci.c| 6 --
>  drivers/nvme/host/pci.c   | 2 +-
>  drivers/scsi/qla2xxx/qla_os.c | 2 +-
>  drivers/scsi/smartpqi/smartpqi_init.c | 2 +-
>  include/linux/blk-mq-pci.h| 3 ++-
>  5 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
> index 76944e3271bf..e233996bb76f 100644
> --- a/block/blk-mq-pci.c
> +++ b/block/blk-mq-pci.c
> @@ -21,6 +21,7 @@
>   * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
>   * @set: tagset to provide the mapping for
>   * @pdev:PCI device associated with @set.
> + * @offset:  Offset to use for the pci irq vector
>   *
>   * This function assumes the PCI device @pdev has at least as many available
>   * interrupt vectors as @set has queues.  It will then query the vector
> @@ -28,13 +29,14 @@
>   * that maps a queue to the CPUs that have irq affinity for the corresponding
>   * vector.
>   */
> -int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev)
> +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
> + int offset)
>  {
>   const struct cpumask *mask;
>   unsigned int queue, cpu;
>  
>   for (queue = 0; queue < set->nr_hw_queues; queue++) {
> - mask = pci_irq_get_affinity(pdev, queue);
> + mask = pci_irq_get_affinity(pdev, queue + offset);
>   if (!mask)
>   goto fallback;
>  
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index cef5ce851a92..e3b9efca0571 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -414,7 +414,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
>  {
>   struct nvme_dev *dev = set->driver_data;
>  
> - return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev));
> + return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 0);
>  }
>  
>  /**
> diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
> index 12ee6e02d146..2c705f3dd265 100644
> --- a/drivers/scsi/qla2xxx/qla_os.c
> +++ b/drivers/scsi/qla2xxx/qla_os.c
> @@ -6805,7 +6805,7 @@ static int qla2xxx_map_queues(struct Scsi_Host *shost)
>   if (USER_CTRL_IRQ(vha->hw))
>   rc = blk_mq_map_queues(>tag_set);
>   else
> - rc = blk_mq_pci_map_queues(>tag_set, vha->hw->pdev);
> + rc = blk_mq_pci_map_queues(>tag_set, vha->hw->pdev, 0);
>   return rc;
>  }
>  
> diff --git a/drivers/scsi/smartpqi/smartpqi_init.c 
> b/drivers/scsi/smartpqi/smartpqi_init.c
> index b2880c7709e6..10c94011c8a8 100644
> --- a/drivers/scsi/smartpqi/smartpqi_init.c
> +++ b/drivers/scsi/smartpqi/smartpqi_init.c
> @@ -5348,7 +5348,7 @@ static int pqi_map_queues(struct Scsi_Host *shost)
>  {
>   struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
>  
> - return blk_mq_pci_map_queues(>tag_set, ctrl_info->pci_dev);
> + return blk_mq_pci_map_queues(>tag_set, ctrl_info->pci_dev, 0);
>  }
>  
>  static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info,
> diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
> index 6338551e0fb9..9f4c17f0d2d8 100644
> --- a/include/linux/blk-mq-pci.h
> +++ b/include/linux/blk-mq-pci.h
> @@ -5,6 +5,7 @@
>  struct blk_mq_tag_set;
>  struct pci_dev;
>  
> -int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev);
> +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
> +   int offset);
>  
>  #endif /* _LINUX_BLK_MQ_PCI_H */
> -- 
> 2.14.3
> 

Reviewed-by: Ming Lei <ming@redhat.com>

-- 
Ming


[PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

2018-03-27 Thread Ming Lei
>From commit 20e4d813931961fe ("blk-mq: simplify queue mapping & schedule
with each possisble CPU") on, it should be easier to see unmapped hctx
in some CPU topo, such as, hctx may not be mapped to any CPU.

This patch avoids the warning in __blk_mq_delay_run_hw_queue() by
checking if the hctx is mapped in blk_mq_run_hw_queues().

blk_mq_run_hw_queues() is often run in SCSI or some driver's completion
path, so this warning has to be addressed.

Reported-by: Stefan Haberland <s...@linux.vnet.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Fixes: 20e4d813931961fe ("blk-mq: simplify queue mapping & schedule with each 
possisble CPU")
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 16e83e6df404..48f25a63833b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1459,7 +1459,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool 
async)
int i;
 
queue_for_each_hw_ctx(q, hctx, i) {
-   if (blk_mq_hctx_stopped(hctx))
+   if (blk_mq_hctx_stopped(hctx) || !blk_mq_hw_queue_mapped(hctx))
continue;
 
blk_mq_run_hw_queue(hctx, async);
-- 
2.9.5



Re: 4.16-RC7 WARNING: CPU: 2 PID: 0 at block/blk-mq.c:1400 __blk_mq_delay_run_hw_queue

2018-03-27 Thread Ming Lei
Hi Stefan,

On Tue, Mar 27, 2018 at 12:04:20PM +0200, Stefan Haberland wrote:
> Hi,
> 
> I get the following warning in __blk_mq_delay_run_hw_queue when the
> scheduler is set to mq-deadline for DASD devices on s390.
> 
> What I see is that for whatever reason there is a hctx nr 0 which has no
> hctx->tags pointer set.
> From my observation it is always hctx nr 0 which has a tags NULL pointer in
> it and I see other hctx which have the hctx->tags pointer set correctly.
> 
> 
> [    2.169986] WARNING: CPU: 0 PID: 0 at block/blk-mq.c:1402
> __blk_mq_delay_run_hw_queue+0xe8/0x118
> [    2.170007] Modules linked in:
> [    2.170014] CPU: 0 PID: 0 Comm: swapper/0 Not tainted
> 4.16.0-rc7-04107-g91a05d9e1d6b-dirty #147
> [    2.170019] Hardware name: IBM 2964 N96 702 (z/VM 6.4.0)
> [    2.170024] Krnl PSW : 76fd6c7f c244c24d
> (__blk_mq_delay_run_hw_queue+0xe8/0x118)
> [    2.170035]    R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0
> RI:0 EA:3
> [    2.170041] Krnl GPRS: 599ec58a 02a94000 02a94000
> 0001
> [    2.170047]     6e761e98 02a96980
> 02a96800
> [    2.170052]    02d87ce0 737fbda8 
> 0001
> [    2.170058]    0001 00aedd10 737fbc38
> 737fbc00
> [    2.170069] Krnl Code: 006ea3c8: ebaff0a4 lmg
> %r10,%r15,160(%r15)
>   006ea3ce: c0f45e0d brcl    15,6d5fe8
>  #006ea3d4: a7f40001 brc 15,6ea3d6
>  >006ea3d8: e340f0c4    lg 
> %r4,192(%r15)
>   006ea3de: ebaff0a4 lmg
> %r10,%r15,160(%r15)
>   006ea3e4: 07f4 bcr 15,%r4
>   006ea3e6: 41b01100 la  %r11,256(%r1)
>   006ea3ea: 182a lr  %r2,%r10
> [    2.170158] Call Trace:
> [    2.170205] ([<02a96800>] 0x2a96800)
> [    2.170248]  [<006ea4c0>] blk_mq_run_hw_queue+0xa0/0x100
> [    2.170262]  [<006ea59c>] blk_mq_run_hw_queues+0x7c/0x98
> [    2.170295]  [<006e88f6>] __blk_mq_complete_request+0x10e/0x1e0
> [    2.170300]  [<006e9e30>] blk_mq_complete_request+0x80/0xa0
> [    2.170307]  [<0087fad0>] dasd_block_tasklet+0x218/0x480
> [    2.170415]  [<0017c3f8>] tasklet_hi_action+0xa0/0x138
> [    2.170434]  [<00a91c10>] __do_softirq+0xc8/0x540
> [    2.170471]  [<0017bd4e>] irq_exit+0x136/0x140
> [    2.170478]  [<0010c912>] do_IRQ+0x8a/0xb8
> [    2.170518]  [<00a90ee0>] io_int_handler+0x138/0x2e0
> [    2.170524]  [<00102cd0>] enabled_wait+0x58/0x128
> [    2.170562] ([<00102cb8>] enabled_wait+0x40/0x128)
> [    2.170577]  [<0010319a>] arch_cpu_idle+0x32/0x48
> [    2.170604]  [<00a8f636>] default_idle_call+0x3e/0x58
> [    2.170613]  [<001cd5d2>] do_idle+0xda/0x190
> [    2.170621]  [<001cd93e>] cpu_startup_entry+0x3e/0x48
> [    2.170633]  [<00e5ebf4>] start_kernel+0x47c/0x490
> [    2.170641]  [<00100020>] _stext+0x20/0x80
> [    2.170650] 2 locks held by swapper/0/0:
> [    2.170658]  #0:  (&(>lock)->rlock){..-.}, at: [<b45eaf9e>]
> dasd_block_tasklet+0x1cc/0x480
> [    2.170676]  #1:  (rcu_read_lock){}, at: [<bc7fa045>]
> hctx_lock+0x34/0x110
> [    2.170750] Last Breaking-Event-Address:
> [    2.170758]  [<006ea3d4>] __blk_mq_delay_run_hw_queue+0xe4/0x118
> [    2.170803] ---[ end trace 1073cf0de1fd32d0 ]---

This warning is harmless, please try the following patch:

--
>From 7b2b5139bfef80f44d1b1424e09ab35b715fbfdb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming@redhat.com>
Date: Tue, 27 Mar 2018 19:54:23 +0800
Subject: [PATCH] blk-mq: only run mapped hw queues in blk_mq_run_hw_queues()

>From commit 20e4d813931961fe ("blk-mq: simplify queue mapping & schedule
with each possisble CPU") on, it should be easier to see unmapped hctx
in some CPU topo, such as, hctx may not be mapped to any CPU.

This patch avoids the warning in __blk_mq_delay_run_hw_queue() by
checking if the hctx is mapped in blk_mq_run_hw_queues().

Reported-by: Stefan Haberland <s...@linux.vnet.ibm.com>
Cc: Christoph Hellwig <h...@lst.de>
Fixes: 20e4d813931961fe ("blk-mq: simplify queue mapping & schedule with each 
possisble CPU")
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq.c | 2 +-

Re: [PATCH 1/3] blk-mq: Allow PCI vector offset for mapping queues

2018-03-25 Thread Ming Lei
On Fri, Mar 23, 2018 at 04:19:21PM -0600, Keith Busch wrote:
> The PCI interrupt vectors intended to be associated with a queue may
> not start at 0. This patch adds an offset parameter so blk-mq may find
> the intended affinity mask. The default value is 0 so existing drivers
> that don't care about this parameter don't need to change.
> 
> Signed-off-by: Keith Busch 
> ---
>  block/blk-mq-pci.c | 12 ++--
>  include/linux/blk-mq-pci.h |  2 ++
>  2 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
> index 76944e3271bf..1040a7705c13 100644
> --- a/block/blk-mq-pci.c
> +++ b/block/blk-mq-pci.c
> @@ -21,6 +21,7 @@
>   * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
>   * @set: tagset to provide the mapping for
>   * @pdev:PCI device associated with @set.
> + * @offset:  PCI irq starting vector offset
>   *
>   * This function assumes the PCI device @pdev has at least as many available
>   * interrupt vectors as @set has queues.  It will then query the vector
> @@ -28,13 +29,14 @@
>   * that maps a queue to the CPUs that have irq affinity for the corresponding
>   * vector.
>   */
> -int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev)
> +int __blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
> + int offset)
>  {
>   const struct cpumask *mask;
>   unsigned int queue, cpu;
>  
>   for (queue = 0; queue < set->nr_hw_queues; queue++) {
> - mask = pci_irq_get_affinity(pdev, queue);
> + mask = pci_irq_get_affinity(pdev, queue + offset);
>   if (!mask)
>   goto fallback;
>  
> @@ -50,4 +52,10 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, 
> struct pci_dev *pdev)
>   set->mq_map[cpu] = 0;
>   return 0;
>  }
> +EXPORT_SYMBOL_GPL(__blk_mq_pci_map_queues);
> +
> +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev)
> +{
> + return __blk_mq_pci_map_queues(set, pdev, 0);
> +}
>  EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
> diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
> index 6338551e0fb9..5a92ecdbd78e 100644
> --- a/include/linux/blk-mq-pci.h
> +++ b/include/linux/blk-mq-pci.h
> @@ -5,6 +5,8 @@
>  struct blk_mq_tag_set;
>  struct pci_dev;
>  
> +int __blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
> + int offset);
>  int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev);
>  
>  #endif /* _LINUX_BLK_MQ_PCI_H */
> -- 
> 2.14.3
> 

Given no many callers of blk_mq_pci_map_queues(), I suggest to add the
parameter of 'offset' to this API directly, then people may keep the 
'.pre_vectors' stuff in mind, and avoid to misuse it.

Thanks,
Ming


Re: [PATCH 2/3] nvme-pci: Remove unused queue parameter

2018-03-25 Thread Ming Lei
On Fri, Mar 23, 2018 at 04:19:22PM -0600, Keith Busch wrote:
> All nvme queue memory is allocated up front. We don't take the node
> into consideration when creating queues anymore, so removing the unused
> parameter.
> 
> Signed-off-by: Keith Busch 
> ---
>  drivers/nvme/host/pci.c | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index cef5ce851a92..632166f7d8f2 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1379,8 +1379,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, 
> struct nvme_queue *nvmeq,
>   return 0;
>  }
>  
> -static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
> - int depth, int node)
> +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
>  {
>   struct nvme_queue *nvmeq = >queues[qid];
>  
> @@ -1595,8 +1594,7 @@ static int nvme_pci_configure_admin_queue(struct 
> nvme_dev *dev)
>   if (result < 0)
>   return result;
>  
> - result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
> - dev_to_node(dev->dev));
> + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
>   if (result)
>   return result;
>  
> @@ -1629,9 +1627,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
>   int ret = 0;
>  
>   for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
> - /* vector == qid - 1, match nvme_create_queue */
> - if (nvme_alloc_queue(dev, i, dev->q_depth,
> -  pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
> + if (nvme_alloc_queue(dev, i, dev->q_depth)) {
>   ret = -ENOMEM;
>   break;
>   }

nvme_create_io_queues() is called after pci_alloc_irq_vectors() returns,
and the above pci_irq_get_node() should return the correct node info,
right?

Thanks,
Ming


Re: [PATCH V5 1/5] scsi: hpsa: fix selection of reply queue

2018-03-19 Thread Ming Lei
On Mon, Mar 19, 2018 at 04:42:09PM +0200, Artem Bityutskiy wrote:
> On Mon, 2018-03-19 at 08:31 -0600, Jens Axboe wrote:
> > I'm assuming that Martin will eventually queue this up. But probably
> > for 4.17, then we can always flag it for a backport to stable once
> > it's been thoroughly tested.
> 
> Jens, thanks for reply.
> 
> I wonder if folks agree that in this case we should revert 
> 
> 84676c1f21e8 genirq/affinity: assign vectors to all possible CPUs
> 
> for v4.16.

Even 84676c1f21e8 is reverted, IO failure or hang can still be triggered
easily when doing CPU online/offline test.

So this patchset is really needed.

Thanks,
Ming


Re: [Possible REGRESSION, 4.16-rc4] Error updating SMART data during runtime and could not connect to lvmetad at some boot attempts

2018-03-13 Thread Ming Lei
On Tue, Mar 13, 2018 at 02:08:23PM +0100, Martin Steigerwald wrote:
> Hans de Goede - 11.03.18, 15:37:
> > Hi Martin,
> > 
> > On 11-03-18 09:20, Martin Steigerwald wrote:
> > > Hello.
> > > 
> > > Since 4.16-rc4 (upgraded from 4.15.2 which worked) I have an issue
> > > with SMART checks occassionally failing like this:
> > > 
> > > smartd[28017]: Device: /dev/sdb [SAT], is in SLEEP mode, suspending checks
> > > udisksd[24408]: Error performing housekeeping for drive
> > > /org/freedesktop/UDisks2/drives/INTEL_SSDSA2CW300G3_[…]: Error updating
> > > SMART data: Error sending ATA command CHECK POWER MODE: Unexpected sense
> > > data returned:#012: 0e 09 0c 00  00 00 ff 00  00 00 00 00  00 00 50
> > > 00..P.#0120010: 00 00 00 00  00 00 00 00  00 00 00 00  00
> > > 00 00 00#012 (g-io-error-quark, 0) merkaba
> > > udisksd[24408]: Error performing housekeeping for drive
> > > /org/freedesktop/UDisks2/drives/Crucial_CT480M500SSD3_[…]: Error updating
> > > SMART dat a: Error sending ATA command CHECK POWER MODE: Unexpected sense
> > > data returned:#012: 01 00 1d 00  00 00 0e 09  0c 00 00 00  ff 00 00
> > > 00#0120010: 00 0 0 00 00  50 00 00 00  00 00 00 00 
> > > 00 00 00 00P...#012 (g-io-error-quark, 0)
> > > 
> > > (Intel SSD is connected via SATA, Crucial via mSATA in a ThinkPad T520)
> > > 
> > > However when I then check manually with smartctl -a | -x | -H the device
> > > reports SMART data just fine.
> > > 
> > > As smartd correctly detects that device is in sleep mode, this may be an
> > > userspace issue in udisksd.
> > > 
> > > Also at some boot attempts the boot hangs with a message like "could not
> > > connect to lvmetad, scanning manually for devices". I use BTRFS RAID 1
> > > on to LVs (each on one of the SSDs). A configuration that requires a
> > > manual
> > > adaption to InitRAMFS in order to boot (basically vgchange -ay before
> > > btrfs device scan).
> > > 
> > > I wonder whether that has to do with the new SATA LPM policy stuff, but as
> > > I had issues with
> > > 
> > >   3 => Medium power with Device Initiated PM enabled
> > > 
> > > (machine did not boot, which could also have been caused by me
> > > accidentally
> > > removing all TCP/IP network support in the kernel with that setting)
> > > 
> > > I set it back to
> > > 
> > > CONFIG_SATA_MOBILE_LPM_POLICY=0
> > > 
> > > (firmware settings)
> > 
> > Right, so at that settings the LPM policy changes are effectively
> > disabled and cannot explain your SMART issues.
> 
> Yes, I now good a photo of one of those boot failures I mentioned, at it 
> seems 
> to be related to blk-mq, as the backtrace contains "blk_mq_terminate_expired".
> 
> I add the screenshot to my bug report.
> 
> [Possible REGRESSION, 4.16-rc4] Error updating SMART data during runtime and 
> boot failures with blk_mq_terminate_expired in backtrace
> https://bugzilla.kernel.org/show_bug.cgi?id=199077
> 
> Hans, I will test your LPM policy horkage for Crucial m500 patch at a later 
> time. I first wanted to add the photo of the boot failure to the bug report.
> 
> Ming and Bart, I added you to cc, cause I had to do with you about another 
> blk-mq report, please feel free to adapt.

Looks RIP points to scsi_times_out+0x17/0x1d0, maybe a SCSI regression?

Thanks,
Ming


[PATCH V5 4/5] scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity

2018-03-13 Thread Ming Lei
Now 84676c1f21e8ff5(genirq/affinity: assign vectors to all possible CPUs)
has been merged to V4.16-rc, and it is easy to allocate all offline CPUs
for some irq vectors, this can't be avoided even though the allocation
is improved.

For example, on a 8cores VM, 4~7 are not-present/offline, 4 queues of
virtio-scsi, the irq affinity assigned can become the following shape:

irq 36, cpu list 0-7
irq 37, cpu list 0-7
irq 38, cpu list 0-7
irq 39, cpu list 0-1
irq 40, cpu list 4,6
irq 41, cpu list 2-3
irq 42, cpu list 5,7

Then IO hang is triggered in case of non-SCSI_MQ.

Given storage IO is always C/S model, there isn't such issue with 
SCSI_MQ(blk-mq),
because no IO can be submitted to one hw queue if the hw queue isn't
mapped to online CPUs.

Fix this issue by forcing to use blk_mq.

BTW, I have been used virtio-scsi(scsi_mq) for several years, and it has
been quite stable, so it shouldn't cause extra risk.

Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Mike Snitzer <snit...@redhat.com>
Cc: Laurence Oberman <lober...@redhat.com>
Reviewed-by: Hannes Reinecke <h...@suse.de>
Acked-by: Paolo Bonzini <pbonz...@redhat.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/virtio_scsi.c | 59 +++---
 1 file changed, 3 insertions(+), 56 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 7c28e8d4955a..54e3a0f6844c 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -91,9 +91,6 @@ struct virtio_scsi_vq {
 struct virtio_scsi_target_state {
seqcount_t tgt_seq;
 
-   /* Count of outstanding requests. */
-   atomic_t reqs;
-
/* Currently active virtqueue for requests sent to this target. */
struct virtio_scsi_vq *req_vq;
 };
@@ -152,8 +149,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi 
*vscsi, void *buf)
struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = >resp.cmd;
-   struct virtio_scsi_target_state *tgt =
-   scsi_target(sc->device)->hostdata;
 
dev_dbg(>device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n",
@@ -210,8 +205,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi 
*vscsi, void *buf)
}
 
sc->scsi_done(sc);
-
-   atomic_dec(>reqs);
 }
 
 static void virtscsi_vq_done(struct virtio_scsi *vscsi,
@@ -580,10 +573,7 @@ static int virtscsi_queuecommand_single(struct Scsi_Host 
*sh,
struct scsi_cmnd *sc)
 {
struct virtio_scsi *vscsi = shost_priv(sh);
-   struct virtio_scsi_target_state *tgt =
-   scsi_target(sc->device)->hostdata;
 
-   atomic_inc(>reqs);
return virtscsi_queuecommand(vscsi, >req_vqs[0], sc);
 }
 
@@ -596,55 +586,11 @@ static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct 
virtio_scsi *vscsi,
return >req_vqs[hwq];
 }
 
-static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
-  struct virtio_scsi_target_state 
*tgt)
-{
-   struct virtio_scsi_vq *vq;
-   unsigned long flags;
-   u32 queue_num;
-
-   local_irq_save(flags);
-   if (atomic_inc_return(>reqs) > 1) {
-   unsigned long seq;
-
-   do {
-   seq = read_seqcount_begin(>tgt_seq);
-   vq = tgt->req_vq;
-   } while (read_seqcount_retry(>tgt_seq, seq));
-   } else {
-   /* no writes can be concurrent because of atomic_t */
-   write_seqcount_begin(>tgt_seq);
-
-   /* keep previous req_vq if a reader just arrived */
-   if (unlikely(atomic_read(>reqs) > 1)) {
-   vq = tgt->req_vq;
-   goto unlock;
-   }
-
-   queue_num = smp_processor_id();
-   while (unlikely(queue_num >= vscsi->num_queues))
-   queue_num -= vscsi->num_queues;
-   tgt->req_vq = vq = >req_vqs[queue_num];
- unlock:
-   write_seqcount_end(>tgt_seq);
-   }
-   local_irq_restore(flags);
-
-   return vq;
-}
-
 static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
 

[PATCH V5 2/5] scsi: megaraid_sas: fix selection of reply queue

2018-03-13 Thread Ming Lei
>From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
one msix vector can be created without any online CPU mapped, then
command may be queued, and won't be notified after its completion.

This patch setups mapping between cpu and reply queue according to irq
affinity info retrived by pci_irq_get_affinity(), and uses this info
to choose reply queue for queuing one command.

Then the chosen reply queue has to be active, and fixes IO hang caused
by using inactive reply queue which doesn't have any online CPU mapped.

Cc: Hannes Reinecke <h...@suse.de>
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Laurence Oberman <lober...@redhat.com>
Cc: Mike Snitzer <snit...@redhat.com>
Cc: Meelis Roos <mr...@linux.ee>
Cc: Artem Bityutskiy <artem.bityuts...@intel.com>
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/megaraid/megaraid_sas.h|  1 +
 drivers/scsi/megaraid/megaraid_sas_base.c   | 39 ++---
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 12 +++--
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas.h 
b/drivers/scsi/megaraid/megaraid_sas.h
index ba6503f37756..27fab8235ea5 100644
--- a/drivers/scsi/megaraid/megaraid_sas.h
+++ b/drivers/scsi/megaraid/megaraid_sas.h
@@ -2128,6 +2128,7 @@ enum MR_PD_TYPE {
 
 struct megasas_instance {
 
+   unsigned int *reply_map;
__le32 *producer;
dma_addr_t producer_h;
__le32 *consumer;
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c 
b/drivers/scsi/megaraid/megaraid_sas_base.c
index a71ee67df084..dde0798b8a91 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -5165,6 +5165,26 @@ megasas_setup_jbod_map(struct megasas_instance *instance)
instance->use_seqnum_jbod_fp = false;
 }
 
+static void megasas_setup_reply_map(struct megasas_instance *instance)
+{
+   const struct cpumask *mask;
+   unsigned int queue, cpu;
+
+   for (queue = 0; queue < instance->msix_vectors; queue++) {
+   mask = pci_irq_get_affinity(instance->pdev, queue);
+   if (!mask)
+   goto fallback;
+
+   for_each_cpu(cpu, mask)
+   instance->reply_map[cpu] = queue;
+   }
+   return;
+
+fallback:
+   for_each_possible_cpu(cpu)
+   instance->reply_map[cpu] = cpu % instance->msix_vectors;
+}
+
 /**
  * megasas_init_fw -   Initializes the FW
  * @instance:  Adapter soft state
@@ -5343,6 +5363,8 @@ static int megasas_init_fw(struct megasas_instance 
*instance)
goto fail_setup_irqs;
}
 
+   megasas_setup_reply_map(instance);
+
dev_info(>pdev->dev,
"firmware supports msix\t: (%d)", fw_msix_count);
dev_info(>pdev->dev,
@@ -6123,20 +6145,29 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct 
megasas_instance *instance)
  */
 static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
 {
+   instance->reply_map = kzalloc(sizeof(unsigned int) * nr_cpu_ids,
+ GFP_KERNEL);
+   if (!instance->reply_map)
+   return -ENOMEM;
+
switch (instance->adapter_type) {
case MFI_SERIES:
if (megasas_alloc_mfi_ctrl_mem(instance))
-   return -ENOMEM;
+   goto fail;
break;
case VENTURA_SERIES:
case THUNDERBOLT_SERIES:
case INVADER_SERIES:
if (megasas_alloc_fusion_context(instance))
-   return -ENOMEM;
+   goto fail;
break;
}
 
return 0;
+ fail:
+   kfree(instance->reply_map);
+   instance->reply_map = NULL;
+   return -ENOMEM;
 }
 
 /*
@@ -6148,6 +6179,7 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance 
*instance)
  */
 static inline void megasas_free_ctrl_mem(struct megasas_instance *instance)
 {
+   kfree(instance->reply_map);
if (instance->adapter_type == MFI_SERIES) {
if (instance->producer)
pci_free_consistent(instance->pdev, sizeof(u32),
@@ -6540,7 +6572,6 @@ static int megasas_probe_one(struct pci_dev *pdev,
pci_free_irq_vectors(instance->pdev);
 fail_init_mfi:
scsi_host_put(host);
-
 fail_alloc_instance:
pci_disable_device(pdev);
 
@@ -6746,6 +6777,8 @@ megasas_resume(struct pci_dev *pdev)
if (rval < 0)
   

[PATCH V5 5/5] scsi: virtio_scsi: unify scsi_host_template

2018-03-13 Thread Ming Lei
Now we switch to use_blk_mq always, and both single queue and multi
queue cases can be handled in one .queuecommand callback, not necessary
to use two scsi_host_template.

Suggested-by: Christoph Hellwig <h...@lst.de>,
Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Mike Snitzer <snit...@redhat.com>
Cc: Laurence Oberman <lober...@redhat.com>
Cc: Hannes Reinecke <h...@suse.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/virtio_scsi.c | 74 ++
 1 file changed, 15 insertions(+), 59 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 54e3a0f6844c..45d04631888a 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -522,11 +522,20 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device 
*vdev,
 }
 #endif
 
-static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
-struct virtio_scsi_vq *req_vq,
+static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi,
+ struct scsi_cmnd *sc)
+{
+   u32 tag = blk_mq_unique_tag(sc->request);
+   u16 hwq = blk_mq_unique_tag_to_hwq(tag);
+
+   return >req_vqs[hwq];
+}
+
+static int virtscsi_queuecommand(struct Scsi_Host *shost,
 struct scsi_cmnd *sc)
 {
-   struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
+   struct virtio_scsi *vscsi = shost_priv(shost);
+   struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc);
struct virtio_scsi_cmd *cmd = scsi_cmd_priv(sc);
unsigned long flags;
int req_size;
@@ -569,32 +578,6 @@ static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
return 0;
 }
 
-static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
-   struct scsi_cmnd *sc)
-{
-   struct virtio_scsi *vscsi = shost_priv(sh);
-
-   return virtscsi_queuecommand(vscsi, >req_vqs[0], sc);
-}
-
-static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi,
- struct scsi_cmnd *sc)
-{
-   u32 tag = blk_mq_unique_tag(sc->request);
-   u16 hwq = blk_mq_unique_tag_to_hwq(tag);
-
-   return >req_vqs[hwq];
-}
-
-static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
-  struct scsi_cmnd *sc)
-{
-   struct virtio_scsi *vscsi = shost_priv(sh);
-   struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc);
-
-   return virtscsi_queuecommand(vscsi, req_vq, sc);
-}
-
 static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 {
DECLARE_COMPLETION_ONSTACK(comp);
@@ -750,34 +733,13 @@ static enum blk_eh_timer_return 
virtscsi_eh_timed_out(struct scsi_cmnd *scmnd)
return BLK_EH_RESET_TIMER;
 }
 
-static struct scsi_host_template virtscsi_host_template_single = {
-   .module = THIS_MODULE,
-   .name = "Virtio SCSI HBA",
-   .proc_name = "virtio_scsi",
-   .this_id = -1,
-   .cmd_size = sizeof(struct virtio_scsi_cmd),
-   .queuecommand = virtscsi_queuecommand_single,
-   .change_queue_depth = virtscsi_change_queue_depth,
-   .eh_abort_handler = virtscsi_abort,
-   .eh_device_reset_handler = virtscsi_device_reset,
-   .eh_timed_out = virtscsi_eh_timed_out,
-   .slave_alloc = virtscsi_device_alloc,
-
-   .dma_boundary = UINT_MAX,
-   .use_clustering = ENABLE_CLUSTERING,
-   .target_alloc = virtscsi_target_alloc,
-   .target_destroy = virtscsi_target_destroy,
-   .track_queue_depth = 1,
-   .force_blk_mq = 1,
-};
-
-static struct scsi_host_template virtscsi_host_template_multi = {
+static struct scsi_host_template virtscsi_host_template = {
.module = THIS_MODULE,
.name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi",
.this_id = -1,
.cmd_size = sizeof(struct virtio_scsi_cmd),
-   .queuecommand = virtscsi_queuecommand_multi,
+   .queuecommand = virtscsi_queuecommand,
.change_queue_depth = virtscsi_change_queue_depth,
.eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset,
@@ -883,7 +845,6 @@ static int virtscsi_probe(struct virtio_device *vdev)
u32 sg_elems, num_targets;
u32 cmd_per_lun;
u32 num_queues;
-   struct scsi_host_template *hostt;
 
if (!vdev->config->get) {
dev_err(>dev, "%s failure: config access disabled\n",
@@ -896,12 +857,7 @@ static int virtscsi_probe(struct virtio_device *vdev)
 

[PATCH V5 3/5] scsi: introduce force_blk_mq

2018-03-13 Thread Ming Lei
>From scsi driver view, it is a bit troublesome to support both blk-mq
and non-blk-mq at the same time, especially when drivers need to support
multi hw-queue.

This patch introduces 'force_blk_mq' to scsi_host_template so that drivers
can provide blk-mq only support, so driver code can avoid the trouble
for supporting both.

Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Mike Snitzer <snit...@redhat.com>
Cc: Laurence Oberman <lober...@redhat.com>
Reviewed-by: Hannes Reinecke <h...@suse.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/hosts.c | 1 +
 include/scsi/scsi_host.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 57bf43e34863..cbbc32df7595 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -477,6 +477,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template 
*sht, int privsize)
shost->dma_boundary = 0x;
 
shost->use_blk_mq = scsi_use_blk_mq;
+   shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq;
 
device_initialize(>shost_gendev);
dev_set_name(>shost_gendev, "host%d", shost->host_no);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 1a1df0d21ee3..6c6366f0bd15 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -452,6 +452,9 @@ struct scsi_host_template {
/* True if the controller does not support WRITE SAME */
unsigned no_write_same:1;
 
+   /* True if the low-level driver supports blk-mq only */
+   unsigned force_blk_mq:1;
+
/*
 * Countdown for host blocking with no commands outstanding.
 */
-- 
2.9.5



[PATCH V5 1/5] scsi: hpsa: fix selection of reply queue

2018-03-13 Thread Ming Lei
>From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
one msix vector can be created without any online CPU mapped, then one
command's completion may not be notified.

This patch setups mapping between cpu and reply queue according to irq
affinity info retrived by pci_irq_get_affinity(), and uses this mapping
table to choose reply queue for queuing one command.

Then the chosen reply queue has to be active, and fixes IO hang caused
by using inactive reply queue which doesn't have any online CPU mapped.

Cc: Hannes Reinecke <h...@suse.de>
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Laurence Oberman <lober...@redhat.com>
Cc: Meelis Roos <mr...@linux.ee>
Cc: Artem Bityutskiy <artem.bityuts...@intel.com>
Cc: Mike Snitzer <snit...@redhat.com>
Tested-by: Laurence Oberman <lober...@redhat.com>
Tested-by: Don Brace <don.br...@microsemi.com>
Tested-by: Artem Bityutskiy <artem.bityuts...@intel.com>
Acked-by: Don Brace <don.br...@microsemi.com>
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/hpsa.c | 73 +++--
 drivers/scsi/hpsa.h |  1 +
 2 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 5293e6827ce5..3a9eca163db8 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -1045,11 +1045,7 @@ static void set_performant_mode(struct ctlr_info *h, 
struct CommandList *c,
c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
if (unlikely(!h->msix_vectors))
return;
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   c->Header.ReplyQueue =
-   raw_smp_processor_id() % h->nreply_queues;
-   else
-   c->Header.ReplyQueue = reply_queue % h->nreply_queues;
+   c->Header.ReplyQueue = reply_queue;
}
 }
 
@@ -1063,10 +1059,7 @@ static void set_ioaccel1_performant_mode(struct 
ctlr_info *h,
 * Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->ReplyQueue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->ReplyQueue = reply_queue % h->nreply_queues;
+   cp->ReplyQueue = reply_queue;
/*
 * Set the bits in the address sent down to include:
 *  - performant mode bit (bit 0)
@@ -1087,10 +1080,7 @@ static void set_ioaccel2_tmf_performant_mode(struct 
ctlr_info *h,
/* Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->reply_queue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->reply_queue = reply_queue % h->nreply_queues;
+   cp->reply_queue = reply_queue;
/* Set the bits in the address sent down to include:
 *  - performant mode bit not used in ioaccel mode 2
 *  - pull count (bits 0-3)
@@ -1109,10 +1099,7 @@ static void set_ioaccel2_performant_mode(struct 
ctlr_info *h,
 * Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->reply_queue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->reply_queue = reply_queue % h->nreply_queues;
+   cp->reply_queue = reply_queue;
/*
 * Set the bits in the address sent down to include:
 *  - performant mode bit not used in ioaccel mode 2
@@ -1157,6 +1144,8 @@ static void __enqueue_cmd_and_start_io(struct ctlr_info 
*h,
 {
dial_down_lockup_detection_during_fw_flash(h, c);
atomic_inc(>commands_outstanding);
+
+   reply_queue = h->reply_map[raw_smp_processor_id()];
switch (c->cmd_type) {
case CMD_IOACCEL1:
set_ioaccel1_performant_mode(h, c, reply_queue);
@@ -7376,6 +7365,26 @@ static void hpsa_disable_interrupt_mode(struct ctlr_info 
*h)
h->msix_vectors = 0;
 }
 
+static void hpsa_setup_reply_map(struct ctlr_info *h)
+{
+   const struct cpumask *mask;
+   unsigned int queue, cpu;
+
+   for (queue = 0; queue < h->msix_vectors; queue++) {
+   mask = pci_irq_get_

[PATCH V5 0/5] SCSI: fix selection of reply(hw) queue

2018-03-13 Thread Ming Lei
Hi All,

The patches fixes reply queue(virt-queue on virtio-scsi) selection on hpsa,
megaraid_sa and virtio-scsi, and IO hang can be caused easily by this issue.

This issue is triggered by 84676c1f21e8 ("genirq/affinity: assign vectors
to all possible CPUs"). After 84676c1f21e8, it is easy to see one msix
vector mapped to all offline CPUs. If the reply queue is seleteced from
all allocated msix vectors(reply queues) in round-roin way, the selected
replay queue may not have any online CPU mapped, IO hang is caused.

Both hpsa and megaraid_sas uses host-wide tagset, we can't convert the
reply queue to blk_mq hw queue directly, otherwise IO performance is degraded
much, according to Kashyap's test, so this patchset sets up one mapping talbe
for selecting reply queue, and this approach has been used by mpt3sas already.

For virtio-scsi, the virt-queue is really hw queue wrt. blk-mq view, so
we introduce 'force_blk_mq' for fix this issue because: 1) virtio-blk
has been used for years in blk-mq mode; 2) we have discussed recently
that scsi_mq will be enabled at default soon.


gitweb:
https://github.com/ming1/linux/tree/v4.16-rc-select-reply-queue-fix-V5

V5:
- cover legacy vector for megaraid_sas(2/5)
- patch style change (4/5)
- add one virtio-scsi cleanup patch(5/5)

V4:
- splitted from previous patchset
- handle virtio-scsi by force_blk_mq

Ming Lei (5):
  scsi: hpsa: fix selection of reply queue
  scsi: megaraid_sas: fix selection of reply queue
  scsi: introduce force_blk_mq
  scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity
  scsi: virtio_scsi: unify scsi_host_template

 drivers/scsi/hosts.c|   1 +
 drivers/scsi/hpsa.c |  73 
 drivers/scsi/hpsa.h |   1 +
 drivers/scsi/megaraid/megaraid_sas.h|   1 +
 drivers/scsi/megaraid/megaraid_sas_base.c   |  39 -
 drivers/scsi/megaraid/megaraid_sas_fusion.c |  12 +--
 drivers/scsi/virtio_scsi.c  | 129 
 include/scsi/scsi_host.h|   3 +
 8 files changed, 116 insertions(+), 143 deletions(-)

-- 
2.9.5



Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Ming Lei
On Tue, Mar 13, 2018 at 09:38:41AM +0200, Artem Bityutskiy wrote:
> On Tue, 2018-03-13 at 11:11 +0800, Dou Liyang wrote:
> >  I also
> > met the situation that BIOS told to ACPI that it could support
> > physical
> > CPUs hotplug, But actually, there was no hardware slots in the
> > machine.
> > the ACPI tables like user inputs which should be validated when we
> > use.
> 
> This is exactly what happens on Skylake Xeon systems. When I check
> dmesg or this file:
> 
> /sys/devices/system/cpu/possible
> 
> on 2S (two socket) and 4S (four socket) systems, I see the same number
> 432.
> 
> This number comes from ACPI MADT. I will speculate (did not see myself)
> that 8S systems will report the same number as well, because of the
> Skylake-SP (Scalable Platform) architecture.
> 
> Number 432 is good for 8S systems, but it is way too large for 2S and
> 4S systems - 4x or 2x larger than the theoretical maximum.
> 
> I do not know why BIOSes have to report unrealistically high numbers, I
> am just sharing my observation.
> 
> So yes, Linux kernel's possible CPU count knowledge may be too large.
> If we use that number to evenly spread IRQ vectors among the CPUs, we
> end up with wasted vectors, and even bugs, as I observe on a 2S
> Skylake.

Then looks this issue need to fix by making possible CPU count accurate
because there are other resources allocated according to num_possible_cpus(),
such as percpu variables.

Thanks,
Ming


Re: [PATCH V4 1/4] scsi: hpsa: fix selection of reply queue

2018-03-12 Thread Ming Lei
On Mon, Mar 12, 2018 at 08:52:02AM +0100, Christoph Hellwig wrote:
> On Sat, Mar 10, 2018 at 11:01:43PM +0800, Ming Lei wrote:
> > > I really dislike this being open coded in drivers.  It really should
> > > be helper chared with the blk-mq map building that drivers just use.
> > > 
> > > For now just have a low-level blk_pci_map_queues that
> > > blk_mq_pci_map_queues, hpsa and megaraid can share.  In the long run
> > > it might make sense to change the blk-mq callout to that low-level
> > > prototype as well.
> > 
> > The way for selecting reply queue is needed for non scsi_mq too.
> 
> Which still doesn't prevent you from using a common helper.

The only common code is the following part:

+   for (queue = 0; queue < instance->msix_vectors; queue++) {
+   mask = pci_irq_get_affinity(instance->pdev, queue);
+   if (!mask)
+   goto fallback;
+
+   for_each_cpu(cpu, mask)
+   instance->reply_map[cpu] = queue;
+   }

For megraraid_sas, the fallback code need to handle mapping in the
following way for legacy vectors:

   for_each_possible_cpu(cpu)
   instance->reply_map[cpu] = cpu % instance->msix_vectors;


So not sure if it is worth of a common helper, given there may not be
potential users of the helper.

Thanks,
Ming


Re: [PATCH V2] nvme-pci: assign separate irq vectors for adminq and ioq0

2018-03-12 Thread Ming Lei
On Fri, Mar 09, 2018 at 10:24:45AM -0700, Keith Busch wrote:
> On Thu, Mar 08, 2018 at 08:42:20AM +0100, Christoph Hellwig wrote:
> > 
> > So I suspect we'll need to go with a patch like this, just with a way
> > better changelog.
> 
> I have to agree this is required for that use case. I'll run some
> quick tests and propose an alternate changelog.
> 
> Longer term, the current way we're including offline present cpus either
> (a) has the driver allocate resources it can't use or (b) spreads the
> ones it can use thinner than they need to be. Why don't we rerun the
> irq spread under a hot cpu notifier for only online CPUs?

4b855ad371 ("blk-mq: Create hctx for each present CPU") removes handling
mapping change via hot cpu notifier. Not only code is cleaned up, but
also fixes very complicated queue dependency issue:

- loop/dm-rq queue depends on underlying queue
- for NVMe, IO queue depends on admin queue

If freezing queue can be avoided in CPU notifier, it should be fine to
do that, otherwise it need to be avoided.

Thanks,
Ming


Re: [PATCH V4 4/4] scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity

2018-03-12 Thread Ming Lei
On Sat, Mar 10, 2018 at 11:15:20AM +0100, Christoph Hellwig wrote:
> This looks generally fine to me:
> 
> Reviewed-by: Christoph Hellwig 
> 
> As a follow on we should probably kill virtscsi_queuecommand_single and
> thus virtscsi_host_template_single as well.
> > Given storage IO is always C/S model, there isn't such issue with 
> > SCSI_MQ(blk-mq),
> 
> What does C/S mean here?

Client–Server.

> 
> > @@ -580,10 +573,7 @@ static int virtscsi_queuecommand_single(struct 
> > Scsi_Host *sh,
> > struct scsi_cmnd *sc)
> >  {
> > struct virtio_scsi *vscsi = shost_priv(sh);
> > -   struct virtio_scsi_target_state *tgt =
> > -   scsi_target(sc->device)->hostdata;
> >  
> > -   atomic_inc(>reqs);
> > return virtscsi_queuecommand(vscsi, >req_vqs[0], sc);
> >  }
> 
> >  static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
> >struct scsi_cmnd *sc)
> >  {
> > struct virtio_scsi *vscsi = shost_priv(sh);
> > -   struct virtio_scsi_target_state *tgt =
> > -   scsi_target(sc->device)->hostdata;
> > -   struct virtio_scsi_vq *req_vq;
> > -
> > -   if (shost_use_blk_mq(sh))
> > -   req_vq = virtscsi_pick_vq_mq(vscsi, sc);
> > -   else
> > -   req_vq = virtscsi_pick_vq(vscsi, tgt);
> > +   struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc);
> >  
> > return virtscsi_queuecommand(vscsi, req_vq, sc);
> 
> Given how virtscsi_pick_vq_mq works virtscsi_queuecommand_single and
> virtscsi_queuecommand_multi now have identical behavior.  That means
> virtscsi_queuecommand_single should be removed, and
> virtscsi_queuecommand_multi should be merged into virtscsi_queuecommand,

OK.

> 
> > @@ -823,6 +768,7 @@ static struct scsi_host_template 
> > virtscsi_host_template_single = {
> > .target_alloc = virtscsi_target_alloc,
> > .target_destroy = virtscsi_target_destroy,
> > .track_queue_depth = 1,
> > +   .force_blk_mq = 1,
> 
> This probably isn't strictly needed.  That being said with your
> change we could probably just drop virtscsi_host_template_single entirely.
> 

OK.

Thanks,
Ming


Re: [PATCH V4 1/4] scsi: hpsa: fix selection of reply queue

2018-03-10 Thread Ming Lei
On Sat, Mar 10, 2018 at 11:09:59AM +0100, Christoph Hellwig wrote:
> > +static void hpsa_setup_reply_map(struct ctlr_info *h)
> > +{
> > +   const struct cpumask *mask;
> > +   unsigned int queue, cpu;
> > +
> > +   for (queue = 0; queue < h->msix_vectors; queue++) {
> > +   mask = pci_irq_get_affinity(h->pdev, queue);
> > +   if (!mask)
> > +   goto fallback;
> > +
> > +   for_each_cpu(cpu, mask)
> > +   h->reply_map[cpu] = queue;
> > +   }
> > +   return;
> > +
> > +fallback:
> > +   for_each_possible_cpu(cpu)
> > +   h->reply_map[cpu] = 0;
> > +}
> 
> > +   h->reply_map = kzalloc(sizeof(*h->reply_map) * nr_cpu_ids, GFP_KERNEL);
> > +   if (!h->reply_map) {
> > +   kfree(h);
> > +   return NULL;
> > +   }
> > +   return h;
> 
> I really dislike this being open coded in drivers.  It really should
> be helper chared with the blk-mq map building that drivers just use.
> 
> For now just have a low-level blk_pci_map_queues that
> blk_mq_pci_map_queues, hpsa and megaraid can share.  In the long run
> it might make sense to change the blk-mq callout to that low-level
> prototype as well.

The way for selecting reply queue is needed for non scsi_mq too.

Thanks,
Ming


Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-09 Thread Ming Lei
On Fri, Mar 09, 2018 at 11:08:54AM +0100, Thomas Gleixner wrote:
> On Fri, 9 Mar 2018, Ming Lei wrote:
> > On Fri, Mar 09, 2018 at 12:20:09AM +0100, Thomas Gleixner wrote:
> > > On Thu, 8 Mar 2018, Ming Lei wrote:
> > > > Actually, it isn't a real fix, the real one is in the following two:
> > > > 
> > > > 0c20244d458e scsi: megaraid_sas: fix selection of reply queue
> > > > ed6d043be8cd scsi: hpsa: fix selection of reply queue
> > > 
> > > Where are these commits? Neither Linus tree not -next know anything about
> > > them
> > 
> > Both aren't merged yet, but they should land V4.16, IMO.
> > 
> > > 
> > > > This patchset can't guarantee that all IRQ vectors are assigned by one
> > > > online CPU, for example, in a quad-socket system, if only one processor
> > > > is present, then some of vectors are still assigned by all offline CPUs,
> > > > and it is a valid case, but still may cause io hang if drivers(hpsa,
> > > > megaraid_sas) select reply queue in current way.
> > > 
> > > So my understanding is that these irq patches are enhancements and not bug
> > > fixes. I'll queue them for 4.17 then.
> > 
> > Wrt. this IO hang issue, these patches shouldn't be bug fix, but they may
> > fix performance regression[1] for some systems caused by 84676c1f21 
> > ("genirq/affinity:
> > assign vectors to all possible CPUs").
> > 
> > [1] https://marc.info/?l=linux-block=152050347831149=2
> 
> Hmm. The patches are rather large for urgent and evtl. backporting. Is
> there a simpler way to address that performance issue?

Not thought of a simpler solution. The problem is that number of active msix 
vector
is decreased a lot by commit 84676c1f21.

However, if someone wants to backport, this patchset can be applied cleanly, no
any conflict.

Thanks,
Ming


Re: [PATCH V4 2/4] scsi: megaraid_sas: fix selection of reply queue

2018-03-09 Thread Ming Lei
On Fri, Mar 09, 2018 at 04:37:56PM +0530, Kashyap Desai wrote:
> > -Original Message-
> > From: Ming Lei [mailto:ming@redhat.com]
> > Sent: Friday, March 9, 2018 9:02 AM
> > To: James Bottomley; Jens Axboe; Martin K . Petersen
> > Cc: Christoph Hellwig; linux-s...@vger.kernel.org; linux-
> > bl...@vger.kernel.org; Meelis Roos; Don Brace; Kashyap Desai; Laurence
> > Oberman; Mike Snitzer; Ming Lei; Hannes Reinecke; James Bottomley; Artem
> > Bityutskiy
> > Subject: [PATCH V4 2/4] scsi: megaraid_sas: fix selection of reply queue
> >
> > From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
> one
> > msix vector can be created without any online CPU mapped, then command
> > may be queued, and won't be notified after its completion.
> >
> > This patch setups mapping between cpu and reply queue according to irq
> > affinity info retrived by pci_irq_get_affinity(), and uses this info to
> choose
> > reply queue for queuing one command.
> >
> > Then the chosen reply queue has to be active, and fixes IO hang caused
> by
> > using inactive reply queue which doesn't have any online CPU mapped.
> 
> Also megaraid FW will use reply queue 0 for any async notification.  We
> want to set pre_vectors = 1 and make sure reply queue 0 is not part of
> affinity hint.
> To meet that requirement, I have to make some more changes like add extra
> queue.
> Example if reply queue supported by FW is 96 and online CPU is 16, current
> driver will allocate 16 msix vector. We may have to allocate 17 msix
> vector and reserve reply queue 0 for async reply from FW.
> 
> I will be sending follow up patch soon.

OK, but the above extra change shouldn't belong to this patch, which
focuses on fixing IO hang because of reply queue selection.

> 
> >
> > Cc: Hannes Reinecke <h...@suse.de>
> > Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
> > Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
> > Cc: Christoph Hellwig <h...@lst.de>,
> > Cc: Don Brace <don.br...@microsemi.com>
> > Cc: Kashyap Desai <kashyap.de...@broadcom.com>
> > Cc: Laurence Oberman <lober...@redhat.com>
> > Cc: Mike Snitzer <snit...@redhat.com>
> > Cc: Meelis Roos <mr...@linux.ee>
> > Cc: Artem Bityutskiy <artem.bityuts...@intel.com>
> > Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible
> CPUs")
> > Signed-off-by: Ming Lei <ming@redhat.com>
> > ---
> >  drivers/scsi/megaraid/megaraid_sas.h|  2 +-
> >  drivers/scsi/megaraid/megaraid_sas_base.c   | 34
> > -
> >  drivers/scsi/megaraid/megaraid_sas_fusion.c | 12 --
> >  3 files changed, 38 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/scsi/megaraid/megaraid_sas.h
> > b/drivers/scsi/megaraid/megaraid_sas.h
> > index ba6503f37756..a644d2be55b6 100644
> > --- a/drivers/scsi/megaraid/megaraid_sas.h
> > +++ b/drivers/scsi/megaraid/megaraid_sas.h
> > @@ -2127,7 +2127,7 @@ enum MR_PD_TYPE {
> >  #define MR_NVME_PAGE_SIZE_MASK 0x00FF
> >
> >  struct megasas_instance {
> > -
> > +   unsigned int *reply_map;
> > __le32 *producer;
> > dma_addr_t producer_h;
> > __le32 *consumer;
> > diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c
> > b/drivers/scsi/megaraid/megaraid_sas_base.c
> > index a71ee67df084..065956cb2aeb 100644
> > --- a/drivers/scsi/megaraid/megaraid_sas_base.c
> > +++ b/drivers/scsi/megaraid/megaraid_sas_base.c
> > @@ -5165,6 +5165,26 @@ megasas_setup_jbod_map(struct
> > megasas_instance *instance)
> > instance->use_seqnum_jbod_fp = false;  }
> >
> > +static void megasas_setup_reply_map(struct megasas_instance *instance)
> > +{
> > +   const struct cpumask *mask;
> > +   unsigned int queue, cpu;
> > +
> > +   for (queue = 0; queue < instance->msix_vectors; queue++) {
> > +   mask = pci_irq_get_affinity(instance->pdev, queue);
> > +   if (!mask)
> > +   goto fallback;
> > +
> > +   for_each_cpu(cpu, mask)
> > +   instance->reply_map[cpu] = queue;
> > +   }
> > +   return;
> > +
> > +fallback:
> > +   for_each_possible_cpu(cpu)
> > +   instance->reply_map[cpu] = 0;
> 
> Fallback should be better instead of just assigning to single reply queue.
> May be something like below.
> 
>for_each_possible_cpu(cpu)
>instance->reply_map[cpu] = 

Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-03-09 Thread Ming Lei
On Fri, Mar 09, 2018 at 12:26:57PM +0530, Kashyap Desai wrote:
> > -Original Message-
> > From: Ming Lei [mailto:ming@redhat.com]
> > Sent: Thursday, March 8, 2018 4:54 PM
> > To: Kashyap Desai
> > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig; Mike
> Snitzer;
> > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar Sandoval;
> > Martin K . Petersen; James Bottomley; Christoph Hellwig; Don Brace;
> Peter
> > Rivera; Laurence Oberman
> > Subject: Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance
> via
> > .host_tagset
> >
> > On Thu, Mar 08, 2018 at 07:06:25PM +0800, Ming Lei wrote:
> > > On Thu, Mar 08, 2018 at 03:34:31PM +0530, Kashyap Desai wrote:
> > > > > -Original Message-
> > > > > From: Ming Lei [mailto:ming@redhat.com]
> > > > > Sent: Thursday, March 8, 2018 6:46 AM
> > > > > To: Kashyap Desai
> > > > > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig;
> > > > > Mike
> > > > Snitzer;
> > > > > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar
> > > > > Sandoval; Martin K . Petersen; James Bottomley; Christoph Hellwig;
> > > > > Don Brace;
> > > > Peter
> > > > > Rivera; Laurence Oberman
> > > > > Subject: Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq
> > > > > performance
> > > > via
> > > > > .host_tagset
> > > > >
> > > > > On Wed, Mar 07, 2018 at 10:58:34PM +0530, Kashyap Desai wrote:
> > > > > > > >
> > > > > > > > Also one observation using V3 series patch. I am seeing
> > > > > > > > below Affinity mapping whereas I have only 72 logical CPUs.
> > > > > > > > It means we are really not going to use all reply queues.
> > > > > > > > e.a If I bind fio jobs on CPU 18-20, I am seeing only one
> > > > > > > > reply queue is used and that may lead to performance drop as
> well.
> > > > > > >
> > > > > > > If the mapping is in such shape, I guess it should be quite
> > > > > > > difficult to
> > > > > > figure out
> > > > > > > one perfect way to solve this situation because one reply
> > > > > > > queue has to
> > > > > > handle
> > > > > > > IOs submitted from 4~5 CPUs at average.
> > > > > >
> > > > > > 4.15.0-rc1 kernel has below mapping - I am not sure which commit
> > > > > > id in
> > > > "
> > > > > > linux_4.16-rc-host-tags-v3.2" is changing the mapping of IRQ to
> CPU.
> > > > > > It
> > > > >
> > > > > I guess the mapping you posted is read from
> /proc/irq/126/smp_affinity.
> > > > >
> > > > > If yes, no any patch in linux_4.16-rc-host-tags-v3.2 should change
> > > > > IRQ
> > > > affinity
> > > > > code, which is done in irq_create_affinity_masks(), as you saw, no
> > > > > any
> > > > patch
> > > > > in linux_4.16-rc-host-tags-v3.2 touches that code.
> > > > >
> > > > > Could you simply apply the patches in linux_4.16-rc-host-tags-v3.2
> > > > against
> > > > > 4.15-rc1 kernel and see any difference?
> > > > >
> > > > > > will be really good if we can fall back to below mapping once
> again.
> > > > > > Current repo linux_4.16-rc-host-tags-v3.2 is giving lots of
> > > > > > random mapping of CPU - MSIx. And that will be problematic in
> > > > > > performance
> > > > run.
> > > > > >
> > > > > > As I posted earlier, latest repo will only allow us to use *18*
> > > > > > reply
> > > > >
> > > > > Looks not see this report before, could you share us how you
> > > > > conclude
> > > > that?
> > > > > The only patch changing reply queue is the following one:
> > > > >
> > > > >   https://marc.info/?l=linux-block=151972611911593=2
> > > > >
> > > > > But not see any issue in this patch yet, can you recover to 72
> > > > > reply
> > > > queues
> > > > > after reverting the patch in above link?
> > > &g

Re: [PATCH V4 0/4] SCSI: fix selection of reply(hw) queue

2018-03-08 Thread Ming Lei
On Fri, Mar 09, 2018 at 08:00:52AM +0100, Hannes Reinecke wrote:
> On 03/09/2018 04:32 AM, Ming Lei wrote:
> > Hi All,
> > 
> > The patches fixes reply queue(virt-queue on virtio-scsi) selection on hpsa,
> > megaraid_sa and virtio-scsi, and IO hang can be caused easily by this issue.
> > 
> > This issue is triggered by 84676c1f21e8 ("genirq/affinity: assign vectors
> > to all possible CPUs"). After 84676c1f21e8, it is easy to see one msix
> > vector mapped to all offline CPUs. If the reply queue is seleteced from
> > all allocated msix vectors(reply queues) in round-roin way, the selected
> > replay queue may not have any online CPU mapped, IO hang is caused.
> > 
> > Both hpsa and megaraid_sas uses host-wide tagset, we can't convert the
> > reply queue to blk_mq hw queue directly, otherwise IO performance is 
> > degraded
> > much, according to Kashyap's test, so this patchset sets up one mapping 
> > talbe
> > for selecting reply queue, and this approach has been used by mpt3sas 
> > already.
> > 
> > For virtio-scsi, the virt-queue is really hw queue wrt. blk-mq view, so
> > we introduce 'force_blk_mq' for fix this issue because: 1) virtio-blk
> > has been used for years in blk-mq mode; 2) we have discussed recently
> > that scsi_mq will be enabled at default soon. 
> > 
> > gitweb:
> > https://github.com/ming1/linux/tree/v4.16-rc-select-reply-queue-fix-V4
> > 
> > V4:
> > - splitted from previous patchset
> > - handle virtio-scsi by force_blk_mq
> > 
> > Ming Lei (4):
> >   scsi: hpsa: fix selection of reply queue
> >   scsi: megaraid_sas: fix selection of reply queue
> >   scsi: introduce force_blk_mq
> >   scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity
> > 
> >  drivers/scsi/hosts.c|  1 +
> >  drivers/scsi/hpsa.c | 73 
> > +
> >  drivers/scsi/hpsa.h |  1 +
> >  drivers/scsi/megaraid/megaraid_sas.h|  2 +-
> >  drivers/scsi/megaraid/megaraid_sas_base.c   | 34 +-
> >  drivers/scsi/megaraid/megaraid_sas_fusion.c | 12 ++---
> >  drivers/scsi/virtio_scsi.c  | 59 ++-
> >  include/scsi/scsi_host.h|  3 ++
> >  8 files changed, 100 insertions(+), 85 deletions(-)
> > 
> Well ... while this looks good in principle, what happens on cpu hotplug?
> Don't we have to redo the map then?

Each item in the table is used to for mapping one CPU id to the hw queue index,
and the size of the table is 'nr_cpu_id', so no need to redo the map on cpu 
hotplug,
just like the usage of set->mq_map in blk-mq.

Thank,
Ming


Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-08 Thread Ming Lei
On Fri, Mar 09, 2018 at 09:00:08AM +0200, Artem Bityutskiy wrote:
> On Fri, 2018-03-09 at 09:24 +0800, Ming Lei wrote:
> > Hi Thomas,
> > 
> > On Fri, Mar 09, 2018 at 12:20:09AM +0100, Thomas Gleixner wrote:
> > > On Thu, 8 Mar 2018, Ming Lei wrote:
> > > > Actually, it isn't a real fix, the real one is in the following
> > > > two:
> > > > 
> > > > 0c20244d458e scsi: megaraid_sas: fix selection of reply queue
> > > > ed6d043be8cd scsi: hpsa: fix selection of reply queue
> > > 
> > > Where are these commits? Neither Linus tree not -next know anything
> > > about
> > > them
> > 
> > Both aren't merged yet, but they should land V4.16, IMO.
> 
> Is it a secret where they are? If not, could you please give ma a
> pointer and I'll give them a test.

  https://marc.info/?l=linux-block=152056636717380=2

Thanks,
Ming


[PATCH V4 3/4] scsi: introduce force_blk_mq

2018-03-08 Thread Ming Lei
>From scsi driver view, it is a bit troublesome to support both blk-mq
and non-blk-mq at the same time, especially when drivers need to support
multi hw-queue.

This patch introduces 'force_blk_mq' to scsi_host_template so that drivers
can provide blk-mq only support, so driver code can avoid the trouble
for supporting both.

Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Mike Snitzer <snit...@redhat.com>
Cc: Laurence Oberman <lober...@redhat.com>
Reviewed-by: Hannes Reinecke <h...@suse.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/hosts.c | 1 +
 include/scsi/scsi_host.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 57bf43e34863..10f04b089392 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -477,6 +477,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template 
*sht, int privsize)
shost->dma_boundary = 0x;
 
shost->use_blk_mq = scsi_use_blk_mq;
+   shost->use_blk_mq = scsi_use_blk_mq || !!shost->hostt->force_blk_mq;
 
device_initialize(>shost_gendev);
dev_set_name(>shost_gendev, "host%d", shost->host_no);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 1a1df0d21ee3..6c6366f0bd15 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -452,6 +452,9 @@ struct scsi_host_template {
/* True if the controller does not support WRITE SAME */
unsigned no_write_same:1;
 
+   /* True if the low-level driver supports blk-mq only */
+   unsigned force_blk_mq:1;
+
/*
 * Countdown for host blocking with no commands outstanding.
 */
-- 
2.9.5



[PATCH V4 4/4] scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity

2018-03-08 Thread Ming Lei
Now 84676c1f21e8ff5(genirq/affinity: assign vectors to all possible CPUs)
has been merged to V4.16-rc, and it is easy to allocate all offline CPUs
for some irq vectors, this can't be avoided even though the allocation
is improved.

For example, on a 8cores VM, 4~7 are not-present/offline, 4 queues of
virtio-scsi, the irq affinity assigned can become the following shape:

irq 36, cpu list 0-7
irq 37, cpu list 0-7
irq 38, cpu list 0-7
irq 39, cpu list 0-1
irq 40, cpu list 4,6
irq 41, cpu list 2-3
irq 42, cpu list 5,7

Then IO hang is triggered in case of non-SCSI_MQ.

Given storage IO is always C/S model, there isn't such issue with 
SCSI_MQ(blk-mq),
because no IO can be submitted to one hw queue if the hw queue isn't
mapped to online CPUs.

Fix this issue by forcing to use blk_mq.

BTW, I have been used virtio-scsi(scsi_mq) for several years, and it has
been quite stable, so it shouldn't cause extra risk.

Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Mike Snitzer <snit...@redhat.com>
Cc: Laurence Oberman <lober...@redhat.com>
Reviewed-by: Hannes Reinecke <h...@suse.de>
Acked-by: Paolo Bonzini <pbonz...@redhat.com>
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/virtio_scsi.c | 59 +++---
 1 file changed, 3 insertions(+), 56 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 7c28e8d4955a..54e3a0f6844c 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -91,9 +91,6 @@ struct virtio_scsi_vq {
 struct virtio_scsi_target_state {
seqcount_t tgt_seq;
 
-   /* Count of outstanding requests. */
-   atomic_t reqs;
-
/* Currently active virtqueue for requests sent to this target. */
struct virtio_scsi_vq *req_vq;
 };
@@ -152,8 +149,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi 
*vscsi, void *buf)
struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = >resp.cmd;
-   struct virtio_scsi_target_state *tgt =
-   scsi_target(sc->device)->hostdata;
 
dev_dbg(>device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n",
@@ -210,8 +205,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi 
*vscsi, void *buf)
}
 
sc->scsi_done(sc);
-
-   atomic_dec(>reqs);
 }
 
 static void virtscsi_vq_done(struct virtio_scsi *vscsi,
@@ -580,10 +573,7 @@ static int virtscsi_queuecommand_single(struct Scsi_Host 
*sh,
struct scsi_cmnd *sc)
 {
struct virtio_scsi *vscsi = shost_priv(sh);
-   struct virtio_scsi_target_state *tgt =
-   scsi_target(sc->device)->hostdata;
 
-   atomic_inc(>reqs);
return virtscsi_queuecommand(vscsi, >req_vqs[0], sc);
 }
 
@@ -596,55 +586,11 @@ static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct 
virtio_scsi *vscsi,
return >req_vqs[hwq];
 }
 
-static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
-  struct virtio_scsi_target_state 
*tgt)
-{
-   struct virtio_scsi_vq *vq;
-   unsigned long flags;
-   u32 queue_num;
-
-   local_irq_save(flags);
-   if (atomic_inc_return(>reqs) > 1) {
-   unsigned long seq;
-
-   do {
-   seq = read_seqcount_begin(>tgt_seq);
-   vq = tgt->req_vq;
-   } while (read_seqcount_retry(>tgt_seq, seq));
-   } else {
-   /* no writes can be concurrent because of atomic_t */
-   write_seqcount_begin(>tgt_seq);
-
-   /* keep previous req_vq if a reader just arrived */
-   if (unlikely(atomic_read(>reqs) > 1)) {
-   vq = tgt->req_vq;
-   goto unlock;
-   }
-
-   queue_num = smp_processor_id();
-   while (unlikely(queue_num >= vscsi->num_queues))
-   queue_num -= vscsi->num_queues;
-   tgt->req_vq = vq = >req_vqs[queue_num];
- unlock:
-   write_seqcount_end(>tgt_seq);
-   }
-   local_irq_restore(flags);
-
-   return vq;
-}
-
 static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
   struct scsi_cmnd *sc)
 {
s

[PATCH V4 1/4] scsi: hpsa: fix selection of reply queue

2018-03-08 Thread Ming Lei
>From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
one msix vector can be created without any online CPU mapped, then one
command's completion may not be notified.

This patch setups mapping between cpu and reply queue according to irq
affinity info retrived by pci_irq_get_affinity(), and uses this mapping
table to choose reply queue for queuing one command.

Then the chosen reply queue has to be active, and fixes IO hang caused
by using inactive reply queue which doesn't have any online CPU mapped.

Cc: Hannes Reinecke <h...@suse.de>
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Laurence Oberman <lober...@redhat.com>
Cc: Meelis Roos <mr...@linux.ee>
Cc: Artem Bityutskiy <artem.bityuts...@intel.com>
Cc: Mike Snitzer <snit...@redhat.com>
Tested-by: Laurence Oberman <lober...@redhat.com>
Tested-by: Don Brace <don.br...@microsemi.com>
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/hpsa.c | 73 +++--
 drivers/scsi/hpsa.h |  1 +
 2 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 5293e6827ce5..3a9eca163db8 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -1045,11 +1045,7 @@ static void set_performant_mode(struct ctlr_info *h, 
struct CommandList *c,
c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
if (unlikely(!h->msix_vectors))
return;
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   c->Header.ReplyQueue =
-   raw_smp_processor_id() % h->nreply_queues;
-   else
-   c->Header.ReplyQueue = reply_queue % h->nreply_queues;
+   c->Header.ReplyQueue = reply_queue;
}
 }
 
@@ -1063,10 +1059,7 @@ static void set_ioaccel1_performant_mode(struct 
ctlr_info *h,
 * Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->ReplyQueue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->ReplyQueue = reply_queue % h->nreply_queues;
+   cp->ReplyQueue = reply_queue;
/*
 * Set the bits in the address sent down to include:
 *  - performant mode bit (bit 0)
@@ -1087,10 +1080,7 @@ static void set_ioaccel2_tmf_performant_mode(struct 
ctlr_info *h,
/* Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->reply_queue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->reply_queue = reply_queue % h->nreply_queues;
+   cp->reply_queue = reply_queue;
/* Set the bits in the address sent down to include:
 *  - performant mode bit not used in ioaccel mode 2
 *  - pull count (bits 0-3)
@@ -1109,10 +1099,7 @@ static void set_ioaccel2_performant_mode(struct 
ctlr_info *h,
 * Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->reply_queue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->reply_queue = reply_queue % h->nreply_queues;
+   cp->reply_queue = reply_queue;
/*
 * Set the bits in the address sent down to include:
 *  - performant mode bit not used in ioaccel mode 2
@@ -1157,6 +1144,8 @@ static void __enqueue_cmd_and_start_io(struct ctlr_info 
*h,
 {
dial_down_lockup_detection_during_fw_flash(h, c);
atomic_inc(>commands_outstanding);
+
+   reply_queue = h->reply_map[raw_smp_processor_id()];
switch (c->cmd_type) {
case CMD_IOACCEL1:
set_ioaccel1_performant_mode(h, c, reply_queue);
@@ -7376,6 +7365,26 @@ static void hpsa_disable_interrupt_mode(struct ctlr_info 
*h)
h->msix_vectors = 0;
 }
 
+static void hpsa_setup_reply_map(struct ctlr_info *h)
+{
+   const struct cpumask *mask;
+   unsigned int queue, cpu;
+
+   for (queue = 0; queue < h->msix_vectors; queue++) {
+   mask = pci_irq_get_affinity(h->pdev, queue);
+   if (!mask)
+   goto fallback;
+
+   for_eac

[PATCH V4 0/4] SCSI: fix selection of reply(hw) queue

2018-03-08 Thread Ming Lei
Hi All,

The patches fixes reply queue(virt-queue on virtio-scsi) selection on hpsa,
megaraid_sa and virtio-scsi, and IO hang can be caused easily by this issue.

This issue is triggered by 84676c1f21e8 ("genirq/affinity: assign vectors
to all possible CPUs"). After 84676c1f21e8, it is easy to see one msix
vector mapped to all offline CPUs. If the reply queue is seleteced from
all allocated msix vectors(reply queues) in round-roin way, the selected
replay queue may not have any online CPU mapped, IO hang is caused.

Both hpsa and megaraid_sas uses host-wide tagset, we can't convert the
reply queue to blk_mq hw queue directly, otherwise IO performance is degraded
much, according to Kashyap's test, so this patchset sets up one mapping talbe
for selecting reply queue, and this approach has been used by mpt3sas already.

For virtio-scsi, the virt-queue is really hw queue wrt. blk-mq view, so
we introduce 'force_blk_mq' for fix this issue because: 1) virtio-blk
has been used for years in blk-mq mode; 2) we have discussed recently
that scsi_mq will be enabled at default soon. 

gitweb:
https://github.com/ming1/linux/tree/v4.16-rc-select-reply-queue-fix-V4

V4:
- splitted from previous patchset
- handle virtio-scsi by force_blk_mq

Ming Lei (4):
  scsi: hpsa: fix selection of reply queue
  scsi: megaraid_sas: fix selection of reply queue
  scsi: introduce force_blk_mq
  scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity

 drivers/scsi/hosts.c|  1 +
 drivers/scsi/hpsa.c | 73 +
 drivers/scsi/hpsa.h |  1 +
 drivers/scsi/megaraid/megaraid_sas.h|  2 +-
 drivers/scsi/megaraid/megaraid_sas_base.c   | 34 +-
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 12 ++---
 drivers/scsi/virtio_scsi.c  | 59 ++-
 include/scsi/scsi_host.h|  3 ++
 8 files changed, 100 insertions(+), 85 deletions(-)

-- 
2.9.5



Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-08 Thread Ming Lei
On Wed, Mar 07, 2018 at 09:11:37AM -0500, Laurence Oberman wrote:
> On Tue, 2018-03-06 at 14:24 -0500, Martin K. Petersen wrote:
> > Ming,
> > 
> > > Given both Don and Laurence have verified that patch 1 and patch 2
> > > does fix IO hang, could you consider to merge the two first?
> > 
> > Oh, and I would still need a formal Acked-by: from Don and Tested-by:
> > from Laurence.
> > 
> > Also, for 4.16/scsi-fixes I would prefer verification to be done with
> > just patch 1/8 and none of the subsequent changes in place. Just to
> > make
> > sure we're testing the right thing.
> > 
> > Thanks!
> > 
> 
> Hello Martin
> 
> I tested just Patch 1/8 from the V3 series.
> No issues running workload and no issues booting on the DL380G7.
> Don can you ack this so we can at least get this one in.
> 
> Against: 4.16.0-rc4.v31of8+ on an x86_64
> 
> Tested-by: Laurence Oberman 

Hi Laurence,

Thanks for your test!

Could you test patch 2 too since you have megaraid_sas controller?

Looks it is better to split the fix patches from the current patchset,
since these fixes should be for V4.16.

Thanks
Ming


Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-08 Thread Ming Lei
On Thu, Mar 08, 2018 at 03:18:33PM +0200, Artem Bityutskiy wrote:
> On Thu, 2018-03-08 at 18:53 +0800, Ming Lei wrote:
> > Hi,
> > 
> > This patchset tries to spread among online CPUs as far as possible, so
> > that we can avoid to allocate too less irq vectors with online CPUs
> > mapped.
> > 
> > For example, in a 8cores system, 4 cpu cores(4~7) are offline/non present,
> > on a device with 4 queues:
> > 
> > 1) before this patchset
> > irq 39, cpu list 0-2
> > irq 40, cpu list 3-4,6
> > irq 41, cpu list 5
> > irq 42, cpu list 7
> > 
> > 2) after this patchset
> > irq 39, cpu list 0,4
> > irq 40, cpu list 1,6
> > irq 41, cpu list 2,5
> > irq 42, cpu list 3,7
> > 
> > Without this patchset, only two vectors(39, 40) can be active, but there
> > can be 4 active irq vectors after applying this patchset.
> 
> Tested-by: Artem Bityutskiy <artem.bityuts...@intel.com>
> Link: https://lkml.kernel.org/r/1519311270.2535.53.ca...@intel.com

Hi Artem,

Thanks for your test!

> 
> Ming,
> 
> this patchset fixes the v4.16-rcX regression that I reported few weeks
> ago. I applied it and verified that Dell R640 server that I mentioned
> in the bug report boots up and the disk works.
> 
> So this is not just an improvement, it also includes a bugfix. 

Actually, it isn't a real fix, the real one is in the following two:

0c20244d458e scsi: megaraid_sas: fix selection of reply queue
ed6d043be8cd scsi: hpsa: fix selection of reply queue

This patchset can't guarantee that all IRQ vectors are assigned by one
online CPU, for example, in a quad-socket system, if only one processor
is present, then some of vectors are still assigned by all offline CPUs,
and it is a valid case, but still may cause io hang if drivers(hpsa, 
megaraid_sas)
select reply queue in current way.

Thanks,
Ming


Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-03-08 Thread Ming Lei
On Thu, Mar 08, 2018 at 07:06:25PM +0800, Ming Lei wrote:
> On Thu, Mar 08, 2018 at 03:34:31PM +0530, Kashyap Desai wrote:
> > > -Original Message-
> > > From: Ming Lei [mailto:ming@redhat.com]
> > > Sent: Thursday, March 8, 2018 6:46 AM
> > > To: Kashyap Desai
> > > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig; Mike
> > Snitzer;
> > > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar Sandoval;
> > > Martin K . Petersen; James Bottomley; Christoph Hellwig; Don Brace;
> > Peter
> > > Rivera; Laurence Oberman
> > > Subject: Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance
> > via
> > > .host_tagset
> > >
> > > On Wed, Mar 07, 2018 at 10:58:34PM +0530, Kashyap Desai wrote:
> > > > > >
> > > > > > Also one observation using V3 series patch. I am seeing below
> > > > > > Affinity mapping whereas I have only 72 logical CPUs.  It means we
> > > > > > are really not going to use all reply queues.
> > > > > > e.a If I bind fio jobs on CPU 18-20, I am seeing only one reply
> > > > > > queue is used and that may lead to performance drop as well.
> > > > >
> > > > > If the mapping is in such shape, I guess it should be quite
> > > > > difficult to
> > > > figure out
> > > > > one perfect way to solve this situation because one reply queue has
> > > > > to
> > > > handle
> > > > > IOs submitted from 4~5 CPUs at average.
> > > >
> > > > 4.15.0-rc1 kernel has below mapping - I am not sure which commit id in
> > "
> > > > linux_4.16-rc-host-tags-v3.2" is changing the mapping of IRQ to CPU.
> > > > It
> > >
> > > I guess the mapping you posted is read from /proc/irq/126/smp_affinity.
> > >
> > > If yes, no any patch in linux_4.16-rc-host-tags-v3.2 should change IRQ
> > affinity
> > > code, which is done in irq_create_affinity_masks(), as you saw, no any
> > patch
> > > in linux_4.16-rc-host-tags-v3.2 touches that code.
> > >
> > > Could you simply apply the patches in linux_4.16-rc-host-tags-v3.2
> > against
> > > 4.15-rc1 kernel and see any difference?
> > >
> > > > will be really good if we can fall back to below mapping once again.
> > > > Current repo linux_4.16-rc-host-tags-v3.2 is giving lots of random
> > > > mapping of CPU - MSIx. And that will be problematic in performance
> > run.
> > > >
> > > > As I posted earlier, latest repo will only allow us to use *18* reply
> > >
> > > Looks not see this report before, could you share us how you conclude
> > that?
> > > The only patch changing reply queue is the following one:
> > >
> > >   https://marc.info/?l=linux-block=151972611911593=2
> > >
> > > But not see any issue in this patch yet, can you recover to 72 reply
> > queues
> > > after reverting the patch in above link?
> > Ming -
> > 
> > While testing, my system went bad. I debug further and understood that
> > affinity mapping was changed due to below commit -
> > 84676c1f21e8ff54befe985f4f14dc1edc10046b
> > 
> > [PATCH] genirq/affinity: assign vectors to all possible CPUs
> > 
> > Because of above change, we end up using very less reply queue. Many reply
> > queues on my setup was mapped to offline/not-available CPUs. This may be
> > primary contributing to odd performance impact and it may not be truly due
> > to V3/V4 patch series.
> 
> Seems a good news, :-)
> 
> > 
> > I am planning to check your V3 and V4 series after removing above commit
> > ID (for performance impact.).
> 
> You can run your test on a server in which all CPUs are kept as online
> for avoiding this issue.
> 
> Or you can apply the following patchset for avoiding this issue:
> 
>   https://marc.info/?l=linux-block=152050646332092=2

If you want to do this way, all patches have been put into the following
tree(V4):

https://github.com/ming1/linux/commits/v4.16-rc-host-tags-v4

#in reverse order
genirq/affinity: irq vector spread among online CPUs as far as possible
genirq/affinity: support to do irq vectors spread starting from any vector
genirq/affinity: move actual irq vector spread into one helper
genirq/affinity: rename *node_to_possible_cpumask as *node_to_cpumask
scsi: megaraid: improve scsi_mq performance via .host_tagset
scsi: hpsa: improve scsi_mq performance via .host_tagset
block: null_blk: introduce module parameter of 'g_host_tags'
scsi: Add template flag 'host_tagset'
blk-mq: introduce BLK_MQ_F_HOST_TAGS
blk-mq: introduce 'start_tag' field to 'struct blk_mq_tags'
scsi: avoid to hold host_busy for scsi_mq
scsi: read host_busy via scsi_host_busy()
scsi: introduce scsi_host_busy()
scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity
scsi: introduce force_blk_mq
scsi: megaraid_sas: fix selection of reply queue
scsi: hpsa: fix selection of reply queue


Thanks,
Ming


Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-03-08 Thread Ming Lei
On Thu, Mar 08, 2018 at 03:34:31PM +0530, Kashyap Desai wrote:
> > -Original Message-
> > From: Ming Lei [mailto:ming@redhat.com]
> > Sent: Thursday, March 8, 2018 6:46 AM
> > To: Kashyap Desai
> > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig; Mike
> Snitzer;
> > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar Sandoval;
> > Martin K . Petersen; James Bottomley; Christoph Hellwig; Don Brace;
> Peter
> > Rivera; Laurence Oberman
> > Subject: Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance
> via
> > .host_tagset
> >
> > On Wed, Mar 07, 2018 at 10:58:34PM +0530, Kashyap Desai wrote:
> > > > >
> > > > > Also one observation using V3 series patch. I am seeing below
> > > > > Affinity mapping whereas I have only 72 logical CPUs.  It means we
> > > > > are really not going to use all reply queues.
> > > > > e.a If I bind fio jobs on CPU 18-20, I am seeing only one reply
> > > > > queue is used and that may lead to performance drop as well.
> > > >
> > > > If the mapping is in such shape, I guess it should be quite
> > > > difficult to
> > > figure out
> > > > one perfect way to solve this situation because one reply queue has
> > > > to
> > > handle
> > > > IOs submitted from 4~5 CPUs at average.
> > >
> > > 4.15.0-rc1 kernel has below mapping - I am not sure which commit id in
> "
> > > linux_4.16-rc-host-tags-v3.2" is changing the mapping of IRQ to CPU.
> > > It
> >
> > I guess the mapping you posted is read from /proc/irq/126/smp_affinity.
> >
> > If yes, no any patch in linux_4.16-rc-host-tags-v3.2 should change IRQ
> affinity
> > code, which is done in irq_create_affinity_masks(), as you saw, no any
> patch
> > in linux_4.16-rc-host-tags-v3.2 touches that code.
> >
> > Could you simply apply the patches in linux_4.16-rc-host-tags-v3.2
> against
> > 4.15-rc1 kernel and see any difference?
> >
> > > will be really good if we can fall back to below mapping once again.
> > > Current repo linux_4.16-rc-host-tags-v3.2 is giving lots of random
> > > mapping of CPU - MSIx. And that will be problematic in performance
> run.
> > >
> > > As I posted earlier, latest repo will only allow us to use *18* reply
> >
> > Looks not see this report before, could you share us how you conclude
> that?
> > The only patch changing reply queue is the following one:
> >
> > https://marc.info/?l=linux-block=151972611911593=2
> >
> > But not see any issue in this patch yet, can you recover to 72 reply
> queues
> > after reverting the patch in above link?
> Ming -
> 
> While testing, my system went bad. I debug further and understood that
> affinity mapping was changed due to below commit -
> 84676c1f21e8ff54befe985f4f14dc1edc10046b
> 
> [PATCH] genirq/affinity: assign vectors to all possible CPUs
> 
> Because of above change, we end up using very less reply queue. Many reply
> queues on my setup was mapped to offline/not-available CPUs. This may be
> primary contributing to odd performance impact and it may not be truly due
> to V3/V4 patch series.

Seems a good news, :-)

> 
> I am planning to check your V3 and V4 series after removing above commit
> ID (for performance impact.).

You can run your test on a server in which all CPUs are kept as online
for avoiding this issue.

Or you can apply the following patchset for avoiding this issue:

https://marc.info/?l=linux-block=152050646332092=2

> 
> It is good if we spread possible CPUs (instead of online cpus) to all irq
> vectors  considering -  We should have at least *one* online CPU mapped to
> the vector.

Right, that is exactly what the above patchset does.

Thanks,
Ming


Re: [PATCH V3 7/8] scsi: hpsa: improve scsi_mq performance via .host_tagset

2018-03-08 Thread Ming Lei
On Thu, Mar 08, 2018 at 08:54:43AM +0100, Christoph Hellwig wrote:
> > +   /* 256 tags should be high enough to saturate device */
> > +   int max_queues = DIV_ROUND_UP(h->scsi_host->can_queue, 256);
> > +
> > +   /* per NUMA node hw queue */
> > +   h->scsi_host->nr_hw_queues = min_t(int, nr_node_ids, max_queues);
> 
> I don't think this magic should be in a driver.  The per-node hw_queue
> selection seems like something we'd better do in the core code.

The thing is that driver code may need to know if multiple queues are used,
then driver may partition its own resource into multi hw queues, and
improve its .queuecommand and .complete_command. That seems what
megaraid_sas should do in next time.

> 
> Also the whole idea to use nr_hw_queues for just partitioning tag
> space on hardware that doesn't really support multiple hardware queues
> seems more than odd.

The per-node hw queue is used together with BLK_MQ_F_HOST_TAGS, which is
really for improving the single queue case(single tagset). If driver/device
supports real multiple hw queues, they don't need this approach.

Thanks,
Ming


[PATCH V3 2/4] genirq/affinity: move actual irq vector spread into one helper

2018-03-08 Thread Ming Lei
No functional change, just prepare for converting to 2-stage
irq vector spread.

Cc: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 97 +--
 1 file changed, 55 insertions(+), 42 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4b1c4763212d..e119e86bed48 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,50 +94,19 @@ static int get_nodes_in_cpumask(cpumask_var_t 
*node_to_cpumask,
return nodes;
 }
 
-/**
- * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @nvecs: The total number of vectors
- * @affd:  Description of the affinity requirements
- *
- * Returns the masks pointer or NULL if allocation failed.
- */
-struct cpumask *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
+   cpumask_var_t *node_to_cpumask,
+   const struct cpumask *cpu_mask,
+   struct cpumask *nmsk,
+   struct cpumask *masks)
 {
-   int n, nodes, cpus_per_vec, extra_vecs, curvec;
int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int last_affv = affv + affd->pre_vectors;
+   int curvec = affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
-   struct cpumask *masks;
-   cpumask_var_t nmsk, *node_to_cpumask;
-
-   /*
-* If there aren't any vectors left after applying the pre/post
-* vectors don't bother with assigning affinity.
-*/
-   if (!affv)
-   return NULL;
-
-   if (!zalloc_cpumask_var(, GFP_KERNEL))
-   return NULL;
-
-   masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
-   if (!masks)
-   goto out;
+   int n, nodes, cpus_per_vec, extra_vecs;
 
-   node_to_cpumask = alloc_node_to_cpumask();
-   if (!node_to_cpumask)
-   goto out;
-
-   /* Fill out vectors at the beginning that don't need affinity */
-   for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-   cpumask_copy(masks + curvec, irq_default_affinity);
-
-   /* Stabilize the cpumasks */
-   get_online_cpus();
-   build_node_to_cpumask(node_to_cpumask);
-   nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask,
-);
+   nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, );
 
/*
 * If the number of nodes in the mask is greater than or equal the
@@ -150,7 +119,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (++curvec == last_affv)
break;
}
-   goto done;
+   goto out;
}
 
for_each_node_mask(n, nodemsk) {
@@ -160,7 +129,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
/* Get the cpus on this node which are in the mask */
-   cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]);
+   cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
 
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -186,7 +155,51 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
--nodes;
}
 
-done:
+out:
+   return curvec - affd->pre_vectors;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs: The total number of vectors
+ * @affd:  Description of the affinity requirements
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+{
+   int curvec;
+   struct cpumask *masks;
+   cpumask_var_t nmsk, *node_to_cpumask;
+
+   /*
+* If there aren't any vectors left after applying the pre/post
+* vectors don't bother with assigning affinity.
+*/
+   if (nvecs == affd->pre_vectors + affd->post_vectors)
+   return NULL;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL))
+   return NULL;
+
+   masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+   if (!masks)
+   goto out;
+
+   node_to_cpumask = alloc_node_to_cpumask();
+   if (!node_to_cpumask)
+   goto out;
+
+   /* Fill out vectors at the beginning that don't need affinity */
+   for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+   cpumask_copy(masks + curvec, irq_

[PATCH V3 4/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-08 Thread Ming Lei
84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
may cause irq vector assigned to all offline CPUs, and this kind of
assignment may cause much less irq vectors mapped to online CPUs, and
performance may get hurt.

For example, in a 8 cores system, 0~3 online, 4~8 offline/not present,
see 'lscpu':

[ming@box]$lscpu
Architecture:  x86_64
CPU op-mode(s):32-bit, 64-bit
Byte Order:Little Endian
CPU(s):4
On-line CPU(s) list:   0-3
Thread(s) per core:1
Core(s) per socket:2
Socket(s): 2
NUMA node(s):  2
...
NUMA node0 CPU(s): 0-3
NUMA node1 CPU(s):
...

For example, one device has 4 queues:

1) before 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
irq 39, cpu list 0
irq 40, cpu list 1
irq 41, cpu list 2
irq 42, cpu list 3

2) after 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
irq 39, cpu list 0-2
irq 40, cpu list 3-4,6
irq 41, cpu list 5
irq 42, cpu list 7

3) after applying this patch against V4.15+:
irq 39, cpu list 0,4
irq 40, cpu list 1,6
irq 41, cpu list 2,5
irq 42, cpu list 3,7

This patch tries to do irq vector spread among online CPUs as far as
possible by 2 stages spread.

The above assignment 3) isn't the optimal result from NUMA view, but it
returns more irq vectors with online CPU mapped, given in reality one CPU
should be enough to handle one irq vector, so it is better to do this way.

Cc: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reported-by: Laurence Oberman <lober...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 35 +--
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 616f040c5d02..253c5bf85d18 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -106,6 +106,9 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
nodemask_t nodemsk = NODE_MASK_NONE;
int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 
+   if (!cpumask_weight(cpu_mask))
+   return 0;
+
nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, );
 
/*
@@ -175,9 +178,9 @@ struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
int affv = nvecs - affd->pre_vectors - affd->post_vectors;
-   int curvec;
+   int curvec, vecs_offline, vecs_online;
struct cpumask *masks;
-   cpumask_var_t nmsk, *node_to_cpumask;
+   cpumask_var_t nmsk, cpu_mask, *node_to_cpumask;
 
/*
 * If there aren't any vectors left after applying the pre/post
@@ -193,9 +196,12 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (!masks)
goto out;
 
+   if (!alloc_cpumask_var(_mask, GFP_KERNEL))
+   goto out;
+
node_to_cpumask = alloc_node_to_cpumask();
if (!node_to_cpumask)
-   goto out;
+   goto out_free_cpu_mask;
 
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
@@ -204,15 +210,32 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
-   curvec += irq_build_affinity_masks(affd, curvec, affv,
-  node_to_cpumask,
-  cpu_possible_mask, nmsk, masks);
+   /* spread on online CPUs starting from the vector of affd->pre_vectors 
*/
+   vecs_online = irq_build_affinity_masks(affd, curvec, affv,
+  node_to_cpumask,
+  cpu_online_mask, nmsk, masks);
+
+   /* spread on offline CPUs starting from the next vector to be handled */
+   if (vecs_online >= affv)
+   curvec = affd->pre_vectors;
+   else
+   curvec = affd->pre_vectors + vecs_online;
+   cpumask_andnot(cpu_mask, cpu_possible_mask, cpu_online_mask);
+   vecs_offline = irq_build_affinity_masks(affd, curvec, affv,
+   node_to_cpumask,
+   cpu_mask, nmsk, masks);
put_online_cpus();
 
/* Fill out vectors at the end that don't need affinity */
+   if (vecs_online + vecs_offline >= affv)
+   curvec = affv + affd->pre_vectors;
+   else
+   curvec = affd->pre_vectors + vecs_online + vecs_offlin

[PATCH V3 3/4] genirq/affinity: support to do irq vectors spread starting from any vector

2018-03-08 Thread Ming Lei
Now two parameters(start_vec, affv) are introduced to 
irq_build_affinity_masks(),
then this helper can build the affinity of each irq vector starting from
the irq vector of 'start_vec', and handle at most 'affv' vectors.

This way is required to do 2-stages irq vectors spread among all
possible CPUs.

Cc: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e119e86bed48..616f040c5d02 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,17 +94,17 @@ static int get_nodes_in_cpumask(cpumask_var_t 
*node_to_cpumask,
return nodes;
 }
 
-static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+   int start_vec, int affv,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct cpumask *masks)
 {
-   int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int last_affv = affv + affd->pre_vectors;
-   int curvec = affd->pre_vectors;
+   int curvec = start_vec;
nodemask_t nodemsk = NODE_MASK_NONE;
-   int n, nodes, cpus_per_vec, extra_vecs;
+   int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 
nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, );
 
@@ -116,8 +116,10 @@ static int irq_build_affinity_masks(int nvecs, const 
struct irq_affinity *affd,
for_each_node_mask(n, nodemsk) {
cpumask_copy(masks + curvec,
 node_to_cpumask[n]);
-   if (++curvec == last_affv)
+   if (++done == affv)
break;
+   if (++curvec == last_affv)
+   curvec = affd->pre_vectors;
}
goto out;
}
@@ -150,13 +152,16 @@ static int irq_build_affinity_masks(int nvecs, const 
struct irq_affinity *affd,
irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
}
 
-   if (curvec >= last_affv)
+   done += v;
+   if (done >= affv)
break;
+   if (curvec >= last_affv)
+   curvec = affd->pre_vectors;
--nodes;
}
 
 out:
-   return curvec - affd->pre_vectors;
+   return done;
 }
 
 /**
@@ -169,6 +174,7 @@ static int irq_build_affinity_masks(int nvecs, const struct 
irq_affinity *affd,
 struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
+   int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int curvec;
struct cpumask *masks;
cpumask_var_t nmsk, *node_to_cpumask;
@@ -198,7 +204,8 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
-   curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask,
+   curvec += irq_build_affinity_masks(affd, curvec, affv,
+  node_to_cpumask,
   cpu_possible_mask, nmsk, masks);
put_online_cpus();
 
-- 
2.9.5



[PATCH V3 1/4] genirq/affinity: rename *node_to_possible_cpumask as *node_to_cpumask

2018-03-08 Thread Ming Lei
The following patches will introduce two stage irq spread for improving
irq spread on all possible CPUs.

No funtional change.

Cc: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a37a3b4b6342..4b1c4763212d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, 
struct cpumask *nmsk,
}
 }
 
-static cpumask_var_t *alloc_node_to_possible_cpumask(void)
+static cpumask_var_t *alloc_node_to_cpumask(void)
 {
cpumask_var_t *masks;
int node;
@@ -62,7 +62,7 @@ static cpumask_var_t *alloc_node_to_possible_cpumask(void)
return NULL;
 }
 
-static void free_node_to_possible_cpumask(cpumask_var_t *masks)
+static void free_node_to_cpumask(cpumask_var_t *masks)
 {
int node;
 
@@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t 
*masks)
kfree(masks);
 }
 
-static void build_node_to_possible_cpumask(cpumask_var_t *masks)
+static void build_node_to_cpumask(cpumask_var_t *masks)
 {
int cpu;
 
@@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t 
*masks)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
 }
 
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
 {
int n, nodes = 0;
 
/* Calculate the number of nodes in the supplied affinity mask */
for_each_node(n) {
-   if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
+   if (cpumask_intersects(mask, node_to_cpumask[n])) {
node_set(n, *nodemsk);
nodes++;
}
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
int last_affv = affv + affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
struct cpumask *masks;
-   cpumask_var_t nmsk, *node_to_possible_cpumask;
+   cpumask_var_t nmsk, *node_to_cpumask;
 
/*
 * If there aren't any vectors left after applying the pre/post
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (!masks)
goto out;
 
-   node_to_possible_cpumask = alloc_node_to_possible_cpumask();
-   if (!node_to_possible_cpumask)
+   node_to_cpumask = alloc_node_to_cpumask();
+   if (!node_to_cpumask)
goto out;
 
/* Fill out vectors at the beginning that don't need affinity */
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
 
/* Stabilize the cpumasks */
get_online_cpus();
-   build_node_to_possible_cpumask(node_to_possible_cpumask);
-   nodes = get_nodes_in_cpumask(node_to_possible_cpumask, 
cpu_possible_mask,
+   build_node_to_cpumask(node_to_cpumask);
+   nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask,
 );
 
/*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (affv <= nodes) {
for_each_node_mask(n, nodemsk) {
cpumask_copy(masks + curvec,
-node_to_possible_cpumask[n]);
+node_to_cpumask[n]);
if (++curvec == last_affv)
break;
}
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
/* Get the cpus on this node which are in the mask */
-   cpumask_and(nmsk, cpu_possible_mask, 
node_to_possible_cpumask[n]);
+   cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]);
 
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -192,7 +192,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
/* Fill out vectors at the end that don't need affinity */
for (; curvec < nvecs; curvec++)
cpumask_copy(masks + curvec, irq_default_affinity);
-   free_node_to_possible_cpumask(node_to_possible_cpumask);
+   free_node_to_cpumask(node_to_cpumask);
 out:
free_cpumask_var(nmsk);
return masks;
-- 
2.9.5



Re: [PATCH V2 3/5] genirq/affinity: move actual irq vector spread into one helper

2018-03-08 Thread Ming Lei
On Tue, Mar 06, 2018 at 12:28:32AM +0800, kbuild test robot wrote:
> Hi Ming,
> 
> Thank you for the patch! Perhaps something to improve:
> 
> [auto build test WARNING on tip/irq/core]
> [also build test WARNING on v4.16-rc4 next-20180305]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Ming-Lei/genirq-affinity-irq-vector-spread-among-online-CPUs-as-far-as-possible/20180305-184912
> config: i386-randconfig-a1-201809 (attached as .config)
> compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=i386 
> 
> All warnings (new ones prefixed by >>):
> 
>kernel/irq/affinity.c: In function 'irq_create_affinity_masks':
> >> kernel/irq/affinity.c:201:50: warning: passing argument 3 of 
> >> 'irq_build_affinity_masks' from incompatible pointer type
>  curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask,
>  ^
>kernel/irq/affinity.c:97:12: note: expected 'const struct cpumask (*)[1]' 
> but argument is of type 'struct cpumask (*)[1]'
> static int irq_build_affinity_masks(int nvecs, const struct irq_affinity 
> *affd,

Looks this warning can only be triggered on ARCH=i386 with gcc-4.X.

Can't reproduce it when building on other ARCHs, and can't reproduce
it with gcc-6 too.


Thanks,
Ming


Re: [PATCH V3 4/8] blk-mq: introduce BLK_MQ_F_HOST_TAGS

2018-03-08 Thread Ming Lei
On Thu, Mar 08, 2018 at 08:52:52AM +0100, Christoph Hellwig wrote:
> On Tue, Feb 27, 2018 at 06:07:46PM +0800, Ming Lei wrote:
> > This patch can support to partition host-wide tags to multiple hw queues,
> > so each hw queue related data structures(tags, hctx) can be accessed in
> > NUMA locality way, for example, the hw queue can be per NUMA node.
> > 
> > It is observed IOPS can be improved much in this way on null_blk test.
> 
> null_blk isn't too interesting, so some real hardware number would
> be very useful here.

About 10~20% IOPS improvement can be observed on scsi_debug too, which is
setup on one dual-sockets system.

It needs one hpsa or megaraid_sas host with dozens of SSDs, which seems
not easy to setup for me.

And Kashyap is very cooperative to test patches, looks V3 is much
better than before by using per-node hw queue.

If atomic operations on scsi_host->host_busy are removed, and
megaraid_sas IO path can be optimized a bit, we should get some improvement
by per-node hw queue with BLK_MQ_F_HOST_TAGS on megaraid_sas.

> 
> Also the documentation should be a lot less sparse.  When are we going
> to set this flag?  What help are we going to give driver authors to
> guide chosing the option?

OK, will do that in next version.

Thanks,
Ming


Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-08 Thread Ming Lei
On Thu, Mar 08, 2018 at 09:41:16AM +0100, Hannes Reinecke wrote:
> On 03/08/2018 09:15 AM, Ming Lei wrote:
> > On Thu, Mar 08, 2018 at 08:50:35AM +0100, Christoph Hellwig wrote:
> >>> +static void hpsa_setup_reply_map(struct ctlr_info *h)
> >>> +{
> >>> + const struct cpumask *mask;
> >>> + unsigned int queue, cpu;
> >>> +
> >>> + for (queue = 0; queue < h->msix_vectors; queue++) {
> >>> + mask = pci_irq_get_affinity(h->pdev, queue);
> >>> + if (!mask)
> >>> + goto fallback;
> >>> +
> >>> + for_each_cpu(cpu, mask)
> >>> + h->reply_map[cpu] = queue;
> >>> + }
> >>> + return;
> >>> +
> >>> +fallback:
> >>> + for_each_possible_cpu(cpu)
> >>> + h->reply_map[cpu] = 0;
> >>> +}
> >>
> >> It seems a little annoying that we have to duplicate this in the driver.
> >> Wouldn't this be solved by your force_blk_mq flag and relying on the
> >> hw_ctx id?
> > 
> > This issue can be solved by force_blk_mq, but may cause performance
> > regression for host-wide tagset drivers:
> > 
> > - If the whole tagset is partitioned into each hw queue, each hw queue's
> > depth may not be high enough, especially SCSI's IO path may be not
> > efficient enough. Even though we keep each queue's depth as 256, which
> > should be high enough to exploit parallelism from device internal view,
> > but still can't get good performance.
> > 
> > - If the whole tagset is still shared among all hw queues, the shared
> > tags can be accessed from all CPUs, and IOPS is degraded.
> > 
> > Kashyap has tested the above two approaches, both hurts IOPS on 
> > megaraid_sas.
> > 
> This is precisely the issue I have been worried about, too.
> 
> The problem is not so much the tagspace (which actually is quite small
> memory footprint-wise), but rather the _requests_ indexed by the tags.

But V1 is done in this way, one shared tags is used and requests are
allocated for each hw queue in NUMA locality, finally Kashyap confirmed
that IOPS can be recovered to normal if iostats is set as 0 after V1 is
applied:

https://marc.info/?l=linux-scsi=151815231026789=2

That means the shared tags does have a big effect on performance.

> 
> We have this:
> 
> struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
> unsigned int hctx_idx,
> unsigned int nr_tags,
> unsigned int reserved_tags)
> {
> struct blk_mq_tags *tags;
> int node;
> 
> node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
> if (node == NUMA_NO_NODE)
> node = set->numa_node;
> 
> tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
>  BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
> if (!tags)
> return NULL;
> 
> tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
>   GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
> 
> 
> IE the _entire_ request set is allocated as _one_ array, making it quite
> hard to handle from the lower-level CPU caches.
> Also the 'node' indicator doesn't really help us here, as the requests
> have to be access by all CPUs in the shared tag case.
> 
> Would it be possible move tags->rqs to become a _double_ pointer?
> Then we would have only a shared lookup table, but the requests
> themselves can be allocated per node, depending on the CPU map.
> _And_ it should be easier on the CPU cache ...

That is basically same with the way in V1, even similar with V3, in
which per-node hw queue is introduced, from Kashyap's test, the
performance isn't bad. I believe finally IOPS can be improved if
scsi_host->host_busy operation is removed from IO path and
megaraid_sas driver is improved, as I mentioned earlier.

Thanks,
Ming


Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-08 Thread Ming Lei
On Thu, Mar 08, 2018 at 08:50:35AM +0100, Christoph Hellwig wrote:
> > +static void hpsa_setup_reply_map(struct ctlr_info *h)
> > +{
> > +   const struct cpumask *mask;
> > +   unsigned int queue, cpu;
> > +
> > +   for (queue = 0; queue < h->msix_vectors; queue++) {
> > +   mask = pci_irq_get_affinity(h->pdev, queue);
> > +   if (!mask)
> > +   goto fallback;
> > +
> > +   for_each_cpu(cpu, mask)
> > +   h->reply_map[cpu] = queue;
> > +   }
> > +   return;
> > +
> > +fallback:
> > +   for_each_possible_cpu(cpu)
> > +   h->reply_map[cpu] = 0;
> > +}
> 
> It seems a little annoying that we have to duplicate this in the driver.
> Wouldn't this be solved by your force_blk_mq flag and relying on the
> hw_ctx id?

This issue can be solved by force_blk_mq, but may cause performance
regression for host-wide tagset drivers:

- If the whole tagset is partitioned into each hw queue, each hw queue's
depth may not be high enough, especially SCSI's IO path may be not
efficient enough. Even though we keep each queue's depth as 256, which
should be high enough to exploit parallelism from device internal view,
but still can't get good performance.

- If the whole tagset is still shared among all hw queues, the shared
tags can be accessed from all CPUs, and IOPS is degraded.

Kashyap has tested the above two approaches, both hurts IOPS on megaraid_sas.


thanks,
Ming


Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-03-07 Thread Ming Lei
On Wed, Mar 07, 2018 at 10:58:34PM +0530, Kashyap Desai wrote:
> > >
> > > Also one observation using V3 series patch. I am seeing below Affinity
> > > mapping whereas I have only 72 logical CPUs.  It means we are really
> > > not going to use all reply queues.
> > > e.a If I bind fio jobs on CPU 18-20, I am seeing only one reply queue
> > > is used and that may lead to performance drop as well.
> >
> > If the mapping is in such shape, I guess it should be quite difficult to
> figure out
> > one perfect way to solve this situation because one reply queue has to
> handle
> > IOs submitted from 4~5 CPUs at average.
> 
> 4.15.0-rc1 kernel has below mapping - I am not sure which commit id in "
> linux_4.16-rc-host-tags-v3.2" is changing the mapping of IRQ to CPU.  It

I guess the mapping you posted is read from /proc/irq/126/smp_affinity.

If yes, no any patch in linux_4.16-rc-host-tags-v3.2 should change IRQ
affinity code, which is done in irq_create_affinity_masks(), as you saw, no any
patch in linux_4.16-rc-host-tags-v3.2 touches that code.

Could you simply apply the patches in linux_4.16-rc-host-tags-v3.2 against
4.15-rc1 kernel and see any difference?

> will be really good if we can fall back to below mapping once again.
> Current repo linux_4.16-rc-host-tags-v3.2 is giving lots of random mapping
> of CPU - MSIx. And that will be problematic in performance run.
> 
> As I posted earlier, latest repo will only allow us to use *18* reply

Looks not see this report before, could you share us how you conclude that?
The only patch changing reply queue is the following one:

https://marc.info/?l=linux-block=151972611911593=2

But not see any issue in this patch yet, can you recover to 72 reply
queues after reverting the patch in above link?

> queue instead of *72*.  Lots of performance related issue can be pop up on
> different setup due to inconsistency in CPU - MSIx mapping. BTW, changes
> in this area is intentional @" linux_4.16-rc-host-tags-v3.2". ?

As you mentioned in the following link, you didn't see big performance drop
with linux_4.16-rc-host-tags-v3.2, right?

https://marc.info/?l=linux-block=151982993810092=2


Thanks,
Ming


Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-03-07 Thread Ming Lei
On Wed, Mar 07, 2018 at 08:31:31PM +0530, Kashyap Desai wrote:
> > -Original Message-
> > From: Ming Lei [mailto:ming@redhat.com]
> > Sent: Wednesday, March 7, 2018 10:58 AM
> > To: Kashyap Desai
> > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig; Mike
> Snitzer;
> > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar Sandoval;
> > Martin K . Petersen; James Bottomley; Christoph Hellwig; Don Brace;
> Peter
> > Rivera; Laurence Oberman
> > Subject: Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance
> via
> > .host_tagset
> >
> > On Wed, Feb 28, 2018 at 08:28:48PM +0530, Kashyap Desai wrote:
> > > Ming -
> > >
> > > Quick testing on my setup -  Performance slightly degraded (4-5%
> > > drop)for megaraid_sas driver with this patch. (From 1610K IOPS it goes
> > > to 1544K) I confirm that after applying this patch, we have #queue =
> #numa
> > node.
> > >
> > > ls -l
> > >
> >
> /sys/devices/pci:80/:80:02.0/:83:00.0/host10/target10:2:23/10:
> > > 2:23:0/block/sdy/mq
> > > total 0
> > > drwxr-xr-x. 18 root root 0 Feb 28 09:53 0 drwxr-xr-x. 18 root root 0
> > > Feb 28 09:53 1
> > >
> > >
> > > I would suggest to skip megaraid_sas driver changes using
> > > shared_tagset until and unless there is obvious gain. If overall
> > > interface of using shared_tagset is commit in kernel tree, we will
> > > investigate (megaraid_sas
> > > driver) in future about real benefit of using it.
> >
> > Hi Kashyap,
> >
> > Now I have put patches for removing operating on scsi_host->host_busy in
> > V4[1], especially which are done in the following 3 patches:
> >
> > 9221638b9bc9 scsi: avoid to hold host_busy for scsi_mq
> > 1ffc8c0ffbe4 scsi: read host_busy via scsi_host_busy()
> > e453d3983243 scsi: introduce scsi_host_busy()
> >
> >
> > Could you run your test on V4 and see if IOPS can be improved on
> > megaraid_sas?
> >
> >
> > [1] https://github.com/ming1/linux/commits/v4.16-rc-host-tags-v4
> 
> I will be doing testing soon.

Today I revisit your previous perf trace too, seems the following samples take
a bit more CPU:

   4.64%  [megaraid_sas]   [k] complete_cmd_fusion
   ...
   2.22%  [megaraid_sas]   [k] megasas_build_io_fusion
   ...
   1.33%  [megaraid_sas]   [k] megasas_build_and_issue_cmd_fusion

But V4 should get a bit improvement in theory.

And if some host-wide resource of megaraid_sas can be partitioned to
per-node hw queue, I guess some of improvement can be got too.

> 
> BTW - Performance impact is due below patch only -
> "[PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via
> .host_tagset"
> 
> Below patch is really needed -
> "[PATCH V3 2/8] scsi: megaraid_sas: fix selection of reply queue"
> 
> I am currently doing review on my setup.  I think above patch is fixing
> real issue of performance (for megaraid_sas) as driver may not be sending
> IO to optimal reply queue.

The ideal way is to map reply queue to blk-mq's hw queue, but seems
SCSI/driver's IO path is too slow so that high enough hw queue
depth(from device internal view, for example 256) still can't reach good
performance, as you observed.

> Having CPU to MSIx mapping will solve that. Megaraid_sas driver always
> create max MSIx as min (online CPU, # MSIx HW support).
> I will do more review and testing for that particular patch as well.

OK, thanks!

> 
> Also one observation using V3 series patch. I am seeing below Affinity
> mapping whereas I have only 72 logical CPUs.  It means we are really not
> going to use all reply queues.
> e.a If I bind fio jobs on CPU 18-20, I am seeing only one reply queue is
> used and that may lead to performance drop as well.

If the mapping is in such shape, I guess it should be quite difficult to
figure out one perfect way to solve this situation because one reply
queue has to handle IOs submitted from 4~5 CPUs at average.

The application should have the knowledge to avoid this kind of usage.


Thanks,
Ming


Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-03-06 Thread Ming Lei
On Wed, Feb 28, 2018 at 08:28:48PM +0530, Kashyap Desai wrote:
> Ming -
> 
> Quick testing on my setup -  Performance slightly degraded (4-5% drop)for
> megaraid_sas driver with this patch. (From 1610K IOPS it goes to 1544K)
> I confirm that after applying this patch, we have #queue = #numa node.
> 
> ls -l
> /sys/devices/pci:80/:80:02.0/:83:00.0/host10/target10:2:23/10:
> 2:23:0/block/sdy/mq
> total 0
> drwxr-xr-x. 18 root root 0 Feb 28 09:53 0
> drwxr-xr-x. 18 root root 0 Feb 28 09:53 1
> 
> 
> I would suggest to skip megaraid_sas driver changes using shared_tagset
> until and unless there is obvious gain. If overall interface of using
> shared_tagset is commit in kernel tree, we will investigate (megaraid_sas
> driver) in future about real benefit of using it.

Hi Kashyap,

Now I have put patches for removing operating on scsi_host->host_busy
in V4[1], especially which are done in the following 3 patches:

9221638b9bc9 scsi: avoid to hold host_busy for scsi_mq
1ffc8c0ffbe4 scsi: read host_busy via scsi_host_busy()
e453d3983243 scsi: introduce scsi_host_busy()


Could you run your test on V4 and see if IOPS can be improved on
megaraid_sas?


[1] https://github.com/ming1/linux/commits/v4.16-rc-host-tags-v4

Thanks,
Ming


Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-06 Thread Ming Lei
On Tue, Mar 06, 2018 at 02:24:25PM -0500, Martin K. Petersen wrote:
> 
> Ming,
> 
> > Given both Don and Laurence have verified that patch 1 and patch 2
> > does fix IO hang, could you consider to merge the two first?
> 
> Oh, and I would still need a formal Acked-by: from Don and Tested-by:
> from Laurence.
> 
> Also, for 4.16/scsi-fixes I would prefer verification to be done with
> just patch 1/8 and none of the subsequent changes in place. Just to make
> sure we're testing the right thing.

Hi Martin,

Please consider 2/8 too since it is still a fix.

Thanks,
Ming


[PATCH V2] block: null_blk: fix 'Invalid parameters' when loading module

2018-03-05 Thread Ming Lei
On ARM64, the default page size has been 64K on some distributions, and
we should allow ARM64 people to play null_blk.

This patch fixes the issue by extend page bitmap size for supporting
other non-4KB PAGE_SIZE.

Cc: Bart Van Assche <bart.vanass...@wdc.com>
Cc: Shaohua Li <s...@kernel.org>
Cc: Kyungchan Koh <kkc6...@fb.com>,
Cc: weiping zhang <zhangweip...@didichuxing.com>
Cc: Yi Zhang <yi.zh...@redhat.com>
Reported-by: Yi Zhang <yi.zh...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/block/null_blk.c | 46 +-
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 51b16249028a..3c5a684de170 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -72,6 +72,7 @@ enum nullb_device_flags {
NULLB_DEV_FL_CACHE  = 3,
 };
 
+#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
 /*
  * nullb_page is a page in memory for nullb devices.
  *
@@ -86,10 +87,10 @@ enum nullb_device_flags {
  */
 struct nullb_page {
struct page *page;
-   unsigned long bitmap;
+   DECLARE_BITMAP(bitmap, MAP_SZ);
 };
-#define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1)
-#define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2)
+#define NULLB_PAGE_LOCK (MAP_SZ - 1)
+#define NULLB_PAGE_FREE (MAP_SZ - 2)
 
 struct nullb_device {
struct nullb *nullb;
@@ -732,7 +733,7 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
if (!t_page->page)
goto out_freepage;
 
-   t_page->bitmap = 0;
+   memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
return t_page;
 out_freepage:
kfree(t_page);
@@ -742,13 +743,20 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
 
 static void null_free_page(struct nullb_page *t_page)
 {
-   __set_bit(NULLB_PAGE_FREE, _page->bitmap);
-   if (test_bit(NULLB_PAGE_LOCK, _page->bitmap))
+   __set_bit(NULLB_PAGE_FREE, t_page->bitmap);
+   if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
return;
__free_page(t_page->page);
kfree(t_page);
 }
 
+static bool null_page_empty(struct nullb_page *page)
+{
+   int size = MAP_SZ - 2;
+
+   return find_first_bit(page->bitmap, size) == size;
+}
+
 static void null_free_sector(struct nullb *nullb, sector_t sector,
bool is_cache)
 {
@@ -763,9 +771,9 @@ static void null_free_sector(struct nullb *nullb, sector_t 
sector,
 
t_page = radix_tree_lookup(root, idx);
if (t_page) {
-   __clear_bit(sector_bit, _page->bitmap);
+   __clear_bit(sector_bit, t_page->bitmap);
 
-   if (!t_page->bitmap) {
+   if (null_page_empty(t_page)) {
ret = radix_tree_delete_item(root, idx, t_page);
WARN_ON(ret != t_page);
null_free_page(ret);
@@ -836,7 +844,7 @@ static struct nullb_page *__null_lookup_page(struct nullb 
*nullb,
t_page = radix_tree_lookup(root, idx);
WARN_ON(t_page && t_page->page->index != idx);
 
-   if (t_page && (for_write || test_bit(sector_bit, _page->bitmap)))
+   if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
return t_page;
 
return NULL;
@@ -899,10 +907,10 @@ static int null_flush_cache_page(struct nullb *nullb, 
struct nullb_page *c_page)
 
t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
 
-   __clear_bit(NULLB_PAGE_LOCK, _page->bitmap);
-   if (test_bit(NULLB_PAGE_FREE, _page->bitmap)) {
+   __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
+   if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
null_free_page(c_page);
-   if (t_page && t_page->bitmap == 0) {
+   if (t_page && null_page_empty(t_page)) {
ret = radix_tree_delete_item(>dev->data,
idx, t_page);
null_free_page(t_page);
@@ -918,11 +926,11 @@ static int null_flush_cache_page(struct nullb *nullb, 
struct nullb_page *c_page)
 
for (i = 0; i < PAGE_SECTORS;
i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
-   if (test_bit(i, _page->bitmap)) {
+   if (test_bit(i, c_page->bitmap)) {
offset = (i << SECTOR_SHIFT);
memcpy(dst + offset, src + offset,
nullb->dev->blocksize);
-   __set_bit(i, _page->bitmap);
+   __set_bit(i, t_page->bitmap);
}
}
 
@@ -959,10 +967,10 @@ static int null_make_cache_space(struct nullb *nullb, 
unsigned long n)
 * We found t

Re: [PATCH] block: null_blk: fix 'Invalid parameters' failure when loading module

2018-03-05 Thread Ming Lei
On Mon, Mar 05, 2018 at 03:57:07PM +, Bart Van Assche wrote:
> On Sat, 2018-03-03 at 10:24 +0800, Ming Lei wrote:
> >  struct nullb_page {
> > struct page *page;
> > -   unsigned long bitmap;
> > +   unsigned long bitmap[DIV_ROUND_UP(MAP_SZ, sizeof(unsigned long) * 8)];
> >  };
> 
> Could DECLARE_BITMAP() have been used here?

Indeed, will do it in V2.

Thanks,
Ming


[PATCH V2 3/5] genirq/affinity: move actual irq vector spread into one helper

2018-03-04 Thread Ming Lei
No functional change, just prepare for converting to 2-stage
irq vector spread.

Cc: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 97 +--
 1 file changed, 55 insertions(+), 42 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 9f49d6ef0dc8..256adf92ec62 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,50 +94,19 @@ static int get_nodes_in_cpumask(const cpumask_var_t 
*node_to_cpumask,
return nodes;
 }
 
-/**
- * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @nvecs: The total number of vectors
- * @affd:  Description of the affinity requirements
- *
- * Returns the masks pointer or NULL if allocation failed.
- */
-struct cpumask *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
+   const cpumask_var_t *node_to_cpumask,
+   const struct cpumask *cpu_mask,
+   struct cpumask *nmsk,
+   struct cpumask *masks)
 {
-   int n, nodes, cpus_per_vec, extra_vecs, curvec;
int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int last_affv = affv + affd->pre_vectors;
+   int curvec = affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
-   struct cpumask *masks;
-   cpumask_var_t nmsk, *node_to_cpumask;
-
-   /*
-* If there aren't any vectors left after applying the pre/post
-* vectors don't bother with assigning affinity.
-*/
-   if (!affv)
-   return NULL;
-
-   if (!zalloc_cpumask_var(, GFP_KERNEL))
-   return NULL;
-
-   masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
-   if (!masks)
-   goto out;
+   int n, nodes, cpus_per_vec, extra_vecs;
 
-   node_to_cpumask = alloc_node_to_cpumask();
-   if (!node_to_cpumask)
-   goto out;
-
-   /* Fill out vectors at the beginning that don't need affinity */
-   for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-   cpumask_copy(masks + curvec, irq_default_affinity);
-
-   /* Stabilize the cpumasks */
-   get_online_cpus();
-   build_node_to_cpumask(node_to_cpumask);
-   nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask,
-);
+   nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, );
 
/*
 * If the number of nodes in the mask is greater than or equal the
@@ -150,7 +119,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (++curvec == last_affv)
break;
}
-   goto done;
+   goto out;
}
 
for_each_node_mask(n, nodemsk) {
@@ -160,7 +129,7 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
/* Get the cpus on this node which are in the mask */
-   cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]);
+   cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
 
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -186,7 +155,51 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
--nodes;
}
 
-done:
+out:
+   return curvec - affd->pre_vectors;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs: The total number of vectors
+ * @affd:  Description of the affinity requirements
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+{
+   int curvec;
+   struct cpumask *masks;
+   cpumask_var_t nmsk, *node_to_cpumask;
+
+   /*
+* If there aren't any vectors left after applying the pre/post
+* vectors don't bother with assigning affinity.
+*/
+   if (nvecs == affd->pre_vectors + affd->post_vectors)
+   return NULL;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL))
+   return NULL;
+
+   masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+   if (!masks)
+   goto out;
+
+   node_to_cpumask = alloc_node_to_cpumask();
+   if (!node_to_cpumask)
+   goto out;
+
+   /* Fill out vectors at the beginning that don't need affinity */
+   for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+   cpumask_copy(masks + curvec, irq_

[PATCH V2 2/5] genirq/affinity: mark 'node_to_cpumask' as const for get_nodes_in_cpumask()

2018-03-04 Thread Ming Lei
Inside irq_create_affinity_masks(), once 'node_to_cpumask' is created,
it is accessed read-only, so mark it as const for
get_nodes_in_cpumask().

Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Christoph Hellwig <h...@lst.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4b1c4763212d..9f49d6ef0dc8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -79,7 +79,7 @@ static void build_node_to_cpumask(cpumask_var_t *masks)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
 }
 
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
+static int get_nodes_in_cpumask(const cpumask_var_t *node_to_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
 {
int n, nodes = 0;
-- 
2.9.5



[PATCH V2 4/5] genirq/affinity: support to do irq vectors spread starting from any vector

2018-03-04 Thread Ming Lei
Now two parameters(start_vec, affv) are introduced to 
irq_build_affinity_masks(),
then this helper can build the affinity of each irq vector starting from
the irq vector of 'start_vec', and handle at most 'affv' vectors.

This way is required to do 2-stages irq vectors spread among all
possible CPUs.

Cc: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 256adf92ec62..a8c5d07890a6 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,17 +94,17 @@ static int get_nodes_in_cpumask(const cpumask_var_t 
*node_to_cpumask,
return nodes;
 }
 
-static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+   const int start_vec, const int affv,
const cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct cpumask *masks)
 {
-   int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int last_affv = affv + affd->pre_vectors;
-   int curvec = affd->pre_vectors;
+   int curvec = start_vec;
nodemask_t nodemsk = NODE_MASK_NONE;
-   int n, nodes, cpus_per_vec, extra_vecs;
+   int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 
nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, );
 
@@ -116,8 +116,10 @@ static int irq_build_affinity_masks(int nvecs, const 
struct irq_affinity *affd,
for_each_node_mask(n, nodemsk) {
cpumask_copy(masks + curvec,
 node_to_cpumask[n]);
-   if (++curvec == last_affv)
+   if (++done == affv)
break;
+   if (++curvec == last_affv)
+   curvec = affd->pre_vectors;
}
goto out;
}
@@ -150,13 +152,16 @@ static int irq_build_affinity_masks(int nvecs, const 
struct irq_affinity *affd,
irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
}
 
-   if (curvec >= last_affv)
+   done += v;
+   if (done >= affv)
break;
+   if (curvec >= last_affv)
+   curvec = affd->pre_vectors;
--nodes;
}
 
 out:
-   return curvec - affd->pre_vectors;
+   return done;
 }
 
 /**
@@ -169,6 +174,7 @@ static int irq_build_affinity_masks(int nvecs, const struct 
irq_affinity *affd,
 struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
+   int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int curvec;
struct cpumask *masks;
cpumask_var_t nmsk, *node_to_cpumask;
@@ -198,7 +204,8 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
-   curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask,
+   curvec += irq_build_affinity_masks(affd, curvec, affv,
+  node_to_cpumask,
   cpu_possible_mask, nmsk, masks);
put_online_cpus();
 
-- 
2.9.5



[PATCH V2 5/5] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-04 Thread Ming Lei
84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
may cause irq vector assigned to all offline CPUs, and this kind of
assignment may cause much less irq vectors mapped to online CPUs, and
performance may get hurt.

For example, in a 8 cores system, 0~3 online, 4~8 offline/not present,
see 'lscpu':

[ming@box]$lscpu
Architecture:  x86_64
CPU op-mode(s):32-bit, 64-bit
Byte Order:Little Endian
CPU(s):4
On-line CPU(s) list:   0-3
Thread(s) per core:1
Core(s) per socket:2
Socket(s): 2
NUMA node(s):  2
...
NUMA node0 CPU(s): 0-3
NUMA node1 CPU(s):
...

For example, one device has 4 queues:

1) before 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
irq 39, cpu list 0
irq 40, cpu list 1
irq 41, cpu list 2
irq 42, cpu list 3

2) after 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
irq 39, cpu list 0-2
irq 40, cpu list 3-4,6
irq 41, cpu list 5
irq 42, cpu list 7

3) after applying this patch against V4.15+:
irq 39, cpu list 0,4
irq 40, cpu list 1,6
irq 41, cpu list 2,5
irq 42, cpu list 3,7

This patch tries to do irq vector spread among online CPUs as far as
possible by 2 stages spread.

The above assignment 3) isn't the optimal result from NUMA view, but it
returns more irq vectors with online CPU mapped, given in reality one CPU
should be enough to handle one irq vector, so it is better to do this way.

Cc: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reported-by: Laurence Oberman <lober...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 kernel/irq/affinity.c | 35 +--
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a8c5d07890a6..aa2635416fc5 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -106,6 +106,9 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
nodemask_t nodemsk = NODE_MASK_NONE;
int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 
+   if (!cpumask_weight(cpu_mask))
+   return 0;
+
nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, );
 
/*
@@ -175,9 +178,9 @@ struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
int affv = nvecs - affd->pre_vectors - affd->post_vectors;
-   int curvec;
+   int curvec, vecs_offline, vecs_online;
struct cpumask *masks;
-   cpumask_var_t nmsk, *node_to_cpumask;
+   cpumask_var_t nmsk, cpu_mask, *node_to_cpumask;
 
/*
 * If there aren't any vectors left after applying the pre/post
@@ -193,9 +196,12 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
if (!masks)
goto out;
 
+   if (!alloc_cpumask_var(_mask, GFP_KERNEL))
+   goto out;
+
node_to_cpumask = alloc_node_to_cpumask();
if (!node_to_cpumask)
-   goto out;
+   goto out_free_cpu_mask;
 
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
@@ -204,15 +210,32 @@ irq_create_affinity_masks(int nvecs, const struct 
irq_affinity *affd)
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
-   curvec += irq_build_affinity_masks(affd, curvec, affv,
-  node_to_cpumask,
-  cpu_possible_mask, nmsk, masks);
+   /* spread on online CPUs starting from the vector of affd->pre_vectors 
*/
+   vecs_online = irq_build_affinity_masks(affd, curvec, affv,
+  node_to_cpumask,
+  cpu_online_mask, nmsk, masks);
+
+   /* spread on offline CPUs starting from the next vector to be handled */
+   if (vecs_online >= affv)
+   curvec = affd->pre_vectors;
+   else
+   curvec = affd->pre_vectors + vecs_online;
+   cpumask_andnot(cpu_mask, cpu_possible_mask, cpu_online_mask);
+   vecs_offline = irq_build_affinity_masks(affd, curvec, affv,
+   node_to_cpumask,
+   cpu_mask, nmsk, masks);
put_online_cpus();
 
/* Fill out vectors at the end that don't need affinity */
+   if (vecs_online + vecs_offline >= affv)
+   curvec = affv + affd->pre_vectors;
+   else
+   curvec = affd->pre_vectors + vecs_online + vecs_offlin

[PATCH V2 0/5] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-04 Thread Ming Lei
Hi,

This patchset tries to spread among online CPUs as far as possible, so
that we can avoid to allocate too less irq vectors with online CPUs
mapped.

For example, in a 8cores system, 4 cpu cores(4~7) are offline/non present,
on a device with 4 queues:

1) before this patchset
irq 39, cpu list 0-2
irq 40, cpu list 3-4,6
irq 41, cpu list 5
irq 42, cpu list 7

2) after this patchset
irq 39, cpu list 0,4
irq 40, cpu list 1,6
irq 41, cpu list 2,5
irq 42, cpu list 3,7

Without this patchset, only two vectors(39, 40) can be active, but there
can be 4 active irq vectors after applying this patchset.

One disadvantage is that CPUs from different NUMA node can be mapped to
one same irq vector. Given generally one CPU should be enough to handle
one irq vector, it shouldn't be a big deal. Especailly more vectors have
to be allocated, otherwise performance can be hurt in current
assignment.

V2:
- address coments from Christoph
- mark irq_build_affinity_masks as static
- move constification of get_nodes_in_cpumask's parameter into one
  prep patch
- add Reviewed-by tag

Thanks
Ming

Ming Lei (5):
  genirq/affinity: rename *node_to_possible_cpumask as *node_to_cpumask
  genirq/affinity: mark 'node_to_cpumask' as const for
get_nodes_in_cpumask()
  genirq/affinity: move actual irq vector spread into one helper
  genirq/affinity: support to do irq vectors spread starting from any
vector
  genirq/affinity: irq vector spread among online CPUs as far as
possible

 kernel/irq/affinity.c | 145 --
 1 file changed, 94 insertions(+), 51 deletions(-)

-- 
2.9.5



Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-04 Thread Ming Lei
On Fri, Mar 02, 2018 at 04:53:21PM -0500, Laurence Oberman wrote:
> On Fri, 2018-03-02 at 15:03 +, Don Brace wrote:
> > > -Original Message-
> > > From: Laurence Oberman [mailto:lober...@redhat.com]
> > > Sent: Friday, March 02, 2018 8:09 AM
> > > To: Ming Lei <ming@redhat.com>
> > > Cc: Don Brace <don.br...@microsemi.com>; Jens Axboe <axboe@kernel.d
> > > k>;
> > > linux-block@vger.kernel.org; Christoph Hellwig <h...@infradead.org>;
> > > Mike
> > > Snitzer <snit...@redhat.com>; linux-s...@vger.kernel.org; Hannes
> > > Reinecke
> > > <h...@suse.de>; Arun Easi <arun.e...@cavium.com>; Omar Sandoval
> > > <osan...@fb.com>; Martin K . Petersen <martin.peter...@oracle.com>;
> > > James
> > > Bottomley <james.bottom...@hansenpartnership.com>; Christoph
> > > Hellwig
> > > <h...@lst.de>; Kashyap Desai <kashyap.de...@broadcom.com>; Peter
> > > Rivera
> > > <peter.riv...@broadcom.com>; Meelis Roos <mr...@linux.ee>
> > > Subject: Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply
> > > queue
> > > 
> > > EXTERNAL EMAIL
> > > 
> > > 
> > > On Fri, 2018-03-02 at 10:16 +0800, Ming Lei wrote:
> > > > On Thu, Mar 01, 2018 at 04:19:34PM -0500, Laurence Oberman wrote:
> > > > > On Thu, 2018-03-01 at 14:01 -0500, Laurence Oberman wrote:
> > > > > > On Thu, 2018-03-01 at 16:18 +, Don Brace wrote:
> > > > > > > > -Original Message-
> > > > > > > > From: Ming Lei [mailto:ming@redhat.com]
> > > > > > > > Sent: Tuesday, February 27, 2018 4:08 AM
> > > > > > > > To: Jens Axboe <ax...@kernel.dk>; linux-block@vger.kernel
> > > > > > > > .org
> > > > > > > > ;
> > > > > > > > Christoph
> > > > > > > > Hellwig <h...@infradead.org>; Mike Snitzer <snitzer@redhat
> > > > > > > > .com
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Cc: linux-s...@vger.kernel.org; Hannes Reinecke <hare@sus
> > > > > > > > e.de
> > > > > > > > > ;
> > > > > > > > 
> > > > > > > > Arun Easi
> > > > > > > > <arun.e...@cavium.com>; Omar Sandoval <osan...@fb.com>;
> > > > > > > > Martin K
> > > > > > > > .
> > > > > > > > Petersen <martin.peter...@oracle.com>; James Bottomley
> > > > > > > > <james.bottom...@hansenpartnership.com>; Christoph
> > > > > > > > Hellwig  > > > > > > > ch@l
> > > > > > > > st
> > > > > > > > .de>;
> > > > > > > > Don Brace <don.br...@microsemi.com>; Kashyap Desai
> > > > > > > > <kashyap.de...@broadcom.com>; Peter Rivera <peter.rivera@
> > > > > > > > broa
> > > > > > > > dcom
> > > > > > > > .c
> > > > > > > > om>;
> > > > > > > > Laurence Oberman <lober...@redhat.com>; Ming Lei
> > > > > > > > <ming@redhat.com>; Meelis Roos <mr...@linux.ee>
> > > > > > > > Subject: [PATCH V3 1/8] scsi: hpsa: fix selection of
> > > > > > > > reply
> > > > > > > > queue
> > > > > > > > 
> > > > 
> > > > Seems Don run into IO failure without blk-mq, could you run your
> > > > tests again
> > > > in legacy mode?
> > > > 
> > > > Thanks,
> > > > Ming
> > > 
> > > Hello Ming
> > > I ran multiple passes on Legacy and still see no issues in my test
> > > bed
> > > 
> > > BOOT_IMAGE=/vmlinuz-4.16.0-rc2.ming+ root=UUID=43f86d71-b1bf-4789-
> > > a28e-
> > > 21c6ddc90195 ro crashkernel=256M@64M log_buf_len=64M
> > > console=ttyS1,115200n8
> > > 
> > > HEAD of the git kernel I am using
> > > 
> > > 694e16f scsi: megaraid: improve scsi_mq performance via
> > > .host_tagset
> > > 793686c scsi: hpsa: improve scsi_mq performance via .host_tagset
> > > 60d5b36 block: null_blk: introduce module parameter of
> > > 'g_host_tags'
> > > 8847067 scsi: Add template flag 'host_tagset'
> > > a8fbdd6 blk-mq: introduce BLK_MQ_F_HOST_TAGS
> > > 4710fab blk-mq: introduce 'start_tag' field to 'struct blk_mq_tags'
> > > 09bb153 scsi: megaraid_sas: fix selection of reply queue
> > > 52700d8 scsi: hpsa: fix selection of reply queue
> > 
> > I checkout out Linus's tree (4.16.0-rc3+) and re-applied the above
> > patches.
> > I  and have been running 24 hours with no issues.
> > Evidently my forked copy was corrupted. 
> > 
> > So, my I/O testing has gone well. 
> > 
> > I'll run some performance numbers next.
> > 
> > Thanks,
> > Don
> 
> Unless Kashyap is not happy we need to consider getting this in to
> Linus now because we are seeing HPE servers that keep hanging now with
> the original commit now upstream.

Hi Martin,

Given both Don and Laurence have verified that patch 1 and patch 2
does fix IO hang, could you consider to merge the two first?

Thanks,
Ming


Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-01 Thread Ming Lei
On Thu, Mar 01, 2018 at 04:19:34PM -0500, Laurence Oberman wrote:
> On Thu, 2018-03-01 at 14:01 -0500, Laurence Oberman wrote:
> > On Thu, 2018-03-01 at 16:18 +, Don Brace wrote:
> > > > -Original Message-----
> > > > From: Ming Lei [mailto:ming@redhat.com]
> > > > Sent: Tuesday, February 27, 2018 4:08 AM
> > > > To: Jens Axboe <ax...@kernel.dk>; linux-block@vger.kernel.org;
> > > > Christoph
> > > > Hellwig <h...@infradead.org>; Mike Snitzer <snit...@redhat.com>
> > > > Cc: linux-s...@vger.kernel.org; Hannes Reinecke <h...@suse.de>;
> > > > Arun Easi
> > > > <arun.e...@cavium.com>; Omar Sandoval <osan...@fb.com>; Martin K
> > > > .
> > > > Petersen <martin.peter...@oracle.com>; James Bottomley
> > > > <james.bottom...@hansenpartnership.com>; Christoph Hellwig <hch@l
> > > > st
> > > > .de>;
> > > > Don Brace <don.br...@microsemi.com>; Kashyap Desai
> > > > <kashyap.de...@broadcom.com>; Peter Rivera <peter.rivera@broadcom
> > > > .c
> > > > om>;
> > > > Laurence Oberman <lober...@redhat.com>; Ming Lei
> > > > <ming@redhat.com>; Meelis Roos <mr...@linux.ee>
> > > > Subject: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue
> > > > 
> > > > EXTERNAL EMAIL
> > > > 
> > > > 
> > > > From 84676c1f21 (genirq/affinity: assign vectors to all possible
> > > > CPUs),
> > > > one msix vector can be created without any online CPU mapped,
> > > > then
> > > > one
> > > > command's completion may not be notified.
> > > > 
> > > > This patch setups mapping between cpu and reply queue according
> > > > to
> > > > irq
> > > > affinity info retrived by pci_irq_get_affinity(), and uses this
> > > > mapping
> > > > table to choose reply queue for queuing one command.
> > > > 
> > > > Then the chosen reply queue has to be active, and fixes IO hang
> > > > caused
> > > > by using inactive reply queue which doesn't have any online CPU
> > > > mapped.
> > > > 
> > > > Cc: Hannes Reinecke <h...@suse.de>
> > > > Cc: Arun Easi <arun.e...@cavium.com>
> > > > Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
> > > > Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
> > > > Cc: Christoph Hellwig <h...@lst.de>,
> > > > Cc: Don Brace <don.br...@microsemi.com>
> > > > Cc: Kashyap Desai <kashyap.de...@broadcom.com>
> > > > Cc: Peter Rivera <peter.riv...@broadcom.com>
> > > > Cc: Laurence Oberman <lober...@redhat.com>
> > > > Cc: Meelis Roos <mr...@linux.ee>
> > > > Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all
> > > > possible CPUs")
> > > > Signed-off-by: Ming Lei <ming@redhat.com>
> > > 
> > > I am getting some issues that need to be tracked down:
> > > 
> > > [ 1636.032984] hpsa :87:00.0: Acknowledging event: 0xc032
> > > (HP
> > > SSD Smart Path configuration change)
> > > [ 1638.510656] hpsa :87:00.0: scsi 3:0:8:0: updated Direct-
> > > Access HP   MO0400JDVEU  PHYS DRV SSDSmartPathCap- En-
> > > Exp=0
> > > [ 1653.967695] hpsa :87:00.0: Acknowledging event: 0x8020
> > > (HP
> > > SSD Smart Path configuration change)
> > > [ 1656.770377] hpsa :87:00.0: scsi 3:0:8:0: updated Direct-
> > > Access HP   MO0400JDVEU  PHYS DRV SSDSmartPathCap- En-
> > > Exp=0
> > > [ 2839.762267] hpsa :87:00.0: Acknowledging event: 0x8020
> > > (HP
> > > SSD Smart Path configuration change)
> > > [ 2840.841290] hpsa :87:00.0: scsi 3:0:8:0: updated Direct-
> > > Access HP   MO0400JDVEU  PHYS DRV SSDSmartPathCap- En-
> > > Exp=0
> > > [ 2917.582653] hpsa :87:00.0: Acknowledging event: 0xc020
> > > (HP
> > > SSD Smart Path configuration change)
> > > [ 2919.087191] hpsa :87:00.0: scsi 3:1:0:1: updated Direct-
> > > Access HP   LOGICAL VOLUME   RAID-5 SSDSmartPathCap+ En+
> > > Exp=1
> > > [ 2919.142527] hpsa :87:00.0: hpsa_figure_phys_disk_ptrs:
> > > [3:1:0:2] A phys disk component of LV is missing

Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-01 Thread Ming Lei
Hi Don,

Thanks for your test!

On Thu, Mar 01, 2018 at 04:18:17PM +, Don Brace wrote:
> > -Original Message-
> > From: Ming Lei [mailto:ming@redhat.com]
> > Sent: Tuesday, February 27, 2018 4:08 AM
> > To: Jens Axboe <ax...@kernel.dk>; linux-block@vger.kernel.org; Christoph
> > Hellwig <h...@infradead.org>; Mike Snitzer <snit...@redhat.com>
> > Cc: linux-s...@vger.kernel.org; Hannes Reinecke <h...@suse.de>; Arun Easi
> > <arun.e...@cavium.com>; Omar Sandoval <osan...@fb.com>; Martin K .
> > Petersen <martin.peter...@oracle.com>; James Bottomley
> > <james.bottom...@hansenpartnership.com>; Christoph Hellwig <h...@lst.de>;
> > Don Brace <don.br...@microsemi.com>; Kashyap Desai
> > <kashyap.de...@broadcom.com>; Peter Rivera <peter.riv...@broadcom.com>;
> > Laurence Oberman <lober...@redhat.com>; Ming Lei
> > <ming@redhat.com>; Meelis Roos <mr...@linux.ee>
> > Subject: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue
> > 
> > EXTERNAL EMAIL
> > 
> > 
> > From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
> > one msix vector can be created without any online CPU mapped, then one
> > command's completion may not be notified.
> > 
> > This patch setups mapping between cpu and reply queue according to irq
> > affinity info retrived by pci_irq_get_affinity(), and uses this mapping
> > table to choose reply queue for queuing one command.
> > 
> > Then the chosen reply queue has to be active, and fixes IO hang caused
> > by using inactive reply queue which doesn't have any online CPU mapped.
> > 
> > Cc: Hannes Reinecke <h...@suse.de>
> > Cc: Arun Easi <arun.e...@cavium.com>
> > Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
> > Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
> > Cc: Christoph Hellwig <h...@lst.de>,
> > Cc: Don Brace <don.br...@microsemi.com>
> > Cc: Kashyap Desai <kashyap.de...@broadcom.com>
> > Cc: Peter Rivera <peter.riv...@broadcom.com>
> > Cc: Laurence Oberman <lober...@redhat.com>
> > Cc: Meelis Roos <mr...@linux.ee>
> > Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
> > Signed-off-by: Ming Lei <ming@redhat.com>
> 
> I am getting some issues that need to be tracked down:

I check the patch one more time, not find odd thing, and the only one
is that inside hpsa_do_reset(), wait_for_device_to_become_ready() is
called to send 'test unit ready' always by the reply queue 0. Do you know
if something bad may happen if other non-zero reply queue is used?

Could you share us how you reproduce this issue?

Looks you can boot successfully, so could you please provide the
following output?

1) what is your server type? We may find one in our lab, so that I can
try to reproduce it.

2) lscpu

3) irq affinity info, and you need to pass the 1st column of
'lspci' of your hpsa PCI device to this script:

#!/bin/sh
if [ $# -ge 1 ]; then
PCID=$1
else
PCID=`lspci | grep "Non-Volatile memory" | cut -c1-7`
fi
PCIP=`find /sys/devices -name *$PCID | grep pci`
IRQS=`ls $PCIP/msi_irqs`

echo "kernel version: "
uname -a

echo "PCI name is $PCID, dump its irq affinity:"
for IRQ in $IRQS; do
CPUS=`cat /proc/irq/$IRQ/smp_affinity_list`
echo "\tirq $IRQ, cpu list $CPUS"
done


Thanks,
Ming


Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-02-28 Thread Ming Lei
On Thu, Mar 01, 2018 at 10:54:17AM +0530, Kashyap Desai wrote:
> > -Original Message-
> > From: Laurence Oberman [mailto:lober...@redhat.com]
> > Sent: Wednesday, February 28, 2018 9:52 PM
> > To: Ming Lei; Kashyap Desai
> > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig; Mike
> > Snitzer;
> > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar Sandoval;
> > Martin K . Petersen; James Bottomley; Christoph Hellwig; Don Brace; Peter
> > Rivera
> > Subject: Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance
> > via
> > .host_tagset
> >
> > On Wed, 2018-02-28 at 23:21 +0800, Ming Lei wrote:
> > > On Wed, Feb 28, 2018 at 08:28:48PM +0530, Kashyap Desai wrote:
> > > > Ming -
> > > >
> > > > Quick testing on my setup -  Performance slightly degraded (4-5%
> > > > drop)for megaraid_sas driver with this patch. (From 1610K IOPS it
> > > > goes to
> > > > 1544K)
> > > > I confirm that after applying this patch, we have #queue = #numa
> > > > node.
> > > >
> > > > ls -l
> > > > /sys/devices/pci:80/:80:02.0/:83:00.0/host10/target10:2
> > > > :23/10:
> > > > 2:23:0/block/sdy/mq
> > > > total 0
> > > > drwxr-xr-x. 18 root root 0 Feb 28 09:53 0 drwxr-xr-x. 18 root root 0
> > > > Feb 28 09:53 1
> > >
> > > OK, thanks for your test.
> > >
> > > As I mentioned to you, this patch should have improved performance on
> > > megaraid_sas, but the current slight degrade might be caused by
> > > scsi_host_queue_ready() in scsi_queue_rq(), I guess.
> > >
> > > With .host_tagset enabled and use per-numa-node hw queue, request can
> > > be queued to lld more frequently/quick than single queue, then the
> > > cost of
> > > atomic_inc_return(>host_busy) may be increased much meantime,
> > > think about millions of such operations, and finally slight IOPS drop
> > > is observed when the hw queue depth becomes half of .can_queue.
> > >
> > > >
> > > >
> > > > I would suggest to skip megaraid_sas driver changes using
> > > > shared_tagset until and unless there is obvious gain. If overall
> > > > interface of using shared_tagset is commit in kernel tree, we will
> > > > investigate (megaraid_sas
> > > > driver) in future about real benefit of using it.
> > >
> > > I'd suggest to not merge it until it is proved that performance can be
> > > improved in real device.
> 
> Noted.
> 
> > >
> > > I will try to work to remove the expensive atomic_inc_return(
> > > >host_busy)
> > > from scsi_queue_rq(), since it isn't needed for SCSI_MQ, once it is
> > > done, will ask you to test again.
> 
> Ming - Do you mean removing host_busy stats  from scsi_queue_rq() will still
> provide correct value in host_busy whenever IO reach to LLD ?

The host queue depth has been respected by blk-mq already before calling
scsi_queue_rq(), so not necessary to do it again in scsi_queue_rq(), but this
counter is needed in error handler, so we have to figure out one way to not
break error handler.

Also megaraid_sas driver need to be checked if there is host wide lock
used in .queuecommand or completion path.

Thanks,
Ming


Re: [PATCH] mq-deadline: Make sure to always unlock zones

2018-02-28 Thread Ming Lei
On Wed, Feb 28, 2018 at 09:35:29AM -0800, Bart Van Assche wrote:
> From: Damien Le Moal <damien.lem...@wdc.com>
> 
> In case of a failed write request (all retries failed) and when using
> libata, the SCSI error handler calls scsi_finish_command(). In the
> case of blk-mq this means that scsi_mq_done() does not get called,
> that blk_mq_complete_request() does not get called and also that the
> mq-deadline .completed_request() method is not called. This results in
> the target zone of the failed write request being left in a locked
> state, preventing that any new write requests are issued to the same
> zone.
> 
> Fix this by replacing the .completed_request() method with the
> .finish_request() method as this method is always called whether or
> not a request completes successfully. Since the .finish_request()
> method is only called by the blk-mq core if a .prepare_request()
> method exists, add a dummy .prepare_request() method.
> 
> Fixes: 5700f69178e9 ("mq-deadline: Introduce zone locking support")
> Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> [ bvanassche: edited patch description ]
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: Ming Lei <ming@redhat.com>
> ---
>  block/mq-deadline.c | 16 +---
>  1 file changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index c56f211c8440..8ec0ba9f5386 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -535,13 +535,22 @@ static void dd_insert_requests(struct blk_mq_hw_ctx 
> *hctx,
>   spin_unlock(>lock);
>  }
>  
> +/*
> + * Nothing to do here. This is defined only to ensure that .finish_request
> + * method is called upon request completion.
> + */
> +static void dd_prepare_request(struct request *rq, struct bio *bio)
> +{
> +}
> +
>  /*
>   * For zoned block devices, write unlock the target zone of
>   * completed write requests. Do this while holding the zone lock
>   * spinlock so that the zone is never unlocked while deadline_fifo_request()
> - * while deadline_next_request() are executing.
> + * or deadline_next_request() are executing. This function is called for
> + * all requests, whether or not these requests complete successfully.
>   */
> -static void dd_completed_request(struct request *rq)
> +static void dd_finish_request(struct request *rq)
>  {
>   struct request_queue *q = rq->q;
>  
> @@ -756,7 +765,8 @@ static struct elevator_type mq_deadline = {
>   .ops.mq = {
>   .insert_requests= dd_insert_requests,
>   .dispatch_request   = dd_dispatch_request,
> - .completed_request  = dd_completed_request,
> + .prepare_request= dd_prepare_request,
> + .finish_request = dd_finish_request,
>   .next_request   = elv_rb_latter_request,
>   .former_request = elv_rb_former_request,
>   .bio_merge  = dd_bio_merge,
> -- 
> 2.16.2
> 

Looks fine:

Reviewed-by: Ming Lei <ming@redhat.com>

Thanks,
Ming


Re: [PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-02-28 Thread Ming Lei
On Wed, Feb 28, 2018 at 08:28:48PM +0530, Kashyap Desai wrote:
> Ming -
> 
> Quick testing on my setup -  Performance slightly degraded (4-5% drop)for
> megaraid_sas driver with this patch. (From 1610K IOPS it goes to 1544K)
> I confirm that after applying this patch, we have #queue = #numa node.
> 
> ls -l
> /sys/devices/pci:80/:80:02.0/:83:00.0/host10/target10:2:23/10:
> 2:23:0/block/sdy/mq
> total 0
> drwxr-xr-x. 18 root root 0 Feb 28 09:53 0
> drwxr-xr-x. 18 root root 0 Feb 28 09:53 1

OK, thanks for your test.

As I mentioned to you, this patch should have improved performance on
megaraid_sas, but the current slight degrade might be caused by
scsi_host_queue_ready() in scsi_queue_rq(), I guess.

With .host_tagset enabled and use per-numa-node hw queue, request can be
queued to lld more frequently/quick than single queue, then the cost of
atomic_inc_return(>host_busy) may be increased much meantime,
think about millions of such operations, and finally slight IOPS drop
is observed when the hw queue depth becomes half of .can_queue.

> 
> 
> I would suggest to skip megaraid_sas driver changes using shared_tagset
> until and unless there is obvious gain. If overall interface of using
> shared_tagset is commit in kernel tree, we will investigate (megaraid_sas
> driver) in future about real benefit of using it.

I'd suggest to not merge it until it is proved that performance can be
improved in real device.

I will try to work to remove the expensive atomic_inc_return(>host_busy)
from scsi_queue_rq(), since it isn't needed for SCSI_MQ, once it is done, will
ask you to test again.


Thanks,
Ming


Re: [PATCH] blk-mq: Make sure that the affected zone is unlocked if a request times out

2018-02-27 Thread Ming Lei
Hi Damien,

On Wed, Feb 28, 2018 at 02:21:49AM +, Damien Le Moal wrote:
> Ming,
> 
> On 2018/02/27 17:35, Ming Lei wrote:
> > On Tue, Feb 27, 2018 at 04:28:30PM -0800, Bart Van Assche wrote:
> >> If a request times out the .completed_request() method is not called
> > 
> > If BLK_EH_HANDLED is returned from .timeout(), __blk_mq_complete_request()
> > should have called .completed_request(). Otherwise, somewhere may be
> > wrong about timeout handling. Could you investigate why .completed_request
> > isn't called under this situation?
> 
> Actually, the commit message is a little misleading. The problem is not only 
> for
> timeout but also for commands completing with a failure. This is very easy to
> reproduce by simply doing an unaligned write to a sequential zone on an ATA
> zoned block device. In this case, the scheduler .completed_request method is 
> not
> called, which result in the target zone of the failed write to remain locked.

Actually request should have been completed in case of timeout,
otherwise the race between timeout and normal completion can't be
avoided.

But for dispatch failure, we deal with that with blk_mq_end_request(IOERR)
directly, please see blk_mq_dispatch_rq_list(), so the failed request is
freed without completion.

> 
> Hence the addition of a .finish_request method in mq-deadline pointing to the
> same function as .completed_request to ensure that the command target zone is
> unlocked. To ensure that the .finish_request method is called, the RQF_ELVPRIV
> flag is set when the request is dispatched after the target zone was locked.

> 
> >> and the .finish_request() method is only called if RQF_ELVPRIV has
> > 
> > .finish_request() is counter-pair of .prepare_request(), and both
> > aren't implemented by mq-deadline, so RQF_ELVPRIV needn't to be set,
> > and the current rule is that this flag is managed by block core.
> 
> Indeed. So do you think it would be better to rather force a call to
> .completed_request for failed command in ATA case ? Currently, it is not 
> called
> after all retries for the command failed.

Now we know the reason, seems fine with either way:

1) handle it before calling blk_mq_end_request(IOERR)

2) introduce both .prepare_request()/.finish_request(), and do req zone
write lock in .dispatch_reqeust, but unlock in .finish_request, just like
what kyber does.


Thanks,
Ming


Re: [PATCH] blk-mq: Make sure that the affected zone is unlocked if a request times out

2018-02-27 Thread Ming Lei
On Tue, Feb 27, 2018 at 04:28:30PM -0800, Bart Van Assche wrote:
> If a request times out the .completed_request() method is not called

If BLK_EH_HANDLED is returned from .timeout(), __blk_mq_complete_request()
should have called .completed_request(). Otherwise, somewhere may be
wrong about timeout handling. Could you investigate why .completed_request
isn't called under this situation?

> and the .finish_request() method is only called if RQF_ELVPRIV has

.finish_request() is counter-pair of .prepare_request(), and both
aren't implemented by mq-deadline, so RQF_ELVPRIV needn't to be set,
and the current rule is that this flag is managed by block core.

> been set. Hence this patch that sets RQF_ELVPRIV and that adds a
> .finish_request() method. Without this patch, if a request times out
> the zone that request applies to remains locked forever and no further
> writes are accepted for that zone.
> 
> Fixes: 5700f69178e9 ("mq-deadline: Introduce zone locking support")
> Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: Ming Lei <ming@redhat.com>
> ---
>  block/mq-deadline.c | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index c56f211c8440..55d5b7a02d62 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -367,7 +367,8 @@ static struct request *__dd_dispatch_request(struct 
> deadline_data *dd)
>* If the request needs its target zone locked, do it.
>*/
>   blk_req_zone_write_lock(rq);
> - rq->rq_flags |= RQF_STARTED;
> + /* Set RQF_ELVPRIV to ensure that .finish_request() gets called */
> + rq->rq_flags |= RQF_STARTED | RQF_ELVPRIV;
>   return rq;
>  }
>  
> @@ -539,7 +540,9 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
>   * For zoned block devices, write unlock the target zone of
>   * completed write requests. Do this while holding the zone lock
>   * spinlock so that the zone is never unlocked while deadline_fifo_request()
> - * while deadline_next_request() are executing.
> + * while deadline_next_request() are executing. This function is called twice
> + * for requests that complete in a normal way and once for requests that time
> + * out.
>   */
>  static void dd_completed_request(struct request *rq)
>  {
> @@ -757,6 +760,7 @@ static struct elevator_type mq_deadline = {
>   .insert_requests= dd_insert_requests,
>   .dispatch_request   = dd_dispatch_request,
>   .completed_request  = dd_completed_request,
> + .finish_request = dd_completed_request,
>   .next_request   = elv_rb_latter_request,
>   .former_request = elv_rb_former_request,
>   .bio_merge  = dd_bio_merge,
> -- 
> 2.16.2
> 

-- 
Ming


[PATCH V3 8/8] scsi: megaraid: improve scsi_mq performance via .host_tagset

2018-02-27 Thread Ming Lei
It is observed on null_blk that IOPS can be improved much by simply making
hw queue per NUMA node, so this patch applies the introduced .host_tagset
for improving performance.

In reality, .can_queue is quite big, and NUMA node number is often small, so
each hw queue's depth should be high enough to saturate device.

Cc: Arun Easi <arun.e...@cavium.com>
Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Peter Rivera <peter.riv...@broadcom.com>
Cc: Laurence Oberman <lober...@redhat.com>
Cc: Hannes Reinecke <h...@suse.de>
Cc: Mike Snitzer <snit...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/megaraid/megaraid_sas_base.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c 
b/drivers/scsi/megaraid/megaraid_sas_base.c
index 065956cb2aeb..0b46f97cbfdb 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -3177,6 +3177,7 @@ static struct scsi_host_template megasas_template = {
.use_clustering = ENABLE_CLUSTERING,
.change_queue_depth = scsi_change_queue_depth,
.no_write_same = 1,
+   .host_tagset = 1,
 };
 
 /**
@@ -5947,6 +5948,8 @@ static int megasas_start_aen(struct megasas_instance 
*instance)
 static int megasas_io_attach(struct megasas_instance *instance)
 {
struct Scsi_Host *host = instance->host;
+   /* 256 tags should be high enough to saturate device */
+   int max_queues = DIV_ROUND_UP(host->can_queue, 256);
 
/*
 * Export parameters required by SCSI mid-layer
@@ -5987,6 +5990,9 @@ static int megasas_io_attach(struct megasas_instance 
*instance)
host->max_lun = MEGASAS_MAX_LUN;
host->max_cmd_len = 16;
 
+   /* per NUMA node hw queue */
+   host->nr_hw_queues = min_t(int, nr_node_ids, max_queues);
+
/*
 * Notify the mid-layer about the new controller
 */
-- 
2.9.5



[PATCH V3 7/8] scsi: hpsa: improve scsi_mq performance via .host_tagset

2018-02-27 Thread Ming Lei
It is observed that IOPS can be improved much by simply making
hw queue per NUMA node on null_blk, so this patch applies the
introduced .host_tagset for improving performance.

In reality, .can_queue is quite big, and NUMA node number is
often small, so each hw queue's depth should be high enough to
saturate device.

Cc: Arun Easi <arun.e...@cavium.com>
Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Peter Rivera <peter.riv...@broadcom.com>
Cc: Laurence Oberman <lober...@redhat.com>
Cc: Hannes Reinecke <h...@suse.de>
Cc: Mike Snitzer <snit...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/hpsa.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 3a9eca163db8..0747751b7e1c 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -978,6 +978,7 @@ static struct scsi_host_template hpsa_driver_template = {
.shost_attrs = hpsa_shost_attrs,
.max_sectors = 1024,
.no_write_same = 1,
+   .host_tagset = 1,
 };
 
 static inline u32 next_command(struct ctlr_info *h, u8 q)
@@ -5761,6 +5762,11 @@ static int hpsa_scsi_host_alloc(struct ctlr_info *h)
 static int hpsa_scsi_add_host(struct ctlr_info *h)
 {
int rv;
+   /* 256 tags should be high enough to saturate device */
+   int max_queues = DIV_ROUND_UP(h->scsi_host->can_queue, 256);
+
+   /* per NUMA node hw queue */
+   h->scsi_host->nr_hw_queues = min_t(int, nr_node_ids, max_queues);
 
rv = scsi_add_host(h->scsi_host, >pdev->dev);
if (rv) {
-- 
2.9.5



[PATCH V3 5/8] scsi: Add template flag 'host_tagset'

2018-02-27 Thread Ming Lei
From: Hannes Reinecke <h...@suse.com>

Add a host template flag 'host_tagset' to enable the use of a global
tagset for block-mq.

Cc: Hannes Reinecke <h...@suse.de>
Cc: Arun Easi <arun.e...@cavium.com>
Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Peter Rivera <peter.riv...@broadcom.com>
Cc: Mike Snitzer <snit...@redhat.com>
Signed-off-by: Hannes Reinecke <h...@suse.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/scsi/scsi_lib.c  | 2 ++
 include/scsi/scsi_host.h | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a86df9ca7d1c..8e6f118f1066 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2291,6 +2291,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
shost->tag_set.flags |=
BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
+   if (shost->hostt->host_tagset)
+   shost->tag_set.flags |= BLK_MQ_F_HOST_TAGS;
shost->tag_set.driver_data = shost;
 
return blk_mq_alloc_tag_set(>tag_set);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 1a1df0d21ee3..1b35d9cb59b3 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -457,6 +457,9 @@ struct scsi_host_template {
 */
unsigned int max_host_blocked;
 
+   /* True if the host supports a host-wide tagspace */
+   unsigned host_tagset:1;
+
/*
 * Default value for the blocking.  If the queue is empty,
 * host_blocked counts down in the request_fn until it restarts
-- 
2.9.5



[PATCH V3 6/8] block: null_blk: introduce module parameter of 'g_host_tags'

2018-02-27 Thread Ming Lei
This patch introduces the parameter of 'g_host_tags' so that we can
test this feature by null_blk easiy.

With host_tags when the whole hw depth is kept as same, it is observed
that IOPS can be improved by ~50% on a dual socket(total 16 CPU cores)
system:

1) no 'host_tags', each hw queue depth is 16, and 1 hw queue
modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 host_tags=0 
submit_queues=1 hw_queue_depth=16

IOPS: 1382K

2) 'host_tags', each hw queue depth is 8, and 2 hw queues
modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 host_tags=1 
submit_queues=2 hw_queue_depth=16

IOPS: 2124K

3) fio test done in above two settings:
fio --bs=4k --size=512G  --rw=randread --norandommap --direct=1 
--ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1  
--name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 
--name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3

Cc: Arun Easi <arun.e...@cavium.com>
Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Peter Rivera <peter.riv...@broadcom.com>
Cc: Laurence Oberman <lober...@redhat.com>
Cc: Hannes Reinecke <h...@suse.de>
Cc: Mike Snitzer <snit...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 drivers/block/null_blk.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 287a09611c0f..51b16249028a 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -163,6 +163,10 @@ static int g_submit_queues = 1;
 module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
 
+static int g_host_tags = 0;
+module_param_named(host_tags, g_host_tags, int, S_IRUGO);
+MODULE_PARM_DESC(host_tags, "All submission queues share one tags");
+
 static int g_home_node = NUMA_NO_NODE;
 module_param_named(home_node, g_home_node, int, S_IRUGO);
 MODULE_PARM_DESC(home_node, "Home node for the device");
@@ -1622,6 +1626,8 @@ static int null_init_tag_set(struct nullb *nullb, struct 
blk_mq_tag_set *set)
set->flags = BLK_MQ_F_SHOULD_MERGE;
if (g_no_sched)
set->flags |= BLK_MQ_F_NO_SCHED;
+   if (g_host_tags)
+   set->flags |= BLK_MQ_F_HOST_TAGS;
set->driver_data = NULL;
 
if ((nullb && nullb->dev->blocking) || g_blocking)
-- 
2.9.5



[PATCH V3 4/8] blk-mq: introduce BLK_MQ_F_HOST_TAGS

2018-02-27 Thread Ming Lei
This patch can support to partition host-wide tags to multiple hw queues,
so each hw queue related data structures(tags, hctx) can be accessed in
NUMA locality way, for example, the hw queue can be per NUMA node.

It is observed IOPS can be improved much in this way on null_blk test.

Cc: Hannes Reinecke <h...@suse.de>
Cc: Arun Easi <arun.e...@cavium.com>
Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Peter Rivera <peter.riv...@broadcom.com>
Cc: Mike Snitzer <snit...@redhat.com>
Cc: Laurence Oberman <lober...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq-debugfs.c |  2 ++
 block/blk-mq-sched.c   |  2 +-
 block/blk-mq-tag.c | 10 +++---
 block/blk-mq-tag.h |  5 -
 block/blk-mq.c | 43 ++-
 block/blk-mq.h |  3 ++-
 include/linux/blk-mq.h |  2 ++
 7 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 21cbc1f071c6..56b4a572f233 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -206,6 +206,7 @@ static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(SG_MERGE),
+   HCTX_FLAG_NAME(HOST_TAGS),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
 };
@@ -434,6 +435,7 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
seq_printf(m, "active_queues=%d\n",
   atomic_read(>active_queues));
+   seq_printf(m, "start_tag=%u\n", tags->start_tag);
 
seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(>bitmap_tags, m);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 25c14c58385c..d895a57f945a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -497,7 +497,7 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
int ret;
 
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
-  set->reserved_tags);
+  set->reserved_tags, 0);
if (!hctx->sched_tags)
return -ENOMEM;
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 5014d7343ea9..cc8886f82c71 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -380,9 +380,11 @@ static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct 
blk_mq_tags *tags,
return NULL;
 }
 
-struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
+struct blk_mq_tags *blk_mq_init_tags(struct blk_mq_tag_set *set,
+unsigned int total_tags,
 unsigned int reserved_tags,
-int node, int alloc_policy)
+int node, int alloc_policy,
+unsigned int start_tag)
 {
struct blk_mq_tags *tags;
 
@@ -397,6 +399,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int 
total_tags,
 
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
+   tags->start_tag = start_tag;
 
return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
 }
@@ -438,7 +441,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > 16 * BLKDEV_MAX_RQ)
return -EINVAL;
 
-   new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+   new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0,
+   tags->start_tag);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 1d629920db69..9cd195cb15d0 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -14,6 +14,7 @@ struct blk_mq_tags {
atomic_t active_queues;
 
unsigned int start_tag;
+   boolhost_wide;
 
struct sbitmap_queue bitmap_tags;
struct sbitmap_queue breserved_tags;
@@ -24,7 +25,9 @@ struct blk_mq_tags {
 };
 
 
-extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int 
reserved_tags, int node, int alloc_policy);
+extern struct blk_mq_tags *blk_mq_init_tags(struct blk_mq_tag_set *set,
+   unsigned int nr_tags, unsigned int reserved_tags, int node,
+   int alloc_policy, unsigned int start_tag);
 extern void blk_mq_free_tags(struct bl

[PATCH V3 3/8] blk-mq: introduce 'start_tag' field to 'struct blk_mq_tags'

2018-02-27 Thread Ming Lei
This patch introduces 'start_tag' field to 'struct blk_mq_tags' so that
host wide tagset can be supported easily in the following patches by
partitioning host wide tags into multiple hw queues.

No function change.

Cc: Hannes Reinecke <h...@suse.de>
Cc: Arun Easi <arun.e...@cavium.com>
Cc: Omar Sandoval <osan...@fb.com>,
Cc: "Martin K. Petersen" <martin.peter...@oracle.com>,
Cc: James Bottomley <james.bottom...@hansenpartnership.com>,
Cc: Christoph Hellwig <h...@lst.de>,
Cc: Don Brace <don.br...@microsemi.com>
Cc: Kashyap Desai <kashyap.de...@broadcom.com>
Cc: Peter Rivera <peter.riv...@broadcom.com>
Cc: Mike Snitzer <snit...@redhat.com>
Signed-off-by: Ming Lei <ming@redhat.com>
---
 block/blk-mq-tag.c | 3 ++-
 block/blk-mq-tag.h | 6 --
 block/blk-mq.c | 7 ---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 336dde07b230..5014d7343ea9 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -179,12 +179,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data 
*data)
finish_wait(>wait, );
 
 found_tag:
-   return tag + tag_offset;
+   return tag + tag_offset + tags->start_tag;
 }
 
 void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
struct blk_mq_ctx *ctx, unsigned int tag)
 {
+   tag -= tags->start_tag;
if (!blk_mq_tag_is_reserved(tags, tag)) {
const int real_tag = tag - tags->nr_reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..1d629920db69 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -13,6 +13,8 @@ struct blk_mq_tags {
 
atomic_t active_queues;
 
+   unsigned int start_tag;
+
struct sbitmap_queue bitmap_tags;
struct sbitmap_queue breserved_tags;
 
@@ -78,13 +80,13 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx 
*hctx)
 static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
unsigned int tag, struct request *rq)
 {
-   hctx->tags->rqs[tag] = rq;
+   hctx->tags->rqs[tag - hctx->tags->start_tag] = rq;
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
  unsigned int tag)
 {
-   return tag < tags->nr_reserved_tags;
+   return (tag - tags->start_tag) < tags->nr_reserved_tags;
 }
 
 #endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 357492712b0e..5ea11d087f7b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -270,7 +270,7 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
unsigned int tag, unsigned int op)
 {
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
-   struct request *rq = tags->static_rqs[tag];
+   struct request *rq = tags->static_rqs[tag - tags->start_tag];
req_flags_t rq_flags = 0;
 
if (data->flags & BLK_MQ_REQ_INTERNAL) {
@@ -283,7 +283,7 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
}
rq->tag = tag;
rq->internal_tag = -1;
-   data->hctx->tags->rqs[rq->tag] = rq;
+   data->hctx->tags->rqs[rq->tag - tags->start_tag] = rq;
}
 
/* csd/requeue_work/fifo_time is initialized before use */
@@ -801,6 +801,7 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
+   tag -= tags->start_tag;
if (tag < tags->nr_tags) {
prefetch(tags->rqs[tag]);
return tags->rqs[tag];
@@ -1076,7 +1077,7 @@ bool blk_mq_get_driver_tag(struct request *rq, struct 
blk_mq_hw_ctx **hctx,
rq->rq_flags |= RQF_MQ_INFLIGHT;
atomic_inc(>nr_active);
}
-   data.hctx->tags->rqs[rq->tag] = rq;
+   data.hctx->tags->rqs[rq->tag - data.hctx->tags->start_tag] = rq;
}
 
 done:
-- 
2.9.5



<    1   2   3   4   5   6   7   8   9   10   >