Re: [PATCH v3 15/15] blk-mq: use hk cpus only when isolcpus=io_queue is enabled

2024-08-13 Thread Ming Lei
On Tue, Aug 06, 2024 at 02:06:47PM +0200, Daniel Wagner wrote:
> When isolcpus=io_queue is enabled all hardware queues should run on the
> housekeeping CPUs only. Thus ignore the affinity mask provided by the
> driver. Also we can't use blk_mq_map_queues because it will map all CPUs
> to first hctx unless, the CPU is the same as the hctx has the affinity
> set to, e.g. 8 CPUs with isolcpus=io_queue,2-3,6-7 config
> 
>   queue mapping for /dev/nvme0n1
> hctx0: default 2 3 4 6 7
> hctx1: default 5
> hctx2: default 0
> hctx3: default 1
> 
>   PCI name is 00:05.0: nvme0n1
> irq 57 affinity 0-1 effective 1 is_managed:0 nvme0q0
> irq 58 affinity 4 effective 4 is_managed:1 nvme0q1
> irq 59 affinity 5 effective 5 is_managed:1 nvme0q2
> irq 60 affinity 0 effective 0 is_managed:1 nvme0q3
> irq 61 affinity 1 effective 1 is_managed:1 nvme0q4
> 
> where as with blk_mq_hk_map_queues we get
> 
>   queue mapping for /dev/nvme0n1
> hctx0: default 2 4
> hctx1: default 3 5
> hctx2: default 0 6
> hctx3: default 1 7
> 
>   PCI name is 00:05.0: nvme0n1
> irq 56 affinity 0-1 effective 1 is_managed:0 nvme0q0
> irq 61 affinity 4 effective 4 is_managed:1 nvme0q1
> irq 62 affinity 5 effective 5 is_managed:1 nvme0q2
> irq 63 affinity 0 effective 0 is_managed:1 nvme0q3
> irq 64 affinity 1 effective 1 is_managed:1 nvme0q4
> 
> Signed-off-by: Daniel Wagner 
> ---
>  block/blk-mq-cpumap.c | 56 
> +++
>  1 file changed, 56 insertions(+)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index c1277763aeeb..7e026c2ffa02 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -60,11 +60,64 @@ unsigned int blk_mq_num_online_queues(unsigned int 
> max_queues)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
>  
> +static bool blk_mq_hk_map_queues(struct blk_mq_queue_map *qmap)
> +{
> + struct cpumask *hk_masks;
> + cpumask_var_t isol_mask;
> +
> + unsigned int queue, cpu;
> +
> + if (!housekeeping_enabled(HK_TYPE_IO_QUEUE))
> + return false;
> +
> + /* map housekeeping cpus to matching hardware context */
> + hk_masks = group_cpus_evenly(qmap->nr_queues);
> + if (!hk_masks)
> + goto fallback;
> +
> + for (queue = 0; queue < qmap->nr_queues; queue++) {
> + for_each_cpu(cpu, &hk_masks[queue])
> + qmap->mq_map[cpu] = qmap->queue_offset + queue;
> + }
> +
> + kfree(hk_masks);
> +
> + /* map isolcpus to hardware context */
> + if (!alloc_cpumask_var(&isol_mask, GFP_KERNEL))
> + goto fallback;
> +
> + queue = 0;
> + cpumask_andnot(isol_mask,
> +cpu_possible_mask,
> +housekeeping_cpumask(HK_TYPE_IO_QUEUE));
> +
> + for_each_cpu(cpu, isol_mask) {
> + qmap->mq_map[cpu] = qmap->queue_offset + queue;
> + queue = (queue + 1) % qmap->nr_queues;
> + }
> +

With patch 14 and the above change, managed irq's affinity becomes not
matched with blk-mq mapping any more.

If the last CPU in managed irq's affinity becomes offline, blk-mq
mapping may have other isolated CPUs, so IOs in this hctx won't be
drained from blk_mq_hctx_notify_offline() in case of CPU offline,
but genirq still shutdowns this manage irq.

So IO hang risk is introduced here, it should be the reason of your
hang observation.


Thanks, 
Ming




Re: [PATCH v3 15/15] blk-mq: use hk cpus only when isolcpus=io_queue is enabled

2024-08-09 Thread Ming Lei
On Tue, Aug 06, 2024 at 02:06:47PM +0200, Daniel Wagner wrote:
> When isolcpus=io_queue is enabled all hardware queues should run on the
> housekeeping CPUs only. Thus ignore the affinity mask provided by the
> driver. Also we can't use blk_mq_map_queues because it will map all CPUs
> to first hctx unless, the CPU is the same as the hctx has the affinity
> set to, e.g. 8 CPUs with isolcpus=io_queue,2-3,6-7 config
> 
>   queue mapping for /dev/nvme0n1
> hctx0: default 2 3 4 6 7
> hctx1: default 5
> hctx2: default 0
> hctx3: default 1
> 
>   PCI name is 00:05.0: nvme0n1
> irq 57 affinity 0-1 effective 1 is_managed:0 nvme0q0
> irq 58 affinity 4 effective 4 is_managed:1 nvme0q1
> irq 59 affinity 5 effective 5 is_managed:1 nvme0q2
> irq 60 affinity 0 effective 0 is_managed:1 nvme0q3
> irq 61 affinity 1 effective 1 is_managed:1 nvme0q4
> 
> where as with blk_mq_hk_map_queues we get
> 
>   queue mapping for /dev/nvme0n1
> hctx0: default 2 4
> hctx1: default 3 5
> hctx2: default 0 6
> hctx3: default 1 7
> 
>   PCI name is 00:05.0: nvme0n1
> irq 56 affinity 0-1 effective 1 is_managed:0 nvme0q0
> irq 61 affinity 4 effective 4 is_managed:1 nvme0q1
> irq 62 affinity 5 effective 5 is_managed:1 nvme0q2
> irq 63 affinity 0 effective 0 is_managed:1 nvme0q3
> irq 64 affinity 1 effective 1 is_managed:1 nvme0q4
> 
> Signed-off-by: Daniel Wagner 
> ---
>  block/blk-mq-cpumap.c | 56 
> +++
>  1 file changed, 56 insertions(+)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index c1277763aeeb..7e026c2ffa02 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -60,11 +60,64 @@ unsigned int blk_mq_num_online_queues(unsigned int 
> max_queues)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
>  
> +static bool blk_mq_hk_map_queues(struct blk_mq_queue_map *qmap)
> +{
> + struct cpumask *hk_masks;
> + cpumask_var_t isol_mask;
> +
> + unsigned int queue, cpu;
> +
> + if (!housekeeping_enabled(HK_TYPE_IO_QUEUE))
> + return false;
> +
> + /* map housekeeping cpus to matching hardware context */
> + hk_masks = group_cpus_evenly(qmap->nr_queues);
> + if (!hk_masks)
> + goto fallback;
> +
> + for (queue = 0; queue < qmap->nr_queues; queue++) {
> + for_each_cpu(cpu, &hk_masks[queue])
> + qmap->mq_map[cpu] = qmap->queue_offset + queue;
> + }
> +
> + kfree(hk_masks);
> +
> + /* map isolcpus to hardware context */
> + if (!alloc_cpumask_var(&isol_mask, GFP_KERNEL))
> + goto fallback;
> +
> + queue = 0;
> + cpumask_andnot(isol_mask,
> +cpu_possible_mask,
> +housekeeping_cpumask(HK_TYPE_IO_QUEUE));
> +
> + for_each_cpu(cpu, isol_mask) {
> + qmap->mq_map[cpu] = qmap->queue_offset + queue;
> + queue = (queue + 1) % qmap->nr_queues;
> + }
> +
> + free_cpumask_var(isol_mask);
> +
> + return true;
> +
> +fallback:
> + /* map all cpus to hardware context ignoring any affinity */
> + queue = 0;
> + for_each_possible_cpu(cpu) {
> + qmap->mq_map[cpu] = qmap->queue_offset + queue;
> + queue = (queue + 1) % qmap->nr_queues;
> + }
> + return true;
> +}
> +
>  void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>  {
>   const struct cpumask *masks;
>   unsigned int queue, cpu;
>  
> + if (blk_mq_hk_map_queues(qmap))
> + return;
> +
>   masks = group_cpus_evenly(qmap->nr_queues);
>   if (!masks) {
>   for_each_possible_cpu(cpu)
> @@ -118,6 +171,9 @@ void blk_mq_dev_map_queues(struct blk_mq_queue_map *qmap,
>   const struct cpumask *mask;
>   unsigned int queue, cpu;
>  
> + if (blk_mq_hk_map_queues(qmap))
> + return;
> +
>   for (queue = 0; queue < qmap->nr_queues; queue++) {
>   mask = get_queue_affinity(dev_data, dev_off, queue);
>   if (!mask)

>From above implementation, "isolcpus=io_queue" is actually just one
optimization on "isolcpus=managed_irq", and there isn't essential
difference between the two.

And I'd suggest to optimize 'isolcpus=managed_irq' directly, such as:

- reduce nr_queues or numgrps for group_cpus_evenly() according to
house-keeping cpu mask

- spread house-keeping & isolate cpu mask evenly on each queue, and
you can use the existed two-stage spread for doing that


thanks,
Ming




Re: [PATCH v3 15/15] blk-mq: use hk cpus only when isolcpus=io_queue is enabled

2024-08-09 Thread Ming Lei
On Fri, Aug 09, 2024 at 09:22:11AM +0200, Daniel Wagner wrote:
> On Thu, Aug 08, 2024 at 01:26:41PM GMT, Ming Lei wrote:
> > Isolated CPUs are removed from queue mapping in this patchset, when someone
> > submit IOs from the isolated CPU, what is the correct hctx used for handling
> > these IOs?
> 
> No, every possible CPU gets a mapping. What this patch series does, is
> to limit/aligns the number of hardware context to the number of
> housekeeping CPUs. There is still a complete ctx-hctc mapping. So

OK, then I guess patch 1~7 aren't supposed to belong to this series,
cause you just want to reduce nr_hw_queues, meantime spread
house-keeping CPUs first for avoiding queues with all isolated cpu mask.

> whenever an user thread on an isolated CPU is issuing an IO a
> housekeeping CPU will also be involved (with the additional overhead,
> which seems to be okay for these users).
> 
> Without hardware queue on the isolated CPUs ensures we really never get
> any unexpected IO on those CPUs unless userspace does it own its own.
> It's a safety net.
> 
> Just to illustrate it, the non isolcpus configuration (default) map
> for an 8 CPU setup:
> 
> queue mapping for /dev/vda
> hctx0: default 0
> hctx1: default 1
> hctx2: default 2
> hctx3: default 3
> hctx4: default 4
> hctx5: default 5
> hctx6: default 6
> hctx7: default 7
> 
> and with isolcpus=io_queue,2-3,6-7
> 
> queue mapping for /dev/vda
> hctx0: default 0 2
> hctx1: default 1 3
> hctx2: default 4 6
> hctx3: default 5 7

OK, Looks I missed the point in patch 15 in which you added isolated cpu
into mapping manually, just wondering why not take the current two-stage
policy to cover both house-keeping and isolated CPUs in group_cpus_evenly()?

Such as spread house-keeping CPUs first, then isolated CPUs, just like
what we did for present & non-present cpus.

Then the whole patchset can be simplified a lot.

> 
> > From current implementation, it depends on implied zero filled
> > tag_set->map[type].mq_map[isolated_cpu], so hctx 0 is used.
> > 
> > During CPU offline, in blk_mq_hctx_notify_offline(),
> > blk_mq_hctx_has_online_cpu() returns true even though the last cpu in
> > hctx 0 is offline because isolated cpus join hctx 0 unexpectedly, so IOs in
> > hctx 0 won't be drained.
> > 
> > However managed irq core code still shutdowns the hw queue's irq because all
> > CPUs in this hctx are offline now. Then IO hang is triggered, isn't
> > it?
> 
> Thanks for the explanation. I was able to reproduce this scenario, that
> is a hardware context with two CPUs which go offline. Initially, I used
> fio for creating the workload but this never hit the hanger. Instead
> some background workload from systemd-journald is pretty reliable to
> trigger the hanger you describe.
> 
> Example:
> 
>   hctx2: default 4 6
> 
> CPU 0 stays online, CPU 1-5 are offline. CPU 6 is offlined:
> 
>   smpboot: CPU 5 is now offline
>   blk_mq_hctx_has_online_cpu:3537 hctx3 offline
>   blk_mq_hctx_has_online_cpu:3537 hctx2 offline
> 
> and there is no forward progress anymore, the cpuhotplug state machine
> is blocked and an IO is hanging:
> 
>   # grep busy /sys/kernel/debug/block/*/hctx*/tags | grep -v busy=0
>   /sys/kernel/debug/block/vda/hctx2/tags:busy=61
> 
> and blk_mq_hctx_notify_offline busy loops forever:
> 
>task:cpuhp/6 state:D stack:0 pid:439   tgid:439   ppid:2  
> flags:0x4000
>Call Trace:
> 
> __schedule+0x79d/0x15c0
> ? lockdep_hardirqs_on_prepare+0x152/0x210
> ? kvm_sched_clock_read+0xd/0x20
> ? local_clock_noinstr+0x28/0xb0
> ? local_clock+0x11/0x30
> ? lock_release+0x122/0x4a0
> schedule+0x3d/0xb0
> schedule_timeout+0x88/0xf0
> ? __pfx_process_timeout+0x10/0x10d
> msleep+0x28/0x40
> blk_mq_hctx_notify_offline+0x1b5/0x200
> ? cpuhp_thread_fun+0x41/0x1f0
> cpuhp_invoke_callback+0x27e/0x780
> ? __pfx_blk_mq_hctx_notify_offline+0x10/0x10
> ? cpuhp_thread_fun+0x42/0x1f0
> cpuhp_thread_fun+0x178/0x1f0
> smpboot_thread_fn+0x12e/0x1c0
> ? __pfx_smpboot_thread_fn+0x10/0x10
> kthread+0xe8/0x110
> ? __pfx_kthread+0x10/0x10
> ret_from_fork+0x33/0x40
> ? __pfx_kthread+0x10/0x10
> ret_from_fork_asm+0x1a/0x30
> 
> 
> I don't think this is a new problem this code introduces. This problem
> exists for any hardware context which has more than one CPU. As far I
> understand it, the problem is that there is no forward progress possible
> for the IO itself (I assume the corresp

Re: [PATCH v3 15/15] blk-mq: use hk cpus only when isolcpus=io_queue is enabled

2024-08-07 Thread Ming Lei
On Wed, Aug 07, 2024 at 02:40:11PM +0200, Daniel Wagner wrote:
> On Tue, Aug 06, 2024 at 10:55:09PM GMT, Ming Lei wrote:
> > On Tue, Aug 06, 2024 at 02:06:47PM +0200, Daniel Wagner wrote:
> > > When isolcpus=io_queue is enabled all hardware queues should run on the
> > > housekeeping CPUs only. Thus ignore the affinity mask provided by the
> > > driver. Also we can't use blk_mq_map_queues because it will map all CPUs
> > > to first hctx unless, the CPU is the same as the hctx has the affinity
> > > set to, e.g. 8 CPUs with isolcpus=io_queue,2-3,6-7 config
> > 
> > What is the expected behavior if someone still tries to submit IO on 
> > isolated
> > CPUs?
> 
> If a user thread is issuing an IO the IO is handled by the housekeeping
> CPU, which will cause some noise on the submitting CPU. As far I was
> told this is acceptable. Our customers really don't want to have any
> IO not from their application ever hitting the isolcpus. When their
> application is issuing an IO.
> 
> > BTW, I don't see any change in blk_mq_get_ctx()/blk_mq_map_queue() in this
> > patchset,
> 
> I was trying to figure out what you tried to explain last time with
> hangs, but didn't really understand what the conditions are for this
> problem to occur.

Isolated CPUs are removed from queue mapping in this patchset, when someone
submit IOs from the isolated CPU, what is the correct hctx used for handling
these IOs?

>From current implementation, it depends on implied zero filled
tag_set->map[type].mq_map[isolated_cpu], so hctx 0 is used.

During CPU offline, in blk_mq_hctx_notify_offline(),
blk_mq_hctx_has_online_cpu() returns true even though the last cpu in
hctx 0 is offline because isolated cpus join hctx 0 unexpectedly, so IOs in
hctx 0 won't be drained.

However managed irq core code still shutdowns the hw queue's irq because all
CPUs in this hctx are offline now. Then IO hang is triggered, isn't it?

The current blk-mq takes static & global queue/CPUs mapping, in which all CPUs
are covered. This patchset removes isolated CPUs from the mapping, and the
change is big from viewpoint of blk-mq queue mapping.

> 
> > that means one random hctx(or even NULL) may be used for submitting
> > IO from isolated CPUs,
> > then there can be io hang risk during cpu hotplug, or
> > kernel panic when submitting bio.
> 
> Can you elaborate a bit more? I must miss something important here.
> 
> Anyway, my understanding is that when the last CPU of a hctx goes
> offline the affinity is broken and assigned to an online HK CPU. And we
> ensure all flight IO have finished and also ensure we don't submit any
> new IO to a CPU which goes offline.
> 
> FWIW, I tried really hard to get an IO hang with cpu hotplug.
 
Please see above.


thanks,
Ming




Re: [PATCH v3 15/15] blk-mq: use hk cpus only when isolcpus=io_queue is enabled

2024-08-06 Thread Ming Lei
On Tue, Aug 06, 2024 at 02:06:47PM +0200, Daniel Wagner wrote:
> When isolcpus=io_queue is enabled all hardware queues should run on the
> housekeeping CPUs only. Thus ignore the affinity mask provided by the
> driver. Also we can't use blk_mq_map_queues because it will map all CPUs
> to first hctx unless, the CPU is the same as the hctx has the affinity
> set to, e.g. 8 CPUs with isolcpus=io_queue,2-3,6-7 config

What is the expected behavior if someone still tries to submit IO on isolated
CPUs?

BTW, I don't see any change in blk_mq_get_ctx()/blk_mq_map_queue() in this
patchset, that means one random hctx(or even NULL) may be used for submitting
IO from isolated CPUs, then there can be io hang risk during cpu hotplug, or
kernel panic when submitting bio.

Thanks,
Ming




Re: [PATCH v3 14/15] lib/group_cpus.c: honor housekeeping config when grouping CPUs

2024-08-06 Thread Ming Lei
On Tue, Aug 06, 2024 at 02:06:46PM +0200, Daniel Wagner wrote:
> group_cpus_evenly distributes all present CPUs into groups. This ignores
> the isolcpus configuration and assigns isolated CPUs into the groups.
> 
> Make group_cpus_evenly aware of isolcpus configuration and use the
> housekeeping CPU mask as base for distributing the available CPUs into
> groups.
> 
> Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed 
> interrupts")

This patch fixes nothing on commit 11ea68f553e2, please remove the above
Fixes tag.


Thanks,
Ming




Re: [PATCH] virtio_blk: Fix device surprise removal

2024-02-18 Thread Ming Lei
On Sat, Feb 17, 2024 at 08:08:48PM +0200, Parav Pandit wrote:
> When the PCI device is surprise removed, requests won't complete from
> the device. These IOs are never completed and disk deletion hangs
> indefinitely.
> 
> Fix it by aborting the IOs which the device will never complete
> when the VQ is broken.
> 
> With this fix now fio completes swiftly.
> An alternative of IO timeout has been considered, however
> when the driver knows about unresponsive block device, swiftly clearing
> them enables users and upper layers to react quickly.
> 
> Verified with multiple device unplug cycles with pending IOs in virtio
> used ring and some pending with device.
> 
> In future instead of VQ broken, a more elegant method can be used. At the
> moment the patch is kept to its minimal changes given its urgency to fix
> broken kernels.
> 
> Fixes: 43bb40c5b926 ("virtio_pci: Support surprise removal of virtio pci 
> device")
> Cc: sta...@vger.kernel.org
> Reported-by: lirongq...@baidu.com
> Closes: 
> https://lore.kernel.org/virtualization/c45dd68698cd47238c55fb73ca9b4...@baidu.com/
> Co-developed-by: Chaitanya Kulkarni 
> Signed-off-by: Chaitanya Kulkarni 
> Signed-off-by: Parav Pandit 
> ---
>  drivers/block/virtio_blk.c | 54 ++
>  1 file changed, 54 insertions(+)
> 
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 2bf14a0e2815..59b49899b229 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -1562,10 +1562,64 @@ static int virtblk_probe(struct virtio_device *vdev)
>   return err;
>  }
>  
> +static bool virtblk_cancel_request(struct request *rq, void *data)
> +{
> + struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
> +
> + vbr->in_hdr.status = VIRTIO_BLK_S_IOERR;
> + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq))
> + blk_mq_complete_request(rq);
> +
> + return true;
> +}
> +
> +static void virtblk_cleanup_reqs(struct virtio_blk *vblk)
> +{
> + struct virtio_blk_vq *blk_vq;
> + struct request_queue *q;
> + struct virtqueue *vq;
> + unsigned long flags;
> + int i;
> +
> + vq = vblk->vqs[0].vq;
> + if (!virtqueue_is_broken(vq))
> + return;
> +

What if the surprise happens after the above check?


Thanks,
Ming




Re: atomic queue limits updates v3

2024-01-31 Thread Ming Lei
On Wed, Jan 31, 2024 at 02:03:46PM +0100, Christoph Hellwig wrote:
> Hi Jens,
> 
> currently queue limits updates are a mess in that they are updated one
> limit at a time, which makes both cross-checking them against other
> limits hard, and also makes it hard to provide atomicy.
> 
> This series tries to change this by updating the whole set of queue
> limits atomically.   This in done in two ways:
> 
>  - for the initial setup the queue_limits structure is simply passed to
>the queue/disk allocation helpers and applies there after validation.
>  - for the (relatively few) cases that update limits at runtime a pair
>of helpers to take a snapshot of the current limits and to commit it
>after picking up the callers changes are provided.
> 
> As the series is big enough it only converts two drivers - virtio_blk as
> a heavily used driver in virtualized setups, and loop as one that actually
> does runtime updates while being fairly simple.  I plan to update most
> drivers for this merge window, although SCSI will probably have to wait
> for the next one given that it will need extensive API changes in the
> LLDD and ULD interfaces.
> 
> Chances since v2:
>  - fix the physical block size default
>  - use PAGE_SECTORS_SHIFT more 
> 
> Chances since v1:
>  - remove a spurious NULL return in blk_alloc_queue
>  - keep the existing max_discard_sectors == 0 behavior
>  - drop the patch nvme discard limit update hack - it will go into
>the series updating nvme instead
>  - drop a chunk_sector check
>  - use PAGE_SECTORS in a few places
>  - document the checks and defaults in blk_validate_limits
>  - various spelling fixes

For the whole series:

Reviewed-by: Ming Lei 

Thanks,
Ming




Re: [Report] requests are submitted to hardware in reverse order from nvme/virtio-blk queue_rqs()

2024-01-24 Thread Ming Lei
On Thu, Jan 25, 2024 at 07:32:37AM +0900, Damien Le Moal wrote:
> On 1/25/24 00:41, Keith Busch wrote:
> > On Wed, Jan 24, 2024 at 07:59:54PM +0800, Ming Lei wrote:
> >> Requests are added to plug list in reverse order, and both virtio-blk
> >> and nvme retrieves request from plug list in order, so finally requests
> >> are submitted to hardware in reverse order via nvme_queue_rqs() or
> >> virtio_queue_rqs, see:
> >>
> >>io_uring   submit_bio  vdb  6302096 4096
> >>io_uring   submit_bio  vdb 12235072 4096
> >>io_uring   submit_bio  vdb  7682280 4096
> >>io_uring   submit_bio  vdb 11912464 4096
> >>io_uring virtio_queue_rqs  vdb 11912464 4096
> >>io_uring virtio_queue_rqs  vdb  7682280 4096
> >>io_uring virtio_queue_rqs  vdb 12235072 4096
> >>io_uring virtio_queue_rqs  vdb  6302096 4096
> >>
> >>
> >> May this reorder be one problem for virtio-blk and nvme-pci?
> > 
> > For nvme, it depends. Usually it's probably not a problem, though some
> > pci ssd's have optimizations for sequential IO that might not work if
> > these get reordered.
> 
> ZNS and zoned virtio-blk drives... Cannot use io_uring at the moment. But I do
> not thing we reliably can anyway, unless the issuer is CPU/ring aware and 
> always
> issue writes to a zone using the same ring.

It isn't related with io_uring.

What matters is plug & none & queue_rqs(). If none is applied, any IOs
in single batch will be added to plug list, then dispatched to hardware
in reversed order via queue_rqs().

Thanks,
Ming




[Report] requests are submitted to hardware in reverse order from nvme/virtio-blk queue_rqs()

2024-01-24 Thread Ming Lei
Hello,

Requests are added to plug list in reverse order, and both virtio-blk
and nvme retrieves request from plug list in order, so finally requests
are submitted to hardware in reverse order via nvme_queue_rqs() or
virtio_queue_rqs, see:

io_uring   submit_bio  vdb  6302096 4096
io_uring   submit_bio  vdb 12235072 4096
io_uring   submit_bio  vdb  7682280 4096
io_uring   submit_bio  vdb 11912464 4096
io_uring virtio_queue_rqs  vdb 11912464 4096
io_uring virtio_queue_rqs  vdb  7682280 4096
io_uring virtio_queue_rqs  vdb 12235072 4096
io_uring virtio_queue_rqs  vdb  6302096 4096


May this reorder be one problem for virtio-blk and nvme-pci?


Thanks,
Ming




Re: [PATCH] virtio_blk: set the default scheduler to none

2023-12-07 Thread Ming Lei
On Thu, Dec 07, 2023 at 07:44:37PM -0700, Keith Busch wrote:
> On Fri, Dec 08, 2023 at 10:00:36AM +0800, Ming Lei wrote:
> > On Thu, Dec 07, 2023 at 12:31:05PM +0800, Li Feng wrote:
> > > virtio-blk is generally used in cloud computing scenarios, where the
> > > performance of virtual disks is very important. The mq-deadline scheduler
> > > has a big performance drop compared to none with single queue. In my 
> > > tests,
> > > mq-deadline 4k readread iops were 270k compared to 450k for none. So here
> > > the default scheduler of virtio-blk is set to "none".
> > 
> > The test result shows you may not test HDD. backing of virtio-blk.
> > 
> > none can lose IO merge capability more or less, so probably sequential IO 
> > perf
> > drops in case of HDD backing.
> 
> More of a curiosity, as I don't immediately even have an HDD to test
> with! Isn't it more useful for the host providing the backing HDD use an
> appropriate IO scheduler? virtio-blk has similiarities with a stacking
> block driver, and we usually don't need to stack IO schedulers.

dm-rq actually uses IO scheduler at high layer, and early merge has some
benefits:

1) virtio-blk inflight requests are reduced, so less chance to throttle
inside VM, meantime less IOs(bigger size) are handled by QEMU, and submitted
to host side queue.

2) early merge in VM is cheap than host side, since there can be more block
IOs originated from different virtio-blk/scsi devices at the same time and
all images can be stored in single disk, then these IOs become interleaved in
host side queue, so sequential IO may become random or hard to merge.

As Jens mentioned, it needs actual test.


Thanks,
Ming




Re: [PATCH] virtio_blk: set the default scheduler to none

2023-12-07 Thread Ming Lei
On Thu, Dec 07, 2023 at 12:31:05PM +0800, Li Feng wrote:
> virtio-blk is generally used in cloud computing scenarios, where the
> performance of virtual disks is very important. The mq-deadline scheduler
> has a big performance drop compared to none with single queue. In my tests,
> mq-deadline 4k readread iops were 270k compared to 450k for none. So here
> the default scheduler of virtio-blk is set to "none".

The test result shows you may not test HDD. backing of virtio-blk.

none can lose IO merge capability more or less, so probably sequential IO perf
drops in case of HDD backing.

Thanks,
Ming




Re: [PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-22 Thread Ming Lei
On Tue, Oct 22, 2019 at 12:19:17PM +0100, John Garry wrote:
> On 22/10/2019 01:16, Ming Lei wrote:
> > On Mon, Oct 21, 2019 at 03:02:56PM +0100, John Garry wrote:
> > > On 21/10/2019 13:53, Ming Lei wrote:
> > > > On Mon, Oct 21, 2019 at 12:49:53PM +0100, John Garry wrote:
> > > > > > > > 
> > > > > > > 
> > > > > > > Yes, we share tags among all queues, but we generate the tag - 
> > > > > > > known as IPTT
> > > > > > > - in the LLDD now, as we can no longer use the request tag (as it 
> > > > > > > is not
> > > > > > > unique per all queues):
> > > > > > > 
> > > > > > > https://github.com/hisilicon/kernel-dev/commit/087b95af374be6965583c1673032fb33bc8127e8#diff-f5d8fff19bc539a7387af5230d4e5771R188
> > > > > > > 
> > > > > > > As I said, the branch is messy and I did have to fix 087b95af374.
> > > > > > 
> > > > > > Firstly this way may waste lots of memory, especially the queue 
> > > > > > depth is
> > > > > > big, such as, hisilicon V3's queue depth is 4096.
> > > > > > 
> > > > > > Secondly, you have to deal with queue busy efficiently and 
> > > > > > correctly,
> > > > > > for example, your real hw tags(IPTT) can be used up easily, and how
> > > > > > will you handle these dispatched request?
> > > > > 
> > > > > I have not seen scenario of exhausted IPTT. And IPTT count is same as 
> > > > > SCSI
> > > > > host.can_queue, so SCSI midlayer should ensure that this does not 
> > > > > occur.
> > > > 
> > > 
> > > Hi Ming,
> 
> Hi Ming,
> 
> > > 
> > > > That check isn't correct, and each hw queue should have allowed
> > > > .can_queue in-flight requests.
> > > 
> > > There always seems to be some confusion or disagreement on this topic.
> > > 
> > > I work according to the comment in scsi_host.h:
> > > 
> > > "Note: it is assumed that each hardware queue has a queue depth of
> > >   can_queue. In other words, the total queue depth per host
> > >   is nr_hw_queues * can_queue."
> > > 
> > > So I set Scsi_host.can_queue = HISI_SAS_MAX_COMMANDS (=4096)
> > 
> > I believe all current drivers set .can_queue as single hw queue's depth.
> > If you set .can_queue as HISI_SAS_MAX_COMMANDS which is HBA's queue
> > depth, the hisilicon sas driver will HISI_SAS_MAX_COMMANDS * nr_hw_queues
> > in-flight requests.
> 
> Yeah, but the SCSI host should still limit max IOs over all queues to
> .can_queue:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scsi/scsi_mid_low_api.txt#n1083
> 

That limit is actually from legacy single-queue era, you should see that
I am removing it:

https://lore.kernel.org/linux-scsi/20191009093241.21481-2-ming@redhat.com/

With this change, IOPS can be improved much on some fast SCSI storage.


Thanks,
Ming



Re: [PATCH 2/4] block: Fix a race between blk_poll() and blk_mq_update_nr_hw_queues()

2019-10-22 Thread Ming Lei
On Mon, Oct 21, 2019 at 03:42:57PM -0700, Bart Van Assche wrote:
> If blk_poll() is called if no requests are in progress, it may happen that
> blk_mq_update_nr_hw_queues() modifies the data structures used by blk_poll(),
> e.g. q->queue_hw_ctx[]. Fix this race by serializing blk_poll() against
> blk_mq_update_nr_hw_queues().
> 
> Cc: Christoph Hellwig 
> Cc: Ming Lei 
> Cc: Hannes Reinecke 
> Cc: Johannes Thumshirn 
> Signed-off-by: Bart Van Assche 
> ---
>  block/blk-mq.c | 38 +-
>  1 file changed, 25 insertions(+), 13 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 7528678ef41f..ea64d951f411 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3439,19 +3439,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
>   return blk_mq_poll_hybrid_sleep(q, hctx, rq);
>  }
>  
> -/**
> - * blk_poll - poll for IO completions
> - * @q:  the queue
> - * @cookie: cookie passed back at IO submission time
> - * @spin: whether to spin for completions
> - *
> - * Description:
> - *Poll for completions on the passed in queue. Returns number of
> - *completed entries found. If @spin is true, then blk_poll will continue
> - *looping until at least one completion is found, unless the task is
> - *otherwise marked running (or we need to reschedule).
> - */
> -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
> +static int __blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
>  {
>   struct blk_mq_hw_ctx *hctx;
>   long state;
> @@ -3503,6 +3491,30 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, 
> bool spin)
>   __set_current_state(TASK_RUNNING);
>   return 0;
>  }
> +
> +/**
> + * blk_poll - poll for IO completions
> + * @q:  the queue
> + * @cookie: cookie passed back at IO submission time
> + * @spin: whether to spin for completions
> + *
> + * Description:
> + *Poll for completions on the passed in queue. Returns number of
> + *completed entries found. If @spin is true, then blk_poll will continue
> + *looping until at least one completion is found, unless the task is
> + *otherwise marked running (or we need to reschedule).
> + */
> +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
> +{
> + int ret;
> +
> + if (!percpu_ref_tryget(&q->q_usage_counter))
> + return 0;
> + ret = __blk_poll(q, cookie, spin);
> + blk_queue_exit(q);
> +
> + return ret;
> +}

IMO, this change isn't required. Caller of blk_poll is supposed to
hold refcount of the request queue, then the related hctx data structure
won't go away. When the hctx is in transient state, there can't be IO
to be polled, and it is safe to call into IO path.

BTW, .poll is absolutely the fast path, we should be careful to add code
in this path.

Thanks,
Ming



Re: [PATCH 1/4] block: Remove the synchronize_rcu() call from __blk_mq_update_nr_hw_queues()

2019-10-22 Thread Ming Lei
On Mon, Oct 21, 2019 at 03:42:56PM -0700, Bart Van Assche wrote:
> Since the blk_mq_{,un}freeze_queue() calls in __blk_mq_update_nr_hw_queues()
> already serialize __blk_mq_update_nr_hw_queues() against
> blk_mq_queue_tag_busy_iter(), the synchronize_rcu() call in
> __blk_mq_update_nr_hw_queues() is not necessary. Hence remove it.
> 
> Note: the synchronize_rcu() call in __blk_mq_update_nr_hw_queues() was
> introduced by commit f5e4d635 ("blk-mq: sync the update nr_hw_queues with
> blk_mq_queue_tag_busy_iter"). Commit 530ca2c9bd69 ("blk-mq: Allow blocking
> queue tag iter callbacks") removed the rcu_read_{,un}lock() calls that
> correspond to the synchronize_rcu() call in __blk_mq_update_nr_hw_queues().
> 
> Cc: Christoph Hellwig 
> Cc: Ming Lei 
> Cc: Hannes Reinecke 
> Cc: Johannes Thumshirn 
> Signed-off-by: Bart Van Assche 
> ---
>  block/blk-mq.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 8538dc415499..7528678ef41f 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3242,10 +3242,6 @@ static void __blk_mq_update_nr_hw_queues(struct 
> blk_mq_tag_set *set,
>  
>   list_for_each_entry(q, &set->tag_list, tag_set_list)
>   blk_mq_freeze_queue(q);
> - /*
> -  * Sync with blk_mq_queue_tag_busy_iter.
> -  */
> - synchronize_rcu();
>   /*
>* Switch IO scheduler to 'none', cleaning up the data associated
>* with the previous scheduler. We will switch back once we are done
> -- 
> 2.23.0.866.gb869b98d4c-goog
> 

Reviewed-by: Ming Lei 

-- 
Ming



Re: [PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-21 Thread Ming Lei
On Mon, Oct 21, 2019 at 03:02:56PM +0100, John Garry wrote:
> On 21/10/2019 13:53, Ming Lei wrote:
> > On Mon, Oct 21, 2019 at 12:49:53PM +0100, John Garry wrote:
> > > > > > 
> > > > > 
> > > > > Yes, we share tags among all queues, but we generate the tag - known 
> > > > > as IPTT
> > > > > - in the LLDD now, as we can no longer use the request tag (as it is 
> > > > > not
> > > > > unique per all queues):
> > > > > 
> > > > > https://github.com/hisilicon/kernel-dev/commit/087b95af374be6965583c1673032fb33bc8127e8#diff-f5d8fff19bc539a7387af5230d4e5771R188
> > > > > 
> > > > > As I said, the branch is messy and I did have to fix 087b95af374.
> > > > 
> > > > Firstly this way may waste lots of memory, especially the queue depth is
> > > > big, such as, hisilicon V3's queue depth is 4096.
> > > > 
> > > > Secondly, you have to deal with queue busy efficiently and correctly,
> > > > for example, your real hw tags(IPTT) can be used up easily, and how
> > > > will you handle these dispatched request?
> > > 
> > > I have not seen scenario of exhausted IPTT. And IPTT count is same as SCSI
> > > host.can_queue, so SCSI midlayer should ensure that this does not occur.
> > 
> 
> Hi Ming,
> 
> > That check isn't correct, and each hw queue should have allowed
> > .can_queue in-flight requests.
> 
> There always seems to be some confusion or disagreement on this topic.
> 
> I work according to the comment in scsi_host.h:
> 
> "Note: it is assumed that each hardware queue has a queue depth of
>  can_queue. In other words, the total queue depth per host
>  is nr_hw_queues * can_queue."
> 
> So I set Scsi_host.can_queue = HISI_SAS_MAX_COMMANDS (=4096)

I believe all current drivers set .can_queue as single hw queue's depth.
If you set .can_queue as HISI_SAS_MAX_COMMANDS which is HBA's queue
depth, the hisilicon sas driver will HISI_SAS_MAX_COMMANDS * nr_hw_queues
in-flight requests.

> 
> > 
> > > 
> > > > 
> > > > Finally, you have to evaluate the performance effect, this is highly
> > > > related with how to deal with out-of-IPTT.
> > > 
> > > Some figures from our previous testing:
> > > 
> > > Managed interrupt without exposing multiple queues: 3M IOPs
> > > Managed interrupt with exposing multiple queues: 2.6M IOPs
> > 
> > Then you see the performance regression.
> 
> Let's discuss this when I send the patches, so we don't get sidetracked on
> this blk-mq improvement topic.

OK, what I meant is to use correct driver to test the patches, otherwise
it might be hard to investigate.


Thanks,
Ming



Re: [PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-21 Thread Ming Lei
On Mon, Oct 21, 2019 at 12:49:53PM +0100, John Garry wrote:
> > > > 
> > > 
> > > Yes, we share tags among all queues, but we generate the tag - known as 
> > > IPTT
> > > - in the LLDD now, as we can no longer use the request tag (as it is not
> > > unique per all queues):
> > > 
> > > https://github.com/hisilicon/kernel-dev/commit/087b95af374be6965583c1673032fb33bc8127e8#diff-f5d8fff19bc539a7387af5230d4e5771R188
> > > 
> > > As I said, the branch is messy and I did have to fix 087b95af374.
> > 
> > Firstly this way may waste lots of memory, especially the queue depth is
> > big, such as, hisilicon V3's queue depth is 4096.
> > 
> > Secondly, you have to deal with queue busy efficiently and correctly,
> > for example, your real hw tags(IPTT) can be used up easily, and how
> > will you handle these dispatched request?
> 
> I have not seen scenario of exhausted IPTT. And IPTT count is same as SCSI
> host.can_queue, so SCSI midlayer should ensure that this does not occur.

That check isn't correct, and each hw queue should have allowed
.can_queue in-flight requests.

> 
> > 
> > Finally, you have to evaluate the performance effect, this is highly
> > related with how to deal with out-of-IPTT.
> 
> Some figures from our previous testing:
> 
> Managed interrupt without exposing multiple queues: 3M IOPs
> Managed interrupt with exposing multiple queues: 2.6M IOPs

Then you see the performance regression.


Thanks,
Ming



Re: [PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-21 Thread Ming Lei
On Mon, Oct 21, 2019 at 10:47:05AM +0100, John Garry wrote:
> On 21/10/2019 10:34, Ming Lei wrote:
> > On Mon, Oct 21, 2019 at 10:19:18AM +0100, John Garry wrote:
> > > On 20/10/2019 11:14, Ming Lei wrote:
> > > > > > ght? If so, I need to find some simple sysfs entry to
> > > > > > > > tell me of this occurrence, to trigger the capture. Or add 
> > > > > > > > something. My
> > > > > > > > script is pretty dump.
> > > > > > > > 
> > > > > > > > BTW, I did notice that we the dump_stack in 
> > > > > > > > __blk_mq_run_hw_queue()
> > > > > > > > pretty soon before the problem happens - maybe a clue or maybe 
> > > > > > > > coincidence.
> > > > > > > > 
> > > > > > 
> > > > > > I finally got to capture that debugfs dump at the point the SCSI IOs
> > > > > > timeout, as attached. Let me know if any problem receiving it.
> > > > > > 
> > > > > > Here's a kernel log snippet at that point (I added some print for 
> > > > > > the
> > > > > > timeout):
> > > > > > 
> > > > > > 609] psci: CPU6 killed.
> > > > > > [  547.722217] CPU5: shutdown
> > > > > > [  547.724926] psci: CPU5 killed.
> > > > > > [  547.749951] irq_shutdown
> > > > > > [  547.752701] IRQ 800: no longer affine to CPU4
> > > > > > [  547.757265] CPU4: shutdown
> > > > > > [  547.759971] psci: CPU4 killed.
> > > > > > [  547.790348] CPU3: shutdown
> > > > > > [  547.793052] psci: CPU3 killed.
> > > > > > [  547.818330] CPU2: shutdown
> > > > > > [  547.821033] psci: CPU2 killed.
> > > > > > [  547.854285] CPU1: shutdown
> > > > > > [  547.856989] psci: CPU1 killed.
> > > > > > [  575.925307] scsi_timeout req=0x0023b0dd9c00 reserved=0
> > > > > > [  575.930794] scsi_timeout req=0x0023b0df2700 reserved=0
> > > > > From the debugfs log, 66 requests are dumped, and 63 of them has
> > > > been submitted to device, and the other 3 is in ->dispatch list
> > > > via requeue after timeout is handled.
> > > > 
> > > 
> > > Hi Ming,
> > > 
> > > > You mentioned that:
> > > > 
> > > > " - I added some debug prints in blk_mq_hctx_drain_inflight_rqs() for 
> > > > when
> > > >  inflights rqs !=0, and I don't see them for this timeout"
> > > > 
> > > > There might be two reasons:
> > > > 
> > > > 1) You are still testing a multiple reply-queue device?
> > > 
> > > As before, I am testing by exposing mutliple queues to the SCSI midlayer. 
> > > I
> > > had to make this change locally, as on mainline we still only expose a
> > > single queue and use the internal reply queue when enabling managed
> > > interrupts.
> > > 
> > > As I
> > > > mentioned last times, it is hard to map reply-queue into blk-mq
> > > > hctx correctly.
> > > 
> > > Here's my branch, if you want to check:
> > > 
> > > https://github.com/hisilicon/kernel-dev/commits/private-topic-sas-5.4-mq-v4
> > > 
> > > It's a bit messy (sorry), but you can see that the reply-queue in the LLDD
> > > is removed in commit 087b95af374.
> > > 
> > > I am now thinking of actually making this change to the LLDD in mainline 
> > > to
> > > avoid any doubt in future.
> > 
> > As I mentioned last time, you do share tags among all MQ queues on your 
> > hardware
> > given your hardware is actually SQ HBA, so commit 087b95af374 is definitely
> > wrong, isn't it?
> > 
> 
> Yes, we share tags among all queues, but we generate the tag - known as IPTT
> - in the LLDD now, as we can no longer use the request tag (as it is not
> unique per all queues):
> 
> https://github.com/hisilicon/kernel-dev/commit/087b95af374be6965583c1673032fb33bc8127e8#diff-f5d8fff19bc539a7387af5230d4e5771R188
> 
> As I said, the branch is messy and I did have to fix 087b95af374.

Firstly this way may waste lots of memory, especially the queue depth is
big, such as, hisilicon V3's queue depth is 4096.

Secondly, you have to deal with queue busy efficiently and correctly,
for example, your real hw tags(IPTT) can be used up easily, and how
will you handle these dispatched request?

Finally, you have to evaluate the performance effect, this is highly
related with how to deal with out-of-IPTT.

I'd suggest you to fix the stuff and post them out for review.

Thanks,
Ming



Re: [PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-21 Thread Ming Lei
On Mon, Oct 21, 2019 at 10:19:18AM +0100, John Garry wrote:
> On 20/10/2019 11:14, Ming Lei wrote:
> > > > ght? If so, I need to find some simple sysfs entry to
> > > > > > tell me of this occurrence, to trigger the capture. Or add 
> > > > > > something. My
> > > > > > script is pretty dump.
> > > > > >
> > > > > > BTW, I did notice that we the dump_stack in __blk_mq_run_hw_queue()
> > > > > > pretty soon before the problem happens - maybe a clue or maybe 
> > > > > > coincidence.
> > > > > >
> > > >
> > > > I finally got to capture that debugfs dump at the point the SCSI IOs
> > > > timeout, as attached. Let me know if any problem receiving it.
> > > >
> > > > Here's a kernel log snippet at that point (I added some print for the
> > > > timeout):
> > > >
> > > > 609] psci: CPU6 killed.
> > > > [  547.722217] CPU5: shutdown
> > > > [  547.724926] psci: CPU5 killed.
> > > > [  547.749951] irq_shutdown
> > > > [  547.752701] IRQ 800: no longer affine to CPU4
> > > > [  547.757265] CPU4: shutdown
> > > > [  547.759971] psci: CPU4 killed.
> > > > [  547.790348] CPU3: shutdown
> > > > [  547.793052] psci: CPU3 killed.
> > > > [  547.818330] CPU2: shutdown
> > > > [  547.821033] psci: CPU2 killed.
> > > > [  547.854285] CPU1: shutdown
> > > > [  547.856989] psci: CPU1 killed.
> > > > [  575.925307] scsi_timeout req=0x0023b0dd9c00 reserved=0
> > > > [  575.930794] scsi_timeout req=0x0023b0df2700 reserved=0
> > > From the debugfs log, 66 requests are dumped, and 63 of them has
> > been submitted to device, and the other 3 is in ->dispatch list
> > via requeue after timeout is handled.
> > 
> 
> Hi Ming,
> 
> > You mentioned that:
> > 
> > " - I added some debug prints in blk_mq_hctx_drain_inflight_rqs() for when
> >  inflights rqs !=0, and I don't see them for this timeout"
> > 
> > There might be two reasons:
> > 
> > 1) You are still testing a multiple reply-queue device?
> 
> As before, I am testing by exposing mutliple queues to the SCSI midlayer. I
> had to make this change locally, as on mainline we still only expose a
> single queue and use the internal reply queue when enabling managed
> interrupts.
> 
> As I
> > mentioned last times, it is hard to map reply-queue into blk-mq
> > hctx correctly.
> 
> Here's my branch, if you want to check:
> 
> https://github.com/hisilicon/kernel-dev/commits/private-topic-sas-5.4-mq-v4
> 
> It's a bit messy (sorry), but you can see that the reply-queue in the LLDD
> is removed in commit 087b95af374.
> 
> I am now thinking of actually making this change to the LLDD in mainline to
> avoid any doubt in future.

As I mentioned last time, you do share tags among all MQ queues on your hardware
given your hardware is actually SQ HBA, so commit 087b95af374 is definitely
wrong, isn't it?

It can be very hard to partition the single tags among multiple hctx.


Thanks,
Ming



Re: [PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-20 Thread Ming Lei
On Thu, Oct 17, 2019 at 04:40:12PM +0100, John Garry wrote:
> On 16/10/2019 17:19, John Garry wrote:
> > On 16/10/2019 13:07, Ming Lei wrote:
> > > On Wed, Oct 16, 2019 at 09:58:27AM +0100, John Garry wrote:
> > > > On 14/10/2019 02:50, Ming Lei wrote:
> > > > > Hi,
> > > > > 
> > > > > Thomas mentioned:
> > > > > "
> > > > >  That was the constraint of managed interrupts from the very
> > > > > beginning:
> > > > > 
> > > > >   The driver/subsystem has to quiesce the interrupt line and the
> > > > > associated
> > > > >   queue _before_ it gets shutdown in CPU unplug and not fiddle
> > > > > with it
> > > > >   until it's restarted by the core when the CPU is plugged in
> > > > > again.
> > > > > "
> > > > > 
> > > > > But no drivers or blk-mq do that before one hctx becomes dead(all
> > > > > CPUs for one hctx are offline), and even it is worse, blk-mq stills
> > > > > tries
> > > > > to run hw queue after hctx is dead, see blk_mq_hctx_notify_dead().
> > > > > 
> > > > > This patchset tries to address the issue by two stages:
> > > > > 
> > > > > 1) add one new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE
> > > > > 
> > > > > - mark the hctx as internal stopped, and drain all in-flight requests
> > > > > if the hctx is going to be dead.
> > > > > 
> > > > > 2) re-submit IO in the state of CPUHP_BLK_MQ_DEAD after the hctx
> > > > > becomes dead
> > > > > 
> > > > > - steal bios from the request, and resubmit them via
> > > > > generic_make_request(),
> > > > > then these IO will be mapped to other live hctx for dispatch
> > > > > 
> > > > > Please comment & review, thanks!
> > > > > 
> > > > > John, I don't add your tested-by tag since V3 have some changes,
> > > > > and I appreciate if you may run your test on V3.
> > > > 
> > > > Hi Ming,
> > > > 
> > > > So I got around to doing some testing. The good news is that issue
> > > > which we
> > > > were experiencing in v3 series seems to have has gone away - alot more
> > > > stable.
> > > > 
> > > > However, unfortunately, I did notice some SCSI timeouts:
> > > > 
> > > > 15508.615074] CPU2: shutdown
> > > > [15508.617778] psci: CPU2 killed.
> > > > [15508.651220] CPU1: shutdown
> > > > [15508.653924] psci: CPU1 killed.
> > > > [15518.406229] sas: Enter sas_scsi_recover_host busy: 63 failed: 63
> > > > Jobs: 1 (f=1): [R] [0.0% done] [0[15518.412239] sas: sas_scsi_find_task:
> > > > aborting task 0xa7159744
> > > > KB/0KB/0KB /s] [0/0/0 iops] [eta [15518.421708] sas:
> > > > sas_eh_handle_sas_errors: task 0xa7159744 is done
> > > > [15518.431266] sas: sas_scsi_find_task: aborting task 0xd39731eb
> > > > [15518.442539] sas: sas_eh_handle_sas_errors: task 0xd39731eb is
> > > > done
> > > > [15518.449407] sas: sas_scsi_find_task: aborting task 0x9f77c9bd
> > > > [15518.455899] sas: sas_eh_handle_sas_errors: task 0x9f77c9bd is
> > > > done
> > > > 
> > > > A couple of things to note:
> > > > - I added some debug prints in blk_mq_hctx_drain_inflight_rqs() for when
> > > > inflights rqs !=0, and I don't see them for this timeout
> > > > - 0 datarate reported from fio
> > > > 
> > > > I'll have a look...
> > > 
> > > What is the output of the following command?
> > > 
> > > (cd /sys/kernel/debug/block/$SAS_DISK && find . -type f -exec grep -aH
> > > . {} \;)
> > I assume that you want this run at about the time SCSI EH kicks in for
> > the timeout, right? If so, I need to find some simple sysfs entry to
> > tell me of this occurrence, to trigger the capture. Or add something. My
> > script is pretty dump.
> > 
> > BTW, I did notice that we the dump_stack in __blk_mq_run_hw_queue()
> > pretty soon before the problem happens - maybe a clue or maybe coincidence.
> > 
> 
> I finally got to capture that debugfs dump at the point the SCSI IOs
> timeout, as attached. Let me know if any problem re

Re: [PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-16 Thread Ming Lei
On Wed, Oct 16, 2019 at 09:58:27AM +0100, John Garry wrote:
> On 14/10/2019 02:50, Ming Lei wrote:
> > Hi,
> > 
> > Thomas mentioned:
> > "
> >  That was the constraint of managed interrupts from the very beginning:
> > 
> >   The driver/subsystem has to quiesce the interrupt line and the 
> > associated
> >   queue _before_ it gets shutdown in CPU unplug and not fiddle with it
> >   until it's restarted by the core when the CPU is plugged in again.
> > "
> > 
> > But no drivers or blk-mq do that before one hctx becomes dead(all
> > CPUs for one hctx are offline), and even it is worse, blk-mq stills tries
> > to run hw queue after hctx is dead, see blk_mq_hctx_notify_dead().
> > 
> > This patchset tries to address the issue by two stages:
> > 
> > 1) add one new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE
> > 
> > - mark the hctx as internal stopped, and drain all in-flight requests
> > if the hctx is going to be dead.
> > 
> > 2) re-submit IO in the state of CPUHP_BLK_MQ_DEAD after the hctx becomes 
> > dead
> > 
> > - steal bios from the request, and resubmit them via generic_make_request(),
> > then these IO will be mapped to other live hctx for dispatch
> > 
> > Please comment & review, thanks!
> > 
> > John, I don't add your tested-by tag since V3 have some changes,
> > and I appreciate if you may run your test on V3.
> 
> Hi Ming,
> 
> So I got around to doing some testing. The good news is that issue which we
> were experiencing in v3 series seems to have has gone away - alot more
> stable.
> 
> However, unfortunately, I did notice some SCSI timeouts:
> 
> 15508.615074] CPU2: shutdown
> [15508.617778] psci: CPU2 killed.
> [15508.651220] CPU1: shutdown
> [15508.653924] psci: CPU1 killed.
> [15518.406229] sas: Enter sas_scsi_recover_host busy: 63 failed: 63
> Jobs: 1 (f=1): [R] [0.0% done] [0[15518.412239] sas: sas_scsi_find_task:
> aborting task 0xa7159744
> KB/0KB/0KB /s] [0/0/0 iops] [eta [15518.421708] sas:
> sas_eh_handle_sas_errors: task 0xa7159744 is done
> [15518.431266] sas: sas_scsi_find_task: aborting task 0xd39731eb
> [15518.442539] sas: sas_eh_handle_sas_errors: task 0xd39731eb is
> done
> [15518.449407] sas: sas_scsi_find_task: aborting task 0x9f77c9bd
> [15518.455899] sas: sas_eh_handle_sas_errors: task 0x9f77c9bd is
> done
> 
> A couple of things to note:
> - I added some debug prints in blk_mq_hctx_drain_inflight_rqs() for when
> inflights rqs !=0, and I don't see them for this timeout
> - 0 datarate reported from fio
> 
> I'll have a look...

What is the output of the following command?

(cd /sys/kernel/debug/block/$SAS_DISK && find . -type f -exec grep -aH . {} \;)

Thanks,
Ming


[PATCH V4 5/5] blk-mq: handle requests dispatched from IO scheduler in case that hctx is dead

2019-10-13 Thread Ming Lei
If hctx becomes dead, all in-queue IO requests aimed at this hctx have to
be re-submitted, so cover requests queued in scheduler queue.

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Reviewed-by: Hannes Reinecke 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 17f0a9ef32a8..06081966549f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2305,6 +2305,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
enum hctx_type type;
bool hctx_dead;
struct request *rq;
+   struct elevator_queue *e;
 
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -2315,12 +2316,31 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
hctx_dead = cpumask_first_and(hctx->cpumask, cpu_online_mask) >=
nr_cpu_ids;
 
-   spin_lock(&ctx->lock);
-   if (!list_empty(&ctx->rq_lists[type])) {
-   list_splice_init(&ctx->rq_lists[type], &tmp);
-   blk_mq_hctx_clear_pending(hctx, ctx);
+   e = hctx->queue->elevator;
+   if (!e) {
+   spin_lock(&ctx->lock);
+   if (!list_empty(&ctx->rq_lists[type])) {
+   list_splice_init(&ctx->rq_lists[type], &tmp);
+   blk_mq_hctx_clear_pending(hctx, ctx);
+   }
+   spin_unlock(&ctx->lock);
+   } else if (hctx_dead) {
+   LIST_HEAD(sched_tmp);
+
+   while ((rq = e->type->ops.dispatch_request(hctx))) {
+   if (rq->mq_hctx != hctx)
+   list_add(&rq->queuelist, &sched_tmp);
+   else
+   list_add(&rq->queuelist, &tmp);
+   }
+
+   while (!list_empty(&sched_tmp)) {
+   rq = list_entry(sched_tmp.next, struct request,
+   queuelist);
+   list_del_init(&rq->queuelist);
+   blk_mq_sched_insert_request(rq, true, true, true);
+   }
}
-   spin_unlock(&ctx->lock);
 
if (!hctx_dead) {
if (list_empty(&tmp))
-- 
2.20.1



[PATCH V4 1/5] blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED

2019-10-13 Thread Ming Lei
Add a new hw queue state of BLK_MQ_S_INTERNAL_STOPPED, which prepares
for stopping hw queue before all CPUs of this hctx become offline.

We can't reuse BLK_MQ_S_STOPPED because that state can be cleared during IO
completion.

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Cc: John Garry 
Reviewed-by: Hannes Reinecke 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c | 1 +
 block/blk-mq.h | 3 ++-
 include/linux/blk-mq.h | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b3f2ba483992..af40a02c46ee 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
+   HCTX_STATE_NAME(INTERNAL_STOPPED),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 32c62c64e6c2..63717573bc16 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -176,7 +176,8 @@ static inline struct blk_mq_tags 
*blk_mq_tags_from_data(struct blk_mq_alloc_data
 
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
-   return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
+   return test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
+   test_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
 }
 
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0bf056de5cc3..079c282e4471 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -235,6 +235,9 @@ enum {
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART  = 2,
 
+   /* hw queue is internal stopped, driver do not use it */
+   BLK_MQ_S_INTERNAL_STOPPED   = 3,
+
BLK_MQ_MAX_DEPTH= 10240,
 
BLK_MQ_CPU_WORK_BATCH   = 8,
-- 
2.20.1



[PATCH V4 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-13 Thread Ming Lei
Hi,

Thomas mentioned:
"
 That was the constraint of managed interrupts from the very beginning:

  The driver/subsystem has to quiesce the interrupt line and the associated
  queue _before_ it gets shutdown in CPU unplug and not fiddle with it
  until it's restarted by the core when the CPU is plugged in again.
"

But no drivers or blk-mq do that before one hctx becomes dead(all
CPUs for one hctx are offline), and even it is worse, blk-mq stills tries
to run hw queue after hctx is dead, see blk_mq_hctx_notify_dead().

This patchset tries to address the issue by two stages:

1) add one new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE

- mark the hctx as internal stopped, and drain all in-flight requests
if the hctx is going to be dead.

2) re-submit IO in the state of CPUHP_BLK_MQ_DEAD after the hctx becomes dead

- steal bios from the request, and resubmit them via generic_make_request(),
then these IO will be mapped to other live hctx for dispatch

Please comment & review, thanks!

John, I don't add your tested-by tag since V3 have some changes,
and I appreciate if you may run your test on V3.

V4:
- resubmit IOs in dispatch list in case that this hctx is dead 

V3:
- re-organize patch 2 & 3 a bit for addressing Hannes's comment
- fix patch 4 for avoiding potential deadlock, as found by Hannes

V2:
- patch4 & patch 5 in V1 have been merged to block tree, so remove
  them
- address comments from John Garry and Minwoo



Ming Lei (5):
  blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED
  blk-mq: prepare for draining IO when hctx's all CPUs are offline
  blk-mq: stop to handle IO and drain IO before hctx becomes dead
  blk-mq: re-submit IO in case that hctx is dead
  blk-mq: handle requests dispatched from IO scheduler in case that hctx
is dead

 block/blk-mq-debugfs.c |   2 +
 block/blk-mq-tag.c |   2 +-
 block/blk-mq-tag.h |   2 +
 block/blk-mq.c | 137 ++---
 block/blk-mq.h |   3 +-
 drivers/block/loop.c   |   2 +-
 drivers/md/dm-rq.c |   2 +-
 include/linux/blk-mq.h |   5 ++
 include/linux/cpuhotplug.h |   1 +
 9 files changed, 141 insertions(+), 15 deletions(-)

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 

-- 
2.20.1



[PATCH V4 2/5] blk-mq: prepare for draining IO when hctx's all CPUs are offline

2019-10-13 Thread Ming Lei
Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
up queue mapping. Thomas mentioned the following point[1]:

"
 That was the constraint of managed interrupts from the very beginning:

  The driver/subsystem has to quiesce the interrupt line and the associated
  queue _before_ it gets shutdown in CPU unplug and not fiddle with it
  until it's restarted by the core when the CPU is plugged in again.
"

However, current blk-mq implementation doesn't quiesce hw queue before
the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is
one cpuhp state handled after the CPU is down, so there isn't any chance
to quiesce hctx for blk-mq wrt. CPU hotplug.

Add new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE for blk-mq to stop queues
and wait for completion of in-flight requests.

We will stop hw queue and wait for completion of in-flight requests
when one hctx is becoming dead in the following patch. This way may
cause dead-lock for some stacking blk-mq drivers, such as dm-rq and
loop.

Add blk-mq flag of BLK_MQ_F_NO_MANAGED_IRQ and mark it for dm-rq and
loop, so we needn't to wait for completion of in-flight requests from
dm-rq & loop, then the potential dead-lock can be avoided.

[1] 
https://lore.kernel.org/linux-block/alpine.deb.2.21.1904051331270.1...@nanos.tec.linutronix.de/

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c |  1 +
 block/blk-mq.c | 13 +
 drivers/block/loop.c   |  2 +-
 drivers/md/dm-rq.c |  2 +-
 include/linux/blk-mq.h |  2 ++
 include/linux/cpuhotplug.h |  1 +
 6 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index af40a02c46ee..24fff8c90942 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -240,6 +240,7 @@ static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
+   HCTX_FLAG_NAME(NO_MANAGED_IRQ),
 };
 #undef HCTX_FLAG_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec791156e9cc..a664f196782a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2225,6 +2225,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags *tags,
return -ENOMEM;
 }
 
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+   return 0;
+}
+
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -2261,6 +2266,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
 
 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
+   if (!(hctx->flags & BLK_MQ_F_NO_MANAGED_IRQ))
+   cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+   &hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead);
 }
@@ -2320,6 +2328,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 {
hctx->queue_num = hctx_idx;
 
+   if (!(hctx->flags & BLK_MQ_F_NO_MANAGED_IRQ))
+   cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+   &hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
hctx->tags = set->tags[hctx_idx];
@@ -3547,6 +3558,8 @@ static int __init blk_mq_init(void)
 {
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
+   cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+   NULL, blk_mq_hctx_notify_online);
return 0;
 }
 subsys_initcall(blk_mq_init);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6f77eaa7217..751a28a1d4b0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1999,7 +1999,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_NO_MANAGED_IRQ;
lo->tag_set.driver_data = lo;
 
err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f8577e2c13b..5f1ff70ac029 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, 
struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set

[PATCH V4 4/5] blk-mq: re-submit IO in case that hctx is dead

2019-10-13 Thread Ming Lei
When all CPUs in one hctx are offline, we shouldn't run this hw queue
for completing request any more.

So steal bios from the request, and resubmit them, and finally free
the request in blk_mq_hctx_notify_dead().

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 54 --
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3384242202eb..17f0a9ef32a8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2268,10 +2268,34 @@ static int blk_mq_hctx_notify_online(unsigned int cpu, 
struct hlist_node *node)
return 0;
 }
 
+static void blk_mq_resubmit_io(struct request *rq)
+{
+   struct bio_list list;
+   struct bio *bio;
+
+   bio_list_init(&list);
+   blk_steal_bios(&list, rq);
+
+   /*
+* Free the old empty request before submitting bio for avoiding
+* potential deadlock
+*/
+   blk_mq_cleanup_rq(rq);
+   blk_mq_end_request(rq, 0);
+
+   while (true) {
+   bio = bio_list_pop(&list);
+   if (!bio)
+   break;
+
+   generic_make_request(bio);
+   }
+}
+
 /*
- * 'cpu' is going away. splice any existing rq_list entries from this
- * software queue to the hw queue dispatch list, and ensure that it
- * gets run.
+ * 'cpu' has gone away. If this hctx is dead, we can't dispatch request
+ * to the hctx any more, so steal bios from requests of this hctx, and
+ * re-submit them to the request queue, and free these requests finally.
  */
 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
@@ -2279,6 +2303,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
struct blk_mq_ctx *ctx;
LIST_HEAD(tmp);
enum hctx_type type;
+   bool hctx_dead;
+   struct request *rq;
 
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -2286,6 +2312,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
 
clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
 
+   hctx_dead = cpumask_first_and(hctx->cpumask, cpu_online_mask) >=
+   nr_cpu_ids;
+
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
@@ -2293,14 +2322,27 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
}
spin_unlock(&ctx->lock);
 
-   if (list_empty(&tmp))
+   if (!hctx_dead) {
+   if (list_empty(&tmp))
+   return 0;
+   spin_lock(&hctx->lock);
+   list_splice_tail_init(&tmp, &hctx->dispatch);
+   spin_unlock(&hctx->lock);
+   blk_mq_run_hw_queue(hctx, true);
return 0;
+   }
 
+   /* requests in dispatch list has to be re-submitted too */
spin_lock(&hctx->lock);
-   list_splice_tail_init(&tmp, &hctx->dispatch);
+   list_splice_tail_init(&hctx->dispatch, &tmp);
spin_unlock(&hctx->lock);
 
-   blk_mq_run_hw_queue(hctx, true);
+   while (!list_empty(&tmp)) {
+   rq = list_entry(tmp.next, struct request, queuelist);
+   list_del_init(&rq->queuelist);
+   blk_mq_resubmit_io(rq);
+   }
+
return 0;
 }
 
-- 
2.20.1



[PATCH V4 3/5] blk-mq: stop to handle IO and drain IO before hctx becomes dead

2019-10-13 Thread Ming Lei
Before one CPU becomes offline, check if it is the last online CPU
of hctx. If yes, mark this hctx as BLK_MQ_S_INTERNAL_STOPPED, meantime
wait for completion of all in-flight IOs originated from this hctx.

This way guarantees that there isn't any inflight IO before shutdowning
the managed IRQ line.

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq-tag.c |  2 +-
 block/blk-mq-tag.h |  2 ++
 block/blk-mq.c | 40 
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 008388e82b5c..31828b82552b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -325,7 +325,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, 
struct sbitmap_queue *bt,
  * true to continue iterating tags, false to stop.
  * @priv:  Will be passed as second argument to @fn.
  */
-static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
busy_tag_iter_fn *fn, void *priv)
 {
if (tags->nr_reserved_tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..321fd6f440e6 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -35,6 +35,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+   busy_tag_iter_fn *fn, void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a664f196782a..3384242202eb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2225,8 +2225,46 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags *tags,
return -ENOMEM;
 }
 
+static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
+bool reserved)
+{
+   unsigned *count = data;
+
+   if ((blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
+   (*count)++;
+
+   return true;
+}
+
+static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_tags *tags)
+{
+   unsigned count = 0;
+
+   blk_mq_all_tag_busy_iter(tags, blk_mq_count_inflight_rq, &count);
+
+   return count;
+}
+
+static void blk_mq_hctx_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+   while (1) {
+   if (!blk_mq_tags_inflight_rqs(hctx->tags))
+   break;
+   msleep(5);
+   }
+}
+
 static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
 {
+   struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+   struct blk_mq_hw_ctx, cpuhp_online);
+
+   if ((cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) == cpu) &&
+   (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) >=
+nr_cpu_ids)) {
+   set_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
+   blk_mq_hctx_drain_inflight_rqs(hctx);
+}
return 0;
 }
 
@@ -2246,6 +2284,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
 
+   clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
+
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
-- 
2.20.1



Re: [PATCH V3 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-13 Thread Ming Lei
On Fri, Oct 11, 2019 at 10:10 PM John Garry  wrote:
>
> On 11/10/2019 12:55, Ming Lei wrote:
> > On Fri, Oct 11, 2019 at 4:54 PM John Garry  wrote:
> >>
> >> On 10/10/2019 12:21, John Garry wrote:
> >>>
> >>>>
> >>>> As discussed before, tags of hisilicon V3 is HBA wide. If you switch
> >>>> to real hw queue, each hw queue has to own its independent tags.
> >>>> However, that isn't supported by V3 hardware.
> >>>
> >>> I am generating the tag internally in the driver now, so that hostwide
> >>> tags issue should not be an issue.
> >>>
> >>> And, to be clear, I am not paying too much attention to performance, but
> >>> rather just hotplugging while running IO.
> >>>
> >>> An update on testing:
> >>> I did some scripted overnight testing. The script essentially loops like
> >>> this:
> >>> - online all CPUS
> >>> - run fio binded on a limited bunch of CPUs to cover a hctx mask for 1
> >>> minute
> >>> - offline those CPUs
> >>> - wait 1 minute (> SCSI or NVMe timeout)
> >>> - and repeat
> >>>
> >>> SCSI is actually quite stable, but NVMe isn't. For NVMe I am finding
> >>> some fio processes never dying with IOPS @ 0. I don't see any NVMe
> >>> timeout reported. Did you do any NVMe testing of this sort?
> >>>
> >>
> >> Yeah, so for NVMe, I see some sort of regression, like this:
> >> Jobs: 1 (f=1): [_R] [0.0% done] [0KB/0KB/0KB /s] [0/0/0 iops] [eta
> >> 1158037877d:17h:18m:22s]
> >
> > I can reproduce this issue, and looks there are requests in ->dispatch.
>
> OK, that may match with what I see:
> - the problem occuring coincides with this callpath with
> BLK_MQ_S_INTERNAL_STOPPED set:

Good catch, these requests should have been re-submitted in
blk_mq_hctx_notify_dead() too.

Will do it in V4.

Thanks,
Ming Lei


Re: [PATCH V3 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-11 Thread Ming Lei
On Fri, Oct 11, 2019 at 4:54 PM John Garry  wrote:
>
> On 10/10/2019 12:21, John Garry wrote:
> >
> >>
> >> As discussed before, tags of hisilicon V3 is HBA wide. If you switch
> >> to real hw queue, each hw queue has to own its independent tags.
> >> However, that isn't supported by V3 hardware.
> >
> > I am generating the tag internally in the driver now, so that hostwide
> > tags issue should not be an issue.
> >
> > And, to be clear, I am not paying too much attention to performance, but
> > rather just hotplugging while running IO.
> >
> > An update on testing:
> > I did some scripted overnight testing. The script essentially loops like
> > this:
> > - online all CPUS
> > - run fio binded on a limited bunch of CPUs to cover a hctx mask for 1
> > minute
> > - offline those CPUs
> > - wait 1 minute (> SCSI or NVMe timeout)
> > - and repeat
> >
> > SCSI is actually quite stable, but NVMe isn't. For NVMe I am finding
> > some fio processes never dying with IOPS @ 0. I don't see any NVMe
> > timeout reported. Did you do any NVMe testing of this sort?
> >
>
> Yeah, so for NVMe, I see some sort of regression, like this:
> Jobs: 1 (f=1): [_R] [0.0% done] [0KB/0KB/0KB /s] [0/0/0 iops] [eta
> 1158037877d:17h:18m:22s]

I can reproduce this issue, and looks there are requests in ->dispatch.
I am a bit busy this week, please feel free to investigate it and debugfs
can help you much. I may have time next week for looking this issue.

Thanks,
Ming Lei


Re: [PATCH V3 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-10 Thread Ming Lei
On Wed, Oct 09, 2019 at 09:49:35AM +0100, John Garry wrote:
> > > > > - steal bios from the request, and resubmit them via
> > > > > generic_make_request(),
> > > > > then these IO will be mapped to other live hctx for dispatch
> > > > > 
> > > > > Please comment & review, thanks!
> > > > > 
> > > > > John, I don't add your tested-by tag since V3 have some changes,
> > > > > and I appreciate if you may run your test on V3.
> > > > > 
> > > > 
> > > > Will do, Thanks
> > > 
> > > Hi Ming,
> > > 
> > > I got this warning once:
> > > 
> > > [  162.558185] CPU10: shutdown
> > > [  162.560994] psci: CPU10 killed.
> > > [  162.593939] CPU9: shutdown
> > > [  162.596645] psci: CPU9 killed.
> > > [  162.625838] CPU8: shutdown
> > > [  162.628550] psci: CPU8 killed.
> > > [  162.685790] CPU7: shutdown
> > > [  162.688496] psci: CPU7 killed.
> > > [  162.725771] CPU6: shutdown
> > > [  162.728486] psci: CPU6 killed.
> > > [  162.753884] CPU5: shutdown
> > > [  162.756591] psci: CPU5 killed.
> > > [  162.785584] irq_shutdown
> > > [  162.788277] IRQ 800: no longer affine to CPU4
> > > [  162.793267] CPU4: shutdown
> > > [  162.795975] psci: CPU4 killed.
> > > [  162.849680] run queue from wrong CPU 13, hctx active
> > > [  162.849692] CPU3: shutdown
> > > [  162.854649] CPU: 13 PID: 874 Comm: kworker/3:2H Not tainted
> > > 5.4.0-rc1-00012-gad025dd3d001 #1098
> > > [  162.854653] Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI 
> > > RC0 -
> > > V1.16.01 03/15/2019
> > > [  162.857362] psci: CPU3 killed.
> > > [  162.866039] Workqueue: kblockd blk_mq_run_work_fn
> > > [  162.882281] Call trace:
> > > [  162.884716]  dump_backtrace+0x0/0x150
> > > [  162.888365]  show_stack+0x14/0x20
> > > [  162.891668]  dump_stack+0xb0/0xf8
> > > [  162.894970]  __blk_mq_run_hw_queue+0x11c/0x128
> > > [  162.899400]  blk_mq_run_work_fn+0x1c/0x28
> > > [  162.903397]  process_one_work+0x1e0/0x358
> > > [  162.907393]  worker_thread+0x40/0x488
> > > [  162.911042]  kthread+0x118/0x120
> > > [  162.914257]  ret_from_fork+0x10/0x18
> > 
> > What is the HBA? If it is Hisilicon SAS, it isn't strange, because
> > this patch can't fix single hw queue with multiple private reply queue
> > yet, that can be one follow-up job of this patchset.
> > 
> 
> Yes, hisi_sas. So, right, it is single queue today on mainline, but I
> manually made it multiqueue on my dev branch just to test this series.
> Otherwise I could not test it for that driver.
> 
> My dev branch is here, if interested:
> https://github.com/hisilicon/kernel-dev/commits/private-topic-sas-5.4-mq

Your conversion shouldn't work given you do not change .can_queue in the
patch of 'hisi_sas_v3: multiqueue support'.

As discussed before, tags of hisilicon V3 is HBA wide. If you switch
to real hw queue, each hw queue has to own its independent tags.
However, that isn't supported by V3 hardware.

See previous discussion:

https://marc.info/?t=15592886301&r=1&w=2


Thanks,
Ming


Re: [PATCH V3 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-09 Thread Ming Lei
On Tue, Oct 08, 2019 at 06:15:52PM +0100, John Garry wrote:
> On 08/10/2019 10:06, John Garry wrote:
> > On 08/10/2019 05:18, Ming Lei wrote:
> > > Hi,
> > > 
> > > Thomas mentioned:
> > > "
> > >  That was the constraint of managed interrupts from the very
> > > beginning:
> > > 
> > >   The driver/subsystem has to quiesce the interrupt line and the
> > > associated
> > >   queue _before_ it gets shutdown in CPU unplug and not fiddle
> > > with it
> > >   until it's restarted by the core when the CPU is plugged in again.
> > > "
> > > 
> > > But no drivers or blk-mq do that before one hctx becomes dead(all
> > > CPUs for one hctx are offline), and even it is worse, blk-mq stills tries
> > > to run hw queue after hctx is dead, see blk_mq_hctx_notify_dead().
> > > 
> > > This patchset tries to address the issue by two stages:
> > > 
> > > 1) add one new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE
> > > 
> > > - mark the hctx as internal stopped, and drain all in-flight requests
> > > if the hctx is going to be dead.
> > > 
> > > 2) re-submit IO in the state of CPUHP_BLK_MQ_DEAD after the hctx
> > > becomes dead
> > > 
> > > - steal bios from the request, and resubmit them via
> > > generic_make_request(),
> > > then these IO will be mapped to other live hctx for dispatch
> > > 
> > > Please comment & review, thanks!
> > > 
> > > John, I don't add your tested-by tag since V3 have some changes,
> > > and I appreciate if you may run your test on V3.
> > > 
> > 
> > Will do, Thanks
> 
> Hi Ming,
> 
> I got this warning once:
> 
> [  162.558185] CPU10: shutdown
> [  162.560994] psci: CPU10 killed.
> [  162.593939] CPU9: shutdown
> [  162.596645] psci: CPU9 killed.
> [  162.625838] CPU8: shutdown
> [  162.628550] psci: CPU8 killed.
> [  162.685790] CPU7: shutdown
> [  162.688496] psci: CPU7 killed.
> [  162.725771] CPU6: shutdown
> [  162.728486] psci: CPU6 killed.
> [  162.753884] CPU5: shutdown
> [  162.756591] psci: CPU5 killed.
> [  162.785584] irq_shutdown
> [  162.788277] IRQ 800: no longer affine to CPU4
> [  162.793267] CPU4: shutdown
> [  162.795975] psci: CPU4 killed.
> [  162.849680] run queue from wrong CPU 13, hctx active
> [  162.849692] CPU3: shutdown
> [  162.854649] CPU: 13 PID: 874 Comm: kworker/3:2H Not tainted
> 5.4.0-rc1-00012-gad025dd3d001 #1098
> [  162.854653] Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI RC0 -
> V1.16.01 03/15/2019
> [  162.857362] psci: CPU3 killed.
> [  162.866039] Workqueue: kblockd blk_mq_run_work_fn
> [  162.882281] Call trace:
> [  162.884716]  dump_backtrace+0x0/0x150
> [  162.888365]  show_stack+0x14/0x20
> [  162.891668]  dump_stack+0xb0/0xf8
> [  162.894970]  __blk_mq_run_hw_queue+0x11c/0x128
> [  162.899400]  blk_mq_run_work_fn+0x1c/0x28
> [  162.903397]  process_one_work+0x1e0/0x358
> [  162.907393]  worker_thread+0x40/0x488
> [  162.911042]  kthread+0x118/0x120
> [  162.914257]  ret_from_fork+0x10/0x18

What is the HBA? If it is Hisilicon SAS, it isn't strange, because
this patch can't fix single hw queue with multiple private reply queue
yet, that can be one follow-up job of this patchset.

Thanks,
Ming


[PATCH V3 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-07 Thread Ming Lei
Hi,

Thomas mentioned:
"
 That was the constraint of managed interrupts from the very beginning:

  The driver/subsystem has to quiesce the interrupt line and the associated
  queue _before_ it gets shutdown in CPU unplug and not fiddle with it
  until it's restarted by the core when the CPU is plugged in again.
"

But no drivers or blk-mq do that before one hctx becomes dead(all
CPUs for one hctx are offline), and even it is worse, blk-mq stills tries
to run hw queue after hctx is dead, see blk_mq_hctx_notify_dead().

This patchset tries to address the issue by two stages:

1) add one new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE

- mark the hctx as internal stopped, and drain all in-flight requests
if the hctx is going to be dead.

2) re-submit IO in the state of CPUHP_BLK_MQ_DEAD after the hctx becomes dead

- steal bios from the request, and resubmit them via generic_make_request(),
then these IO will be mapped to other live hctx for dispatch

Please comment & review, thanks!

John, I don't add your tested-by tag since V3 have some changes,
and I appreciate if you may run your test on V3.

V3:
- re-organize patch 2 & 3 a bit for addressing Hannes's comment
- fix patch 4 for avoiding potential deadlock, as found by Hannes

V2:
- patch4 & patch 5 in V1 have been merged to block tree, so remove
  them
- address comments from John Garry and Minwoo


Ming Lei (5):
  blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED
  blk-mq: prepare for draining IO when hctx's all CPUs are offline
  blk-mq: stop to handle IO and drain IO before hctx becomes dead
  blk-mq: re-submit IO in case that hctx is dead
  blk-mq: handle requests dispatched from IO scheduler in case that hctx
is dead

 block/blk-mq-debugfs.c |   2 +
 block/blk-mq-tag.c |   2 +-
 block/blk-mq-tag.h |   2 +
 block/blk-mq.c | 135 +
 block/blk-mq.h |   3 +-
 drivers/block/loop.c   |   2 +-
 drivers/md/dm-rq.c |   2 +-
 include/linux/blk-mq.h |   5 ++
 include/linux/cpuhotplug.h |   1 +
 9 files changed, 138 insertions(+), 16 deletions(-)

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
-- 
2.20.1



[PATCH V3 5/5] blk-mq: handle requests dispatched from IO scheduler in case that hctx is dead

2019-10-07 Thread Ming Lei
If hctx becomes dead, all in-queue IO requests aimed at this hctx have to
be re-submitted, so cover requests queued in scheduler queue.

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Reviewed-by: Hannes Reinecke 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4153c1c4e2aa..4625013a4927 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2305,6 +2305,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
enum hctx_type type;
bool hctx_dead;
struct request *rq;
+   struct elevator_queue *e;
 
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -2315,12 +2316,31 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
hctx_dead = cpumask_first_and(hctx->cpumask, cpu_online_mask) >=
nr_cpu_ids;
 
-   spin_lock(&ctx->lock);
-   if (!list_empty(&ctx->rq_lists[type])) {
-   list_splice_init(&ctx->rq_lists[type], &tmp);
-   blk_mq_hctx_clear_pending(hctx, ctx);
+   e = hctx->queue->elevator;
+   if (!e) {
+   spin_lock(&ctx->lock);
+   if (!list_empty(&ctx->rq_lists[type])) {
+   list_splice_init(&ctx->rq_lists[type], &tmp);
+   blk_mq_hctx_clear_pending(hctx, ctx);
+   }
+   spin_unlock(&ctx->lock);
+   } else if (hctx_dead) {
+   LIST_HEAD(sched_tmp);
+
+   while ((rq = e->type->ops.dispatch_request(hctx))) {
+   if (rq->mq_hctx != hctx)
+   list_add(&rq->queuelist, &sched_tmp);
+   else
+   list_add(&rq->queuelist, &tmp);
+   }
+
+   while (!list_empty(&sched_tmp)) {
+   rq = list_entry(sched_tmp.next, struct request,
+   queuelist);
+   list_del_init(&rq->queuelist);
+   blk_mq_sched_insert_request(rq, true, true, true);
+   }
}
-   spin_unlock(&ctx->lock);
 
if (list_empty(&tmp))
return 0;
-- 
2.20.1



[PATCH V3 1/5] blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED

2019-10-07 Thread Ming Lei
Add a new hw queue state of BLK_MQ_S_INTERNAL_STOPPED, which prepares
for stopping hw queue before all CPUs of this hctx become offline.

We can't reuse BLK_MQ_S_STOPPED because that state can be cleared during IO
completion.

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Cc: John Garry 
Reviewed-by: Hannes Reinecke 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c | 1 +
 block/blk-mq.h | 3 ++-
 include/linux/blk-mq.h | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b3f2ba483992..af40a02c46ee 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
+   HCTX_STATE_NAME(INTERNAL_STOPPED),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 32c62c64e6c2..63717573bc16 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -176,7 +176,8 @@ static inline struct blk_mq_tags 
*blk_mq_tags_from_data(struct blk_mq_alloc_data
 
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
-   return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
+   return test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
+   test_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
 }
 
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0bf056de5cc3..079c282e4471 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -235,6 +235,9 @@ enum {
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART  = 2,
 
+   /* hw queue is internal stopped, driver do not use it */
+   BLK_MQ_S_INTERNAL_STOPPED   = 3,
+
BLK_MQ_MAX_DEPTH= 10240,
 
BLK_MQ_CPU_WORK_BATCH   = 8,
-- 
2.20.1



[PATCH V3 3/5] blk-mq: stop to handle IO and drain IO before hctx becomes dead

2019-10-07 Thread Ming Lei
Before one CPU becomes offline, check if it is the last online CPU
of hctx. If yes, mark this hctx as BLK_MQ_S_INTERNAL_STOPPED, meantime
wait for completion of all in-flight IOs originated from this hctx.

This way guarantees that there isn't any inflight IO before shutdowning
the managed IRQ line.

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq-tag.c |  2 +-
 block/blk-mq-tag.h |  2 ++
 block/blk-mq.c | 40 
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 008388e82b5c..31828b82552b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -325,7 +325,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, 
struct sbitmap_queue *bt,
  * true to continue iterating tags, false to stop.
  * @priv:  Will be passed as second argument to @fn.
  */
-static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
busy_tag_iter_fn *fn, void *priv)
 {
if (tags->nr_reserved_tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..321fd6f440e6 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -35,6 +35,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+   busy_tag_iter_fn *fn, void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a664f196782a..3384242202eb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2225,8 +2225,46 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags *tags,
return -ENOMEM;
 }
 
+static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
+bool reserved)
+{
+   unsigned *count = data;
+
+   if ((blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
+   (*count)++;
+
+   return true;
+}
+
+static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_tags *tags)
+{
+   unsigned count = 0;
+
+   blk_mq_all_tag_busy_iter(tags, blk_mq_count_inflight_rq, &count);
+
+   return count;
+}
+
+static void blk_mq_hctx_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+   while (1) {
+   if (!blk_mq_tags_inflight_rqs(hctx->tags))
+   break;
+   msleep(5);
+   }
+}
+
 static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
 {
+   struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+   struct blk_mq_hw_ctx, cpuhp_online);
+
+   if ((cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) == cpu) &&
+   (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) >=
+nr_cpu_ids)) {
+   set_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
+   blk_mq_hctx_drain_inflight_rqs(hctx);
+}
return 0;
 }
 
@@ -2246,6 +2284,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
 
+   clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
+
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
-- 
2.20.1



[PATCH V3 2/5] blk-mq: prepare for draining IO when hctx's all CPUs are offline

2019-10-07 Thread Ming Lei
Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
up queue mapping. Thomas mentioned the following point[1]:

"
 That was the constraint of managed interrupts from the very beginning:

  The driver/subsystem has to quiesce the interrupt line and the associated
  queue _before_ it gets shutdown in CPU unplug and not fiddle with it
  until it's restarted by the core when the CPU is plugged in again.
"

However, current blk-mq implementation doesn't quiesce hw queue before
the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is
one cpuhp state handled after the CPU is down, so there isn't any chance
to quiesce hctx for blk-mq wrt. CPU hotplug.

Add new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE for blk-mq to stop queues
and wait for completion of in-flight requests.

We will stop hw queue and wait for completion of in-flight requests
when one hctx is becoming dead in the following patch. This way may
cause dead-lock for some stacking blk-mq drivers, such as dm-rq and
loop.

Add blk-mq flag of BLK_MQ_F_NO_MANAGED_IRQ and mark it for dm-rq and
loop, so we needn't to wait for completion of in-flight requests from
dm-rq & loop, then the potential dead-lock can be avoided.

[1] 
https://lore.kernel.org/linux-block/alpine.deb.2.21.1904051331270.1...@nanos.tec.linutronix.de/

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c |  1 +
 block/blk-mq.c | 13 +
 drivers/block/loop.c   |  2 +-
 drivers/md/dm-rq.c |  2 +-
 include/linux/blk-mq.h |  2 ++
 include/linux/cpuhotplug.h |  1 +
 6 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index af40a02c46ee..24fff8c90942 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -240,6 +240,7 @@ static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
+   HCTX_FLAG_NAME(NO_MANAGED_IRQ),
 };
 #undef HCTX_FLAG_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec791156e9cc..a664f196782a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2225,6 +2225,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags *tags,
return -ENOMEM;
 }
 
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+   return 0;
+}
+
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -2261,6 +2266,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
 
 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
+   if (!(hctx->flags & BLK_MQ_F_NO_MANAGED_IRQ))
+   cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+   &hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead);
 }
@@ -2320,6 +2328,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 {
hctx->queue_num = hctx_idx;
 
+   if (!(hctx->flags & BLK_MQ_F_NO_MANAGED_IRQ))
+   cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+   &hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
hctx->tags = set->tags[hctx_idx];
@@ -3547,6 +3558,8 @@ static int __init blk_mq_init(void)
 {
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
+   cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+   NULL, blk_mq_hctx_notify_online);
return 0;
 }
 subsys_initcall(blk_mq_init);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6f77eaa7217..751a28a1d4b0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1999,7 +1999,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_NO_MANAGED_IRQ;
lo->tag_set.driver_data = lo;
 
err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f8577e2c13b..5f1ff70ac029 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, 
struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set

[PATCH V3 4/5] blk-mq: re-submit IO in case that hctx is dead

2019-10-07 Thread Ming Lei
When all CPUs in one hctx are offline, we shouldn't run this hw queue
for completing request any more.

So steal bios from the request, and resubmit them, and finally free
the request in blk_mq_hctx_notify_dead().

Cc: John Garry 
Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 52 +++---
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3384242202eb..4153c1c4e2aa 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2268,10 +2268,34 @@ static int blk_mq_hctx_notify_online(unsigned int cpu, 
struct hlist_node *node)
return 0;
 }
 
+static void blk_mq_resubmit_io(struct request *rq)
+{
+   struct bio_list list;
+   struct bio *bio;
+
+   bio_list_init(&list);
+   blk_steal_bios(&list, rq);
+
+   /*
+* Free the old empty request before submitting bio for avoiding
+* potential deadlock
+*/
+   blk_mq_cleanup_rq(rq);
+   blk_mq_end_request(rq, 0);
+
+   while (true) {
+   bio = bio_list_pop(&list);
+   if (!bio)
+   break;
+
+   generic_make_request(bio);
+   }
+}
+
 /*
- * 'cpu' is going away. splice any existing rq_list entries from this
- * software queue to the hw queue dispatch list, and ensure that it
- * gets run.
+ * 'cpu' has gone away. If this hctx is dead, we can't dispatch request
+ * to the hctx any more, so steal bios from requests of this hctx, and
+ * re-submit them to the request queue, and free these requests finally.
  */
 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
@@ -2279,6 +2303,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
struct blk_mq_ctx *ctx;
LIST_HEAD(tmp);
enum hctx_type type;
+   bool hctx_dead;
+   struct request *rq;
 
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -2286,6 +2312,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
 
clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
 
+   hctx_dead = cpumask_first_and(hctx->cpumask, cpu_online_mask) >=
+   nr_cpu_ids;
+
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
@@ -2296,11 +2325,20 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
if (list_empty(&tmp))
return 0;
 
-   spin_lock(&hctx->lock);
-   list_splice_tail_init(&tmp, &hctx->dispatch);
-   spin_unlock(&hctx->lock);
+   if (!hctx_dead) {
+   spin_lock(&hctx->lock);
+   list_splice_tail_init(&tmp, &hctx->dispatch);
+   spin_unlock(&hctx->lock);
+   blk_mq_run_hw_queue(hctx, true);
+   return 0;
+   }
+
+   while (!list_empty(&tmp)) {
+   rq = list_entry(tmp.next, struct request, queuelist);
+   list_del_init(&rq->queuelist);
+   blk_mq_resubmit_io(rq);
+   }
 
-   blk_mq_run_hw_queue(hctx, true);
return 0;
 }
 
-- 
2.20.1



Re: [PATCH V2 RESEND 4/5] blk-mq: re-submit IO in case that hctx is dead

2019-10-07 Thread Ming Lei
On Mon, Oct 07, 2019 at 08:27:38AM +0200, Hannes Reinecke wrote:
> On 10/6/19 4:45 AM, Ming Lei wrote:
> > When all CPUs in one hctx are offline, we shouldn't run this hw queue
> > for completing request any more.
> > 
> > So steal bios from the request, and resubmit them, and finally free
> > the request in blk_mq_hctx_notify_dead().
> > 
> > Cc: Bart Van Assche 
> > Cc: Hannes Reinecke 
> > Cc: Christoph Hellwig 
> > Cc: Thomas Gleixner 
> > Cc: Keith Busch 
> > Signed-off-by: Ming Lei 
> > ---
> >  block/blk-mq.c | 48 +---
> >  1 file changed, 41 insertions(+), 7 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index d991c122abf2..0b35fdbd1f17 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2280,10 +2280,30 @@ static int blk_mq_hctx_notify_online(unsigned int 
> > cpu, struct hlist_node *node)
> > return 0;
> >  }
> >  
> > +static void blk_mq_resubmit_io(struct request *rq)
> > +{
> > +   struct bio_list list;
> > +   struct bio *bio;
> > +
> > +   bio_list_init(&list);
> > +   blk_steal_bios(&list, rq);
> > +
> > +   while (true) {
> > +   bio = bio_list_pop(&list);
> > +   if (!bio)
> > +   break;
> > +
> > +   generic_make_request(bio);
> > +   }
> > +
> > +   blk_mq_cleanup_rq(rq);
> > +   blk_mq_end_request(rq, 0);
> > +}
> > +
> Hmm. Not sure if this is a good idea.
> Shouldn't we call 'blk_mq_end_request()' before calling
> generic_make_request()?
> otherwise the cloned request might be completed before original one,
> which looks a bit dodgy to me; and might lead to quite a recursion if we
> have several dead cpus to content with ...

Good catch, we should have freed the old empty request before calling
generic_make_request(), will fix it in V3.


Thanks,
Ming


Re: [PATCH V2 RESEND 3/5] blk-mq: stop to handle IO before hctx's all CPUs become offline

2019-10-07 Thread Ming Lei
On Mon, Oct 07, 2019 at 11:23:22AM +0100, John Garry wrote:
> On 06/10/2019 03:45, Ming Lei wrote:
> > +   }
> > +}
> > +
> > +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node 
> > *node)
> > +{
> > +   struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
> > +   struct blk_mq_hw_ctx, cpuhp_online);
> > +   unsigned prev_cpu = -1;
> > +
> > +   while (true) {
> > +   unsigned next_cpu = cpumask_next_and(prev_cpu, hctx->cpumask,
> > +   cpu_online_mask);
> > +
> > +   if (next_cpu >= nr_cpu_ids)
> > +   break;
> > +
> > +   /* return if there is other online CPU on this hctx */
> > +   if (next_cpu != cpu)
> > +   return 0;
> > +
> > +   prev_cpu = next_cpu;
> > +   }
> > +
> > +   set_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
> > +   blk_mq_drain_inflight_rqs(hctx);
> > +
> 
> Does this do the same:
> 
> {
>   struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
>   struct blk_mq_hw_ctx, cpuhp_online);
>   cpumask_var_t tmp;
> 
>   cpumask_and(tmp, hctx->cpumask, cpu_online_mask);
> 
>   /* test if there is any other cpu online in the hctx cpu mask */
>   if (cpumask_any_but(tmp, cpu) < nr_cpu_ids)
>   return 0;
> 
>   set_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
>   blk_mq_drain_inflight_rqs(hctx);
> 
>   return 0;
> }
> 
> If so, it's more readable and concise.

Yes, but we have to allocate space for 'tmp', that is what this patch
tries to avoid, given the logic isn't too complicated.

> 
> 
> BTW, You could have added my Tested-by tags...

OK, I will add it in V3.


Thanks,
Ming


Re: [PATCH V2 RESEND 3/5] blk-mq: stop to handle IO before hctx's all CPUs become offline

2019-10-07 Thread Ming Lei
On Mon, Oct 07, 2019 at 08:23:29AM +0200, Hannes Reinecke wrote:
> On 10/6/19 4:45 AM, Ming Lei wrote:
> > Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
> > up queue mapping. Thomas mentioned the following point[1]:
> > 
> > "
> >  That was the constraint of managed interrupts from the very beginning:
> > 
> >   The driver/subsystem has to quiesce the interrupt line and the associated
> >   queue _before_ it gets shutdown in CPU unplug and not fiddle with it
> >   until it's restarted by the core when the CPU is plugged in again.
> > "
> > 
> > However, current blk-mq implementation doesn't quiesce hw queue before
> > the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is
> > one cpuhp state handled after the CPU is down, so there isn't any chance
> > to quiesce hctx for blk-mq wrt. CPU hotplug.
> > 
> > Add new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE for blk-mq to stop queues
> > and wait for completion of in-flight requests.
> > 
> > [1] 
> > https://lore.kernel.org/linux-block/alpine.deb.2.21.1904051331270.1...@nanos.tec.linutronix.de/
> > 
> > Cc: Bart Van Assche 
> > Cc: Hannes Reinecke 
> > Cc: Christoph Hellwig 
> > Cc: Thomas Gleixner 
> > Cc: Keith Busch 
> > Signed-off-by: Ming Lei 
> > ---
> >  block/blk-mq-tag.c |  2 +-
> >  block/blk-mq-tag.h |  2 ++
> >  block/blk-mq.c | 65 ++
> >  include/linux/blk-mq.h |  1 +
> >  include/linux/cpuhotplug.h |  1 +
> >  5 files changed, 70 insertions(+), 1 deletion(-)
> > 
> I really don't like the zillions of 'XXX_in_flight()' helper in blk-mq;
> blk_mq_queue_inflight(), blk_mq_in_flight(), blk_mq_in_flight_rw() et al.
> Can't you implement your one on top of the already existing?

This one returns in-flight rqs on specific tags(hctx), so far no such
kind of interface, that is why blk_mq_all_tag_busy_iter is exported out
in this patch.


Thanks,
Ming


Re: [PATCH V2 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-05 Thread Ming Lei
On Wed, Oct 02, 2019 at 08:36:52AM -0600, Jens Axboe wrote:
> On 10/2/19 3:56 AM, John Garry wrote:
> > On 22/08/2019 18:39, John Garry wrote:
> >> On 12/08/2019 14:43, Ming Lei wrote:
> >>> Hi,
> >>>
> >>> Thomas mentioned:
> >>>  "
> >>>   That was the constraint of managed interrupts from the very
> >>> beginning:
> >>>
> >>>The driver/subsystem has to quiesce the interrupt line and the
> >>> associated
> >>>queue _before_ it gets shutdown in CPU unplug and not fiddle
> >>> with it
> >>>until it's restarted by the core when the CPU is plugged in again.
> >>>  "
> >>>
> >>> But no drivers or blk-mq do that before one hctx becomes dead(all
> >>> CPUs for one hctx are offline), and even it is worse, blk-mq stills tries
> >>> to run hw queue after hctx is dead, see blk_mq_hctx_notify_dead().
> >>>
> >>> This patchset tries to address the issue by two stages:
> >>>
> >>> 1) add one new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE
> >>>
> >>> - mark the hctx as internal stopped, and drain all in-flight requests
> >>> if the hctx is going to be dead.
> >>>
> >>> 2) re-submit IO in the state of CPUHP_BLK_MQ_DEAD after the hctx
> >>> becomes dead
> >>>
> >>> - steal bios from the request, and resubmit them via
> >>> generic_make_request(),
> >>> then these IO will be mapped to other live hctx for dispatch
> >>>
> >>> Please comment & review, thanks!
> >>>
> >>> V2:
> >>>  - patch4 & patch 5 in V1 have been merged to block tree, so remove
> >>>them
> >>>  - address comments from John Garry and Minwoo
> >>>
> >>>
> >>> Ming Lei (5):
> >>>blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED
> >>>blk-mq: add blk-mq flag of BLK_MQ_F_NO_MANAGED_IRQ
> >>>blk-mq: stop to handle IO before hctx's all CPUs become offline
> >>>blk-mq: re-submit IO in case that hctx is dead
> >>>blk-mq: handle requests dispatched from IO scheduler in case that hctx
> >>>  is dead
> >>
> >> Hi Ming,
> >>
> >> This looks to fix the hotplug issue for me.
> >>
> >> Previously I could manufacture a scenario while running fio where I got
> >> IO timeouts, like this:
> >>
> >> root@(none)$ echo 0 > ./sys/devices/system/cpu/cpu0/online
> >> [  296.897627] process 891 (fio) no longer affine to cpu0
> >> [  296.898488] process 893 (fio) no longer affine to cpu0
> >> [  296.910270] process 890 (fio) no longer affine to cpu0
> >> [  296.927322] IRQ 775: no longer affine to CPU0
> >> [  296.932762] CPU0: shutdown
> >> [  296.935469] psci: CPU0 killed.
> >> root@(none)$ [  326.971962] sas: Enter sas_scsi_recover_host busy: 61
> >> failed: 61
> >> [  326.977978] sas: sas_scsi_find_task: aborting task 0xe2cdc79b
> >> root@(none)$ [  333.047964] hisi_sas_v3_hw :74:02.0: internal task
> >> abort: timeout and not done.
> >> [  333.055616] hisi_sas_v3_hw :74:02.0: abort task: internal abort (-5)
> >> [  333.062306] sas: sas_scsi_find_task: querying task 0xe2cdc79b
> >> [  333.068776] sas: sas_scsi_find_task: task 0xe2cdc79b not at LU
> >> [  333.075295] sas: task 0xe2cdc79b is not at LU: I_T recover
> >> [  333.081464] sas: I_T nexus reset for dev 5000c500a7b95a49
> >>
> >> Please notice the 30-second delay for the SCSI IO timeout.
> >>
> >> And now I don't see it; here's a sample for irq shutdown:
> >> root@(none)$ echo 0 > ./sys/devices/system/cpu/cpu0/online
> >> [  344.608148] process 849 (fio) no longer affine to cpu0
> >> [  344.608639] process 848 (fio) no longer affine to cpu0
> >> [  344.609454] process 850 (fio) no longer affine to cpu0
> >> [  344.643481] process 847 (fio) no longer affine to cpu0
> >> [  346.213842] IRQ 775: no longer affine to CPU0
> >> [  346.219712] CPU0: shutdown
> >> [  346.222425] psci: CPU0 killed.
> >>
> >> Please notice the ~1.5s pause, which would be the queue draining.
> >>
> >> So FWIW:
> >> Tested-by: John Garry 
> >>
> >> JFYI, I tested on 5.3-rc5 and cherry-picked
> >> https://github.com/ming1/linux/commit/0d2cd3c99bb0fe81d2c0ca5d68e02bdc4521d4d6
> >> and "blk-mq: add callback of .cleanup_rq".
> >>
> >> Cheers,
> >> John
> > 
> > Hi Jens,
> > 
> > I don't mean to be pushy, but can we consider to get these patches from
> > Ming merged?
> > 
> > As above, I tested on my SCSI driver and it works. I also tested on an
> > NVMe disk, and it solves the condition which generates this message:
> > root@(none)$ echo 0 > /sys/devices/system/cpu/cpu2/online
> > [  465.635960] CPU2: shutdown
> > [  465.638662] psci: CPU2 killed.
> > [  111.381653] nvme nvme0: I/O 705 QID 18 timeout, completion polled
> > 
> > (that's on top off v5.4-rc1)
> 
> Ming, can you repost the series?

It has been resent out just now.

Thanks,
Ming


[PATCH V2 RESEND 2/5] blk-mq: add blk-mq flag of BLK_MQ_F_NO_MANAGED_IRQ

2019-10-05 Thread Ming Lei
We will stop hw queue and wait for completion of in-flight requests
when one hctx is becoming dead in the following patch. This way may
cause dead-lock for some stacking blk-mq drivers, such as dm-rq and
loop.

Add blk-mq flag of BLK_MQ_F_NO_MANAGED_IRQ and mark it for dm-rq and
loop, so we needn't to wait for completion of in-flight requests of
dm-rq & loop, then the potential dead-lock can be avoided.

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c | 1 +
 drivers/block/loop.c   | 2 +-
 drivers/md/dm-rq.c | 2 +-
 include/linux/blk-mq.h | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index af40a02c46ee..24fff8c90942 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -240,6 +240,7 @@ static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
+   HCTX_FLAG_NAME(NO_MANAGED_IRQ),
 };
 #undef HCTX_FLAG_NAME
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6f77eaa7217..751a28a1d4b0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1999,7 +1999,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_NO_MANAGED_IRQ;
lo->tag_set.driver_data = lo;
 
err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f8577e2c13b..5f1ff70ac029 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, 
struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
-   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
+   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_NO_MANAGED_IRQ;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 079c282e4471..ee60885ec855 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -226,6 +226,7 @@ struct blk_mq_ops {
 enum {
BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
BLK_MQ_F_TAG_SHARED = 1 << 1,
+   BLK_MQ_F_NO_MANAGED_IRQ = 1 << 2,
BLK_MQ_F_BLOCKING   = 1 << 5,
BLK_MQ_F_NO_SCHED   = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
-- 
2.20.1



[PATCH V2 RESEND 3/5] blk-mq: stop to handle IO before hctx's all CPUs become offline

2019-10-05 Thread Ming Lei
Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
up queue mapping. Thomas mentioned the following point[1]:

"
 That was the constraint of managed interrupts from the very beginning:

  The driver/subsystem has to quiesce the interrupt line and the associated
  queue _before_ it gets shutdown in CPU unplug and not fiddle with it
  until it's restarted by the core when the CPU is plugged in again.
"

However, current blk-mq implementation doesn't quiesce hw queue before
the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is
one cpuhp state handled after the CPU is down, so there isn't any chance
to quiesce hctx for blk-mq wrt. CPU hotplug.

Add new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE for blk-mq to stop queues
and wait for completion of in-flight requests.

[1] 
https://lore.kernel.org/linux-block/alpine.deb.2.21.1904051331270.1...@nanos.tec.linutronix.de/

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq-tag.c |  2 +-
 block/blk-mq-tag.h |  2 ++
 block/blk-mq.c | 65 ++
 include/linux/blk-mq.h |  1 +
 include/linux/cpuhotplug.h |  1 +
 5 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 008388e82b5c..31828b82552b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -325,7 +325,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, 
struct sbitmap_queue *bt,
  * true to continue iterating tags, false to stop.
  * @priv:  Will be passed as second argument to @fn.
  */
-static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
busy_tag_iter_fn *fn, void *priv)
 {
if (tags->nr_reserved_tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..321fd6f440e6 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -35,6 +35,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+   busy_tag_iter_fn *fn, void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec791156e9cc..d991c122abf2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2225,6 +2225,61 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags *tags,
return -ENOMEM;
 }
 
+static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
+bool reserved)
+{
+   unsigned *count = data;
+
+   if ((blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
+   (*count)++;
+
+   return true;
+}
+
+static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_tags *tags)
+{
+   unsigned count = 0;
+
+   blk_mq_all_tag_busy_iter(tags, blk_mq_count_inflight_rq, &count);
+
+   return count;
+}
+
+static void blk_mq_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+   while (1) {
+   if (!blk_mq_tags_inflight_rqs(hctx->tags))
+   break;
+   msleep(5);
+   }
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+   struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+   struct blk_mq_hw_ctx, cpuhp_online);
+   unsigned prev_cpu = -1;
+
+   while (true) {
+   unsigned next_cpu = cpumask_next_and(prev_cpu, hctx->cpumask,
+   cpu_online_mask);
+
+   if (next_cpu >= nr_cpu_ids)
+   break;
+
+   /* return if there is other online CPU on this hctx */
+   if (next_cpu != cpu)
+   return 0;
+
+   prev_cpu = next_cpu;
+   }
+
+   set_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
+   blk_mq_drain_inflight_rqs(hctx);
+
+   return 0;
+}
+
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -2241,6 +2296,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
 
+   clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
+
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
@@ -2261,6 +2318,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct h

[PATCH V2 RESEND 5/5] blk-mq: handle requests dispatched from IO scheduler in case that hctx is dead

2019-10-05 Thread Ming Lei
If hctx becomes dead, all in-queue IO requests aimed at this hctx have to
be re-submitted, so cover requests queued in scheduler queue.

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0b35fdbd1f17..94fd47cef1bc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2313,6 +2313,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
enum hctx_type type;
bool hctx_dead;
struct request *rq;
+   struct elevator_queue *e;
 
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -2323,12 +2324,31 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
hctx_dead = cpumask_first_and(hctx->cpumask, cpu_online_mask) >=
nr_cpu_ids;
 
-   spin_lock(&ctx->lock);
-   if (!list_empty(&ctx->rq_lists[type])) {
-   list_splice_init(&ctx->rq_lists[type], &tmp);
-   blk_mq_hctx_clear_pending(hctx, ctx);
+   e = hctx->queue->elevator;
+   if (!e) {
+   spin_lock(&ctx->lock);
+   if (!list_empty(&ctx->rq_lists[type])) {
+   list_splice_init(&ctx->rq_lists[type], &tmp);
+   blk_mq_hctx_clear_pending(hctx, ctx);
+   }
+   spin_unlock(&ctx->lock);
+   } else if (hctx_dead) {
+   LIST_HEAD(sched_tmp);
+
+   while ((rq = e->type->ops.dispatch_request(hctx))) {
+   if (rq->mq_hctx != hctx)
+   list_add(&rq->queuelist, &sched_tmp);
+   else
+   list_add(&rq->queuelist, &tmp);
+   }
+
+   while (!list_empty(&sched_tmp)) {
+   rq = list_entry(sched_tmp.next, struct request,
+   queuelist);
+   list_del_init(&rq->queuelist);
+   blk_mq_sched_insert_request(rq, true, true, true);
+   }
}
-   spin_unlock(&ctx->lock);
 
if (list_empty(&tmp))
return 0;
-- 
2.20.1



[PATCH V2 RESEND 4/5] blk-mq: re-submit IO in case that hctx is dead

2019-10-05 Thread Ming Lei
When all CPUs in one hctx are offline, we shouldn't run this hw queue
for completing request any more.

So steal bios from the request, and resubmit them, and finally free
the request in blk_mq_hctx_notify_dead().

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 48 +---
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d991c122abf2..0b35fdbd1f17 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2280,10 +2280,30 @@ static int blk_mq_hctx_notify_online(unsigned int cpu, 
struct hlist_node *node)
return 0;
 }
 
+static void blk_mq_resubmit_io(struct request *rq)
+{
+   struct bio_list list;
+   struct bio *bio;
+
+   bio_list_init(&list);
+   blk_steal_bios(&list, rq);
+
+   while (true) {
+   bio = bio_list_pop(&list);
+   if (!bio)
+   break;
+
+   generic_make_request(bio);
+   }
+
+   blk_mq_cleanup_rq(rq);
+   blk_mq_end_request(rq, 0);
+}
+
 /*
- * 'cpu' is going away. splice any existing rq_list entries from this
- * software queue to the hw queue dispatch list, and ensure that it
- * gets run.
+ * 'cpu' has gone away. If this hctx is dead, we can't dispatch request
+ * to the hctx any more, so steal bios from requests of this hctx, and
+ * re-submit them to the request queue, and free these requests finally.
  */
 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
@@ -2291,6 +2311,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
struct blk_mq_ctx *ctx;
LIST_HEAD(tmp);
enum hctx_type type;
+   bool hctx_dead;
+   struct request *rq;
 
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -2298,6 +2320,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
 
clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
 
+   hctx_dead = cpumask_first_and(hctx->cpumask, cpu_online_mask) >=
+   nr_cpu_ids;
+
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
@@ -2308,11 +2333,20 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, 
struct hlist_node *node)
if (list_empty(&tmp))
return 0;
 
-   spin_lock(&hctx->lock);
-   list_splice_tail_init(&tmp, &hctx->dispatch);
-   spin_unlock(&hctx->lock);
+   if (!hctx_dead) {
+   spin_lock(&hctx->lock);
+   list_splice_tail_init(&tmp, &hctx->dispatch);
+   spin_unlock(&hctx->lock);
+   blk_mq_run_hw_queue(hctx, true);
+   return 0;
+   }
+
+   while (!list_empty(&tmp)) {
+   rq = list_entry(tmp.next, struct request, queuelist);
+   list_del_init(&rq->queuelist);
+   blk_mq_resubmit_io(rq);
+   }
 
-   blk_mq_run_hw_queue(hctx, true);
return 0;
 }
 
-- 
2.20.1



[PATCH V2 RESEND 0/5] blk-mq: improvement on handling IO during CPU hotplug

2019-10-05 Thread Ming Lei
Hi,

Thomas mentioned:
"
 That was the constraint of managed interrupts from the very beginning:

  The driver/subsystem has to quiesce the interrupt line and the associated
  queue _before_ it gets shutdown in CPU unplug and not fiddle with it
  until it's restarted by the core when the CPU is plugged in again.
"

But no drivers or blk-mq do that before one hctx becomes dead(all
CPUs for one hctx are offline), and even it is worse, blk-mq stills tries
to run hw queue after hctx is dead, see blk_mq_hctx_notify_dead().

This patchset tries to address the issue by two stages:

1) add one new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE

- mark the hctx as internal stopped, and drain all in-flight requests
if the hctx is going to be dead.

2) re-submit IO in the state of CPUHP_BLK_MQ_DEAD after the hctx becomes dead

- steal bios from the request, and resubmit them via generic_make_request(),
then these IO will be mapped to other live hctx for dispatch

Please comment & review, thanks!

V2:
- patch4 & patch 5 in V1 have been merged to block tree, so remove
  them
- address comments from John Garry and Minwoo

Ming Lei (5):
  blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED
  blk-mq: add blk-mq flag of BLK_MQ_F_NO_MANAGED_IRQ
  blk-mq: stop to handle IO before hctx's all CPUs become offline
  blk-mq: re-submit IO in case that hctx is dead
  blk-mq: handle requests dispatched from IO scheduler in case that hctx
is dead

 block/blk-mq-debugfs.c |   2 +
 block/blk-mq-tag.c |   2 +-
 block/blk-mq-tag.h |   2 +
 block/blk-mq.c | 143 +
 block/blk-mq.h |   3 +-
 drivers/block/loop.c   |   2 +-
 drivers/md/dm-rq.c |   2 +-
 include/linux/blk-mq.h |   5 ++
 include/linux/cpuhotplug.h |   1 +
 9 files changed, 146 insertions(+), 16 deletions(-)

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
-- 
2.20.1



[PATCH V2 RESEND 1/5] blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED

2019-10-05 Thread Ming Lei
Add a new hw queue state of BLK_MQ_S_INTERNAL_STOPPED, which prepares
for stopping hw queue before all CPUs of this hctx become offline.

We can't reuse BLK_MQ_S_STOPPED because that state can be cleared during IO
completion.

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: Thomas Gleixner 
Cc: Keith Busch 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c | 1 +
 block/blk-mq.h | 3 ++-
 include/linux/blk-mq.h | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b3f2ba483992..af40a02c46ee 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
+   HCTX_STATE_NAME(INTERNAL_STOPPED),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 32c62c64e6c2..63717573bc16 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -176,7 +176,8 @@ static inline struct blk_mq_tags 
*blk_mq_tags_from_data(struct blk_mq_alloc_data
 
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
-   return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
+   return test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
+   test_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
 }
 
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0bf056de5cc3..079c282e4471 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -235,6 +235,9 @@ enum {
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART  = 2,
 
+   /* hw queue is internal stopped, driver do not use it */
+   BLK_MQ_S_INTERNAL_STOPPED   = 3,
+
BLK_MQ_MAX_DEPTH= 10240,
 
BLK_MQ_CPU_WORK_BATCH   = 8,
-- 
2.20.1



Re: [PATCH v5] block: fix null pointer dereference in blk_mq_rq_timed_out()

2019-09-27 Thread Ming Lei
On Fri, Sep 27, 2019 at 04:19:55PM +0800, Yufen Yu wrote:
> We got a null pointer deference BUG_ON in blk_mq_rq_timed_out()
> as following:
> 
> [  108.825472] BUG: kernel NULL pointer dereference, address: 0040
> [  108.827059] PGD 0 P4D 0
> [  108.827313] Oops:  [#1] SMP PTI
> [  108.827657] CPU: 6 PID: 198 Comm: kworker/6:1H Not tainted 5.3.0-rc8+ #431
> [  108.829503] Workqueue: kblockd blk_mq_timeout_work
> [  108.829913] RIP: 0010:blk_mq_check_expired+0x258/0x330
> [  108.838191] Call Trace:
> [  108.838406]  bt_iter+0x74/0x80
> [  108.838665]  blk_mq_queue_tag_busy_iter+0x204/0x450
> [  108.839074]  ? __switch_to_asm+0x34/0x70
> [  108.839405]  ? blk_mq_stop_hw_queue+0x40/0x40
> [  108.839823]  ? blk_mq_stop_hw_queue+0x40/0x40
> [  108.840273]  ? syscall_return_via_sysret+0xf/0x7f
> [  108.840732]  blk_mq_timeout_work+0x74/0x200
> [  108.841151]  process_one_work+0x297/0x680
> [  108.841550]  worker_thread+0x29c/0x6f0
> [  108.841926]  ? rescuer_thread+0x580/0x580
> [  108.842344]  kthread+0x16a/0x1a0
> [  108.842666]  ? kthread_flush_work+0x170/0x170
> [  108.843100]  ret_from_fork+0x35/0x40
> 
> The bug is caused by the race between timeout handle and completion for
> flush request.
> 
> When timeout handle function blk_mq_rq_timed_out() try to read
> 'req->q->mq_ops', the 'req' have completed and reinitiated by next
> flush request, which would call blk_rq_init() to clear 'req' as 0.
> 
> After commit 12f5b93145 ("blk-mq: Remove generation seqeunce"),
> normal requests lifetime are protected by refcount. Until 'rq->ref'
> drop to zero, the request can really be free. Thus, these requests
> cannot been reused before timeout handle finish.
> 
> However, flush request has defined .end_io and rq->end_io() is still
> called even if 'rq->ref' doesn't drop to zero. After that, the 'flush_rq'
> can be reused by the next flush request handle, resulting in null
> pointer deference BUG ON.
> 
> We fix this problem by covering flush request with 'rq->ref'.
> If the refcount is not zero, flush_end_io() return and wait the
> last holder recall it. To record the request status, we add a new
> entry 'rq_status', which will be used in flush_end_io().
> 
> Cc: Ming Lei 
> Cc: Christoph Hellwig 
> Cc: Keith Busch 
> Cc: Bart Van Assche 
> Cc: sta...@vger.kernel.org # v4.18+
> Signed-off-by: Yufen Yu 
> 
> ---
> v2:
>  - move rq_status from struct request to struct blk_flush_queue
> v3:
>  - remove unnecessary '{}' pair.
> v4:
>  - let spinlock to protect 'fq->rq_status'
> v5:
>  - move rq_status after flush_running_idx member of struct blk_flush_queue
> ---
>  block/blk-flush.c | 10 ++
>  block/blk-mq.c|  5 -
>  block/blk.h   |  7 +++
>  3 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index aedd9320e605..1eec9cbe5a0a 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -214,6 +214,16 @@ static void flush_end_io(struct request *flush_rq, 
> blk_status_t error)
>  
>   /* release the tag's ownership to the req cloned from */
>   spin_lock_irqsave(&fq->mq_flush_lock, flags);
> +
> + if (!refcount_dec_and_test(&flush_rq->ref)) {
> + fq->rq_status = error;
> + spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
> + return;
> + }
> +
> + if (fq->rq_status != BLK_STS_OK)
> + error = fq->rq_status;
> +
>   hctx = flush_rq->mq_hctx;
>   if (!q->elevator) {
>   blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 20a49be536b5..e04fa9ab5574 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -912,7 +912,10 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx 
> *hctx,
>*/
>   if (blk_mq_req_expired(rq, next))
>   blk_mq_rq_timed_out(rq, reserved);
> - if (refcount_dec_and_test(&rq->ref))
> +
> + if (is_flush_rq(rq, hctx))
> + rq->end_io(rq, 0);
> + else if (refcount_dec_and_test(&rq->ref))
>   __blk_mq_free_request(rq);
>  
>   return true;
> diff --git a/block/blk.h b/block/blk.h
> index ed347f7a97b1..2d8cdafee799 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -19,6 +19,7 @@ struct blk_flush_queue {
>   unsigned intflush_queue_delayed:1;
>   unsigned intflush_pending_idx:1;
>   unsigned intflush_run

[PATCH 2/2] blk-mq: apply normal plugging for HDD

2019-09-27 Thread Ming Lei
Some HDD drive may expose multiple hw queue, such as MegraRaid, so
still apply the normal plugging for such devices because sequential IO
may benefit a lot from plug merging.

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Damien Le Moal 
Cc: Dave Chinner 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d7aed6518e62..969dfe02fa7c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1983,10 +1983,14 @@ static blk_qc_t blk_mq_make_request(struct 
request_queue *q, struct bio *bio)
/* bypass scheduler for flush rq */
blk_insert_flush(rq);
blk_mq_run_hw_queue(data.hctx, true);
-   } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
+   } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
+   !blk_queue_nonrot(q))) {
/*
 * Use plugging if we have a ->commit_rqs() hook as well, as
 * we know the driver uses bd->last in a smart fashion.
+*
+* Use normal plugging if this disk is slow HDD, as sequential
+* IO may benefit a lot from plug merging.
 */
unsigned int request_count = plug->rq_count;
struct request *last = NULL;
-- 
2.20.1



[PATCH 1/2] blk-mq: respect io scheduler

2019-09-27 Thread Ming Lei
Now in case of real MQ, io scheduler may be bypassed, and not only this
way may hurt performance for some slow MQ device, but also break zoned
device which depends on mq-deadline for respecting the write order in
one zone.

So don't bypass io scheduler if we have one setup.

This patch can double sequential write performance basically on MQ
scsi_debug when mq-deadline is applied.

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Damien Le Moal 
Cc: Dave Chinner 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 20a49be536b5..d7aed6518e62 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2003,6 +2003,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue 
*q, struct bio *bio)
}
 
blk_add_rq_to_plug(plug, rq);
+   } else if (q->elevator) {
+   blk_mq_sched_insert_request(rq, false, true, true);
} else if (plug && !blk_queue_nomerges(q)) {
/*
 * We do limited plugging. If the bio can be merged, do that.
@@ -2026,8 +2028,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue 
*q, struct bio *bio)
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
-   } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
-   !data.hctx->dispatch_busy)) {
+   } else if ((q->nr_hw_queues > 1 && is_sync) ||
+   !data.hctx->dispatch_busy) {
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
} else {
blk_mq_sched_insert_request(rq, false, true, true);
-- 
2.20.1



[PATCH 0/2] blk-mq: two improvemens on slow MQ devices

2019-09-27 Thread Ming Lei
Hi,

The 1st patch always applies io scheduler path if 'none' isn't used,
so that sequential IO performance can be improved on slow MQ device.
Also write order for zone device can be maintained becasue zone device
requires mq-dealine to do that.

The 2nd patch applies normal plugging for MQ HDD. 


Ming Lei (2):
  blk-mq: respect io scheduler
  blk-mq: apply normal plugging for HDD

 block/blk-mq.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

Cc: Bart Van Assche 
Cc: Hannes Reinecke 
Cc: Damien Le Moal 
Cc: Dave Chinner 

-- 
2.20.1



[PATCH] blk-mq: move lockdep_assert_held() into elevator_exit

2019-09-25 Thread Ming Lei
Commit c48dac137a62 ("block: don't hold q->sysfs_lock in elevator_init_mq")
removes q->sysfs_lock from elevator_init_mq(), but forgot to deal with
lockdep_assert_held() called in blk_mq_sched_free_requests() which is
run in failure path of elevator_init_mq().

blk_mq_sched_free_requests() is called in the following 3 functions:

elevator_init_mq()
elevator_exit()
blk_cleanup_queue()

In blk_cleanup_queue(), blk_mq_sched_free_requests() is followed exactly
by 'mutex_lock(&q->sysfs_lock)'.

So moving the lockdep_assert_held() from blk_mq_sched_free_requests()
into elevator_exit() for fixing the report by syzbot.

Cc: Bart Van Assche 
Cc: Damien Le Moal 
Reported-by: syzbot+da3b7677bb913dc1b...@syzkaller.appspotmail.com
Fixed: c48dac137a62 ("block: don't hold q->sysfs_lock in elevator_init_mq")
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 2 --
 block/blk.h  | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c9d183d6c499..ca22afd47b3d 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -555,8 +555,6 @@ void blk_mq_sched_free_requests(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   lockdep_assert_held(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
diff --git a/block/blk.h b/block/blk.h
index ed347f7a97b1..25773d668ec0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -194,6 +194,8 @@ void elv_unregister_queue(struct request_queue *q);
 static inline void elevator_exit(struct request_queue *q,
struct elevator_queue *e)
 {
+   lockdep_assert_held(&q->sysfs_lock);
+
blk_mq_sched_free_requests(q);
__elevator_exit(q, e);
 }
-- 
2.20.1



Re: WARNING in blk_mq_sched_free_requests (2)

2019-09-24 Thread Ming Lei
On Mon, Sep 23, 2019 at 11:26:11AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:574cc453 Merge tag 'drm-next-2019-09-18' of git://anongit...
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=167c3c7e60
> kernel config:  https://syzkaller.appspot.com/x/.config?x=4c1d6bfa784bebea
> dashboard link: https://syzkaller.appspot.com/bug?extid=da3b7677bb913dc1b737
> compiler:   gcc (GCC) 9.0.0 20181231 (experimental)
> 
> Unfortunately, I don't have any reproducer for this crash yet.
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+da3b7677bb913dc1b...@syzkaller.appspotmail.com
> 
> [ cut here ]
> WARNING: CPU: 1 PID: 9291 at block/blk-mq-sched.c:558
> blk_mq_sched_free_requests.cold+0x11/0x21 block/blk-mq-sched.c:558
> Kernel panic - not syncing: panic_on_warn set ...
> CPU: 1 PID: 9291 Comm: syz-executor.1 Not tainted 5.3.0+ #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x172/0x1f0 lib/dump_stack.c:113
>  panic+0x2dc/0x755 kernel/panic.c:219
>  __warn.cold+0x20/0x4c kernel/panic.c:576
>  report_bug+0x263/0x2b0 lib/bug.c:186
>  fixup_bug arch/x86/kernel/traps.c:179 [inline]
>  fixup_bug arch/x86/kernel/traps.c:174 [inline]
>  do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272
>  do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291
>  invalid_op+0x23/0x30 arch/x86/entry/entry_64.S:1028
> RIP: 0010:blk_mq_sched_free_requests.cold+0x11/0x21 block/blk-mq-sched.c:558
> Code: fe 45 85 f6 0f 84 ab e9 ff ff e9 29 e8 ff ff 48 89 cf e8 43 0e 7d fe
> eb ce e8 bc c3 42 fe 48 c7 c7 00 65 e5 87 e8 84 47 2c fe <0f> 0b e9 47 f3 ff
> ff 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57
> RSP: 0018:88805aa6f9e0 EFLAGS: 00010286
> RAX: 0024 RBX:  RCX: 
> RDX:  RSI: 815c26d6 RDI: ed100b54df2e
> RBP: 88805aa6fa30 R08: 0024 R09: ed1015d260d1
> R10: ed1015d260d0 R11: 8880ae930687 R12: fff4
> R13: 8880a3100100 R14: 88808521e1d8 R15: 8880a3100100
>  blk_mq_init_sched+0x32c/0x766 block/blk-mq-sched.c:543
>  elevator_init_mq+0x1d3/0x3f0 block/elevator.c:719
>  __device_add_disk+0xd57/0x1230 block/genhd.c:705
>  device_add_disk+0x2b/0x40 block/genhd.c:763
>  add_disk include/linux/genhd.h:429 [inline]
>  loop_add+0x635/0x8d0 drivers/block/loop.c:2051
>  loop_control_ioctl drivers/block/loop.c:2152 [inline]
>  loop_control_ioctl+0x165/0x360 drivers/block/loop.c:2134
>  vfs_ioctl fs/ioctl.c:46 [inline]
>  file_ioctl fs/ioctl.c:509 [inline]
>  do_vfs_ioctl+0xdb6/0x13e0 fs/ioctl.c:696
>  ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
>  __do_sys_ioctl fs/ioctl.c:720 [inline]
>  __se_sys_ioctl fs/ioctl.c:718 [inline]
>  __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
>  do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x459a09
> Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff
> 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:7f4830110c78 EFLAGS: 0246 ORIG_RAX: 0010
> RAX: ffda RBX: 0003 RCX: 00459a09
> RDX:  RSI: 4c80 RDI: 0005
> RBP: 0075c118 R08:  R09: 
> R10:  R11: 0246 R12: 7f48301116d4
> R13: 004c3118 R14: 004d69f8 R15: 
> Kernel Offset: disabled
> Rebooting in 86400 seconds..
> 

We removed q->sysfs_lock from elevator_init_mq() in the commit: c48dac137a62a
("block: don't hold q->sysfs_lock in elevator_init_mq")

And lockdep_assert_held() could be killed from blk_mq_sched_free_requests().

I will post a patch to fix the issue later.


Thanks,
Ming


Re: [PATCH] block: don't release queue's sysfs lock during switching elevator

2019-09-24 Thread Ming Lei
On Tue, Sep 24, 2019 at 11:37:09AM -0700, Bart Van Assche wrote:
> On 9/23/19 8:12 AM, Ming Lei wrote:
> > @@ -523,11 +521,9 @@ void elv_unregister_queue(struct request_queue *q)
> > kobject_uevent(&e->kobj, KOBJ_REMOVE);
> > kobject_del(&e->kobj);
> > -   mutex_lock(&q->sysfs_lock);
> > e->registered = 0;
> > /* Re-enable throttling in case elevator disabled it */
> > wbt_enable_default(q);
> > -   mutex_unlock(&q->sysfs_lock);
> > }
> >   }
> 
> Does this patch cause sysfs_lock to be held around kobject_del(&e->kobj)?

Yes.

> Since sysfs_lock is locked from inside elv_attr_show() and elv_attr_store(),

The request queue's sysfs_lock isn't required in elv_attr_show() and
elv_attr_store(), and only elevator's sysfs_lock is needed in the two
functions.

> does this mean that this patch reintroduces the lock inversion problem that
> was fixed recently?

No.

The lock inversion issue only existed on kobjects of q->kobj & q->mq_obj,
which was fixed already given the queue's sysfs_lock is required in
.show/.store callback of these two kobjects' attributes.


thanks,
Ming


[PATCH] block: don't release queue's sysfs lock during switching elevator

2019-09-23 Thread Ming Lei
cecf5d87ff20 ("block: split .sysfs_lock into two locks") starts to
release & acquire sysfs_lock before registering/un-registering elevator
queue during switching elevator for avoiding potential deadlock from
showing & storing 'queue/iosched' attributes and removing elevator's
kobject.

Turns out there isn't such deadlock because 'q->sysfs_lock' isn't
required in .show & .store of queue/iosched's attributes, and just
elevator's sysfs lock is acquired in elv_iosched_store() and
elv_iosched_show(). So it is safe to hold queue's sysfs lock when
registering/un-registering elevator queue.

The biggest issue is that commit cecf5d87ff20 assumes that concurrent
write on 'queue/scheduler' can't happen. However, this assumption isn't
true, because kernfs_fop_write() only guarantees that concurrent write
aren't called on the same open file, but the write could be from
different open on the file. So we can't release & re-acquire queue's
sysfs lock during switching elevator, otherwise use-after-free on
elevator could be triggered.

Fixes the issue by not releasing queue's sysfs lock during switching
elevator.

Fixes: cecf5d87ff20 ("block: split .sysfs_lock into two locks")
Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-sysfs.c | 13 -
 block/elevator.c  | 31 +--
 2 files changed, 5 insertions(+), 39 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b82736c781c5..962fc0c44381 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -989,13 +989,11 @@ int blk_register_queue(struct gendisk *disk)
blk_mq_debugfs_register(q);
}
 
-   /*
-* The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
-* switch won't happen at all.
-*/
+   mutex_lock(&q->sysfs_lock);
if (q->elevator) {
ret = elv_register_queue(q, false);
if (ret) {
+   mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
@@ -1005,7 +1003,6 @@ int blk_register_queue(struct gendisk *disk)
has_elevator = true;
}
 
-   mutex_lock(&q->sysfs_lock);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
@@ -1062,12 +1059,10 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
 
-   /*
-* q->kobj has been removed, so it is safe to check if elevator
-* exists without holding q->sysfs_lock.
-*/
+   mutex_lock(&q->sysfs_lock);
if (q->elevator)
elv_unregister_queue(q);
+   mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
 
kobject_put(&disk_to_dev(disk)->kobj);
diff --git a/block/elevator.c b/block/elevator.c
index bba10e83478a..5437059c9261 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -503,9 +503,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
 
-   mutex_lock(&q->sysfs_lock);
e->registered = 1;
-   mutex_unlock(&q->sysfs_lock);
}
return error;
 }
@@ -523,11 +521,9 @@ void elv_unregister_queue(struct request_queue *q)
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
 
-   mutex_lock(&q->sysfs_lock);
e->registered = 0;
/* Re-enable throttling in case elevator disabled it */
wbt_enable_default(q);
-   mutex_unlock(&q->sysfs_lock);
}
 }
 
@@ -590,32 +586,11 @@ int elevator_switch_mq(struct request_queue *q,
lockdep_assert_held(&q->sysfs_lock);
 
if (q->elevator) {
-   if (q->elevator->registered) {
-   mutex_unlock(&q->sysfs_lock);
-
-   /*
-* Concurrent elevator switch can't happen becasue
-* sysfs write is always exclusively on same file.
-*
-* Also the elevator queue won't be freed after
-* sysfs_lock is released becasue kobject_del() in
-* blk_unregister_queue() waits for completion of
-* .store & .show on its attributes.
-*/
+   if (q->

Re: [PATCH 2/2] blk-mq: always call into the scheduler in blk_mq_make_request()

2019-09-19 Thread Ming Lei
On Thu, Sep 19, 2019 at 10:21:54AM +, Damien Le Moal wrote:
> On 2019/09/19 11:45, Hannes Reinecke wrote:
> > From: Hannes Reinecke 
> > 
> > A scheduler might be attached even for devices exposing more than
> > one hardware queue, so the check for the number of hardware queue
> > is pointless and should be removed.
> > 
> > Signed-off-by: Hannes Reinecke 
> > ---
> >  block/blk-mq.c | 6 +-
> >  1 file changed, 1 insertion(+), 5 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 44ff3c1442a4..faab542e4836 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -1931,7 +1931,6 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, 
> > struct request *rq)
> >  
> >  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio 
> > *bio)
> >  {
> > -   const int is_sync = op_is_sync(bio->bi_opf);
> > const int is_flush_fua = op_is_flush(bio->bi_opf);
> > struct blk_mq_alloc_data data = { .flags = 0};
> > struct request *rq;
> > @@ -1977,7 +1976,7 @@ static blk_qc_t blk_mq_make_request(struct 
> > request_queue *q, struct bio *bio)
> > /* bypass scheduler for flush rq */
> > blk_insert_flush(rq);
> > blk_mq_run_hw_queue(data.hctx, true);
> > -   } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
> > +   } else if (plug && q->mq_ops->commit_rqs) {
> > /*
> >  * Use plugging if we have a ->commit_rqs() hook as well, as
> >  * we know the driver uses bd->last in a smart fashion.
> > @@ -2020,9 +2019,6 @@ static blk_qc_t blk_mq_make_request(struct 
> > request_queue *q, struct bio *bio)
> > blk_mq_try_issue_directly(data.hctx, same_queue_rq,
> > &cookie);
> > }
> > -   } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
> > -   !data.hctx->dispatch_busy)) {
> > -   blk_mq_try_issue_directly(data.hctx, rq, &cookie);
> 
> It may be worth mentioning that blk_mq_sched_insert_request() will do a direct
> insert of the request using __blk_mq_insert_request(). But that insert is
> slightly different from what blk_mq_try_issue_directly() does with
> __blk_mq_issue_directly() as the request in that case is passed along to the
> device using queue->mq_ops->queue_rq() while __blk_mq_insert_request() will 
> put
> the request in ctx->rq_lists[type].
> 
> This removes the optimized case !q->elevator && !data.hctx->dispatch_busy, 
> but I
> am not sure of the actual performance impact yet. We may want to patch
> blk_mq_sched_insert_request() to handle that case.

The optimization did improve IOPS of single queue SCSI SSD a lot, see 

commit 6ce3dd6eec114930cf2035a8bcb1e80477ed79a8
Author: Ming Lei 
Date:   Tue Jul 10 09:03:31 2018 +0800

blk-mq: issue directly if hw queue isn't busy in case of 'none'

In case of 'none' io scheduler, when hw queue isn't busy, it isn't
necessary to enqueue request to sw queue and dequeue it from
sw queue because request may be submitted to hw queue asap without
extra cost, meantime there shouldn't be much request in sw queue,
and we don't need to worry about effect on IO merge.

There are still some single hw queue SCSI HBAs(HPSA, megaraid_sas, ...)
which may connect high performance devices, so 'none' is often required
for obtaining good performance.

This patch improves IOPS and decreases CPU unilization on megaraid_sas,
per Kashyap's test.


Thanks,
Ming


Re: [PATCH 1/2] blk-mq: fixup request re-insert in blk_mq_try_issue_list_directly()

2019-09-19 Thread Ming Lei
On Thu, Sep 19, 2019 at 11:45:46AM +0200, Hannes Reinecke wrote:
> From: Hannes Reinecke 
> 
> When blk_mq_request_issue_directly() returns BLK_STS_RESOURCE we
> need to requeue the I/O, but adding it to the global request list
> will mess up with the passed-in request list. So re-add the request

We always add request to hctx->dispatch_list after .queue_rq() returns
BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE, so what is the messing up?

> to the original list and leave it to the caller to handle situations
> where the list wasn't completely emptied.
> 
> Signed-off-by: Hannes Reinecke 
> ---
>  block/blk-mq.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index b038ec680e84..44ff3c1442a4 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1899,8 +1899,7 @@ void blk_mq_try_issue_list_directly(struct 
> blk_mq_hw_ctx *hctx,
>   if (ret != BLK_STS_OK) {
>   if (ret == BLK_STS_RESOURCE ||
>   ret == BLK_STS_DEV_RESOURCE) {
> - blk_mq_request_bypass_insert(rq,
> - list_empty(list));
> + list_add(list, &rq->queuelist);

This way may let this request(DONTPREP set) to be merged with other rq
or bio, and potential data corruption may be caused, please see commit:

c616cbee97ae blk-mq: punt failed direct issue to dispatch list


Thanks,
Ming


Re: [PATCH v2] block: fix null pointer dereference in blk_mq_rq_timed_out()

2019-09-17 Thread Ming Lei
On Tue, Sep 17, 2019 at 03:03:12PM +0800, Yufen Yu wrote:
> We got a null pointer deference BUG_ON in blk_mq_rq_timed_out()
> as following:
> 
> [  108.825472] BUG: kernel NULL pointer dereference, address: 0040
> [  108.827059] PGD 0 P4D 0
> [  108.827313] Oops:  [#1] SMP PTI
> [  108.827657] CPU: 6 PID: 198 Comm: kworker/6:1H Not tainted 5.3.0-rc8+ #431
> [  108.829503] Workqueue: kblockd blk_mq_timeout_work
> [  108.829913] RIP: 0010:blk_mq_check_expired+0x258/0x330
> [  108.838191] Call Trace:
> [  108.838406]  bt_iter+0x74/0x80
> [  108.838665]  blk_mq_queue_tag_busy_iter+0x204/0x450
> [  108.839074]  ? __switch_to_asm+0x34/0x70
> [  108.839405]  ? blk_mq_stop_hw_queue+0x40/0x40
> [  108.839823]  ? blk_mq_stop_hw_queue+0x40/0x40
> [  108.840273]  ? syscall_return_via_sysret+0xf/0x7f
> [  108.840732]  blk_mq_timeout_work+0x74/0x200
> [  108.841151]  process_one_work+0x297/0x680
> [  108.841550]  worker_thread+0x29c/0x6f0
> [  108.841926]  ? rescuer_thread+0x580/0x580
> [  108.842344]  kthread+0x16a/0x1a0
> [  108.842666]  ? kthread_flush_work+0x170/0x170
> [  108.843100]  ret_from_fork+0x35/0x40
> 
> The bug is caused by the race between timeout handle and completion for
> flush request.
> 
> When timeout handle function blk_mq_rq_timed_out() try to read
> 'req->q->mq_ops', the 'req' have completed and reinitiated by next
> flush request, which would call blk_rq_init() to clear 'req' as 0.
> 
> After commit 12f5b93145 ("blk-mq: Remove generation seqeunce"),
> normal requests lifetime are protected by refcount. Until 'rq->ref'
> drop to zero, the request can really be free. Thus, these requests
> cannot been reused before timeout handle finish.
> 
> However, flush request has defined .end_io and rq->end_io() is still
> called even if 'rq->ref' doesn't drop to zero. After that, the 'flush_rq'
> can be reused by the next flush request handle, resulting in null
> pointer deference BUG ON.
> 
> We fix this problem by covering flush request with 'rq->ref'.
> If the refcount is not zero, flush_end_io() return and wait the
> last holder recall it. To record the request status, we add a new
> entry 'rq_status', which will be used in flush_end_io().
> 
> Cc: Ming Lei 
> Cc: Christoph Hellwig 
> Cc: Keith Busch 
> Signed-off-by: Yufen Yu 
> ---
>  block/blk-flush.c | 8 
>  block/blk-mq.c| 7 +--
>  block/blk.h   | 6 ++
>  3 files changed, 19 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index aedd9320e605..f3ef6ce05c78 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -212,6 +212,14 @@ static void flush_end_io(struct request *flush_rq, 
> blk_status_t error)
>   struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
>   struct blk_mq_hw_ctx *hctx;
>  
> + if (!refcount_dec_and_test(&flush_rq->ref)) {
> + fq->rq_status = error;
> + return;
> + }
> +
> + if (fq->rq_status != BLK_STS_OK)
> + error = fq->rq_status;
> +
>   /* release the tag's ownership to the req cloned from */
>   spin_lock_irqsave(&fq->mq_flush_lock, flags);
>   hctx = flush_rq->mq_hctx;
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0835f4d8d42e..3d2b2c2e9cdf 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -905,9 +905,12 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx 
> *hctx,
>*/
>   if (blk_mq_req_expired(rq, next))
>   blk_mq_rq_timed_out(rq, reserved);
> - if (refcount_dec_and_test(&rq->ref))
> - __blk_mq_free_request(rq);
>  
> + if (is_flush_rq(rq, hctx)) {
> + rq->end_io(rq, 0);
> + } else if (refcount_dec_and_test(&rq->ref)) {
> + __blk_mq_free_request(rq);
> + }

The above two pair of '{}' can be removed.

>   return true;
>  }
>  
> diff --git a/block/blk.h b/block/blk.h
> index de6b2e146d6e..128bb53622ff 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -30,6 +30,7 @@ struct blk_flush_queue {
>*/
>   struct request  *orig_rq;
>   spinlock_t  mq_flush_lock;
> + blk_status_t    rq_status;
>  };
>  
>  extern struct kmem_cache *blk_requestq_cachep;
> @@ -47,6 +48,11 @@ static inline void __blk_get_queue(struct request_queue *q)
>   kobject_get(&q->kobj);
>  }
>  
> +static inline bool
> +is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) {
> + return hctx->fq->flush_rq == req;
> +}

We usually don't put '{' at the end of function name line.

Once the above patch style comments are addressed, feel free to add:

Reviewed-by: Ming Lei 


thanks,
Ming


Re: [PATCH] block: fix null pointer dereference in blk_mq_rq_timed_out()

2019-09-16 Thread Ming Lei
On Mon, Sep 16, 2019 at 05:27:39PM +0800, Yufen Yu wrote:
> 
> 
> On 2019/9/12 18:07, Ming Lei wrote:
> > On Thu, Sep 12, 2019 at 04:49:15PM +0800, Yufen Yu wrote:
> > > 
> > > On 2019/9/12 12:16, Ming Lei wrote:
> > > > On Thu, Sep 12, 2019 at 11:29:18AM +0800, Yufen Yu wrote:
> > > > > On 2019/9/12 10:46, Ming Lei wrote:
> > > > > > On Sat, Sep 07, 2019 at 06:24:50PM +0800, Yufen Yu wrote:
> > > > > > > There is a race condition between timeout check and completion for
> > > > > > > flush request as follow:
> > > > > > > 
> > > > > > > timeout_workissue flush  issue flush
> > > > > > >blk_insert_flush
> > > > > > > blk_insert_flush
> > > > > > > blk_mq_timeout_work
> > > > > > >blk_kick_flush
> > > > > > > 
> > > > > > > blk_mq_queue_tag_busy_iter
> > > > > > > blk_mq_check_expired(flush_rq)
> > > > > > > 
> > > > > > >__blk_mq_end_request
> > > > > > >   flush_end_io
> > > > > > >   blk_kick_flush
> > > > > > >   blk_rq_init(flush_rq)
> > > > > > >   memset(flush_rq, 0)
> > > > > > Not see there is memset(flush_rq, 0) in block/blk-flush.c
> > > > > Call path as follow:
> > > > > 
> > > > > blk_kick_flush
> > > > >   blk_rq_init
> > > > >   memset(rq, 0, sizeof(*rq));
> > > > Looks I miss this one in blk_rq_init(), sorry for that.
> > > > 
> > > > Given there are only two users of blk_rq_init(), one simple fix could be
> > > > not clearing queue in blk_rq_init(), something like below?
> > > > 
> > > > diff --git a/block/blk-core.c b/block/blk-core.c
> > > > index 77807a5d7f9e..25e6a045c821 100644
> > > > --- a/block/blk-core.c
> > > > +++ b/block/blk-core.c
> > > > @@ -107,7 +107,9 @@ EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
> > > >void blk_rq_init(struct request_queue *q, struct request *rq)
> > > >{
> > > > -   memset(rq, 0, sizeof(*rq));
> > > > +   const int offset = offsetof(struct request, q);
> > > > +
> > > > +   memset((void *)rq + offset, 0, sizeof(*rq) - offset);
> > > > INIT_LIST_HEAD(&rq->queuelist);
> > > > rq->q = q;
> > > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > > > index 1ac790178787..382e71b8787d 100644
> > > > --- a/include/linux/blkdev.h
> > > > +++ b/include/linux/blkdev.h
> > > > @@ -130,7 +130,7 @@ enum mq_rq_state {
> > > > * especially blk_mq_rq_ctx_init() to take care of the added fields.
> > > > */
> > > >struct request {
> > > > -   struct request_queue *q;
> > > > +   struct request_queue *q;/* Must be the 1st field */
> > > > struct blk_mq_ctx *mq_ctx;
> > > > struct blk_mq_hw_ctx *mq_hctx;
> > > Not set req->q as '0' can just avoid BUG_ON for NULL pointer deference.
> > > 
> > > However, the root problem is that 'flush_rq' have been reused while
> > > timeout function handle it currently. That means mq_ops->timeout() may
> > > access old values remained by the last flush request and make the wrong
> > > decision.
> > > 
> > > Take the race condition in the patch as an example.
> > > 
> > > blk_mq_check_expired
> > >  blk_mq_rq_timed_out
> > >  req->q->mq_ops->timeout  // Driver timeout handle may read old 
> > > data
> > >  refcount_dec_and_test(&rq)
> > >  __blk_mq_free_request   // If rq have been reset has '1' in
> > > blk_rq_init(), it will be free here.
> > > 
> > > So, I think we should solve this problem completely. Just like normal
> > > request,
> > > we can prevent flush request to call end_io when timeout handle the 
> > > request.
> > Seems it isn't specific for 'flush_rq', and it should be one generic issue
> > for any request which implements .end_io.
> > 
> > For requests without defining .end_io, rq->ref is applied for protecting
> > its lifetime. However, rq->end_io() is still called even if rq->ref doesn't
> > drop to zero.
> > 
> > If the above is correct, we need to let rq->ref to cover rq->end_io().
> 
> We ignore the fact that we may also need to free 'rq' after calling
> rq->end_io(),
> such as end_clone_request(), mq_flush_data_end_io().
> 
> If  we let 'rq->ref' to cover rq->end_io(), 'rq->ref' have been decreased to
> '0'
> before calling __blk_mq_free_request(). Then, the function will never be
> called.
> 
> So, I think flush request may need to be fixed individually.

Thinking of this issue further, given other cases of .end_io() still
depends on blk_mq_free_request() for freeing request, it is fine
to just fix flush request.


Thanks,
Ming


Re: [PATCH 0/5] block: loop: add file format subsystem and QCOW2 file format driver

2019-09-15 Thread Ming Lei
On Fri, Sep 13, 2019 at 01:57:33PM +0200, Manuel Bentele wrote:
> Hi Ming,
> 
> On 9/12/19 4:24 AM, Ming Lei wrote:
> > On Sat, Aug 24, 2019 at 12:56:14AM +0200, developm...@manuel-bentele.de 
> > wrote:
> >> From: Manuel Bentele 
> >>
> >> Hi
> >>
> >> Regarding to the following discussion [1] on the mailing list I show you 
> >> the result of my work as announced at the end of the discussion [2].
> >>
> >> The discussion was about the project topic of how to implement the 
> >> reading/writing of QCOW2 in the kernel. The project focuses on an 
> >> read-only 
> >> in-kernel QCOW2 implementation to increase the read/write performance 
> >> and tries to avoid nbd. Furthermore, the project is part of a project 
> >> series to develop a in-kernel network boot infrastructure that has no need 
> > I'd suggest you to share more details about this use case first:
> >
> > 1) what is the in-kernel network boot infrastructure? which functions
> > does it provide for user?
> 
> Some time ago, I started to describe the setup a little bit in [1]. Now
> I want to extend the description:
> 
> The boot infrastructure is used in the university environment and
> quarrels with network-related limitations. Step-by-step, the network
> hardware is renewed and improved, but there are still many university
> branches which are spread all over the city and connected by poor uplink
> connections. Sometimes there exist cases where 15 until 20 desktop
> computers have to share only 1 gigabit uplink. To accelerate the network
> boot, the idea came up to use the QCOW2 file format and its compression
> feature for the image content. Tests have shown, that the usage of
> compression is already measurable at gigabit uplinks and clearly
> noticeable at 100 megabit uplinks.

Got it, looks a good use case for compression, but not has to be QCOW2.

> 
> The network boot infrastructure is based on a classical PXE network boot
> to load the Linux kernel and the initramfs. In the initramfs, the
> compressed QCOW2 image is fetched via nfs or cifs or something else. The
> fetched QCOW2 image is now decompressed and read in the kernel. Compared
> to a decompression and read in the user space, like qemu-nbd does, this
> approach does not need any user space process, is faster and avoids
> switchroot problems.

This image can be compressed via xz, and fetched via wget or what
ever. 'xz' could have better compression ratio than qcow2, I guess.

> 
> > 2) how does the in kernel QCOW2 interacts with in-kernel network boot
> > infrastructure?
> 
> The in-kernel QCOW2 implementation uses the fetched QCOW2 image and
> exposes it as block device.
> 
> Therefore, my implementation extends the loop device module by a general
> file format subsystem to implement various file format drivers including
> a driver for the QCOW2 and RAW file format. The configuration utility
> losetup is used to set up a loop device and specify the file format
> driver to use.

You still need to update losetup.  xz-utils can be installed for
decompressing the image, then you still can create loop disk over
the image.

> 
> > 3) most important thing, what are the exact steps for one user to use
> > the in-kernel network boot infrastructure and in-kernel QCOW2?
> 
> To achieve a running system one have to complete the following items:
> 
>   * Set up a PXE boot server and configure client computers to boot from
> the network
>   * Build a Linux kernel for the network boot with built-in QCOW2
> implementation
>   * Prepare the initramfs for the network boot. Use a network file
> system or copy tool to fetch the compressed QCOW2 image.
>   * Create a compressed QCOW2 image that contains a complete environment
> for the user to work with after a successful network boot
>   * Set up the reading of the fetched QCOW2 image using the in-kernel
> QCOW2 implementation and mount the file systems located in the QCOW2
> image.
>   * Perform a switchroot to change into the mounted environment of the
> QCOW2 image.

As I mentioned above, seems not necessary to introduce loop-qcow2.

Thanks,
Ming


Re: [RFC PATCH] blk-mq: Avoid memory reclaim when allocating request map

2019-09-15 Thread Ming Lei
On Sun, Sep 15, 2019 at 05:56:56PM +0530, xiu...@redhat.com wrote:
> From: Xiubo Li 
> 
> For some storage drivers, such as the nbd, when there has new socket
> connections added, it will update the hardware queue number by calling
> blk_mq_update_nr_hw_queues(), in which it will freeze all the queues
> first. And then tries to do the hardware queue updating stuff.
> 
> But int blk_mq_alloc_rq_map()-->blk_mq_init_tags(), when allocating
> memory for tags, it may cause the mm do the memories direct reclaiming,
> since the queues has been freezed, so if needs to flush the page cache
> to disk, it will stuck in generic_make_request()-->blk_queue_enter() by
> waiting the queues to be unfreezed and then cause deadlock here.
> 
> Since the memory size requested here is a small one, which will make
> it not that easy to happen with a large size, but in theory this could
> happen when the OS is running in pressure and out of memory.
> 
> Gabriel Krisman Bertazi has hit the similar issue by fixing it in
> commit 36e1f3d10786 ("blk-mq: Avoid memory reclaim when remapping
> queues"), but might forget this part.
> 
> Signed-off-by: Xiubo Li 
> CC: Gabriel Krisman Bertazi 
> ---
>  block/blk-mq-tag.c | 5 +++--
>  block/blk-mq-tag.h | 5 -
>  block/blk-mq.c | 3 ++-
>  3 files changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 008388e82b5c..04ee0e4c3fa1 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -462,7 +462,8 @@ static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct 
> blk_mq_tags *tags,
>  
>  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
>unsigned int reserved_tags,
> -  int node, int alloc_policy)
> +  int node, int alloc_policy,
> +  gfp_t flags)
>  {
>   struct blk_mq_tags *tags;
>  
> @@ -471,7 +472,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int 
> total_tags,
>   return NULL;
>   }
>  
> - tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
> + tags = kzalloc_node(sizeof(*tags), flags, node);
>   if (!tags)
>   return NULL;
>  
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 61deab0b5a5a..296e0bc97126 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -22,7 +22,10 @@ struct blk_mq_tags {
>  };
>  
>  
> -extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned 
> int reserved_tags, int node, int alloc_policy);
> +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
> + unsigned int reserved_tags,
> + int node, int alloc_policy,
> + gfp_t flags);
>  extern void blk_mq_free_tags(struct blk_mq_tags *tags);
>  
>  extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 240416057f28..9c52e4dfe132 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2090,7 +2090,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct 
> blk_mq_tag_set *set,
>   node = set->numa_node;
>  
>   tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
> - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
> + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags),
> + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);

Now, there are three uses on 'GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY',
and code gets cleaner if you make it as one const variable in
blk_mq_alloc_rq_map.

Otherwise, looks fine:

Reviewed-by: Ming Lei 


thanks,
Ming


Re: [PATCH] block: fix null pointer dereference in blk_mq_rq_timed_out()

2019-09-12 Thread Ming Lei
On Thu, Sep 12, 2019 at 04:49:15PM +0800, Yufen Yu wrote:
> 
> 
> On 2019/9/12 12:16, Ming Lei wrote:
> > On Thu, Sep 12, 2019 at 11:29:18AM +0800, Yufen Yu wrote:
> > > 
> > > On 2019/9/12 10:46, Ming Lei wrote:
> > > > On Sat, Sep 07, 2019 at 06:24:50PM +0800, Yufen Yu wrote:
> > > > > There is a race condition between timeout check and completion for
> > > > > flush request as follow:
> > > > > 
> > > > > timeout_workissue flush  issue flush
> > > > >   blk_insert_flush
> > > > >blk_insert_flush
> > > > > blk_mq_timeout_work
> > > > >   blk_kick_flush
> > > > > 
> > > > > blk_mq_queue_tag_busy_iter
> > > > > blk_mq_check_expired(flush_rq)
> > > > > 
> > > > >   __blk_mq_end_request
> > > > >  flush_end_io
> > > > >  blk_kick_flush
> > > > >  blk_rq_init(flush_rq)
> > > > >  memset(flush_rq, 0)
> > > > Not see there is memset(flush_rq, 0) in block/blk-flush.c
> > > Call path as follow:
> > > 
> > > blk_kick_flush
> > >  blk_rq_init
> > >  memset(rq, 0, sizeof(*rq));
> > Looks I miss this one in blk_rq_init(), sorry for that.
> > 
> > Given there are only two users of blk_rq_init(), one simple fix could be
> > not clearing queue in blk_rq_init(), something like below?
> > 
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 77807a5d7f9e..25e6a045c821 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -107,7 +107,9 @@ EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
> >   void blk_rq_init(struct request_queue *q, struct request *rq)
> >   {
> > -   memset(rq, 0, sizeof(*rq));
> > +   const int offset = offsetof(struct request, q);
> > +
> > +   memset((void *)rq + offset, 0, sizeof(*rq) - offset);
> > INIT_LIST_HEAD(&rq->queuelist);
> > rq->q = q;
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 1ac790178787..382e71b8787d 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -130,7 +130,7 @@ enum mq_rq_state {
> >* especially blk_mq_rq_ctx_init() to take care of the added fields.
> >*/
> >   struct request {
> > -   struct request_queue *q;
> > +   struct request_queue *q;/* Must be the 1st field */
> > struct blk_mq_ctx *mq_ctx;
> > struct blk_mq_hw_ctx *mq_hctx;
> 
> Not set req->q as '0' can just avoid BUG_ON for NULL pointer deference.
> 
> However, the root problem is that 'flush_rq' have been reused while
> timeout function handle it currently. That means mq_ops->timeout() may
> access old values remained by the last flush request and make the wrong
> decision.
> 
> Take the race condition in the patch as an example.
> 
> blk_mq_check_expired
> blk_mq_rq_timed_out
> req->q->mq_ops->timeout  // Driver timeout handle may read old data
> refcount_dec_and_test(&rq)
> __blk_mq_free_request   // If rq have been reset has '1' in
> blk_rq_init(), it will be free here.
> 
> So, I think we should solve this problem completely. Just like normal
> request,
> we can prevent flush request to call end_io when timeout handle the request.

Seems it isn't specific for 'flush_rq', and it should be one generic issue
for any request which implements .end_io.

For requests without defining .end_io, rq->ref is applied for protecting
its lifetime. However, rq->end_io() is still called even if rq->ref doesn't
drop to zero.

If the above is correct, we need to let rq->ref to cover rq->end_io().


Thanks,
Ming


Re: [PATCH] block: fix null pointer dereference in blk_mq_rq_timed_out()

2019-09-11 Thread Ming Lei
On Thu, Sep 12, 2019 at 11:29:18AM +0800, Yufen Yu wrote:
> 
> 
> On 2019/9/12 10:46, Ming Lei wrote:
> > On Sat, Sep 07, 2019 at 06:24:50PM +0800, Yufen Yu wrote:
> > > There is a race condition between timeout check and completion for
> > > flush request as follow:
> > > 
> > > timeout_workissue flush  issue flush
> > >  blk_insert_flush
> > >   blk_insert_flush
> > > blk_mq_timeout_work
> > >  blk_kick_flush
> > > 
> > > blk_mq_queue_tag_busy_iter
> > > blk_mq_check_expired(flush_rq)
> > > 
> > >  __blk_mq_end_request
> > > flush_end_io
> > > blk_kick_flush
> > > blk_rq_init(flush_rq)
> > > memset(flush_rq, 0)
> > Not see there is memset(flush_rq, 0) in block/blk-flush.c
> 
> Call path as follow:
> 
> blk_kick_flush
> blk_rq_init
> memset(rq, 0, sizeof(*rq));

Looks I miss this one in blk_rq_init(), sorry for that.

Given there are only two users of blk_rq_init(), one simple fix could be
not clearing queue in blk_rq_init(), something like below?

diff --git a/block/blk-core.c b/block/blk-core.c
index 77807a5d7f9e..25e6a045c821 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -107,7 +107,9 @@ EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
 
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
-   memset(rq, 0, sizeof(*rq));
+   const int offset = offsetof(struct request, q);
+
+   memset((void *)rq + offset, 0, sizeof(*rq) - offset);
 
INIT_LIST_HEAD(&rq->queuelist);
rq->q = q;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1ac790178787..382e71b8787d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -130,7 +130,7 @@ enum mq_rq_state {
  * especially blk_mq_rq_ctx_init() to take care of the added fields.
  */
 struct request {
-   struct request_queue *q;
+   struct request_queue *q;/* Must be the 1st field */
struct blk_mq_ctx *mq_ctx;
struct blk_mq_hw_ctx *mq_hctx;
 

Thanks,
Ming


[PATCH] block: fix race between switching elevator and removing queues

2019-09-11 Thread Ming Lei
cecf5d87ff20 ("block: split .sysfs_lock into two locks") starts to
release & actuire sysfs_lock again during switching elevator. So it
isn't enough to prevent switching elevator from happening by simply
clearing QUEUE_FLAG_REGISTERED with holding sysfs_lock, because
in-progress switch still can move on after re-acquiring the lock,
meantime the flag of QUEUE_FLAG_REGISTERED won't get checked.

Fixes this issue by checking 'q->elevator' directly & locklessly after
q->kobj is removed in blk_unregister_queue(), this way is safe because
q->elevator can't be changed at that time.

Fixes: cecf5d87ff20 ("block: split .sysfs_lock into two locks")
Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-sysfs.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 107513495220..3af79831e717 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -1030,7 +1030,6 @@ EXPORT_SYMBOL_GPL(blk_register_queue);
 void blk_unregister_queue(struct gendisk *disk)
 {
struct request_queue *q = disk->queue;
-   bool has_elevator;
 
if (WARN_ON(!q))
return;
@@ -1046,7 +1045,6 @@ void blk_unregister_queue(struct gendisk *disk)
 */
mutex_lock(&q->sysfs_lock);
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
-   has_elevator = !!q->elevator;
mutex_unlock(&q->sysfs_lock);
 
mutex_lock(&q->sysfs_dir_lock);
@@ -1061,7 +1059,11 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
 
-   if (has_elevator)
+   /*
+* q->kobj has been removed, so it is safe to check if elevator
+* exists without holding q->sysfs_lock.
+*/
+   if (q->elevator)
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_dir_lock);
 
-- 
2.20.1



Re: [PATCH] block: fix null pointer dereference in blk_mq_rq_timed_out()

2019-09-11 Thread Ming Lei
On Sat, Sep 07, 2019 at 06:24:50PM +0800, Yufen Yu wrote:
> There is a race condition between timeout check and completion for
> flush request as follow:
> 
> timeout_workissue flush  issue flush
> blk_insert_flush
>  blk_insert_flush
> blk_mq_timeout_work
> blk_kick_flush
> 
> blk_mq_queue_tag_busy_iter
> blk_mq_check_expired(flush_rq)
> 
> __blk_mq_end_request
>flush_end_io
>blk_kick_flush
>blk_rq_init(flush_rq)
>memset(flush_rq, 0)

Not see there is memset(flush_rq, 0) in block/blk-flush.c

> 
> blk_mq_timed_out
> BUG_ON flush_rq->q->mq_ops

flush_rq->q won't be changed by blk_rq_init(), and either READ or WRITE
on variable with machine WORD length are atomic, so how can the BUG_ON()
be triggered? Do you have the actual BUG log?

Also now it is driver's responsibility for avoiding race between normal
completion and timeout.

> 
> For normal request, we need to get a tag and then allocate corresponding 
> request.
> Thus, the request cannot be reallocated before the tag have been free.
> Commit 1d9bd5161ba ("blk-mq: replace timeout synchronization with a RCU and
> generation based scheme") and commit 12f5b93145 ("blk-mq: Remove generation
> seqeunce") can guarantee the consistency of timeout handle and completion.
> 
> However, 'flush_rq' have been forgotten. 'flush_rq' allocation management
> dependents on flush implemention mechanism. Each hctx has only one 'flush_rq'.
> When a flush request have completed, the next flush request will hold the 
> 'flush_rq'.
> In the end, timeout handle may access the cleared 'flush_rq'.
> 
> We fix this problem by checking request refcount 'rq->ref', as normal request.
> If the refcount is not zero, flush_end_io() return and wait the last holder
> recall it. To record the request status, we add a new entry 'rq_status',
> which will be used in flush_end_io().
> 
> Signed-off-by: Yufen Yu 
> ---
>  block/blk-flush.c  | 8 
>  block/blk-mq.c | 7 +--
>  block/blk.h| 5 +
>  include/linux/blkdev.h | 2 ++
>  4 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index aedd9320e605..359a7e1a0925 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -212,6 +212,14 @@ static void flush_end_io(struct request *flush_rq, 
> blk_status_t error)
>   struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
>   struct blk_mq_hw_ctx *hctx;
>  
> + if (!refcount_dec_and_test(&flush_rq->ref)) {
> + flush_rq->rq_status = error;
> + return;
> + }
> +
> + if (flush_rq->rq_status != BLK_STS_OK)
> + error = flush_rq->rq_status;
> +
>   /* release the tag's ownership to the req cloned from */
>   spin_lock_irqsave(&fq->mq_flush_lock, flags);
>   hctx = flush_rq->mq_hctx;
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0835f4d8d42e..3d2b2c2e9cdf 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -905,9 +905,12 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx 
> *hctx,
>*/
>   if (blk_mq_req_expired(rq, next))
>   blk_mq_rq_timed_out(rq, reserved);
> - if (refcount_dec_and_test(&rq->ref))
> - __blk_mq_free_request(rq);
>  
> + if (is_flush_rq(rq, hctx)) {
> + rq->end_io(rq, 0);
> + } else if (refcount_dec_and_test(&rq->ref)) {
> + __blk_mq_free_request(rq);
> + }
>   return true;
>  }
>  
> diff --git a/block/blk.h b/block/blk.h
> index de6b2e146d6e..f503ef9ad3e6 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -47,6 +47,11 @@ static inline void __blk_get_queue(struct request_queue *q)
>   kobject_get(&q->kobj);
>  }
>  
> +static inline bool
> +is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) {
> + return hctx->fq->flush_rq == req;
> +}
> +
>  struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
>   int node, int cmd_size, gfp_t flags);
>  void blk_free_flush_queue(struct blk_flush_queue *q);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 1ef375dafb1c..b1d05077e03f 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -237,6 +237,8 @@ struct request {
>*/
>   rq_end_io_fn *end_io;
>   void *end_io_data;
> +
> + blk_status_t rq_status;
>  };

'rq_status' is only for flush request, so it may be added to 'struct
blk_flush_queue' instead of 'struct request'.


Thanks,
Ming


Re: [PATCH 0/5] block: loop: add file format subsystem and QCOW2 file format driver

2019-09-11 Thread Ming Lei
On Sat, Aug 24, 2019 at 12:56:14AM +0200, developm...@manuel-bentele.de wrote:
> From: Manuel Bentele 
> 
> Hi
> 
> Regarding to the following discussion [1] on the mailing list I show you 
> the result of my work as announced at the end of the discussion [2].
> 
> The discussion was about the project topic of how to implement the 
> reading/writing of QCOW2 in the kernel. The project focuses on an read-only 
> in-kernel QCOW2 implementation to increase the read/write performance 
> and tries to avoid nbd. Furthermore, the project is part of a project 
> series to develop a in-kernel network boot infrastructure that has no need 

I'd suggest you to share more details about this use case first:

1) what is the in-kernel network boot infrastructure? which functions
does it provide for user?

2) how does the in kernel QCOW2 interacts with in-kernel network boot
infrastructure?

3) most important thing, what are the exact steps for one user to use
the in-kernel network boot infrastructure and in-kernel QCOW2?

Without knowing the motivation/purpose and exact use case, it doesn't
make sense to discuss the implementation details, IMO.

Thanks,
Ming


Re: [PATCH 10/15] nvme-pci: do not build a scatterlist to map metadata

2019-09-11 Thread Ming Lei
On Wed, Aug 28, 2019 at 05:20:57PM +0800, Ming Lei wrote:
> On Thu, Mar 21, 2019 at 04:10:32PM -0700, Christoph Hellwig wrote:
> > We always have exactly one segment, so we can simply call dma_map_bvec.
> > 
> > Signed-off-by: Christoph Hellwig 
> > ---
> >  drivers/nvme/host/pci.c | 23 ++-
> >  1 file changed, 10 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index bc4ee869fe82..a7dad24e0406 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -221,7 +221,7 @@ struct nvme_iod {
> > int npages; /* In the PRP list. 0 means small pool in use */
> > int nents;  /* Used in scatterlist */
> > dma_addr_t first_dma;
> > -   struct scatterlist meta_sg; /* metadata requires single contiguous 
> > buffer */
> > +   dma_addr_t meta_dma;
> > struct scatterlist *sg;
> > struct scatterlist inline_sg[0];
> >  };
> > @@ -592,13 +592,16 @@ static void nvme_unmap_data(struct nvme_dev *dev, 
> > struct request *req)
> > dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
> > int i;
> >  
> > +   if (blk_integrity_rq(req)) {
> > +   dma_unmap_page(dev->dev, iod->meta_dma,
> > +   rq_integrity_vec(req)->bv_len, dma_dir);
> > +   }
> > +
> > if (iod->nents) {
> > /* P2PDMA requests do not need to be unmapped */
> > if (!is_pci_p2pdma_page(sg_page(iod->sg)))
> > dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
> >  
> > -   if (blk_integrity_rq(req))
> > -   dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
> > }
> >  
> > if (iod->npages == 0)
> > @@ -861,17 +864,11 @@ static blk_status_t nvme_map_data(struct nvme_dev 
> > *dev, struct request *req,
> >  
> > ret = BLK_STS_IOERR;
> > if (blk_integrity_rq(req)) {
> > -   if (blk_rq_count_integrity_sg(q, req->bio) != 1)
> > -   goto out;
> > -
> > -   sg_init_table(&iod->meta_sg, 1);
> > -   if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
> > -   goto out;
> > -
> > -   if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
> > +   iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
> > +   dma_dir, 0);
> 
> Hi Christoph,
> 
> When one bio is enough big, the generated integrity data could cross
> more than one pages even though the data is still in single segment.
> 
> However, we don't convert to multi-page bvec for bio_integrity_prep(),
> and each page may consume one bvec, so is it possible for this patch to
> cause issues in case of NVMe's protection? Since this patch supposes
> that there is only one bvec for integrity data.
> 
> BTW, not see such kind of report, just a concern in theory.

Hello Christoph,

Gently ping...


Thanks,
Ming


Re: [PATCH 1/3] block: Respect the device's maximum segment size

2019-09-11 Thread Ming Lei
On Mon, Sep 09, 2019 at 06:13:31PM +0200, Christoph Hellwig wrote:
> On Mon, Sep 09, 2019 at 02:56:56PM +0200, Thierry Reding wrote:
> > From: Thierry Reding 
> > 
> > When enabling the DMA map merging capability for a queue, ensure that
> > the maximum segment size does not exceed the device's limit.
> 
> We can't do that unfortunately.  If we use the virt_boundary setting
> we do aggressive merges that there is no accounting for.  So we can't
> limit the segment size.

Could you explain a bit why we can't do that?

The segment size limit is basically removed since the following commit
200a9aff7b02 ("block: remove the segment size check in bio_will_gap").

Before that commit, the max segment size limit worked.


Thanks,
Ming


Re: [PATCH V3 blktests] nvme: Add new test case about nvme rescan/reset/remove during IO

2019-09-11 Thread Ming Lei
On Wed, Sep 11, 2019 at 04:53:43PM +0800, Yi Zhang wrote:
> Add one test to cover NVMe SSD rescan/reset/remove operation during
> IO, the steps found several issues during my previous testing, check
> them here:
> http://lists.infradead.org/pipermail/linux-nvme/2017-February/008358.html
> http://lists.infradead.org/pipermail/linux-nvme/2017-May/010259.html
> 
> Signed-off-by: Yi Zhang 
> 
> ---
> 
> changes from v2:
>  - add check_sysfs function for rescan/reset/remove operation
>  - declare all local variables at the start
>  - alignment fix
>  - add udevadm settle
>  - change to QUICK=1
> changes from v1:
>  - add variable for "/sys/bus/pci/devices/${pdev}"
>  - add kill $!; wait; for background fio
>  - add rescan/reset/remove sysfs node check
>  - add loop checking for nvme reinitialized
> 
> ---
> ---
>  tests/nvme/031 | 75 ++
>  tests/nvme/031.out |  2 ++
>  2 files changed, 77 insertions(+)
>  create mode 100755 tests/nvme/031
>  create mode 100644 tests/nvme/031.out
> 
> diff --git a/tests/nvme/031 b/tests/nvme/031
> new file mode 100755
> index 000..31db8a5
> --- /dev/null
> +++ b/tests/nvme/031
> @@ -0,0 +1,75 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-3.0+
> +# Copyright (C) 2019 Yi Zhang 
> +#
> +# Test nvme pci adapter rescan/reset/remove operation during I/O
> +#
> +# Regression test for bellow two commits:
> +# http://lists.infradead.org/pipermail/linux-nvme/2017-May/010367.html
> +# 986f75c876db nvme: avoid to use blk_mq_abort_requeue_list()
> +# 806f026f9b90 nvme: use blk_mq_start_hw_queues() in nvme_kill_queues()
> +
> +. tests/nvme/rc
> +
> +DESCRIPTION="test nvme pci adapter rescan/reset/remove during I/O"
> +QUICK=1
> +
> +requires() {
> + _have_fio
> +}
> +
> +device_requires() {
> + _test_dev_is_nvme
> +}
> +
> +check_sysfs()
> +{
> + local sysfs_attr="$sysfs/$1"
> +
> + if [[ -f "$sysfs_attr" ]]; then
> + echo 1 > "${sysfs_attr}"
> + else
> + # QEMU VM doesn't have the "reset" attribute, skip it
> + [[ "$sysfs_attr" == *reset ]] && return
> + echo "${sysfs_attr} doesn't exist!"
> + fi
> +}
> +
> +test_device() {
> + echo "Running ${TEST_NAME}"
> +
> + local sysfs
> + local m
> +
> + pdev="$(_get_pci_dev_from_blkdev)"
> + sysfs="/sys/bus/pci/devices/${pdev}"
> +
> + # start fio job
> + _run_fio_rand_io --filename="$TEST_DEV" --size=1g \
> + --group_reporting  &> /dev/null &
> +
> + sleep 5
> +
> + # do rescan/reset/remove operation
> + for i in rescan reset remove; do
> + check_sysfs $i
> + done
> +
> + { kill $!; wait; } &> /dev/null
> +
> + echo 1 > /sys/bus/pci/rescan
> +
> + # wait nvme reinitialized
> + for ((m = 0; m < 10; m++)); do
> + if [[ -b "${TEST_DEV}" ]]; then
> +         break
> + fi
> + sleep 0.5
> + done
> + if (( m > 9 )); then
> + echo "nvme still not reinitialized after 5 seconds!"
> + fi
> + udevadm settle
> +
> + echo "Test complete"
> +}
> diff --git a/tests/nvme/031.out b/tests/nvme/031.out
> new file mode 100644
> index 000..ae902bd
> --- /dev/null
> +++ b/tests/nvme/031.out
> @@ -0,0 +1,2 @@
> +Running nvme/031
> +Test complete
> -- 
> 2.17.2
> 

Reviewed-by: Ming Lei 

Thanks,
Ming


Re: [PATCH v3 2/7] block: Change elevator_init_mq() to always succeed

2019-09-04 Thread Ming Lei
On Wed, Sep 04, 2019 at 05:42:42PM +0900, Damien Le Moal wrote:
> If the default elevator chosen is mq-deadline, elevator_init_mq() may
> return an error if mq-deadline initialization fails, leading to
> blk_mq_init_allocated_queue() returning an error, which in turn will
> cause the block device initialization to fail and the device not being
> exposed.
> 
> Instead of taking such extreme measure, handle mq-deadline
> initialization failures in the same manner as when mq-deadline is not
> available (no module to load), that is, default to the "none" scheduler.
> With this change, elevator_init_mq() return type can be changed to void.
> 
> Signed-off-by: Damien Le Moal 
> Reviewed-by: Johannes Thumshirn 
> Reviewed-by: Christoph Hellwig 
> ---
>  block/blk-mq.c   |  8 +---
>  block/blk.h  |  2 +-
>  block/elevator.c | 23 ---
>  3 files changed, 14 insertions(+), 19 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 13923630e00a..ee4caf0c0807 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2842,8 +2842,6 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set 
> *set)
>  struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
> struct request_queue *q)
>  {
> - int ret = -ENOMEM;
> -
>   /* mark the queue as mq asap */
>   q->mq_ops = set->ops;
>  
> @@ -2904,14 +2902,10 @@ struct request_queue 
> *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   blk_mq_add_queue_tag_set(set, q);
>   blk_mq_map_swqueue(q);
>  
> - ret = elevator_init_mq(q);
> - if (ret)
> - goto err_tag_set;
> + elevator_init_mq(q);
>  
>   return q;
>  
> -err_tag_set:
> - blk_mq_del_queue_tag_set(q);
>  err_hctxs:
>   kfree(q->queue_hw_ctx);
>   q->nr_hw_queues = 0;
> diff --git a/block/blk.h b/block/blk.h
> index e4619fc5c99a..ed347f7a97b1 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -184,7 +184,7 @@ void blk_account_io_done(struct request *req, u64 now);
>  
>  void blk_insert_flush(struct request *rq);
>  
> -int elevator_init_mq(struct request_queue *q);
> +void elevator_init_mq(struct request_queue *q);
>  int elevator_switch_mq(struct request_queue *q,
> struct elevator_type *new_e);
>  void __elevator_exit(struct request_queue *, struct elevator_queue *);
> diff --git a/block/elevator.c b/block/elevator.c
> index 4721834815bb..2944c129760c 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -628,34 +628,35 @@ static inline bool elv_support_iosched(struct 
> request_queue *q)
>  
>  /*
>   * For blk-mq devices supporting IO scheduling, we default to using 
> mq-deadline,
> - * if available, for single queue devices. If deadline isn't available OR we
> - * have multiple queues, default to "none".
> + * if available, for single queue devices. If deadline isn't available OR
> + * deadline initialization fails OR we have multiple queues, default to 
> "none".
>   */
> -int elevator_init_mq(struct request_queue *q)
> +void elevator_init_mq(struct request_queue *q)
>  {
>   struct elevator_type *e;
> - int err = 0;
> + int err;
>  
>   if (!elv_support_iosched(q))
> - return 0;
> + return;
>  
>   if (q->nr_hw_queues != 1)
> - return 0;
> + return;
>  
>   WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
>  
>   if (unlikely(q->elevator))
> - goto out;
> + return;
>  
>   e = elevator_get(q, "mq-deadline", false);
>   if (!e)
> - goto out;
> + return;
>  
>   err = blk_mq_init_sched(q, e);
> - if (err)
> + if (err) {
> + pr_warn("\"%s\" elevator initialization failed, "
> + "falling back to \"none\"\n", e->elevator_name);
>   elevator_put(e);
> -out:
> - return err;
> + }
>  }

Looks fine:

Reviewed-by: Ming Lei 

BTW, blk_mq_init_sched()'s failure patch should have restored
q->nr_request. And that could be done in another standalone patch.

-- 
Ming


Re: [PATCH v3 1/7] block: Cleanup elevator_init_mq() use

2019-09-04 Thread Ming Lei
On Wed, Sep 04, 2019 at 05:42:41PM +0900, Damien Le Moal wrote:
> Instead of checking a queue tag_set BLK_MQ_F_NO_SCHED flag before
> calling elevator_init_mq() to make sure that the queue supports IO
> scheduling, use the elevator.c function elv_support_iosched() in
> elevator_init_mq(). This does not introduce any functional change but
> ensure that elevator_init_mq() does the right thing based on the queue
> settings.
> 
> Signed-off-by: Damien Le Moal 
> Reviewed-by: Johannes Thumshirn 
> Reviewed-by: Christoph Hellwig 
> ---
>  block/blk-mq.c   |  8 +++-
>  block/elevator.c | 23 +--
>  2 files changed, 16 insertions(+), 15 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index b622029b19ea..13923630e00a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2904,11 +2904,9 @@ struct request_queue 
> *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   blk_mq_add_queue_tag_set(set, q);
>   blk_mq_map_swqueue(q);
>  
> - if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
> - ret = elevator_init_mq(q);
> - if (ret)
> - goto err_tag_set;
> - }
> + ret = elevator_init_mq(q);
> + if (ret)
> + goto err_tag_set;
>  
>   return q;
>  
> diff --git a/block/elevator.c b/block/elevator.c
> index 86100de3..4721834815bb 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -619,16 +619,26 @@ int elevator_switch_mq(struct request_queue *q,
>   return ret;
>  }
>  
> +static inline bool elv_support_iosched(struct request_queue *q)
> +{
> + if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
> + return false;
> + return true;
> +}
> +
>  /*
> - * For blk-mq devices, we default to using mq-deadline, if available, for 
> single
> - * queue devices.  If deadline isn't available OR we have multiple queues,
> - * default to "none".
> + * For blk-mq devices supporting IO scheduling, we default to using 
> mq-deadline,
> + * if available, for single queue devices. If deadline isn't available OR we
> + * have multiple queues, default to "none".
>   */
>  int elevator_init_mq(struct request_queue *q)
>  {
>   struct elevator_type *e;
>   int err = 0;
>  
> + if (!elv_support_iosched(q))
> + return 0;
> +
>   if (q->nr_hw_queues != 1)
>   return 0;
>  
> @@ -706,13 +716,6 @@ static int __elevator_change(struct request_queue *q, 
> const char *name)
>   return elevator_switch(q, e);
>  }
>  
> -static inline bool elv_support_iosched(struct request_queue *q)
> -{
> - if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
> - return false;
> - return true;
> -}
> -
>  ssize_t elv_iosched_store(struct request_queue *q, const char *name,
> size_t count)
>  {
> -- 
> 2.21.0
> 

Reviewed-by: Ming Lei 

-- 
Ming


Re: [PATCH v2 2/2] mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two)

2019-09-03 Thread Ming Lei
On Wed, Sep 04, 2019 at 07:19:33AM +0200, Christoph Hellwig wrote:
> On Tue, Sep 03, 2019 at 01:53:12PM -0700, Matthew Wilcox wrote:
> > > Its enabled in all full debug session as far as I know. Fedora for
> > > example has been running this for ages to find breakage in device drivers
> > > etc etc.
> > 
> > Are you telling me nobody uses the ramdisk driver on fedora?  Because
> > that's one of the affected drivers.
> 
> For pmem/brd misaligned memory alone doesn't seem to be the problem.
> Misaligned memory that cross a page barrier is.  And at least XFS
> before my log recovery changes only used kmalloc for smaller than
> page size allocation, so this case probably didn't hit.

BTW, does sl[aou]b guarantee that smaller than page size allocation via 
kmalloc()
won't cross page boundary any time?

Thanks,
Ming


Re: [PATCH 10/15] nvme-pci: do not build a scatterlist to map metadata

2019-08-28 Thread Ming Lei
On Thu, Mar 21, 2019 at 04:10:32PM -0700, Christoph Hellwig wrote:
> We always have exactly one segment, so we can simply call dma_map_bvec.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/nvme/host/pci.c | 23 ++-
>  1 file changed, 10 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index bc4ee869fe82..a7dad24e0406 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -221,7 +221,7 @@ struct nvme_iod {
>   int npages; /* In the PRP list. 0 means small pool in use */
>   int nents;  /* Used in scatterlist */
>   dma_addr_t first_dma;
> - struct scatterlist meta_sg; /* metadata requires single contiguous 
> buffer */
> + dma_addr_t meta_dma;
>   struct scatterlist *sg;
>   struct scatterlist inline_sg[0];
>  };
> @@ -592,13 +592,16 @@ static void nvme_unmap_data(struct nvme_dev *dev, 
> struct request *req)
>   dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
>   int i;
>  
> + if (blk_integrity_rq(req)) {
> + dma_unmap_page(dev->dev, iod->meta_dma,
> + rq_integrity_vec(req)->bv_len, dma_dir);
> + }
> +
>   if (iod->nents) {
>   /* P2PDMA requests do not need to be unmapped */
>   if (!is_pci_p2pdma_page(sg_page(iod->sg)))
>   dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
>  
> - if (blk_integrity_rq(req))
> - dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
>   }
>  
>   if (iod->npages == 0)
> @@ -861,17 +864,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
> struct request *req,
>  
>   ret = BLK_STS_IOERR;
>   if (blk_integrity_rq(req)) {
> - if (blk_rq_count_integrity_sg(q, req->bio) != 1)
> - goto out;
> -
> - sg_init_table(&iod->meta_sg, 1);
> - if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
> - goto out;
> -
> - if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
> + iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
> + dma_dir, 0);

Hi Christoph,

When one bio is enough big, the generated integrity data could cross
more than one pages even though the data is still in single segment.

However, we don't convert to multi-page bvec for bio_integrity_prep(),
and each page may consume one bvec, so is it possible for this patch to
cause issues in case of NVMe's protection? Since this patch supposes
that there is only one bvec for integrity data.

BTW, not see such kind of report, just a concern in theory.

thanks,
Ming


[PATCH V4 5/5] block: split .sysfs_lock into two locks

2019-08-27 Thread Ming Lei
 __se_sys_delete_module+0x204/0x337
 ? free_module+0x39f/0x39f
 ? blkcg_maybe_throttle_current+0x8a/0x718
 ? rwlock_bug+0x62/0x62
 ? __blkcg_punt_bio_submit+0xd0/0xd0
 ? trace_hardirqs_on_thunk+0x1a/0x20
 ? mark_held_locks+0x1f/0x7a
 ? do_syscall_64+0x4c/0x295
 do_syscall_64+0xa7/0x295
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fb696cdbe6b
Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 
1f 84 00 00 008
RSP: 002b:7ffec9588788 EFLAGS: 0206 ORIG_RAX: 00b0
RAX: ffda RBX: 559e589137c0 RCX: 7fb696cdbe6b
RDX: 000a RSI: 0800 RDI: 559e58913828
RBP:  R08: 7ffec9587701 R09: 
R10: 7fb696d4eae0 R11: 0206 R12: 7ffec95889b0
R13: 7ffec95896b3 R14: 559e58913260 R15: 0000559e589137c0

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-core.c   |  1 +
 block/blk-mq-sysfs.c   | 12 -
 block/blk-sysfs.c  | 46 +--
 block/blk.h|  2 +-
 block/elevator.c   | 55 --
 include/linux/blkdev.h |  1 +
 6 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 919629ce4015..2792f7cf7bef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
mutex_init(&q->blk_trace_mutex);
 #endif
mutex_init(&q->sysfs_lock);
+   mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
 
init_waitqueue_head(&q->mq_freeze_wq);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 6ddde3774ebe..a0d3ce30fa08 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -270,7 +270,7 @@ void blk_mq_unregister_dev(struct device *dev, struct 
request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   lockdep_assert_held(&q->sysfs_lock);
+   lockdep_assert_held(&q->sysfs_dir_lock);
 
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
@@ -320,7 +320,7 @@ int __blk_mq_register_dev(struct device *dev, struct 
request_queue *q)
int ret, i;
 
WARN_ON_ONCE(!q->kobj.parent);
-   lockdep_assert_held(&q->sysfs_lock);
+   lockdep_assert_held(&q->sysfs_dir_lock);
 
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -354,7 +354,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
 
@@ -362,7 +362,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
blk_mq_unregister_hctx(hctx);
 
 unlock:
-   mutex_unlock(&q->sysfs_lock);
+   mutex_unlock(&q->sysfs_dir_lock);
 }
 
 int blk_mq_sysfs_register(struct request_queue *q)
@@ -370,7 +370,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
 
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
 
@@ -381,7 +381,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
}
 
 unlock:
-   mutex_unlock(&q->sysfs_lock);
+   mutex_unlock(&q->sysfs_dir_lock);
 
return ret;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5b0b5224cfd4..107513495220 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -938,6 +938,7 @@ int blk_register_queue(struct gendisk *disk)
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
+   bool has_elevator = false;
 
if (WARN_ON(!q))
return -ENXIO;
@@ -945,7 +946,6 @@ int blk_register_queue(struct gendisk *disk)
WARN_ONCE(blk_queue_registered(q),
  "%s is registering an already registered queue\n",
  kobject_name(&dev->kobj));
-   blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 
/*
 * SCSI probing may synchronously create and destroy a lot of
@@ -965,8 +965,7 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
return ret;
 
-   /* Prevent changes through sysfs until registration is completed. */
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
 
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret &l

[PATCH V4 4/5] block: add helper for checking if queue is registered

2019-08-27 Thread Ming Lei
There are 4 users which check if queue is registered, so add one helper
to check it.

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-sysfs.c  | 4 ++--
 block/blk-wbt.c| 2 +-
 block/elevator.c   | 2 +-
 include/linux/blkdev.h | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 977c659dcd18..5b0b5224cfd4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -942,7 +942,7 @@ int blk_register_queue(struct gendisk *disk)
if (WARN_ON(!q))
return -ENXIO;
 
-   WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+   WARN_ONCE(blk_queue_registered(q),
  "%s is registering an already registered queue\n",
  kobject_name(&dev->kobj));
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@@ -1026,7 +1026,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
 
/* Return early if disk->queue was never registered. */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return;
 
/*
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 313f45a37e9d..c4d3089e47f7 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -656,7 +656,7 @@ void wbt_enable_default(struct request_queue *q)
return;
 
/* Queue not registered? Maybe shutting down... */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return;
 
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
diff --git a/block/elevator.c b/block/elevator.c
index 33c15fb54ed1..03d923196569 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -656,7 +656,7 @@ static int __elevator_change(struct request_queue *q, const 
char *name)
struct elevator_type *e;
 
/* Make sure queue is not in the middle of being removed */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return -ENOENT;
 
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 167bf879f072..6041755984f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -647,6 +647,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct 
request_queue *q);
 #define blk_queue_quiesced(q)  test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
 #define blk_queue_pm_only(q)   atomic_read(&(q)->pm_only)
 #define blk_queue_fua(q)   test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
+#define blk_queue_registered(q)test_bit(QUEUE_FLAG_REGISTERED, 
&(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
-- 
2.20.1



[PATCH V4 3/5] blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue

2019-08-27 Thread Ming Lei
blk_mq_map_swqueue() is called from blk_mq_init_allocated_queue()
and blk_mq_update_nr_hw_queues(). For the former caller, the kobject
isn't exposed to userspace yet. For the latter caller, hctx sysfs entries
and debugfs are un-registered before updating nr_hw_queues.

On the other hand, commit 2f8f1336a48b ("blk-mq: always free hctx after
request queue is freed") moves freeing hctx into queue's release
handler, so there won't be race with queue release path too.

So don't hold q->sysfs_lock in blk_mq_map_swqueue().

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6968de9d7402..b0ee0cac737f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2456,11 +2456,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
 
-   /*
-* Avoid others reading imcomplete hctx->cpumask through sysfs
-*/
-   mutex_lock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -2521,8 +2516,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
HCTX_TYPE_DEFAULT, i);
}
 
-   mutex_unlock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
/*
 * If no software queues are mapped to this hardware queue,
-- 
2.20.1



[PATCH V4 0/5] block: don't acquire .sysfs_lock before removing mq & iosched kobjects

2019-08-27 Thread Ming Lei
Hi,

The 1st 3 patches cleans up current uses on q->sysfs_lock.

The 4th patch adds one helper for checking if queue is registered.

The last patch splits .sysfs_lock into two locks: one is only for
sync .store/.show from sysfs, the other one is for pretecting kobjects
registering/unregistering. Meantime avoid to acquire .sysfs_lock when
removing mq & iosched kobjects, so that the reported deadlock can
be fixed.

V4:
- address comments from Bart
- update comments, add comments about releasing sysfs_lock in 
elevator_switch_mq
- fix a race in blk_register_queue by holding sysfs_lock for
  emitting KOBJ_ADD
- only the 5th patch is updated

V3:
- drop the 4th patch in V2, which is wrong, meantime not necesary
  for fixing this deadlock
- replace comment with one WARN_ON_ONCE() in patch 2
- add reviewed-by tag

V2:
- remove several uses on .sysfs_lock
- Remove blk_mq_register_dev()
- add one helper for checking queue registered
- split .sysfs_lock into two locks


Bart Van Assche (1):
  block: Remove blk_mq_register_dev()

Ming Lei (4):
  block: don't hold q->sysfs_lock in elevator_init_mq
  blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue
  block: add helper for checking if queue is registered
  block: split .sysfs_lock into two locks

 block/blk-core.c   |  1 +
 block/blk-mq-sysfs.c   | 23 --
 block/blk-mq.c |  7 -
 block/blk-sysfs.c  | 50 +
 block/blk-wbt.c|  2 +-
 block/blk.h|  2 +-
 block/elevator.c   | 71 +++---
 include/linux/blk-mq.h |  1 -
 include/linux/blkdev.h |  2 ++
 9 files changed, 94 insertions(+), 65 deletions(-)

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Cc: Damien Le Moal 

-- 
2.20.1



[PATCH V4 1/5] block: Remove blk_mq_register_dev()

2019-08-27 Thread Ming Lei
From: Bart Van Assche 

This function has no callers. Hence remove it.

Cc: Christoph Hellwig 
Cc: Ming Lei 
Cc: Hannes Reinecke 
Signed-off-by: Bart Van Assche 
---
 block/blk-mq-sysfs.c   | 11 ---
 include/linux/blk-mq.h |  1 -
 2 files changed, 12 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index d6e1a9bd7131..6ddde3774ebe 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -349,17 +349,6 @@ int __blk_mq_register_dev(struct device *dev, struct 
request_queue *q)
return ret;
 }
 
-int blk_mq_register_dev(struct device *dev, struct request_queue *q)
-{
-   int ret;
-
-   mutex_lock(&q->sysfs_lock);
-   ret = __blk_mq_register_dev(dev, q);
-   mutex_unlock(&q->sysfs_lock);
-
-   return ret;
-}
-
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 21cebe901ac0..62a3bb715899 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -253,7 +253,6 @@ struct request_queue *blk_mq_init_sq_queue(struct 
blk_mq_tag_set *set,
const struct blk_mq_ops *ops,
unsigned int queue_depth,
unsigned int set_flags);
-int blk_mq_register_dev(struct device *, struct request_queue *);
 void blk_mq_unregister_dev(struct device *, struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
-- 
2.20.1



[PATCH V4 2/5] block: don't hold q->sysfs_lock in elevator_init_mq

2019-08-27 Thread Ming Lei
The original comment says:

q->sysfs_lock must be held to provide mutual exclusion between
elevator_switch() and here.

Which is simply wrong. elevator_init_mq() is only called from
blk_mq_init_allocated_queue, which is always called before the request
queue is registered via blk_register_queue(), for dm-rq or normal rq
based driver. However, queue's kobject is only exposed and added to sysfs
in blk_register_queue(). So there isn't such race between elevator_switch()
and elevator_init_mq().

So avoid to hold q->sysfs_lock in elevator_init_mq().

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Cc: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/elevator.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 2f17d66d0e61..33c15fb54ed1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -607,23 +607,19 @@ int elevator_init_mq(struct request_queue *q)
if (q->nr_hw_queues != 1)
return 0;
 
-   /*
-* q->sysfs_lock must be held to provide mutual exclusion between
-* elevator_switch() and here.
-*/
-   mutex_lock(&q->sysfs_lock);
+   WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
+
if (unlikely(q->elevator))
-   goto out_unlock;
+   goto out;
 
e = elevator_get(q, "mq-deadline", false);
if (!e)
-   goto out_unlock;
+   goto out;
 
err = blk_mq_init_sched(q, e);
if (err)
elevator_put(e);
-out_unlock:
-   mutex_unlock(&q->sysfs_lock);
+out:
return err;
 }
 
-- 
2.20.1



Re: [PATCH V3 5/5] block: split .sysfs_lock into two locks

2019-08-27 Thread Ming Lei
On Mon, Aug 26, 2019 at 09:24:03AM -0700, Bart Van Assche wrote:
> On 8/25/19 7:51 PM, Ming Lei wrote:
> > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > index 5b0b5224cfd4..5941a0176f87 100644
> > --- a/block/blk-sysfs.c
> > +++ b/block/blk-sysfs.c
> > @@ -938,6 +938,7 @@ int blk_register_queue(struct gendisk *disk)
> > int ret;
> > struct device *dev = disk_to_dev(disk);
> > struct request_queue *q = disk->queue;
> > +   bool has_elevator = false;
> > if (WARN_ON(!q))
> > return -ENXIO;
> > @@ -945,7 +946,6 @@ int blk_register_queue(struct gendisk *disk)
> > WARN_ONCE(blk_queue_registered(q),
> >   "%s is registering an already registered queue\n",
> >   kobject_name(&dev->kobj));
> > -   blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
> > /*
> >  * SCSI probing may synchronously create and destroy a lot of
> > @@ -966,7 +966,7 @@ int blk_register_queue(struct gendisk *disk)
> > return ret;
> > /* Prevent changes through sysfs until registration is completed. */
> > -   mutex_lock(&q->sysfs_lock);
> > +   mutex_lock(&q->sysfs_dir_lock);
> 
> Does mutex_lock(&q->sysfs_dir_lock) really protect against changes of the
> I/O scheduler through sysfs or does it only protect against concurrent sysfs
> object creation and removal?

It is only for protecting against concurrent sysfs object creation and removal.

> In other words, should the comment above this
> mutex lock call be updated?

Yeah, it should be removed.

> 
> > @@ -987,26 +987,37 @@ int blk_register_queue(struct gendisk *disk)
> > blk_mq_debugfs_register(q);
> > }
> > -   kobject_uevent(&q->kobj, KOBJ_ADD);
> > -
> > -   wbt_enable_default(q);
> > -
> > -   blk_throtl_register_queue(q);
> > -
> > +   /*
> > +* The queue's kobject ADD uevent isn't sent out, also the
> > +* flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
> > +* switch won't happen at all.
> > +*/
> > if (q->elevator) {
> > -   ret = elv_register_queue(q);
> > +   ret = elv_register_queue(q, false);
> > if (ret) {
> > -   mutex_unlock(&q->sysfs_lock);
> > -   kobject_uevent(&q->kobj, KOBJ_REMOVE);
> > +   mutex_unlock(&q->sysfs_dir_lock);
> > kobject_del(&q->kobj);
> > blk_trace_remove_sysfs(dev);
> > kobject_put(&dev->kobj);
> > return ret;
> > }
> > +   has_elevator = true;
> > }
> 
> I think the reference to the kobject ADD event in the comment is misleading.
> If e.g. a request queue is registered, unregistered and reregistered
> quickly, can it happen that a udev rule for the ADD event triggered by the
> first registration is executed in the middle of the second registration? Is

It should happen, but this patch doesn't change anything about this
behavior.

> setting the REGISTERED flag later sufficient to fix the race against
> scheduler changes through sysfs?

Yes, it is enough. 

> If so, how about leaving out the reference
> to the kobject ADD event from the above comment?

OK.

> 
> > +   mutex_lock(&q->sysfs_lock);
> > +   blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
> > +   wbt_enable_default(q);
> > +   blk_throtl_register_queue(q);
> > +   mutex_unlock(&q->sysfs_lock);
> > +
> > +   /* Now everything is ready and send out KOBJ_ADD uevent */
> > +   kobject_uevent(&q->kobj, KOBJ_ADD);
> > +   if (has_elevator)
> > +   kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
> 
> Can it happen that immediately after mutex_unlock(&q->sysfs_lock) a script
> removes the I/O scheduler and hence makes the value of the 'has_elevator'
> variable stale? In other words, should emitting KOBJ_ADD also be protected
> by sysfs_lock?

Good catch, it could be fine to hold syfs_lock for emitting KOBJ_ADD.

> 
> > @@ -1021,6 +1032,7 @@ EXPORT_SYMBOL_GPL(blk_register_queue);
> >   void blk_unregister_queue(struct gendisk *disk)
> >   {
> > struct request_queue *q = disk->queue;
> > +   bool has_elevator;
> > if (WARN_ON(!q))
> > return;
> > @@ -1035,25 +1047,25 @@ void blk_unregister_queue(struct gendisk *disk)
> >  * concurrent elv_iosched_store() calls.
> >  */
> > mutex_lock(&q->sysfs_lock);
&g

[PATCH V3 5/5] block: split .sysfs_lock into two locks

2019-08-25 Thread Ming Lei
 __se_sys_delete_module+0x204/0x337
 ? free_module+0x39f/0x39f
 ? blkcg_maybe_throttle_current+0x8a/0x718
 ? rwlock_bug+0x62/0x62
 ? __blkcg_punt_bio_submit+0xd0/0xd0
 ? trace_hardirqs_on_thunk+0x1a/0x20
 ? mark_held_locks+0x1f/0x7a
 ? do_syscall_64+0x4c/0x295
 do_syscall_64+0xa7/0x295
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fb696cdbe6b
Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 
1f 84 00 00 008
RSP: 002b:7ffec9588788 EFLAGS: 0206 ORIG_RAX: 00b0
RAX: ffda RBX: 559e589137c0 RCX: 7fb696cdbe6b
RDX: 000a RSI: 0800 RDI: 559e58913828
RBP:  R08: 7ffec9587701 R09: 
R10: 7fb696d4eae0 R11: 0206 R12: 7ffec95889b0
R13: 7ffec95896b3 R14: 559e58913260 R15: 0000559e589137c0

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-core.c   |  1 +
 block/blk-mq-sysfs.c   | 12 +--
 block/blk-sysfs.c  | 46 ++
 block/blk.h|  2 +-
 block/elevator.c   | 46 ++
 include/linux/blkdev.h |  1 +
 6 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 919629ce4015..2792f7cf7bef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
mutex_init(&q->blk_trace_mutex);
 #endif
mutex_init(&q->sysfs_lock);
+   mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
 
init_waitqueue_head(&q->mq_freeze_wq);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 6ddde3774ebe..a0d3ce30fa08 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -270,7 +270,7 @@ void blk_mq_unregister_dev(struct device *dev, struct 
request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   lockdep_assert_held(&q->sysfs_lock);
+   lockdep_assert_held(&q->sysfs_dir_lock);
 
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
@@ -320,7 +320,7 @@ int __blk_mq_register_dev(struct device *dev, struct 
request_queue *q)
int ret, i;
 
WARN_ON_ONCE(!q->kobj.parent);
-   lockdep_assert_held(&q->sysfs_lock);
+   lockdep_assert_held(&q->sysfs_dir_lock);
 
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -354,7 +354,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
 
@@ -362,7 +362,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
blk_mq_unregister_hctx(hctx);
 
 unlock:
-   mutex_unlock(&q->sysfs_lock);
+   mutex_unlock(&q->sysfs_dir_lock);
 }
 
 int blk_mq_sysfs_register(struct request_queue *q)
@@ -370,7 +370,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
 
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
 
@@ -381,7 +381,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
}
 
 unlock:
-   mutex_unlock(&q->sysfs_lock);
+   mutex_unlock(&q->sysfs_dir_lock);
 
return ret;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5b0b5224cfd4..5941a0176f87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -938,6 +938,7 @@ int blk_register_queue(struct gendisk *disk)
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
+   bool has_elevator = false;
 
if (WARN_ON(!q))
return -ENXIO;
@@ -945,7 +946,6 @@ int blk_register_queue(struct gendisk *disk)
WARN_ONCE(blk_queue_registered(q),
  "%s is registering an already registered queue\n",
  kobject_name(&dev->kobj));
-   blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 
/*
 * SCSI probing may synchronously create and destroy a lot of
@@ -966,7 +966,7 @@ int blk_register_queue(struct gendisk *disk)
return ret;
 
/* Prevent changes through sysfs until registration is completed. */
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
 
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0)

[PATCH V3 4/5] block: add helper for checking if queue is registered

2019-08-25 Thread Ming Lei
There are 4 users which check if queue is registered, so add one helper
to check it.

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-sysfs.c  | 4 ++--
 block/blk-wbt.c| 2 +-
 block/elevator.c   | 2 +-
 include/linux/blkdev.h | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 977c659dcd18..5b0b5224cfd4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -942,7 +942,7 @@ int blk_register_queue(struct gendisk *disk)
if (WARN_ON(!q))
return -ENXIO;
 
-   WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+   WARN_ONCE(blk_queue_registered(q),
  "%s is registering an already registered queue\n",
  kobject_name(&dev->kobj));
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@@ -1026,7 +1026,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
 
/* Return early if disk->queue was never registered. */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return;
 
/*
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 313f45a37e9d..c4d3089e47f7 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -656,7 +656,7 @@ void wbt_enable_default(struct request_queue *q)
return;
 
/* Queue not registered? Maybe shutting down... */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return;
 
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
diff --git a/block/elevator.c b/block/elevator.c
index 33c15fb54ed1..03d923196569 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -656,7 +656,7 @@ static int __elevator_change(struct request_queue *q, const 
char *name)
struct elevator_type *e;
 
/* Make sure queue is not in the middle of being removed */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return -ENOENT;
 
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 167bf879f072..6041755984f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -647,6 +647,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct 
request_queue *q);
 #define blk_queue_quiesced(q)  test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
 #define blk_queue_pm_only(q)   atomic_read(&(q)->pm_only)
 #define blk_queue_fua(q)   test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
+#define blk_queue_registered(q)test_bit(QUEUE_FLAG_REGISTERED, 
&(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
-- 
2.20.1



[PATCH V3 0/5] block: don't acquire .sysfs_lock before removing mq & iosched kobjects

2019-08-25 Thread Ming Lei
Hi,

The 1st 3 patches cleans up current uses on q->sysfs_lock.

The 4th patch adds one helper for checking if queue is registered.

The last patch splits .sysfs_lock into two locks: one is only for
sync .store/.show from sysfs, the other one is for pretecting kobjects
registering/unregistering. Meantime avoid to acquire .sysfs_lock when
removing mq & iosched kobjects, so that the reported deadlock can
be fixed.

V3:
- drop the 4th patch in V2, which is wrong, meantime not necesary
  for fixing this deadlock
- replace comment with one WARN_ON_ONCE() in patch 2
- add reviewed-by tag

V2:
- remove several uses on .sysfs_lock
- Remove blk_mq_register_dev()
- add one helper for checking queue registered
- split .sysfs_lock into two locks


Bart Van Assche (1):
  block: Remove blk_mq_register_dev()

Ming Lei (4):
  block: don't hold q->sysfs_lock in elevator_init_mq
  blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue
  block: add helper for checking if queue is registered
  block: split .sysfs_lock into two locks

 block/blk-core.c   |  1 +
 block/blk-mq-sysfs.c   | 23 
 block/blk-mq.c |  7 -
 block/blk-sysfs.c  | 50 +-
 block/blk-wbt.c|  2 +-
 block/blk.h|  2 +-
 block/elevator.c   | 62 ++
 include/linux/blk-mq.h |  1 -
 include/linux/blkdev.h |  2 ++
 9 files changed, 86 insertions(+), 64 deletions(-)

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Cc: Damien Le Moal 

-- 
2.20.1



[PATCH V3 1/5] block: Remove blk_mq_register_dev()

2019-08-25 Thread Ming Lei
From: Bart Van Assche 

This function has no callers. Hence remove it.

Cc: Christoph Hellwig 
Cc: Ming Lei 
Cc: Hannes Reinecke 
Signed-off-by: Bart Van Assche 
---
 block/blk-mq-sysfs.c   | 11 ---
 include/linux/blk-mq.h |  1 -
 2 files changed, 12 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index d6e1a9bd7131..6ddde3774ebe 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -349,17 +349,6 @@ int __blk_mq_register_dev(struct device *dev, struct 
request_queue *q)
return ret;
 }
 
-int blk_mq_register_dev(struct device *dev, struct request_queue *q)
-{
-   int ret;
-
-   mutex_lock(&q->sysfs_lock);
-   ret = __blk_mq_register_dev(dev, q);
-   mutex_unlock(&q->sysfs_lock);
-
-   return ret;
-}
-
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 21cebe901ac0..62a3bb715899 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -253,7 +253,6 @@ struct request_queue *blk_mq_init_sq_queue(struct 
blk_mq_tag_set *set,
const struct blk_mq_ops *ops,
unsigned int queue_depth,
unsigned int set_flags);
-int blk_mq_register_dev(struct device *, struct request_queue *);
 void blk_mq_unregister_dev(struct device *, struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
-- 
2.20.1



[PATCH V3 3/5] blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue

2019-08-25 Thread Ming Lei
blk_mq_map_swqueue() is called from blk_mq_init_allocated_queue()
and blk_mq_update_nr_hw_queues(). For the former caller, the kobject
isn't exposed to userspace yet. For the latter caller, hctx sysfs entries
and debugfs are un-registered before updating nr_hw_queues.

On the other hand, commit 2f8f1336a48b ("blk-mq: always free hctx after
request queue is freed") moves freeing hctx into queue's release
handler, so there won't be race with queue release path too.

So don't hold q->sysfs_lock in blk_mq_map_swqueue().

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6968de9d7402..b0ee0cac737f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2456,11 +2456,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
 
-   /*
-* Avoid others reading imcomplete hctx->cpumask through sysfs
-*/
-   mutex_lock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -2521,8 +2516,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
HCTX_TYPE_DEFAULT, i);
}
 
-   mutex_unlock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
/*
 * If no software queues are mapped to this hardware queue,
-- 
2.20.1



[PATCH V3 2/5] block: don't hold q->sysfs_lock in elevator_init_mq

2019-08-25 Thread Ming Lei
The original comment says:

q->sysfs_lock must be held to provide mutual exclusion between
elevator_switch() and here.

Which is simply wrong. elevator_init_mq() is only called from
blk_mq_init_allocated_queue, which is always called before the request
queue is registered via blk_register_queue(), for dm-rq or normal rq
based driver. However, queue's kobject is only exposed and added to sysfs
in blk_register_queue(). So there isn't such race between elevator_switch()
and elevator_init_mq().

So avoid to hold q->sysfs_lock in elevator_init_mq().

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Cc: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/elevator.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 2f17d66d0e61..33c15fb54ed1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -607,23 +607,19 @@ int elevator_init_mq(struct request_queue *q)
if (q->nr_hw_queues != 1)
return 0;
 
-   /*
-* q->sysfs_lock must be held to provide mutual exclusion between
-* elevator_switch() and here.
-*/
-   mutex_lock(&q->sysfs_lock);
+   WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
+
if (unlikely(q->elevator))
-   goto out_unlock;
+   goto out;
 
e = elevator_get(q, "mq-deadline", false);
if (!e)
-   goto out_unlock;
+   goto out;
 
err = blk_mq_init_sched(q, e);
if (err)
elevator_put(e);
-out_unlock:
-   mutex_unlock(&q->sysfs_lock);
+out:
return err;
 }
 
-- 
2.20.1



Re: [PATCH V2 4/6] blk-mq: don't hold q->sysfs_lock in blk_mq_realloc_hw_ctxs()

2019-08-25 Thread Ming Lei
On Wed, Aug 21, 2019 at 08:56:36AM -0700, Bart Van Assche wrote:
> On 8/21/19 2:15 AM, Ming Lei wrote:
> > blk_mq_realloc_hw_ctxs() is called from blk_mq_init_allocated_queue()
> > and blk_mq_update_nr_hw_queues(). For the former caller, the kobject
> > isn't exposed to userspace yet. For the latter caller, sysfs/debugfs
> > is un-registered before updating nr_hw_queues.
> > 
> > On the other hand, commit 2f8f1336a48b ("blk-mq: always free hctx after
> > request queue is freed") moves freeing hctx into queue's release
> > handler, so there won't be race with queue release path too.
> > 
> > So don't hold q->sysfs_lock in blk_mq_realloc_hw_ctxs().
> 
> How about mentioning that the locking at the start of
> blk_mq_update_nr_hw_queues() serializes all blk_mq_realloc_hw_ctxs() calls
> that happen after a queue has been registered in sysfs?

This patch is actually wrong because elevator switch still may happen
during updating nr_hw_queues, since only hctx sysfs entries are
un-registered, and "queue/scheduler" is still visible to userspace.

So I will drop this patch in V3.

Thanks,
Ming


Re: [PATCH V2 3/6] blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue

2019-08-25 Thread Ming Lei
On Wed, Aug 21, 2019 at 08:53:52AM -0700, Bart Van Assche wrote:
> On 8/21/19 2:15 AM, Ming Lei wrote:
> > blk_mq_map_swqueue() is called from blk_mq_init_allocated_queue()
> > and blk_mq_update_nr_hw_queues(). For the former caller, the kobject
> > isn't exposed to userspace yet. For the latter caller, sysfs/debugfs
> > is un-registered before updating nr_hw_queues.
> > 
> > On the other hand, commit 2f8f1336a48b ("blk-mq: always free hctx after
> > request queue is freed") moves freeing hctx into queue's release
> > handler, so there won't be race with queue release path too.
> > 
> > So don't hold q->sysfs_lock in blk_mq_map_swqueue().
> > 
> > Cc: Christoph Hellwig 
> > Cc: Hannes Reinecke 
> > Cc: Greg KH 
> > Cc: Mike Snitzer 
> > Cc: Bart Van Assche 
> > Signed-off-by: Ming Lei 
> > ---
> >   block/blk-mq.c | 7 ---
> >   1 file changed, 7 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 6968de9d7402..b0ee0cac737f 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2456,11 +2456,6 @@ static void blk_mq_map_swqueue(struct request_queue 
> > *q)
> > struct blk_mq_ctx *ctx;
> > struct blk_mq_tag_set *set = q->tag_set;
> > -   /*
> > -* Avoid others reading imcomplete hctx->cpumask through sysfs
> > -*/
> > -   mutex_lock(&q->sysfs_lock);
> > -
> > queue_for_each_hw_ctx(q, hctx, i) {
> > cpumask_clear(hctx->cpumask);
> > hctx->nr_ctx = 0;
> > @@ -2521,8 +2516,6 @@ static void blk_mq_map_swqueue(struct request_queue 
> > *q)
> > HCTX_TYPE_DEFAULT, i);
> > }
> > -   mutex_unlock(&q->sysfs_lock);
> > -
> > queue_for_each_hw_ctx(q, hctx, i) {
> > /*
> >  * If no software queues are mapped to this hardware queue,
> > 
> 
> How about adding WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED,
> &q->queue_flags)) ?

q->kobject isn't un-registered before updating nr_hw_queues, and only
hctx->kobj is un-registered, so we can't add the warn here.


Thanks,
Ming


Re: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

2019-08-24 Thread Ming Lei
On Sat, Aug 24, 2019 at 12:27:18AM +, Long Li wrote:
> >>>Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU
> >>>with flooded interrupts
> >>>
> >>>On Tue, Aug 20, 2019 at 10:33:38AM -0700, Sagi Grimberg wrote:
> 
>  > From: Long Li 
>  >
>  > When a NVMe hardware queue is mapped to several CPU queues, it is
>  > possible that the CPU this hardware queue is bound to is flooded by
>  > returning I/O for other CPUs.
>  >
>  > For example, consider the following scenario:
>  > 1. CPU 0, 1, 2 and 3 share the same hardware queue 2. the hardware
>  > queue interrupts CPU 0 for I/O response 3. processes from CPU 1, 2
>  > and 3 keep sending I/Os
>  >
>  > CPU 0 may be flooded with interrupts from NVMe device that are I/O
>  > responses for CPU 1, 2 and 3. Under heavy I/O load, it is possible
>  > that CPU 0 spends all the time serving NVMe and other system
>  > interrupts, but doesn't have a chance to run in process context.
>  >
>  > To fix this, CPU 0 can schedule a work to complete the I/O request
>  > when it detects the scheduler is not making progress. This serves
> >>>multiple purposes:
>  >
>  > 1. This CPU has to be scheduled to complete the request. The other
>  > CPUs can't issue more I/Os until some previous I/Os are completed.
>  > This helps this CPU get out of NVMe interrupts.
>  >
>  > 2. This acts a throttling mechanisum for NVMe devices, in that it
>  > can not starve a CPU while servicing I/Os from other CPUs.
>  >
>  > 3. This CPU can make progress on RCU and other work items on its
> >>>queue.
> 
>  The problem is indeed real, but this is the wrong approach in my mind.
> 
>  We already have irqpoll which takes care proper budgeting polling
>  cycles and not hogging the cpu.
> >>>
> >>>The issue isn't unique to NVMe, and can be any fast devices which
> >>>interrupts CPU too frequently, meantime the interrupt/softirq handler may
> >>>take a bit much time, then CPU is easy to be lockup by the interrupt/sofirq
> >>>handler, especially in case that multiple submission CPUs vs. single
> >>>completion CPU.
> >>>
> >>>Some SCSI devices has the same problem too.
> >>>
> >>>Could we consider to add one generic mechanism to cover this kind of
> >>>problem?
> >>>
> >>>One approach I thought of is to allocate one backup thread for handling 
> >>>such
> >>>interrupt, which can be marked as IRQF_BACKUP_THREAD by drivers.
> >>>
> >>>Inside do_IRQ(), irqtime is accounted, before calling action->handler(),
> >>>check if this CPU has taken too long time for handling IRQ(interrupt or
> >>>softirq) and see if this CPU could be lock up. If yes, wakeup the backup
> 
> How do you know if this CPU is spending all the time in do_IRQ()?
> 
> Is it something like:
> If (IRQ_time /elapsed_time > a threshold value)
>   wake up the backup thread

Yeah, the above could work in theory.

Another approach I thought of is to monitor average irq gap time on each
CPU.

We could use EWMA(Exponential Weighted Moving Average) to do it simply,
such as:

curr_irq_gap(cpu) = current start time of do_IRQ() on 'cpu' -
end time of last do_IRQ() on 'cpu'
avg_irq_gap(cpu) = weight_prev * avg_irq_gap(cpu) + weight_curr * 
curr_irq_gap(cpu) 

note:
weight_prev + weight_curr = 1

When avg_irq_gap(cpu) is close to one small enough threshold, we think irq 
flood is
detected.

'weight_prev' could be chosen as one big enough value for avoiding short-time 
flood.


Thanks,
Ming


Re: [PATCH V2 6/6] block: split .sysfs_lock into two locks

2019-08-23 Thread Ming Lei
On Fri, Aug 23, 2019 at 09:46:48AM -0700, Bart Van Assche wrote:
> On 8/21/19 2:15 AM, Ming Lei wrote:
> > @@ -966,7 +966,7 @@ int blk_register_queue(struct gendisk *disk)
> > return ret;
> > /* Prevent changes through sysfs until registration is completed. */
> > -   mutex_lock(&q->sysfs_lock);
> > +   mutex_lock(&q->sysfs_dir_lock);
> > ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
> > if (ret < 0) {
> > @@ -987,26 +987,37 @@ int blk_register_queue(struct gendisk *disk)
> > blk_mq_debugfs_register(q);
> > }
> > -   kobject_uevent(&q->kobj, KOBJ_ADD);
> > -
> > -   wbt_enable_default(q);
> > -
> > -   blk_throtl_register_queue(q);
> > -
> > +   /*
> > +* The queue's kobject ADD uevent isn't sent out, also the
> > +* flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
> > +* switch won't happen at all.
> > +*/
> > if (q->elevator) {
> > -   ret = elv_register_queue(q);
> > +   ret = elv_register_queue(q, false);
> > if (ret) {
> 
> The above changes seems risky to me. In contrast with what the comment
> suggests, user space code is not required to wait for KOBJ_ADD event to
> start using sysfs attributes. I think user space code *can* write into the
> request queue I/O scheduler sysfs attribute after the kobject_add() call has
> finished and before kobject_uevent(&q->kobj, KOBJ_ADD) is called.

Yeah, one crazy userspace may simply poll on sysfs entres and start to
READ/WRITE before seeing the KOBJ_ADD event.

However, we have another protection via the queue flag QUEUE_FLAG_REGISTERED,
which is set after everything is done. So if userspace's early store
comes, elevator switch still can't happen because the flag is checked in
__elevator_change().

thanks,
Ming


Re: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

2019-08-22 Thread Ming Lei
On Tue, Aug 20, 2019 at 10:33:38AM -0700, Sagi Grimberg wrote:
> 
> > From: Long Li 
> > 
> > When a NVMe hardware queue is mapped to several CPU queues, it is possible
> > that the CPU this hardware queue is bound to is flooded by returning I/O for
> > other CPUs.
> > 
> > For example, consider the following scenario:
> > 1. CPU 0, 1, 2 and 3 share the same hardware queue
> > 2. the hardware queue interrupts CPU 0 for I/O response
> > 3. processes from CPU 1, 2 and 3 keep sending I/Os
> > 
> > CPU 0 may be flooded with interrupts from NVMe device that are I/O responses
> > for CPU 1, 2 and 3. Under heavy I/O load, it is possible that CPU 0 spends
> > all the time serving NVMe and other system interrupts, but doesn't have a
> > chance to run in process context.
> > 
> > To fix this, CPU 0 can schedule a work to complete the I/O request when it
> > detects the scheduler is not making progress. This serves multiple purposes:
> > 
> > 1. This CPU has to be scheduled to complete the request. The other CPUs 
> > can't
> > issue more I/Os until some previous I/Os are completed. This helps this CPU
> > get out of NVMe interrupts.
> > 
> > 2. This acts a throttling mechanisum for NVMe devices, in that it can not
> > starve a CPU while servicing I/Os from other CPUs.
> > 
> > 3. This CPU can make progress on RCU and other work items on its queue.
> 
> The problem is indeed real, but this is the wrong approach in my mind.
> 
> We already have irqpoll which takes care proper budgeting polling
> cycles and not hogging the cpu.

The issue isn't unique to NVMe, and can be any fast devices which
interrupts CPU too frequently, meantime the interrupt/softirq handler may
take a bit much time, then CPU is easy to be lockup by the interrupt/sofirq
handler, especially in case that multiple submission CPUs vs. single
completion CPU.

Some SCSI devices has the same problem too.

Could we consider to add one generic mechanism to cover this kind of
problem?

One approach I thought of is to allocate one backup thread for handling
such interrupt, which can be marked as IRQF_BACKUP_THREAD by drivers. 

Inside do_IRQ(), irqtime is accounted, before calling action->handler(),
check if this CPU has taken too long time for handling IRQ(interrupt or
softirq) and see if this CPU could be lock up. If yes, wakeup the backup
thread to handle the interrupt for avoiding lockup this CPU.

The threaded interrupt framework is there, and this way could be easier
to implement. Meantime most time the handler is run in interrupt context
and we may avoid the performance loss when CPU isn't busy enough.

Any comment on this approach?

Thanks,
Ming


Re: [PATCH 3/3] xfs: alignment check bio buffers

2019-08-22 Thread Ming Lei
On Thu, Aug 22, 2019 at 05:14:40PM -0700, Christoph Hellwig wrote:
> On Thu, Aug 22, 2019 at 06:20:00PM +0800, Ming Lei wrote:
> > In theory, fs bio shouldn't care any DMA limits, which should have been done
> > on splitted bio for doing IO to device.
> > 
> > Also .dma_alignment isn't considered in blk_stack_limits(), so in case
> > of DM, MD or other stacking drivers, fs code won't know the accurate
> > .dma_alignment of underlying queues at all, and the stacking driver's
> > queue dma alignment is still 512.
> 
> Trying to handling alignment lower down means bounce buffering, so I
> don't think trying to hndle it is a sane idea.  I'd be much happier to
> say non-passthrough bios need 512 byte alignment, period.  That should
> cover all the sane cases and we can easily check for it.  The occasional
> device that would need larger alignment just needs to deal with it.

Yeah, I agree we need to avoid bounce buffer, and it is fine to check
512 simply.

Also we should consider the interface/protocol between fs and block layer,
it could make both sides happy to always align offset & length with logical
block size. And that is reasonable for fs bio.


Thanks,
Ming


Re: [PATCH V2 6/6] block: split .sysfs_lock into two locks

2019-08-22 Thread Ming Lei
On Thu, Aug 22, 2019 at 12:52:54PM -0700, Bart Van Assche wrote:
> On 8/21/19 6:28 PM, Ming Lei wrote:
> > On Wed, Aug 21, 2019 at 09:18:08AM -0700, Bart Van Assche wrote:
> > > On 8/21/19 2:15 AM, Ming Lei wrote:
> > > > diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> > > > index 31bbf10d8149..a4cc40ddda86 100644
> > > > --- a/block/blk-mq-sysfs.c
> > > > +++ b/block/blk-mq-sysfs.c
> > > > @@ -247,7 +247,7 @@ void blk_mq_unregister_dev(struct device *dev, 
> > > > struct request_queue *q)
> > > > struct blk_mq_hw_ctx *hctx;
> > > > int i;
> > > > -   lockdep_assert_held(&q->sysfs_lock);
> > > > +   lockdep_assert_held(&q->sysfs_dir_lock);
> > > > queue_for_each_hw_ctx(q, hctx, i)
> > > > blk_mq_unregister_hctx(hctx);
> > > > @@ -297,7 +297,7 @@ int __blk_mq_register_dev(struct device *dev, 
> > > > struct request_queue *q)
> > > > int ret, i;
> > > > WARN_ON_ONCE(!q->kobj.parent);
> > > > -   lockdep_assert_held(&q->sysfs_lock);
> > > > +   lockdep_assert_held(&q->sysfs_dir_lock);
> > > > ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", 
> > > > "mq");
> > > > if (ret < 0)
> > > 
> > > blk_mq_unregister_dev and __blk_mq_register_dev() are only used by
> > > blk_register_queue() and blk_unregister_queue(). It is the responsibility 
> > > of
> > > the callers of these function to serialize request queue registration and
> > > unregistration. Is it really necessary to hold a mutex around the
> > > blk_mq_unregister_dev and __blk_mq_register_dev() calls? Or in other 
> > > words,
> > > can it ever happen that multiple threads invoke one or both functions
> > > concurrently?
> > 
> > hctx kobjects can be removed and re-added via blk_mq_update_nr_hw_queues()
> > which may be called at the same time when queue is registering or
> > un-registering.
> 
> Shouldn't blk_register_queue() and blk_unregister_queue() be serialized
> against blk_mq_update_nr_hw_queues()? Allowing these calls to proceed

It can be easy to say than done. We depends on users for sync
between blk_register_queue() and blk_unregister_queue(), also
there are several locks involved in blk_mq_update_nr_hw_queues().

Now, the sync is done via .sysfs_lock, and so far not see issues in this
area. This patch just converts the .sysfs_lock into .sysfs_dir_lock for
same purpose.

If you have simple and workable patch to serialize blk_register_queue() and
blk_unregister_queue() against blk_mq_update_nr_hw_queues(), I am happy to
review. Otherwise please consider to do it in future, and it shouldn't a
blocker for fixing this deadlock, should it?


Thanks,
Ming


Re: [PATCH 3/3] xfs: alignment check bio buffers

2019-08-22 Thread Ming Lei
On Thu, Aug 22, 2019 at 01:08:52AM -0700, Christoph Hellwig wrote:
> On Thu, Aug 22, 2019 at 02:49:05PM +1000, Dave Chinner wrote:
> > On Thu, Aug 22, 2019 at 10:50:02AM +0800, Ming Lei wrote:
> > > It isn't correct to blk_rq_aligned() here because 'len' has to be logical 
> > > block
> > > size aligned, instead of DMA aligned only.
> 
> Even if len would have to be a multiple of the sector size, that doesn't
> mean calling blk_rq_aligned would be incorrect, just possibly not
> catching all issues.

In theory, fs bio shouldn't care any DMA limits, which should have been done
on splitted bio for doing IO to device.

Also .dma_alignment isn't considered in blk_stack_limits(), so in case
of DM, MD or other stacking drivers, fs code won't know the accurate
.dma_alignment of underlying queues at all, and the stacking driver's
queue dma alignment is still 512.

Also suppose the check is added, I am a bit curious how fs code handles the
failure, so could you explain a bit about the failure handling?

Thanks, 
Ming


Re: [PATCH 3/3] xfs: alignment check bio buffers

2019-08-22 Thread Ming Lei
On Thu, Aug 22, 2019 at 02:49:05PM +1000, Dave Chinner wrote:
> On Thu, Aug 22, 2019 at 10:50:02AM +0800, Ming Lei wrote:
> > On Thu, Aug 22, 2019 at 8:06 AM Christoph Hellwig  
> > wrote:
> > >
> > > On Wed, Aug 21, 2019 at 06:38:20PM +1000, Dave Chinner wrote:
> > > > From: Dave Chinner 
> > > >
> > > > Add memory buffer alignment validation checks to bios built in XFS
> > > > to catch bugs that will result in silent data corruption in block
> > > > drivers that cannot handle unaligned memory buffers but don't
> > > > validate the incoming buffer alignment is correct.
> > > >
> > > > Known drivers with these issues are xenblk, brd and pmem.
> > > >
> > > > Despite there being nothing XFS specific to xfs_bio_add_page(), this
> > > > function was created to do the required validation because the block
> > > > layer developers that keep telling us that is not possible to
> > > > validate buffer alignment in bio_add_page(), and even if it was
> > > > possible it would be too much overhead to do at runtime.
> > >
> > > I really don't think we should life this to XFS, but instead fix it
> > > in the block layer.  And that is not only because I have a pending
> > > series lifting bits you are touching to the block layer..
> > >
> > > > +int
> > > > +xfs_bio_add_page(
> > > > + struct bio  *bio,
> > > > + struct page *page,
> > > > + unsigned intlen,
> > > > + unsigned intoffset)
> > > > +{
> > > > + struct request_queue*q = bio->bi_disk->queue;
> > > > + boolsame_page = false;
> > > > +
> > > > + if (WARN_ON_ONCE(!blk_rq_aligned(q, len, offset)))
> > > > + return -EIO;
> > > > +
> > > > + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
> > > > + if (bio_full(bio, len))
> > > > + return 0;
> > > > + __bio_add_page(bio, page, len, offset);
> > > > + }
> > > > + return len;
> > >
> > > I know Jens disagree, but with the amount of bugs we've been hitting
> > > thangs to slub (and I'm pretty sure we have a more hiding outside of
> > > XFS) I think we need to add the blk_rq_aligned check to bio_add_page.
> > 
> > It isn't correct to blk_rq_aligned() here because 'len' has to be logical 
> > block
> > size aligned, instead of DMA aligned only.
> 
> News to me.
> 
> AFAIA, the overall _IO_ that is being built needs to be a multiple
> of the logical block size in total size (i.e. bio->bi_iter.size)

Right.

> because sub sector IO is not allowed. But queue DMA limits are not
> defined in sectors - they define the scatter/gather DMA capability
> of the hardware, and that's what individual segments (bvecs) need to
> align to.  That's what blk_rq_aligned() checks here - that the bvec

Segment isn't same with bvec. We build segment via scatterlist interface
from bvecs in case that driver needs segment for DMA between CPU and
HBA. The built segment has to respect every kinds of queue limits.

Now there are two kinds of bio, one is called fs bio, the other one is
bio for doing IO from/to the device. Block layer splits fs bio into
bios with proper size for doing IO.

If one bvec is added with un-aligned length to fs bio, and if this bvec
can't be merged with the following ones, how can block layer handle that?
For example, this bvec is un-aligned with virt boundary, then one single
bio is allocated for doing IO of this bvec, then sub-sector IO is
generated.

> segment aligns to what the underlying driver(s) requires, not that
> the entire IO is sector sized and aligned.

Not every drivers need to handle segment, some drivers simply handle
single-page bvec(pmem, brd, zram, ...) or multi-page bvec(loop).

Then un-aligned bvec may cause trouble for drivers which single-page bvec.

> 
> Also, think about multipage bvecs - the pages we are spanning here
> are contiguous pages, so this should end up merging them and turning
> it into a single multipage bvec whose length is sector size
> aligned...

This way works for drivers which use segment, and most of drivers
belong to this type.

> 
> > Also not sure all users may setup bio->bi_disk well before adding page to 
> > bio,
> > since it is allowed to do that now.
> 
> XFS does, so I just don't care about random users of bio_add_page()
> in this patch. Somebody e

Re: [PATCH 3/3] xfs: alignment check bio buffers

2019-08-21 Thread Ming Lei
On Thu, Aug 22, 2019 at 8:06 AM Christoph Hellwig  wrote:
>
> On Wed, Aug 21, 2019 at 06:38:20PM +1000, Dave Chinner wrote:
> > From: Dave Chinner 
> >
> > Add memory buffer alignment validation checks to bios built in XFS
> > to catch bugs that will result in silent data corruption in block
> > drivers that cannot handle unaligned memory buffers but don't
> > validate the incoming buffer alignment is correct.
> >
> > Known drivers with these issues are xenblk, brd and pmem.
> >
> > Despite there being nothing XFS specific to xfs_bio_add_page(), this
> > function was created to do the required validation because the block
> > layer developers that keep telling us that is not possible to
> > validate buffer alignment in bio_add_page(), and even if it was
> > possible it would be too much overhead to do at runtime.
>
> I really don't think we should life this to XFS, but instead fix it
> in the block layer.  And that is not only because I have a pending
> series lifting bits you are touching to the block layer..
>
> > +int
> > +xfs_bio_add_page(
> > + struct bio  *bio,
> > + struct page *page,
> > + unsigned intlen,
> > + unsigned intoffset)
> > +{
> > + struct request_queue*q = bio->bi_disk->queue;
> > + boolsame_page = false;
> > +
> > + if (WARN_ON_ONCE(!blk_rq_aligned(q, len, offset)))
> > + return -EIO;
> > +
> > + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
> > + if (bio_full(bio, len))
> > + return 0;
> > + __bio_add_page(bio, page, len, offset);
> > + }
> > + return len;
>
> I know Jens disagree, but with the amount of bugs we've been hitting
> thangs to slub (and I'm pretty sure we have a more hiding outside of
> XFS) I think we need to add the blk_rq_aligned check to bio_add_page.

It isn't correct to blk_rq_aligned() here because 'len' has to be logical block
size aligned, instead of DMA aligned only.

Also not sure all users may setup bio->bi_disk well before adding page to bio,
since it is allowed to do that now.

If slub buffer crosses two pages, block layer may not handle it at all
even though
un-aligned 'offset' issue is solved.

Thanks,
Ming Lei


Re: [PATCH V2 6/6] block: split .sysfs_lock into two locks

2019-08-21 Thread Ming Lei
On Wed, Aug 21, 2019 at 09:18:08AM -0700, Bart Van Assche wrote:
> On 8/21/19 2:15 AM, Ming Lei wrote:
> > diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> > index 31bbf10d8149..a4cc40ddda86 100644
> > --- a/block/blk-mq-sysfs.c
> > +++ b/block/blk-mq-sysfs.c
> > @@ -247,7 +247,7 @@ void blk_mq_unregister_dev(struct device *dev, struct 
> > request_queue *q)
> > struct blk_mq_hw_ctx *hctx;
> > int i;
> > -   lockdep_assert_held(&q->sysfs_lock);
> > +   lockdep_assert_held(&q->sysfs_dir_lock);
> > queue_for_each_hw_ctx(q, hctx, i)
> > blk_mq_unregister_hctx(hctx);
> > @@ -297,7 +297,7 @@ int __blk_mq_register_dev(struct device *dev, struct 
> > request_queue *q)
> > int ret, i;
> > WARN_ON_ONCE(!q->kobj.parent);
> > -   lockdep_assert_held(&q->sysfs_lock);
> > +   lockdep_assert_held(&q->sysfs_dir_lock);
> > ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
> > if (ret < 0)
> 
> blk_mq_unregister_dev and __blk_mq_register_dev() are only used by
> blk_register_queue() and blk_unregister_queue(). It is the responsibility of
> the callers of these function to serialize request queue registration and
> unregistration. Is it really necessary to hold a mutex around the
> blk_mq_unregister_dev and __blk_mq_register_dev() calls? Or in other words,
> can it ever happen that multiple threads invoke one or both functions
> concurrently?

hctx kobjects can be removed and re-added via blk_mq_update_nr_hw_queues()
which may be called at the same time when queue is registering or
un-registering.

Also the change can be simpler to use a new lock to replace the old one.

> 
> > @@ -331,7 +331,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
> > struct blk_mq_hw_ctx *hctx;
> > int i;
> > -   mutex_lock(&q->sysfs_lock);
> > +   mutex_lock(&q->sysfs_dir_lock);
> > if (!q->mq_sysfs_init_done)
> > goto unlock;
> > @@ -339,7 +339,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
> > blk_mq_unregister_hctx(hctx);
> >   unlock:
> > -   mutex_unlock(&q->sysfs_lock);
> > +   mutex_unlock(&q->sysfs_dir_lock);
> >   }
> >   int blk_mq_sysfs_register(struct request_queue *q)
> > @@ -347,7 +347,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
> > struct blk_mq_hw_ctx *hctx;
> > int i, ret = 0;
> > -   mutex_lock(&q->sysfs_lock);
> > +   mutex_lock(&q->sysfs_dir_lock);
> > if (!q->mq_sysfs_init_done)
> > goto unlock;
> > @@ -358,7 +358,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
> > }
> >   unlock:
> > -   mutex_unlock(&q->sysfs_lock);
> > +   mutex_unlock(&q->sysfs_dir_lock);
> > return ret;
> >   }
> 
> blk_mq_sysfs_unregister() and blk_mq_sysfs_register() are only used by
> __blk_mq_update_nr_hw_queues(). Calls to that function are serialized by the
> tag_list_lock mutex. Is it really necessary to use any locking inside these
> functions?

hctx kobjects can be removed and re-added via blk_mq_update_nr_hw_queues()
which may be called at the same time when queue is registering or
un-registering.

Also the change can be simpler to use a new lock to replace the old one.

> 
> > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > index 5b0b5224cfd4..5941a0176f87 100644
> > --- a/block/blk-sysfs.c
> > +++ b/block/blk-sysfs.c
> > @@ -938,6 +938,7 @@ int blk_register_queue(struct gendisk *disk)
> > int ret;
> > struct device *dev = disk_to_dev(disk);
> > struct request_queue *q = disk->queue;
> > +   bool has_elevator = false;
> > if (WARN_ON(!q))
> > return -ENXIO;
> > @@ -945,7 +946,6 @@ int blk_register_queue(struct gendisk *disk)
> > WARN_ONCE(blk_queue_registered(q),
> >   "%s is registering an already registered queue\n",
> >   kobject_name(&dev->kobj));
> > -   blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
> > /*
> >  * SCSI probing may synchronously create and destroy a lot of
> > @@ -966,7 +966,7 @@ int blk_register_queue(struct gendisk *disk)
> > return ret;
> > /* Prevent changes through sysfs until registration is completed. */
> > -   mutex_lock(&q->sysfs_lock);
> > +   mutex_lock(&q->sysfs_dir_lock);
> > ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
>

Re: [PATCH] block: don't acquire .sysfs_lock before removing mq & iosched kobjects

2019-08-21 Thread Ming Lei
On Wed, Aug 21, 2019 at 08:41:32AM -0700, Bart Van Assche wrote:
> On 8/20/19 8:00 PM, Ming Lei wrote:
> > On Tue, Aug 20, 2019 at 02:21:10PM -0700, Bart Van Assche wrote:
> > > - /*
> > > -  * Remove the sysfs attributes before unregistering the queue data
> > > -  * structures that can be modified through sysfs.
> > > -  */
> > >   if (queue_is_mq(q))
> > > - blk_mq_unregister_dev(disk_to_dev(disk), q);
> > > - mutex_unlock(&q->sysfs_lock);
> > > -
> > > + kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
> > 
> > Could you explain why you move the above line here?
> 
> I'm not sure whether kobject_del() deletes any objects attached to the
> deleted kobj. This change ensures that kobject_uevent() is called before the
> parent object of q->mq_kobj is deleted.

>From comment of kernfs_remove(), all subdirectories and files will be
removed.

kobject_del
sysfs_remove_dir
kernfs_remove

/**
 * kernfs_remove - remove a kernfs_node recursively
 * @kn: the kernfs_node to remove
 *
 * Remove @kn along with all its subdirectories and files.
 */
void kernfs_remove(struct kernfs_node *kn)


Thanks,
Ming


[PATCH V2 5/6] block: add helper for checking if queue is registered

2019-08-21 Thread Ming Lei
There are 4 users which check if queue is registered, so add one helper
to check it.

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-sysfs.c  | 4 ++--
 block/blk-wbt.c| 2 +-
 block/elevator.c   | 2 +-
 include/linux/blkdev.h | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 977c659dcd18..5b0b5224cfd4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -942,7 +942,7 @@ int blk_register_queue(struct gendisk *disk)
if (WARN_ON(!q))
return -ENXIO;
 
-   WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+   WARN_ONCE(blk_queue_registered(q),
  "%s is registering an already registered queue\n",
  kobject_name(&dev->kobj));
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@@ -1026,7 +1026,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
 
/* Return early if disk->queue was never registered. */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return;
 
/*
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 313f45a37e9d..c4d3089e47f7 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -656,7 +656,7 @@ void wbt_enable_default(struct request_queue *q)
return;
 
/* Queue not registered? Maybe shutting down... */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return;
 
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
diff --git a/block/elevator.c b/block/elevator.c
index 37b918dc4676..7449a5836b52 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -660,7 +660,7 @@ static int __elevator_change(struct request_queue *q, const 
char *name)
struct elevator_type *e;
 
/* Make sure queue is not in the middle of being removed */
-   if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+   if (!blk_queue_registered(q))
return -ENOENT;
 
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 167bf879f072..6041755984f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -647,6 +647,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct 
request_queue *q);
 #define blk_queue_quiesced(q)  test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
 #define blk_queue_pm_only(q)   atomic_read(&(q)->pm_only)
 #define blk_queue_fua(q)   test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
+#define blk_queue_registered(q)test_bit(QUEUE_FLAG_REGISTERED, 
&(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
-- 
2.20.1



[PATCH V2 6/6] block: split .sysfs_lock into two locks

2019-08-21 Thread Ming Lei
_sys_delete_module+0x204/0x337
 ? free_module+0x39f/0x39f
 ? blkcg_maybe_throttle_current+0x8a/0x718
 ? rwlock_bug+0x62/0x62
 ? __blkcg_punt_bio_submit+0xd0/0xd0
 ? trace_hardirqs_on_thunk+0x1a/0x20
 ? mark_held_locks+0x1f/0x7a
 ? do_syscall_64+0x4c/0x295
 do_syscall_64+0xa7/0x295
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fb696cdbe6b
Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 
1f 84 00 00 008
RSP: 002b:7ffec9588788 EFLAGS: 0206 ORIG_RAX: 00b0
RAX: ffda RBX: 559e589137c0 RCX: 7fb696cdbe6b
RDX: 000a RSI: 0800 RDI: 559e58913828
RBP:  R08: 7ffec9587701 R09: 
R10: 7fb696d4eae0 R11: 0206 R12: 7ffec95889b0
R13: 7ffec95896b3 R14: 559e58913260 R15: 0000559e589137c0

Cc: Christoph Hellwig 
Cc: Hannes Reinecke 
Cc: Greg KH 
Cc: Mike Snitzer 
Cc: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-core.c   |  1 +
 block/blk-mq-sysfs.c   | 12 +--
 block/blk-sysfs.c  | 46 ++
 block/blk.h|  2 +-
 block/elevator.c   | 46 ++
 include/linux/blkdev.h |  1 +
 6 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 919629ce4015..2792f7cf7bef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
mutex_init(&q->blk_trace_mutex);
 #endif
mutex_init(&q->sysfs_lock);
+   mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
 
init_waitqueue_head(&q->mq_freeze_wq);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 31bbf10d8149..a4cc40ddda86 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -247,7 +247,7 @@ void blk_mq_unregister_dev(struct device *dev, struct 
request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   lockdep_assert_held(&q->sysfs_lock);
+   lockdep_assert_held(&q->sysfs_dir_lock);
 
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
@@ -297,7 +297,7 @@ int __blk_mq_register_dev(struct device *dev, struct 
request_queue *q)
int ret, i;
 
WARN_ON_ONCE(!q->kobj.parent);
-   lockdep_assert_held(&q->sysfs_lock);
+   lockdep_assert_held(&q->sysfs_dir_lock);
 
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -331,7 +331,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
 
@@ -339,7 +339,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
blk_mq_unregister_hctx(hctx);
 
 unlock:
-   mutex_unlock(&q->sysfs_lock);
+   mutex_unlock(&q->sysfs_dir_lock);
 }
 
 int blk_mq_sysfs_register(struct request_queue *q)
@@ -347,7 +347,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
 
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
 
@@ -358,7 +358,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
}
 
 unlock:
-   mutex_unlock(&q->sysfs_lock);
+   mutex_unlock(&q->sysfs_dir_lock);
 
return ret;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5b0b5224cfd4..5941a0176f87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -938,6 +938,7 @@ int blk_register_queue(struct gendisk *disk)
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
+   bool has_elevator = false;
 
if (WARN_ON(!q))
return -ENXIO;
@@ -945,7 +946,6 @@ int blk_register_queue(struct gendisk *disk)
WARN_ONCE(blk_queue_registered(q),
  "%s is registering an already registered queue\n",
  kobject_name(&dev->kobj));
-   blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 
/*
 * SCSI probing may synchronously create and destroy a lot of
@@ -966,7 +966,7 @@ int blk_register_queue(struct gendisk *disk)
return ret;
 
/* Prevent changes through sysfs until registration is completed. */
-   mutex_lock(&q->sysfs_lock);
+   mutex_lock(&q->sysfs_dir_lock);
 
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0) {
@@ 

  1   2   3   4   5   6   7   8   9   10   >