Re: INFO: task hung in blk_freeze_queue

2018-05-04 Thread Tetsuo Handa
A patch for this specific report is ready. I don't know whether other
"dup" reports will be solved by this patch. Thus, I "undup" this report.

#syz undup

>From eed54c6ae475860a9c63b37b58f34735e792eef7 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa 
Date: Sat, 5 May 2018 12:59:12 +0900
Subject: [PATCH] block/loop: Add recursion check for LOOP_CHANGE_FD request.

syzbot is reporting hung tasks at blk_freeze_queue() [1]. This is
due to ioctl(loop_fd, LOOP_CHANGE_FD, loop_fd) request which should be
rejected. Fix this by adding same recursion check which is used by
LOOP_SET_FD request.

Signed-off-by: Tetsuo Handa 
Reported-by: syzbot 
Cc: Jens Axboe 
---
 drivers/block/loop.c | 59 
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5d4e316..cee3c01 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -644,6 +644,34 @@ static void loop_reread_partitions(struct loop_device *lo,
__func__, lo->lo_number, lo->lo_file_name, rc);
 }
 
+static inline int is_loop_device(struct file *file)
+{
+   struct inode *i = file->f_mapping->host;
+
+   return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
+}
+
+static int check_loop_recursion(struct file *f, struct block_device *bdev)
+{
+   /*
+* FIXME: Traversing on other loop devices without corresponding
+* lo_ctl_mutex is not safe. l->lo_state can become Lo_rundown and
+* l->lo_backing_file can become NULL when raced with LOOP_CLR_FD.
+*/
+   while (is_loop_device(f)) {
+   struct loop_device *l;
+
+   if (f->f_mapping->host->i_bdev == bdev)
+   return -EBUSY;
+
+   l = f->f_mapping->host->i_bdev->bd_disk->private_data;
+   if (l->lo_state == Lo_unbound)
+   return -EINVAL;
+   f = l->lo_backing_file;
+   }
+   return 0;
+}
+
 /*
  * loop_change_fd switched the backing store of a loopback device to
  * a new file. This is useful for operating system installers to free up
@@ -673,6 +701,11 @@ static int loop_change_fd(struct loop_device *lo, struct 
block_device *bdev,
if (!file)
goto out;
 
+   /* Avoid recursion */
+   error = check_loop_recursion(file, bdev);
+   if (error)
+   goto out_putf;
+
inode = file->f_mapping->host;
old_file = lo->lo_backing_file;
 
@@ -706,13 +739,6 @@ static int loop_change_fd(struct loop_device *lo, struct 
block_device *bdev,
return error;
 }
 
-static inline int is_loop_device(struct file *file)
-{
-   struct inode *i = file->f_mapping->host;
-
-   return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
-}
-
 /* loop sysfs attributes */
 
 static ssize_t loop_attr_show(struct device *dev, char *page,
@@ -877,7 +903,7 @@ static int loop_prepare_queue(struct loop_device *lo)
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
   struct block_device *bdev, unsigned int arg)
 {
-   struct file *file, *f;
+   struct file *file;
struct inode*inode;
struct address_space *mapping;
int lo_flags = 0;
@@ -897,20 +923,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t 
mode,
goto out_putf;
 
/* Avoid recursion */
-   f = file;
-   while (is_loop_device(f)) {
-   struct loop_device *l;
-
-   if (f->f_mapping->host->i_bdev == bdev)
-   goto out_putf;
-
-   l = f->f_mapping->host->i_bdev->bd_disk->private_data;
-   if (l->lo_state == Lo_unbound) {
-   error = -EINVAL;
-   goto out_putf;
-   }
-   f = l->lo_backing_file;
-   }
+   error = check_loop_recursion(file, bdev);
+   if (error)
+   goto out_putf;
 
mapping = file->f_mapping;
inode = mapping->host;
-- 
1.8.3.1




Re: INFO: task hung in blk_freeze_queue

2018-05-04 Thread syzbot

syzbot has found a reproducer for the following crash on:

HEAD commit:cdface520934 Merge tag 'for_linus_stable' of git://git.ker..
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=136c8ee780
kernel config:  https://syzkaller.appspot.com/x/.config?x=61c12b53c2a25ec4
dashboard link: https://syzkaller.appspot.com/bug?extid=2ab52b8d94df5e2eaa01
compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=15afa24780
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16f0771780

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+2ab52b8d94df5e2ea...@syzkaller.appspotmail.com

INFO: task syz-executor148:4500 blocked for more than 120 seconds.
  Not tainted 4.17.0-rc2+ #23
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor148 D16648  4500   4481 0x
Call Trace:
 context_switch kernel/sched/core.c:2848 [inline]
 __schedule+0x801/0x1e30 kernel/sched/core.c:3490
 schedule+0xef/0x430 kernel/sched/core.c:3549
 blk_mq_freeze_queue_wait+0x1ce/0x460 block/blk-mq.c:136
 blk_freeze_queue+0x4a/0x80 block/blk-mq.c:165
 blk_mq_freeze_queue+0x15/0x20 block/blk-mq.c:174
 loop_clr_fd+0x226/0xb80 drivers/block/loop.c:1047
 lo_ioctl+0x642/0x2130 drivers/block/loop.c:1404
 __blkdev_driver_ioctl block/ioctl.c:303 [inline]
 blkdev_ioctl+0x9b6/0x2020 block/ioctl.c:601
 block_ioctl+0xee/0x130 fs/block_dev.c:1877
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:500 [inline]
 do_vfs_ioctl+0x1cf/0x16a0 fs/ioctl.c:684
 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701
 __do_sys_ioctl fs/ioctl.c:708 [inline]
 __se_sys_ioctl fs/ioctl.c:706 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x449789
RSP: 002b:7f210fae5da8 EFLAGS: 0297 ORIG_RAX: 0010
RAX: ffda RBX: 006dac3c RCX: 00449789
RDX: 00449789 RSI: 4c01 RDI: 0003
RBP:  R08:  R09: 
R10:  R11: 0297 R12: 006dac38
R13: 0030656c69662f2e R14: 6f6f6c2f7665642f R15: 0007

Showing all locks held in the system:
2 locks held by khungtaskd/893:
 #0: 45f40930 (rcu_read_lock){}, at:  
check_hung_uninterruptible_tasks kernel/hung_task.c:175 [inline]
 #0: 45f40930 (rcu_read_lock){}, at: watchdog+0x1ff/0xf60  
kernel/hung_task.c:249
 #1: 81898718 (tasklist_lock){.+.+}, at:  
debug_show_all_locks+0xde/0x34a kernel/locking/lockdep.c:4470

1 lock held by rsyslogd/4362:
 #0: 2e322c73 (>f_pos_lock){+.+.}, at: __run_timers+0x16e/0xc50  
kernel/time/timer.c:1658

2 locks held by getty/4452:
 #0: 3abe4bd2 (>ldisc_sem){}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
 #1: 35e35fb8 (>atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

2 locks held by getty/4453:
 #0: 4e78faf9 (>ldisc_sem){}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
 #1: 44d079f2 (>atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

2 locks held by getty/4454:
 #0: 37bf7fca (>ldisc_sem){}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
 #1: fc65c2e0 (>atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

2 locks held by getty/4455:
 #0: 650b41ff (>ldisc_sem){}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
 #1: f8a69a89 (>atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

2 locks held by getty/4456:
 #0: 33547e18 (>ldisc_sem){}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
 #1: 0c85318d (>atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

2 locks held by getty/4457:
 #0: e5cb3908 (>ldisc_sem){}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
 #1: 9fc1aed4 (>atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

2 locks held by getty/4458:
 #0: 55360c24 (>ldisc_sem){}, at:  
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
 #1: 2bcd4fa8 (>atomic_read_lock){+.+.}, at:  
n_tty_read+0x321/0x1cc0 drivers/tty/n_tty.c:2131

1 lock held by syz-executor148/4486:
 #0: bf14345a (>lo_ctl_mutex/1){+.+.}, at: lo_ioctl+0x8d/0x2130  
drivers/block/loop.c:1391

1 lock held by syz-executor148/4500:
 #0: bf14345a (>lo_ctl_mutex/1){+.+.}, at: lo_ioctl+0x8d/0x2130  
drivers/block/loop.c:1391

1 lock held by syz-executor148/4514:
 #0: bf14345a (>lo_ctl_mutex/1){+.+.}, at: lo_ioctl+0x8d/0x2130  
drivers/block/loop.c:1391

1 lock held by syz-executor148/4515:
 #0: bf14345a (>lo_ctl_mutex/1){+.+.}, at: lo_ioctl+0x8d/0x2130  

Re: [PATCH V3 7/8] nvme: pci: recover controller reliably

2018-05-04 Thread Ming Lei
On Fri, May 4, 2018 at 4:28 PM, jianchao.wang
 wrote:
> Hi ming
>
> On 05/04/2018 04:02 PM, Ming Lei wrote:
>>> nvme_error_handler should invoke nvme_reset_ctrl instead of introducing 
>>> another interface.
>>> Then it is more convenient to ensure that there will be only one resetting 
>>> instance running.
>>>
>> But as you mentioned above, reset_work has to be splitted into two
>> contexts for handling IO timeout during wait_freeze in reset_work,
>> so single instance of nvme_reset_ctrl() may not work well.
>
> I mean the EH kthread and the reset_work which both could reset the ctrl 
> instead of
> the pre and post rest context.
>
> Honestly, I suspect a bit that whether it is worthy to try to recover from 
> [1].
> The Eh kthread solution could make things easier, but the codes for recovery 
> from [1] has
> made code really complicated. It is more difficult to unify the nvme-pci, 
> rdma and fc.

Another choice may be nested EH, which should be easier to implement:

- run the whole recovery procedures(shutdown & reset) in one single context
- and start a new context to handle new timeout during last recovery in the
same way

The two approaches is just like sync IO vs AIO.

Thanks,
Ming Lei


Re: v4.16-rc1 + dm-mpath + BFQ

2018-05-04 Thread Laurence Oberman
On Fri, 2018-05-04 at 22:11 +0200, Paolo Valente wrote:
> > Il giorno 30 mar 2018, alle ore 18:57, Bart Van Assche  > c...@wdc.com> ha scritto:
> > 
> > On Fri, 2018-03-30 at 10:23 +0200, Paolo Valente wrote:
> > > Still 4.16-rc1, being that the version for which you reported
> > > this
> > > issue in the first place.
> > 
> > A vanilla v4.16-rc1 kernel is not sufficient to run the srp-test
> > software
> > since RDMA/CM support for the SRP target driver is missing from
> > that kernel.
> > That's why I asked you to use the for-next branch from my github
> > repository
> > in a previous e-mail. Anyway, since the necessary patches are now
> > in
> > linux-next, the srp-test software can also be run against linux-
> > next. Here
> > are the results that I obtained with label next-20180329 and the
> > kernel
> > config attached to your previous e-mail:
> > 
> > # while ./srp-test/run_tests -c -d -r 10 -e bfq; do :; done
> > 
> > BUG: unable to handle kernel NULL pointer dereference at
> > 0200
> > PGD 0 P4D 0 
> > Oops: 0002 [#1] SMP PTI
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-
> > prebuilt.qemu-project.org 04/01/2014
> > RIP: 0010:rb_erase+0x284/0x380
> > Call Trace:
> > 
> > elv_rb_del+0x24/0x30
> > bfq_remove_request+0x9a/0x2e0 [bfq]
> > ? rcu_read_lock_sched_held+0x64/0x70
> > ? update_load_avg+0x72b/0x760
> > bfq_finish_requeue_request+0x2e1/0x3b0 [bfq]
> > ? __lock_is_held+0x5a/0xa0
> > blk_mq_free_request+0x5f/0x1a0
> > blk_put_request+0x23/0x60
> > multipath_release_clone+0xe/0x10
> > dm_softirq_done+0xe3/0x270
> > __blk_mq_complete_request_remote+0x18/0x20
> > flush_smp_call_function_queue+0xa1/0x150
> > generic_smp_call_function_single_interrupt+0x13/0x30
> > smp_call_function_single_interrupt+0x4d/0x220
> > call_function_single_interrupt+0xf/0x20
> > 
> > 
> 
> Hi Bart,
> I suspect my recent fix [1] might fix your failure too.
> 
> Thanks,
> Paolo
> 
> [1] https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1682
> 264.html
> 
> > Bart.
> > 
> > 
> > 
> 
> 
I was never able to reproduce Barts original issue using his tree and
actual mlx5/cx4 hardware and ibsrp
I enabled BFQ with no other special tuning for the moath and subpaths.
I was waiting for him to come back from vacation to check with him.

Thanks
Laurence


Re: v4.16-rc1 + dm-mpath + BFQ

2018-05-04 Thread Paolo Valente


> Il giorno 30 mar 2018, alle ore 18:57, Bart Van Assche 
>  ha scritto:
> 
> On Fri, 2018-03-30 at 10:23 +0200, Paolo Valente wrote:
>> Still 4.16-rc1, being that the version for which you reported this
>> issue in the first place.
> 
> A vanilla v4.16-rc1 kernel is not sufficient to run the srp-test software
> since RDMA/CM support for the SRP target driver is missing from that kernel.
> That's why I asked you to use the for-next branch from my github repository
> in a previous e-mail. Anyway, since the necessary patches are now in
> linux-next, the srp-test software can also be run against linux-next. Here
> are the results that I obtained with label next-20180329 and the kernel
> config attached to your previous e-mail:
> 
> # while ./srp-test/run_tests -c -d -r 10 -e bfq; do :; done
> 
> BUG: unable to handle kernel NULL pointer dereference at 0200
> PGD 0 P4D 0 
> Oops: 0002 [#1] SMP PTI
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.0.0-prebuilt.qemu-project.org 04/01/2014
> RIP: 0010:rb_erase+0x284/0x380
> Call Trace:
> 
> elv_rb_del+0x24/0x30
> bfq_remove_request+0x9a/0x2e0 [bfq]
> ? rcu_read_lock_sched_held+0x64/0x70
> ? update_load_avg+0x72b/0x760
> bfq_finish_requeue_request+0x2e1/0x3b0 [bfq]
> ? __lock_is_held+0x5a/0xa0
> blk_mq_free_request+0x5f/0x1a0
> blk_put_request+0x23/0x60
> multipath_release_clone+0xe/0x10
> dm_softirq_done+0xe3/0x270
> __blk_mq_complete_request_remote+0x18/0x20
> flush_smp_call_function_queue+0xa1/0x150
> generic_smp_call_function_single_interrupt+0x13/0x30
> smp_call_function_single_interrupt+0x4d/0x220
> call_function_single_interrupt+0xf/0x20
> 
> 

Hi Bart,
I suspect my recent fix [1] might fix your failure too.

Thanks,
Paolo

[1] https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1682264.html

> Bart.
> 
> 
> 



Re: [PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-04 Thread Mike Galbraith
Tentatively, I suspect you've just fixed the nasty stalls I reported a
while back.  Not a hint of stall as yet (should have shown itself by
now), spinning rust buckets are being all they can be, box feels good.

Later mq-deadline (I hope to eventually forget the module dependency
eternities we've spent together;), welcome back bfq (maybe.. I hope).

-Mike


[PATCH BUGFIX] block, bfq: postpone rq preparation to insert or merge

2018-05-04 Thread Paolo Valente
When invoked for an I/O request rq, the prepare_request hook of bfq
increments reference counters in the destination bfq_queue for rq. In
this respect, after this hook has been invoked, rq may still be
transformed into a request with no icq attached, i.e., for bfq, a
request not associated with any bfq_queue. No further hook is invoked
to signal this tranformation to bfq (in general, to the destination
elevator for rq). This leads bfq into an inconsistent state, because
bfq has no chance to correctly lower these counters back. This
inconsistency may in its turn cause incorrect scheduling and hangs. It
certainly causes memory leaks, by making it impossible for bfq to free
the involved bfq_queue.

On the bright side, no transformation can still happen for rq after rq
has been inserted into bfq, or merged with another, already inserted,
request. Exploiting this fact, this commit addresses the above issue
by delaying the preparation of an I/O request to when the request is
inserted or merged.

This change also gives a performance bonus: a lock-contention point
gets removed. To prepare a request, bfq needs to hold its scheduler
lock. After postponing request preparation to insertion or merging, no
lock needs to be grabbed any longer in the prepare_request hook, while
the lock already taken to perform insertion or merging is used to
preparare the request as well.

Signed-off-by: Paolo Valente 
---
 block/bfq-iosched.c | 86 +++--
 1 file changed, 57 insertions(+), 29 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 771ae9730ac6..ea02162df6c7 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1858,6 +1858,8 @@ static int bfq_request_merge(struct request_queue *q, 
struct request **req,
return ELEVATOR_NO_MERGE;
 }
 
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
 static void bfq_request_merged(struct request_queue *q, struct request *req,
   enum elv_merge type)
 {
@@ -1866,7 +1868,7 @@ static void bfq_request_merged(struct request_queue *q, 
struct request *req,
blk_rq_pos(req) <
blk_rq_pos(container_of(rb_prev(>rb_node),
struct request, rb_node))) {
-   struct bfq_queue *bfqq = RQ_BFQQ(req);
+   struct bfq_queue *bfqq = bfq_init_rq(req);
struct bfq_data *bfqd = bfqq->bfqd;
struct request *prev, *next_rq;
 
@@ -1894,7 +1896,8 @@ static void bfq_request_merged(struct request_queue *q, 
struct request *req,
 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct request *next)
 {
-   struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+   struct bfq_queue *bfqq = bfq_init_rq(rq),
+   *next_bfqq = bfq_init_rq(next);
 
if (!RB_EMPTY_NODE(>rb_node))
goto end;
@@ -4540,14 +4543,12 @@ static inline void bfq_update_insert_stats(struct 
request_queue *q,
   unsigned int cmd_flags) {}
 #endif
 
-static void bfq_prepare_request(struct request *rq, struct bio *bio);
-
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
   bool at_head)
 {
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
-   struct bfq_queue *bfqq = RQ_BFQQ(rq);
+   struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
unsigned int cmd_flags;
 
@@ -4562,24 +4563,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx 
*hctx, struct request *rq,
blk_mq_sched_request_inserted(rq);
 
spin_lock_irq(>lock);
+   bfqq = bfq_init_rq(rq);
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(>queuelist, >dispatch);
else
list_add_tail(>queuelist, >dispatch);
-   } else {
-   if (WARN_ON_ONCE(!bfqq)) {
-   /*
-* This should never happen. Most likely rq is
-* a requeued regular request, being
-* re-inserted without being first
-* re-prepared. Do a prepare, to avoid
-* failure.
-*/
-   bfq_prepare_request(rq, rq->bio);
-   bfqq = RQ_BFQQ(rq);
-   }
-
+   } else { /* bfqq is assumed to be non null here */
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
 * Update bfqq, because, if a queue merge has occurred
@@ -4922,11 +4912,48 @@ static struct bfq_queue 
*bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 }
 
 /*
- * Allocate bfq data structures associated with this request.
+ * Only reset private 

Re: [PATCH v2] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Tetsuo Handa
Jens Axboe wrote:
> On 5/4/18 10:14 AM, Tetsuo Handa wrote:
>  If that's not easily done, then my next suggestion would be to
>  use a loop flag for it, LO_FLAGS_SYSFS_SETUP or something like that.
> >>>
> >>> Yes, that would be possible.
> >>
> >> Let's make that change.
> > 
> > Since LO_FLAGS_* are defined in include/uapi/linux/loop.h as userspace 
> > visible
> > flags, I feel that using "struct loop_device"->lo_flags for recording 
> > whether
> > sysfs entry exists might be strange... Anyway, updated patch is shown below.
> 
> Hmm yes, I forgot about that, I guess that makes the flags approach
> pretty much useless. Let's just go with your v1 in that case.
> 

OK. You can s/sysfs_ready/sysfs_init_done/ before you apply to your tree.


Re: [PATCH v2] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Jens Axboe
On 5/4/18 10:14 AM, Tetsuo Handa wrote:
> Jens Axboe wrote:
>>> The loop module ignores sysfs_create_group() failure and pretends that
>>> LOOP_SET_FD request succeeded. I guess that the author of commit
>>> ee86273062cbb310 ("loop: add some basic read-only sysfs attributes")
>>> assumed that it is not a fatal error enough to abort LOOP_SET_FD request.
>>>
>>> Do we want to abort LOOP_SET_FD request if sysfs_create_group() failed?
>>
>> Probably safer to retain that behavior.
> 
> OK.
> 
>>
 If that's not easily done, then my next suggestion would be to
 use a loop flag for it, LO_FLAGS_SYSFS_SETUP or something like that.
>>>
>>> Yes, that would be possible.
>>
>> Let's make that change.
> 
> Since LO_FLAGS_* are defined in include/uapi/linux/loop.h as userspace visible
> flags, I feel that using "struct loop_device"->lo_flags for recording whether
> sysfs entry exists might be strange... Anyway, updated patch is shown below.

Hmm yes, I forgot about that, I guess that makes the flags approach
pretty much useless. Let's just go with your v1 in that case.

-- 
Jens Axboe



[PATCH v2] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Tetsuo Handa
Jens Axboe wrote:
> > The loop module ignores sysfs_create_group() failure and pretends that
> > LOOP_SET_FD request succeeded. I guess that the author of commit
> > ee86273062cbb310 ("loop: add some basic read-only sysfs attributes")
> > assumed that it is not a fatal error enough to abort LOOP_SET_FD request.
> > 
> > Do we want to abort LOOP_SET_FD request if sysfs_create_group() failed?
> 
> Probably safer to retain that behavior.

OK.

> 
> >> If that's not easily done, then my next suggestion would be to
> >> use a loop flag for it, LO_FLAGS_SYSFS_SETUP or something like that.
> > 
> > Yes, that would be possible.
> 
> Let's make that change.

Since LO_FLAGS_* are defined in include/uapi/linux/loop.h as userspace visible
flags, I feel that using "struct loop_device"->lo_flags for recording whether
sysfs entry exists might be strange... Anyway, updated patch is shown below.



>From c9897b6387b13b533c32dcc624e12a93f23224d0 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa 
Date: Sat, 5 May 2018 00:52:59 +0900
Subject: [PATCH v2] loop: remember whether sysfs_create_group() succeeded

syzbot is hitting WARN() triggered by memory allocation fault
injection [1] because loop module is calling sysfs_remove_group()
when sysfs_create_group() failed.

Fix this by remembering whether sysfs_create_group() succeeded.
To remember it, userspace visible API flag LO_FLAGS_SYSFS_ENTRY is
introduced. This flag indicates that sysfs entries are available, and
should be always set if CONFIG_SYSFS=y because sysfs entries will be
created unless fault injection prevents it.

[1] 
https://syzkaller.appspot.com/bug?id=3f86c0edf75c86d2633aeb9dd69eccc70bc7e90b

Signed-off-by: Tetsuo Handa 
Reported-by: syzbot 

Cc: Jens Axboe 
---
 drivers/block/loop.c  | 6 --
 include/uapi/linux/loop.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5d4e316..499eafd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -951,7 +951,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
loop_update_dio(lo);
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
-   loop_sysfs_init(lo);
+   if (IS_ENABLED(CONFIG_SYSFS) && loop_sysfs_init(lo) == 0)
+   lo->lo_flags |= LO_FLAGS_SYSFS_ENTRY;
/* let user-space know about the new size */
kobject_uevent(_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
 
@@ -1070,7 +1071,8 @@ static int loop_clr_fd(struct loop_device *lo)
invalidate_bdev(bdev);
}
set_capacity(lo->lo_disk, 0);
-   loop_sysfs_exit(lo);
+   if (lo->lo_flags & LO_FLAGS_SYSFS_ENTRY)
+   loop_sysfs_exit(lo);
if (bdev) {
bd_set_size(bdev, 0);
/* let user-space know about this change */
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index 080a8df..5de1eaa6 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -23,6 +23,7 @@ enum {
LO_FLAGS_AUTOCLEAR  = 4,
LO_FLAGS_PARTSCAN   = 8,
LO_FLAGS_DIRECT_IO  = 16,
+   LO_FLAGS_SYSFS_ENTRY= 32,
 };
 
 #include/* for __kernel_old_dev_t */
-- 
1.8.3.1


Re: [PATCH v4 00/14] Copy Offload in NVMe Fabrics with P2P PCI Memory

2018-05-04 Thread Logan Gunthorpe


On 04/05/18 08:27 AM, Christian König wrote:
> Are you sure that this is more convenient? At least on first glance it 
> feels overly complicated.
> 
> I mean what's the difference between the two approaches?
> 
>      sum = pci_p2pdma_distance(target, [A, B, C, target]);
> 
> and
> 
>      sum = pci_p2pdma_distance(target, A);
>      sum += pci_p2pdma_distance(target, B);
>      sum += pci_p2pdma_distance(target, C);

Well, it's more for consistency with the pci_p2pdma_find() which has to
take a list of devices to find a resource which matches all of them.
(You can't use multiple calls in that case because all the devices in
the list might not have the same set of compatible providers.) That way
we can use the same list to check the distance (when the user specifies
a device) as we do to find a compatible device (when the user wants to
automatically find one.

Logan


[GIT PULL] Block IO fixes for 4.17-rc4

2018-05-04 Thread Jens Axboe
Hi Linus,

A collection of fixes that should to into this release. This pull
request contains:

- Set of bcache fixes from Coly, fixing regression in patches that went
  into this series.

- Set of NVMe fixes by way of Keith.

- Set of bdi related fixes, one from Jan and two from Tetsuo Handa,
  fixing various issues around device addition/removal.

- Two block inflight fixes from Omar, fixing issues around the
  transition to using tags for blk-mq inflight accounting that we did a
  few releases ago.

Please pull!


  git://git.kernel.dk/linux-block.git tags/for-linus-20180504



Chengguang Xu (1):
  nvme: fix potential memory leak in option parsing

Coly Li (6):
  bcache: store disk name in struct cache and struct cached_dev
  bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error()
  bcache: count backing device I/O error for writeback I/O
  bcache: add wait_for_kthread_stop() in bch_allocator_thread()
  bcache: set dc->io_disable to true in conditional_stop_bcache_device()
  bcache: use pr_info() to inform duplicated CACHE_SET_IO_DISABLE set

Jan Kara (1):
  bdi: Fix oops in wb_workfn()

Johannes Thumshirn (1):
  nvmet: switch loopback target state to connecting when resetting

Keith Busch (3):
  nvme: Set integrity flag for user passthrough commands
  nvme/multipath: Disable runtime writable enabling parameter
  nvme/multipath: Fix multipath disabled naming collisions

Omar Sandoval (2):
  blk-mq: count allocated but not started requests in iostats inflight
  blk-mq: fix sysfs inflight counter

Tetsuo Handa (2):
  bdi: wake up concurrent wb_shutdown() callers.
  bdi: Fix use after free bug in debugfs_remove()

 block/blk-mq.c| 40 ---
 block/blk-mq.h|  4 ++-
 block/genhd.c | 12 +++
 block/partition-generic.c | 10 +++---
 drivers/md/bcache/alloc.c |  5 ++-
 drivers/md/bcache/bcache.h|  4 +++
 drivers/md/bcache/debug.c |  3 +-
 drivers/md/bcache/io.c|  8 ++---
 drivers/md/bcache/request.c   |  5 +--
 drivers/md/bcache/super.c | 75 ++-
 drivers/md/bcache/writeback.c |  4 ++-
 drivers/nvme/host/core.c  | 27 ++--
 drivers/nvme/host/fabrics.c   |  6 
 drivers/nvme/host/multipath.c | 24 +-
 drivers/nvme/host/nvme.h  | 12 +++
 drivers/nvme/target/loop.c|  6 
 fs/fs-writeback.c |  2 +-
 include/linux/genhd.h |  4 ++-
 include/linux/wait_bit.h  | 17 ++
 mm/backing-dev.c  |  3 +-
 20 files changed, 189 insertions(+), 82 deletions(-)

-- 
Jens Axboe



Re: [PATCH] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Jens Axboe
On 5/4/18 8:40 AM, Tetsuo Handa wrote:
> Jens Axboe wrote:
>> On 5/4/18 8:27 AM, Tetsuo Handa wrote:
>>> Jens Axboe wrote:
 On 5/4/18 5:47 AM, Tetsuo Handa wrote:
> >From 626d33de1b70b11ecaf95a9f83f7644998e54cbb Mon Sep 17 00:00:00 2001
> From: Tetsuo Handa 
> Date: Wed, 2 May 2018 23:03:48 +0900
> Subject: [PATCH] loop: remember whether sysfs_create_group() succeeded
>
> syzbot is hitting WARN() triggered by memory allocation fault
> injection [1] because loop module is calling sysfs_remove_group()
> when sysfs_create_group() failed.
> Fix this by remembering whether sysfs_create_group() succeeded.

 Can we store this locally instead of in the loop_device? Also,
 naming wise, something like sysfs_init_done would be more readily
 understandable.
>>>
>>> Whether sysfs entry for this loop device exists is per "struct loop_device"
>>> flag, isn't it? What does "locally" mean?
>>
>> I'm assuming this is calling remove in an error path when alloc fails.
>> So it should be possible to know locally whether this was done or not,
>> before calling the teardown. Storing this is in the loop_device seems
>> like a bit of a hack.
> 
> The loop module ignores sysfs_create_group() failure and pretends that
> LOOP_SET_FD request succeeded. I guess that the author of commit
> ee86273062cbb310 ("loop: add some basic read-only sysfs attributes")
> assumed that it is not a fatal error enough to abort LOOP_SET_FD request.
> 
> Do we want to abort LOOP_SET_FD request if sysfs_create_group() failed?

Probably safer to retain that behavior.

>> If that's not easily done, then my next suggestion would be to
>> use a loop flag for it, LO_FLAGS_SYSFS_SETUP or something like that.
> 
> Yes, that would be possible.

Let's make that change.

-- 
Jens Axboe



Re: [PATCH] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Tetsuo Handa
Jens Axboe wrote:
> On 5/4/18 8:27 AM, Tetsuo Handa wrote:
> > Jens Axboe wrote:
> >> On 5/4/18 5:47 AM, Tetsuo Handa wrote:
> >>> >From 626d33de1b70b11ecaf95a9f83f7644998e54cbb Mon Sep 17 00:00:00 2001
> >>> From: Tetsuo Handa 
> >>> Date: Wed, 2 May 2018 23:03:48 +0900
> >>> Subject: [PATCH] loop: remember whether sysfs_create_group() succeeded
> >>>
> >>> syzbot is hitting WARN() triggered by memory allocation fault
> >>> injection [1] because loop module is calling sysfs_remove_group()
> >>> when sysfs_create_group() failed.
> >>> Fix this by remembering whether sysfs_create_group() succeeded.
> >>
> >> Can we store this locally instead of in the loop_device? Also,
> >> naming wise, something like sysfs_init_done would be more readily
> >> understandable.
> > 
> > Whether sysfs entry for this loop device exists is per "struct loop_device"
> > flag, isn't it? What does "locally" mean?
> 
> I'm assuming this is calling remove in an error path when alloc fails.
> So it should be possible to know locally whether this was done or not,
> before calling the teardown. Storing this is in the loop_device seems
> like a bit of a hack.

The loop module ignores sysfs_create_group() failure and pretends that
LOOP_SET_FD request succeeded. I guess that the author of commit
ee86273062cbb310 ("loop: add some basic read-only sysfs attributes")
assumed that it is not a fatal error enough to abort LOOP_SET_FD request.

Do we want to abort LOOP_SET_FD request if sysfs_create_group() failed?

> 
> If that's not easily done, then my next suggestion would be to
> use a loop flag for it, LO_FLAGS_SYSFS_SETUP or something like that.

Yes, that would be possible.


Re: [PATCH 1/3] block: don't disable interrupts during kmap_atomic()

2018-05-04 Thread Jens Axboe
On 5/4/18 8:32 AM, Sebastian Andrzej Siewior wrote:
> bounce_copy_vec() disables interrupts around kmap_atomic(). This is a
> leftover from the old kmap_atomic() implementation which relied on fixed
> mapping slots, so the caller had to make sure that the same slot could not
> be reused from an interrupting context.
> 
> kmap_atomic() was changed to dynamic slots long ago and commit 1ec9c5ddc17a
> ("include/linux/highmem.h: remove the second argument of k[un]map_atomic()")
> removed the slot assignements, but the callers were not checked for now
> redundant interrupt disabling.
> 
> Remove the conditional interrupt disable.

All three patches look fine to me, I'll queue them up for 4.18. In
the future, please include a cover letter for block patches when
sending > 1 patch. Makes it easier to reply to the series as a whole.

-- 
Jens Axboe



[PATCH 2/3] block: Remove redundant WARN_ON()

2018-05-04 Thread Sebastian Andrzej Siewior
From: Anna-Maria Gleixner 

Commit 2fff8a924d4c ("block: Check locking assumptions at runtime") added a
lockdep_assert_held(q->queue_lock) which makes the WARN_ON() redundant
because lockdep will detect and warn about context violations.

The unconditional WARN_ON() does not provide real additional value, so it
can be removed.

Signed-off-by: Anna-Maria Gleixner 
Signed-off-by: Sebastian Andrzej Siewior 
---
 block/blk-core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 85909b431eb0..a3caccaa7d1f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -360,7 +360,6 @@ EXPORT_SYMBOL(blk_start_queue_async);
 void blk_start_queue(struct request_queue *q)
 {
lockdep_assert_held(q->queue_lock);
-   WARN_ON(!in_interrupt() && !irqs_disabled());
WARN_ON_ONCE(q->mq_ops);
 
queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-- 
2.17.0



[PATCH 3/3] block: Shorten interrupt disabled regions

2018-05-04 Thread Sebastian Andrzej Siewior
From: Thomas Gleixner 

Commit 9c40cef2b799 ("sched: Move blk_schedule_flush_plug() out of
__schedule()") moved the blk_schedule_flush_plug() call out of the
interrupt/preempt disabled region in the scheduler. This allows to replace
local_irq_save/restore(flags) by local_irq_disable/enable() in
blk_flush_plug_list().

But it makes more sense to disable interrupts explicitly when the request
queue is locked end reenable them when the request to is unlocked. This
shortens the interrupt disabled section which is important when the plug
list contains requests for more than one queue. The comment which claims
that disabling interrupts around the loop is misleading as the called
functions can reenable interrupts unconditionally anyway and obfuscates the
scope badly:

 local_irq_save(flags);
   spin_lock(q->queue_lock);
   ...
   queue_unplugged(q...);
 scsi_request_fn();
   spin_unlock_irq(q->queue_lock);

---^^^ 

   spin_lock_irq(q->queue_lock);
 spin_unlock(q->queue_lock);
 local_irq_restore(flags);

Aside of that the detached interrupt disabling is a constant pain for
PREEMPT_RT as it requires patching and special casing when RT is enabled
while with the spin_*_irq() variants this happens automatically.

Signed-off-by: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Tejun Heo 
Cc: Jens Axboe 
Cc: Linus Torvalds 
Link: http://lkml.kernel.org/r/20110622174919.025446...@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior 
---
 block/blk-core.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a3caccaa7d1f..0573f9226c2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3629,7 +3629,7 @@ static void queue_unplugged(struct request_queue *q, 
unsigned int depth,
blk_run_queue_async(q);
else
__blk_run_queue(q);
-   spin_unlock(q->queue_lock);
+   spin_unlock_irq(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
@@ -3677,7 +3677,6 @@ EXPORT_SYMBOL(blk_check_plugged);
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
struct request_queue *q;
-   unsigned long flags;
struct request *rq;
LIST_HEAD(list);
unsigned int depth;
@@ -3697,11 +3696,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool 
from_schedule)
q = NULL;
depth = 0;
 
-   /*
-* Save and disable interrupts here, to avoid doing it for every
-* queue lock we have to take.
-*/
-   local_irq_save(flags);
while (!list_empty()) {
rq = list_entry_rq(list.next);
list_del_init(>queuelist);
@@ -3714,7 +3708,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool 
from_schedule)
queue_unplugged(q, depth, from_schedule);
q = rq->q;
depth = 0;
-   spin_lock(q->queue_lock);
+   spin_lock_irq(q->queue_lock);
}
 
/*
@@ -3741,8 +3735,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool 
from_schedule)
 */
if (q)
queue_unplugged(q, depth, from_schedule);
-
-   local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)
-- 
2.17.0



[PATCH 1/3] block: don't disable interrupts during kmap_atomic()

2018-05-04 Thread Sebastian Andrzej Siewior
bounce_copy_vec() disables interrupts around kmap_atomic(). This is a
leftover from the old kmap_atomic() implementation which relied on fixed
mapping slots, so the caller had to make sure that the same slot could not
be reused from an interrupting context.

kmap_atomic() was changed to dynamic slots long ago and commit 1ec9c5ddc17a
("include/linux/highmem.h: remove the second argument of k[un]map_atomic()")
removed the slot assignements, but the callers were not checked for now
redundant interrupt disabling.

Remove the conditional interrupt disable.

Signed-off-by: Sebastian Andrzej Siewior 
---
 block/bounce.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/bounce.c b/block/bounce.c
index dd0b93f2a871..fea9c8146d82 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -63,14 +63,11 @@ __initcall(init_emergency_pool);
  */
 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 {
-   unsigned long flags;
unsigned char *vto;
 
-   local_irq_save(flags);
vto = kmap_atomic(to->bv_page);
memcpy(vto + to->bv_offset, vfrom, to->bv_len);
kunmap_atomic(vto);
-   local_irq_restore(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
-- 
2.17.0



Re: [PATCH] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Jens Axboe
On 5/4/18 8:27 AM, Tetsuo Handa wrote:
> Jens Axboe wrote:
>> On 5/4/18 5:47 AM, Tetsuo Handa wrote:
>>> >From 626d33de1b70b11ecaf95a9f83f7644998e54cbb Mon Sep 17 00:00:00 2001
>>> From: Tetsuo Handa 
>>> Date: Wed, 2 May 2018 23:03:48 +0900
>>> Subject: [PATCH] loop: remember whether sysfs_create_group() succeeded
>>>
>>> syzbot is hitting WARN() triggered by memory allocation fault
>>> injection [1] because loop module is calling sysfs_remove_group()
>>> when sysfs_create_group() failed.
>>> Fix this by remembering whether sysfs_create_group() succeeded.
>>
>> Can we store this locally instead of in the loop_device? Also,
>> naming wise, something like sysfs_init_done would be more readily
>> understandable.
> 
> Whether sysfs entry for this loop device exists is per "struct loop_device"
> flag, isn't it? What does "locally" mean?

I'm assuming this is calling remove in an error path when alloc fails.
So it should be possible to know locally whether this was done or not,
before calling the teardown. Storing this is in the loop_device seems
like a bit of a hack.

If that's not easily done, then my next suggestion would be to
use a loop flag for it, LO_FLAGS_SYSFS_SETUP or something like that.

-- 
Jens Axboe



Re: [PATCH v4 00/14] Copy Offload in NVMe Fabrics with P2P PCI Memory

2018-05-04 Thread Christian König

Am 03.05.2018 um 20:43 schrieb Logan Gunthorpe:


On 03/05/18 11:29 AM, Christian König wrote:

Ok, that is the point where I'm stuck. Why do we need that in one
function call in the PCIe subsystem?

The problem at least with GPUs is that we seriously don't have that
information here, cause the PCI subsystem might not be aware of all the
interconnections.

For example it isn't uncommon to put multiple GPUs on one board. To the
PCI subsystem that looks like separate devices, but in reality all GPUs
are interconnected and can access each others memory directly without
going over the PCIe bus.

I seriously don't want to model that in the PCI subsystem, but rather
the driver. That's why it feels like a mistake to me to push all that
into the PCI function.

Huh? I'm lost. If you have a bunch of PCI devices you can send them as a
list to this API, if you want. If the driver is _sure_ they are all the
same, you only have to send one. In your terminology, you'd just have to
call the interface with:

pci_p2pdma_distance(target, [initiator, target])


Ok, I expected that something like that would do it.

So just to confirm: When I have a bunch of GPUs which could be the 
initiator I only need to do "pci_p2pdma_distance(target, [first GPU, 
target]);" and not "pci_p2pdma_distance(target, [first GPU, second GPU, 
third GPU, forth, target])" ?



Why can't we model that as two separate transactions?

You could, but this is more convenient for users of the API that need to
deal with multiple devices (and manage devices that may be added or
removed at any time).


Are you sure that this is more convenient? At least on first glance it 
feels overly complicated.


I mean what's the difference between the two approaches?

    sum = pci_p2pdma_distance(target, [A, B, C, target]);

and

    sum = pci_p2pdma_distance(target, A);
    sum += pci_p2pdma_distance(target, B);
    sum += pci_p2pdma_distance(target, C);


Yeah, same for me. If Bjorn is ok with that specialized NVM functions
that I'm fine with that as well.

I think it would just be more convenient when we can come up with
functions which can handle all use cases, cause there still seems to be
a lot of similarities.

The way it's implemented is more general and can handle all use cases.
You are arguing for a function that can handle your case (albeit with a
bit more fuss) but can't handle mine and is therefore less general.
Calling my interface specialized is wrong.


Well at the end of the day you only need to convince Bjorn of the 
interface, so I'm perfectly fine with it as long as it serves my use 
case as well :)


But I still would like to understand your intention, cause that really 
helps not to accidentally break something in the long term.


Now when I take a look at the pure PCI hardware level, what I have is a 
transaction between an initiator and a target, and not multiple devices 
in one operation.


I mean you must have a very good reason that you now want to deal with 
multiple devices in the software layer, but neither from the code nor 
from your explanation that reason becomes obvious to me.


Thanks,
Christian.



Logan




Re: [PATCH] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Tetsuo Handa
Jens Axboe wrote:
> On 5/4/18 5:47 AM, Tetsuo Handa wrote:
> >>From 626d33de1b70b11ecaf95a9f83f7644998e54cbb Mon Sep 17 00:00:00 2001
> > From: Tetsuo Handa 
> > Date: Wed, 2 May 2018 23:03:48 +0900
> > Subject: [PATCH] loop: remember whether sysfs_create_group() succeeded
> > 
> > syzbot is hitting WARN() triggered by memory allocation fault
> > injection [1] because loop module is calling sysfs_remove_group()
> > when sysfs_create_group() failed.
> > Fix this by remembering whether sysfs_create_group() succeeded.
> 
> Can we store this locally instead of in the loop_device? Also,
> naming wise, something like sysfs_init_done would be more readily
> understandable.

Whether sysfs entry for this loop device exists is per "struct loop_device"
flag, isn't it? What does "locally" mean?


Re: [PATCH] block: add verifier for cmdline partition

2018-05-04 Thread Jens Axboe
On 5/4/18 1:07 AM, Caizhiyong wrote:
>> -Original Message-
>> From: Wang YanQing [mailto:udkni...@gmail.com]
>> Sent: Thursday, May 03, 2018 7:18 PM
>> To: ax...@kernel.dk
>> Cc: gre...@linuxfoundation.org; pombreda...@nexb.com;
>> t...@linutronix.de; Caizhiyong ; linux-
>> bl...@vger.kernel.org; linux-ker...@vger.kernel.org
>> Subject: [PATCH] block: add verifier for cmdline partition
>>
>> I meet strange filesystem corruption issue recently, the reason
>> is there are overlaps partitions in cmdline partition argument.
>>
>> This patch add verifier for cmdline partition, then if there are
>> overlaps partitions, cmdline_partition will return error and log
>> a error message.
>>
> 
> Partition overlap was intentionally designed in this cmdline partition.
> some time, the cmdline partition save raw data(not filesystem), the overlap 
> makes data  access very convenient.
> 
> reference  
> http://lists.infradead.org/pipermail/linux-mtd/2013-August/048092.html

Might make sense to warn about it at least, it can be very surprising if
it happens inadvertently.

-- 
Jens Axboe



Re: [PATCH] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Jens Axboe
On 5/4/18 5:47 AM, Tetsuo Handa wrote:
>>From 626d33de1b70b11ecaf95a9f83f7644998e54cbb Mon Sep 17 00:00:00 2001
> From: Tetsuo Handa 
> Date: Wed, 2 May 2018 23:03:48 +0900
> Subject: [PATCH] loop: remember whether sysfs_create_group() succeeded
> 
> syzbot is hitting WARN() triggered by memory allocation fault
> injection [1] because loop module is calling sysfs_remove_group()
> when sysfs_create_group() failed.
> Fix this by remembering whether sysfs_create_group() succeeded.

Can we store this locally instead of in the loop_device? Also,
naming wise, something like sysfs_init_done would be more readily
understandable.

-- 
Jens Axboe



[PATCH] loop: remember whether sysfs_create_group() succeeded

2018-05-04 Thread Tetsuo Handa
>From 626d33de1b70b11ecaf95a9f83f7644998e54cbb Mon Sep 17 00:00:00 2001
From: Tetsuo Handa 
Date: Wed, 2 May 2018 23:03:48 +0900
Subject: [PATCH] loop: remember whether sysfs_create_group() succeeded

syzbot is hitting WARN() triggered by memory allocation fault
injection [1] because loop module is calling sysfs_remove_group()
when sysfs_create_group() failed.
Fix this by remembering whether sysfs_create_group() succeeded.

[1] 
https://syzkaller.appspot.com/bug?id=3f86c0edf75c86d2633aeb9dd69eccc70bc7e90b

Signed-off-by: Tetsuo Handa 
Reported-by: syzbot 

Reviewed-by: Greg Kroah-Hartman 
Cc: Jens Axboe 
---
 drivers/block/loop.c | 11 ++-
 drivers/block/loop.h |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5d4e316..1d758d8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -809,16 +809,17 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, 
char *buf)
.attrs= loop_attrs,
 };
 
-static int loop_sysfs_init(struct loop_device *lo)
+static void loop_sysfs_init(struct loop_device *lo)
 {
-   return sysfs_create_group(_to_dev(lo->lo_disk)->kobj,
- _attribute_group);
+   lo->sysfs_ready = !sysfs_create_group(_to_dev(lo->lo_disk)->kobj,
+ _attribute_group);
 }
 
 static void loop_sysfs_exit(struct loop_device *lo)
 {
-   sysfs_remove_group(_to_dev(lo->lo_disk)->kobj,
-  _attribute_group);
+   if (lo->sysfs_ready)
+   sysfs_remove_group(_to_dev(lo->lo_disk)->kobj,
+  _attribute_group);
 }
 
 static void loop_config_discard(struct loop_device *lo)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index b78de98..73c801f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -58,6 +58,7 @@ struct loop_device {
struct kthread_worker   worker;
struct task_struct  *worker_task;
booluse_dio;
+   boolsysfs_ready;
 
struct request_queue*lo_queue;
struct blk_mq_tag_set   tag_set;
-- 
1.8.3.1



Re: [PATCH V3 7/8] nvme: pci: recover controller reliably

2018-05-04 Thread Ming Lei
On Fri, May 04, 2018 at 04:28:23PM +0800, jianchao.wang wrote:
> Hi ming
> 
> On 05/04/2018 04:02 PM, Ming Lei wrote:
> >> nvme_error_handler should invoke nvme_reset_ctrl instead of introducing 
> >> another interface.
> >> Then it is more convenient to ensure that there will be only one resetting 
> >> instance running.
> >>
> > But as you mentioned above, reset_work has to be splitted into two
> > contexts for handling IO timeout during wait_freeze in reset_work,
> > so single instance of nvme_reset_ctrl() may not work well.
> 
> I mean the EH kthread and the reset_work which both could reset the ctrl 
> instead of
> the pre and post rest context.

That may run nvme_pre_reset_dev() two times from both EH thread and
reset_work, looks not good.

> 
> Honestly, I suspect a bit that whether it is worthy to try to recover from 
> [1].
> The Eh kthread solution could make things easier, but the codes for recovery 
> from [1] has
> made code really complicated. It is more difficult to unify the nvme-pci, 
> rdma and fc. 
IMO the model is not complicated(one EH thread with two-stage resetting), and
it may be cloned to rdma, fc, .. without much difficulty.

Follows the model:

1) single EH thread, in which controller should be guaranteed to
shutdown, and EH thread is waken up when controller needs to be
recovered.

2) 1st stage resetting: recover controller, which has to run in one
work context, since admin commands for recover may timeout.

3) 2nd stage resetting: draining IO & updating controller state to live,
which has to run in another context, since IOs during this stage may
timeout too.

The reset lock is used to sync between 1st stage resetting and 2nd
stage resetting. The '1st stage resetting' is scheduled from EH thread,
and the '2nd state resetting' is scheduled after the '1st stage
resetting' is done.

The implementation might not be so complicated too, since V3 has partitioned
reset_work into two parts, then what we just need to do in V4 is to
run each in one work from EH thread.

> How about just fail the resetting as the Keith's solution ?
> 
> [1] io timeout when nvme_reset_work or the new nvme_post_reset_dev invoke 
> nvme_wait_freeze.
> 

I'd suggest to not change controller state as DELETING in this case,
otherwise it may be reported as another bug, in which controller becomes
completely unusable. This issue can be easily triggered by blktests
block/011, in this test case, controller is always recoverable.

I will post out V4, and see if all current issues can be covered, and
how complicated the implementation is.

Thanks
Ming


Re: [PATCH V3 7/8] nvme: pci: recover controller reliably

2018-05-04 Thread jianchao.wang
Hi ming

On 05/04/2018 04:02 PM, Ming Lei wrote:
>> nvme_error_handler should invoke nvme_reset_ctrl instead of introducing 
>> another interface.
>> Then it is more convenient to ensure that there will be only one resetting 
>> instance running.
>>
> But as you mentioned above, reset_work has to be splitted into two
> contexts for handling IO timeout during wait_freeze in reset_work,
> so single instance of nvme_reset_ctrl() may not work well.

I mean the EH kthread and the reset_work which both could reset the ctrl 
instead of
the pre and post rest context.

Honestly, I suspect a bit that whether it is worthy to try to recover from [1].
The Eh kthread solution could make things easier, but the codes for recovery 
from [1] has
made code really complicated. It is more difficult to unify the nvme-pci, rdma 
and fc. 

How about just fail the resetting as the Keith's solution ?

[1] io timeout when nvme_reset_work or the new nvme_post_reset_dev invoke 
nvme_wait_freeze.

Thanks
Jianchao


Re: [PATCH V3 7/8] nvme: pci: recover controller reliably

2018-05-04 Thread Ming Lei
On Fri, May 04, 2018 at 02:10:19PM +0800, jianchao.wang wrote:
> Hi ming
> 
> On 05/04/2018 12:24 PM, Ming Lei wrote:
> >> Just invoke nvme_dev_disable in nvme_error_handler context and hand over 
> >> the other things
> >> to nvme_reset_work as the v2 patch series seems clearer.
> > That way may not fix the current race: nvme_dev_disable() will
> > quiesce/freeze queue again when resetting is in-progress, then
> > nothing can move on.
> 
> With timeout quiesce and nvme EH kthread, the nvme_dev_disable could ensure 
> all the in-flight
> requests to be handled. When we queue the reset_work in nvme_error_handler, 
> there will be no
> timeout anymore.

Right.

> If there is timeout due to the admin io in reset_work, this is expected, 
> reset_work will be
> interrupted and fail.

At least with V3, the admin io submitted in reset_work won't be handled
well. When EH thread hangs in the sync admin io, nvme_eh_schedule()
can't wakeup the EH thread any more, then the admin io may hang forever
in reset_work. Then we have to run nvme_pre_reset_dev() in another
context.

> If the io timeout when reset_work wait freeze, because the reset_work is 
> waiting, things cannot
> move on. we have multiple method to handle this:
> 1. as Keith's solution, change ctrl state to DELETING and fail the reset_work.
> 2. as current patch set, try to recovery it. but we have to split reset_work 
> into two contexts.
>

Right, nvme_dev_disable() can guarantee forward progress, and we have to
make sure that nvme_dev_disable() is called when timeout happens.

> Actually, there are many other places where the nvme_dev_disable will be 
> invoked.   
> > 
> > One big change in V3 is to run nvme_dev_disable() & nvme_pre_reset_dev()
> > in the EH thread, and make sure queues are started once
> > nvme_pre_reset_dev() returns. But as you pointed, it may not work well
> > enough if admin requests are timed-out in nvme_pre_reset_dev().
> > 
> >> The primary target of nvme_error_handler is to drain IOs and disable 
> >> controller.
> >> reset_work could take over the left things of the recovery work
> > Draining IOs isn't necessary, we can remove freezing queues & wait_freeze()
> > from nvme_dev_disable() & nvme_reset_work() for non-shutdown, that can be
> > easy to fix all these races since the .eh_reset_work isn't needed any more,
> > because quiescing is enough to prevent IOs from entering driver/device.
> > 
> > The only issue is that more requests may be failed when reset failed,
> > so I don't want to try that, and just follows the current behaviour.
> > 
> > IMO the primary goal of nvme EH is to recover controller, that is
> > what nvme_dev_disable() and resetting should do, if IO isn't drained,
> > timeout still may be triggered.
> > 
> > The big trouble is about percpu_ref, once the q_usage_counter is killed
> > to start freezing for preventing IO from entering block layer, we have
> > to run blk_mq_freeze_queue_wait() before unfreezing the usage couner. If
> > blk_mq_freeze_queue_wait() isn't needed, things can be much easier for us.
> 
> 
> Sorry for my bad description. I mean nvme_dev_disable will drain all the 
> in-flight requests and requeue or fail them, then we will not get any timeout 
> again.

OK, got it, sorry for misunderstanding your point.

> 
> In fact, I used to talk with Keith about remove freezing queues & wait_freeze 
> for non-shutdown.
> There is an obstacle, nvme_dev_add -> blk_mq_update_nr_hw_queues.
> When the ctrl comes back, the number of hw queues maybe changed.
> Even if we move the wait_freeze, blk_mq_update_nr_hw_queues will do that.
> On the other hand, if not freeze the queue, the requests that are submitted 
> to the dying hw queue 
> during the resetting, will be sacrificed.  

Right.

> 
> 
> > 
> >> IMO, modify the nvme_reset_ctrl_sync in nvme_error_handler to 
> >> nvme_reset_ctrl should be ok.
> >> If the admin ios in reset_work timeout, the nvme_error_handler could also 
> >> provide help to complete them.
> >>
> > Right, that is one goal of this patchset.
> > 
> >>> There are actually two issues here, both are covered in this patchset:
> >>>
> >>> 1) when nvme_wait_freeze() is for draining IO, controller error may
> >>> happen again, this patch can shutdown & reset controller again to
> >>> recover it.
> >>>
> >> We could split the nvme_reset_work into two contexts instead of introduce 
> >> another nvme_eh_reset.
> >> Otherwise, the code looks really complicated.
> > Yeah, that is what I plan to do in V4, introduce .eh_pre_reset_work and
> > .eh_post_reset_work, and run the following things in .eh_pre_reset_work:
> > 
> > - nvme_pre_reset_dev()
> > - schedule .eh_post_reset_work
> > 
> > and run draining IO & updating controller state in .eh_post_reset_work.
> > .eh_pre_reset_work will be scheduled in nvme_error_handler().
> > 
> > This way may address the issue of admin req timeout in nvme_pre_reset_dev(),
> > what do you think of this approach?
> 
> nvme_error_handler should invoke 

RE: [PATCH] block: add verifier for cmdline partition

2018-05-04 Thread Caizhiyong
> -Original Message-
> From: Wang YanQing [mailto:udkni...@gmail.com]
> Sent: Thursday, May 03, 2018 7:18 PM
> To: ax...@kernel.dk
> Cc: gre...@linuxfoundation.org; pombreda...@nexb.com;
> t...@linutronix.de; Caizhiyong ; linux-
> bl...@vger.kernel.org; linux-ker...@vger.kernel.org
> Subject: [PATCH] block: add verifier for cmdline partition
> 
> I meet strange filesystem corruption issue recently, the reason
> is there are overlaps partitions in cmdline partition argument.
> 
> This patch add verifier for cmdline partition, then if there are
> overlaps partitions, cmdline_partition will return error and log
> a error message.
> 

Partition overlap was intentionally designed in this cmdline partition.
some time, the cmdline partition save raw data(not filesystem), the overlap 
makes data  access very convenient.

reference  
http://lists.infradead.org/pipermail/linux-mtd/2013-August/048092.html

> Signed-off-by: Wang YanQing 
> ---
>  block/partitions/cmdline.c | 63
> +-
>  1 file changed, 62 insertions(+), 1 deletion(-)
> 


Re: [PATCH V3 7/8] nvme: pci: recover controller reliably

2018-05-04 Thread jianchao.wang
Oh sorry.

On 05/04/2018 02:10 PM, jianchao.wang wrote:
> nvme_error_handler should invoke nvme_reset_ctrl instead of introducing 
> another interface.
> Then it is more convenient to ensure that there will be only one resetting 
> instance running.

ctrl state is still in RESETTING state, nvme_reset_ctrl cannot work.

Thanks
Jianchao


Re: [PATCH V3 7/8] nvme: pci: recover controller reliably

2018-05-04 Thread jianchao.wang
Hi ming

On 05/04/2018 12:24 PM, Ming Lei wrote:
>> Just invoke nvme_dev_disable in nvme_error_handler context and hand over the 
>> other things
>> to nvme_reset_work as the v2 patch series seems clearer.
> That way may not fix the current race: nvme_dev_disable() will
> quiesce/freeze queue again when resetting is in-progress, then
> nothing can move on.

With timeout quiesce and nvme EH kthread, the nvme_dev_disable could ensure all 
the in-flight
requests to be handled. When we queue the reset_work in nvme_error_handler, 
there will be no
timeout anymore.
If there is timeout due to the admin io in reset_work, this is expected, 
reset_work will be
interrupted and fail.
If the io timeout when reset_work wait freeze, because the reset_work is 
waiting, things cannot
move on. we have multiple method to handle this:
1. as Keith's solution, change ctrl state to DELETING and fail the reset_work.
2. as current patch set, try to recovery it. but we have to split reset_work 
into two contexts.

Actually, there are many other places where the nvme_dev_disable will be 
invoked.   
> 
> One big change in V3 is to run nvme_dev_disable() & nvme_pre_reset_dev()
> in the EH thread, and make sure queues are started once
> nvme_pre_reset_dev() returns. But as you pointed, it may not work well
> enough if admin requests are timed-out in nvme_pre_reset_dev().
> 
>> The primary target of nvme_error_handler is to drain IOs and disable 
>> controller.
>> reset_work could take over the left things of the recovery work
> Draining IOs isn't necessary, we can remove freezing queues & wait_freeze()
> from nvme_dev_disable() & nvme_reset_work() for non-shutdown, that can be
> easy to fix all these races since the .eh_reset_work isn't needed any more,
> because quiescing is enough to prevent IOs from entering driver/device.
> 
> The only issue is that more requests may be failed when reset failed,
> so I don't want to try that, and just follows the current behaviour.
> 
> IMO the primary goal of nvme EH is to recover controller, that is
> what nvme_dev_disable() and resetting should do, if IO isn't drained,
> timeout still may be triggered.
> 
> The big trouble is about percpu_ref, once the q_usage_counter is killed
> to start freezing for preventing IO from entering block layer, we have
> to run blk_mq_freeze_queue_wait() before unfreezing the usage couner. If
> blk_mq_freeze_queue_wait() isn't needed, things can be much easier for us.


Sorry for my bad description. I mean nvme_dev_disable will drain all the 
in-flight requests and requeue or fail them, then we will not get any timeout 
again.

In fact, I used to talk with Keith about remove freezing queues & wait_freeze 
for non-shutdown.
There is an obstacle, nvme_dev_add -> blk_mq_update_nr_hw_queues.
When the ctrl comes back, the number of hw queues maybe changed.
Even if we move the wait_freeze, blk_mq_update_nr_hw_queues will do that.
On the other hand, if not freeze the queue, the requests that are submitted to 
the dying hw queue 
during the resetting, will be sacrificed.  


> 
>> IMO, modify the nvme_reset_ctrl_sync in nvme_error_handler to 
>> nvme_reset_ctrl should be ok.
>> If the admin ios in reset_work timeout, the nvme_error_handler could also 
>> provide help to complete them.
>>
> Right, that is one goal of this patchset.
> 
>>> There are actually two issues here, both are covered in this patchset:
>>>
>>> 1) when nvme_wait_freeze() is for draining IO, controller error may
>>> happen again, this patch can shutdown & reset controller again to
>>> recover it.
>>>
>> We could split the nvme_reset_work into two contexts instead of introduce 
>> another nvme_eh_reset.
>> Otherwise, the code looks really complicated.
> Yeah, that is what I plan to do in V4, introduce .eh_pre_reset_work and
> .eh_post_reset_work, and run the following things in .eh_pre_reset_work:
> 
>   - nvme_pre_reset_dev()
>   - schedule .eh_post_reset_work
> 
> and run draining IO & updating controller state in .eh_post_reset_work.
> .eh_pre_reset_work will be scheduled in nvme_error_handler().
> 
> This way may address the issue of admin req timeout in nvme_pre_reset_dev(),
> what do you think of this approach?

nvme_error_handler should invoke nvme_reset_ctrl instead of introducing another 
interface.
Then it is more convenient to ensure that there will be only one resetting 
instance running.

Thanks
Jianchao