from:"Jens Axboe"

[PATCH 06/28] blk-mq: remove the request_list usage

2018-10-25 Thread Jens Axboe

We don't do anything with it, that's just the legacy path.

Signed-off-by: Jens Axboe 
---
 block/blk-mq.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3f91c6e5b17a..4c82dc44d4d8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -510,9 +510,6 @@ void blk_mq_free_request(struct request *rq)
 
rq_qos_done(q, rq);
 
-   if (blk_rq_rl(rq))
-   blk_put_rl(blk_rq_rl(rq));
-
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
@@ -1675,8 +1672,6 @@ static void blk_mq_bio_to_request(struct request *rq, 
struct bio *bio)
 {
blk_init_request_from_bio(rq, bio);
 
-   blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
-
blk_account_io_start(rq, true);
 }
 
-- 
2.17.1

[PATCH 11/28] bsg: pass in desired timeout handler

2018-10-25 Thread Jens Axboe

This will ease in the conversion to blk-mq, where we can't set
a timeout handler after queue init.

Cc: Johannes Thumshirn 
Cc: Benjamin Block 
Cc: linux-scsi@vger.kernel.org
Signed-off-by: Jens Axboe 
---
 block/bsg-lib.c | 3 ++-
 drivers/scsi/scsi_transport_fc.c| 7 +++
 drivers/scsi/scsi_transport_iscsi.c | 2 +-
 drivers/scsi/scsi_transport_sas.c   | 4 ++--
 include/linux/bsg-lib.h | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f3501cdaf1a6..1da011ec04e6 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -304,7 +304,7 @@ static void bsg_exit_rq(struct request_queue *q, struct 
request *req)
  * @dd_job_size: size of LLD data needed for each job
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-   bsg_job_fn *job_fn, int dd_job_size)
+   bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size)
 {
struct request_queue *q;
int ret;
@@ -327,6 +327,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, 
const char *name,
blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
blk_queue_softirq_done(q, bsg_softirq_done);
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
+   blk_queue_rq_timed_out(q, timeout);
 
ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
if (ret) {
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 381668fa135d..98aaffb4c715 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3780,7 +3780,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct 
fc_host_attrs *fc_host)
snprintf(bsg_name, sizeof(bsg_name),
 "fc_host%d", shost->host_no);
 
-   q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
+   q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, fc_bsg_job_timeout,
+   i->f->dd_bsg_size);
if (IS_ERR(q)) {
dev_err(dev,
"fc_host%d: bsg interface failed to initialize - setup 
queue\n",
@@ -3788,7 +3789,6 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct 
fc_host_attrs *fc_host)
return PTR_ERR(q);
}
__scsi_init_queue(shost, q);
-   blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT);
fc_host->rqst_q = q;
return 0;
@@ -3826,14 +3826,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct 
fc_rport *rport)
return -ENOTSUPP;
 
q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
-   i->f->dd_bsg_size);
+   fc_bsg_job_timeout, i->f->dd_bsg_size);
if (IS_ERR(q)) {
dev_err(dev, "failed to setup bsg queue\n");
return PTR_ERR(q);
}
__scsi_init_queue(shost, q);
blk_queue_prep_rq(q, fc_bsg_rport_prep);
-   blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
rport->rqst_q = q;
return 0;
diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index 6fd2fe210fc3..26b11a775be9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct 
iscsi_cls_host *ihost)
return -ENOTSUPP;
 
snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-   q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
+   q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, NULL, 0);
if (IS_ERR(q)) {
shost_printk(KERN_ERR, shost, "bsg interface failed to "
 "initialize - no request queue\n");
diff --git a/drivers/scsi/scsi_transport_sas.c 
b/drivers/scsi/scsi_transport_sas.c
index 0a165b2b3e81..cf6d47891d77 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -198,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, 
struct sas_rphy *rphy)
 
if (rphy) {
q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
-   sas_smp_dispatch, 0);
+   sas_smp_dispatch, NULL, 0);
if (IS_ERR(q))
return PTR_ERR(q);
rphy->q = q;
@@ -207,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, 
struct sas_rphy *rphy)
 
snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
q = bsg_setup_queue(&shost->shost_gendev, name,
-   sas_smp_dispatch, 0);
+   sas

[PATCH 01/28] sunvdc: convert to blk-mq

2018-10-25 Thread Jens Axboe

Convert from the old request_fn style driver to blk-mq.

Cc: David Miller 
Signed-off-by: Jens Axboe 
---
 drivers/block/sunvdc.c | 149 +++--
 1 file changed, 98 insertions(+), 51 deletions(-)

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index b54fa6726303..95cb4ea8e402 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -6,7 +6,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -66,9 +66,10 @@ struct vdc_port {
 
u64 max_xfer_size;
u32 vdisk_block_size;
+   u32 drain;
 
u64 ldc_timeout;
-   struct timer_list   ldc_reset_timer;
+   struct delayed_work ldc_reset_timer_work;
struct work_struct  ldc_reset_work;
 
/* The server fills these in for us in the disk attribute
@@ -80,12 +81,14 @@ struct vdc_port {
u8  vdisk_mtype;
u32 vdisk_phys_blksz;
 
+   struct blk_mq_tag_set   tag_set;
+
chardisk_name[32];
 };
 
 static void vdc_ldc_reset(struct vdc_port *port);
 static void vdc_ldc_reset_work(struct work_struct *work);
-static void vdc_ldc_reset_timer(struct timer_list *t);
+static void vdc_ldc_reset_timer_work(struct work_struct *work);
 
 static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
 {
@@ -175,11 +178,8 @@ static void vdc_blk_queue_start(struct vdc_port *port)
 * handshake completes, so check for initial handshake before we've
 * allocated a disk.
 */
-   if (port->disk && blk_queue_stopped(port->disk->queue) &&
-   vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) {
-   blk_start_queue(port->disk->queue);
-   }
-
+   if (port->disk && vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50)
+   blk_mq_start_hw_queues(port->disk->queue);
 }
 
 static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
@@ -197,7 +197,7 @@ static void vdc_handshake_complete(struct vio_driver_state 
*vio)
 {
struct vdc_port *port = to_vdc_port(vio);
 
-   del_timer(&port->ldc_reset_timer);
+   cancel_delayed_work(&port->ldc_reset_timer_work);
vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
vdc_blk_queue_start(port);
 }
@@ -320,7 +320,7 @@ static void vdc_end_one(struct vdc_port *port, struct 
vio_dring_state *dr,
 
rqe->req = NULL;
 
-   __blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size);
+   blk_mq_end_request(req, desc->status ? BLK_STS_IOERR : 0);
 
vdc_blk_queue_start(port);
 }
@@ -525,29 +525,40 @@ static int __send_request(struct request *req)
return err;
 }
 
-static void do_vdc_request(struct request_queue *rq)
+static blk_status_t vdc_queue_rq(struct blk_mq_hw_ctx *hctx,
+const struct blk_mq_queue_data *bd)
 {
-   struct request *req;
+   struct vdc_port *port = hctx->queue->queuedata;
+   struct vio_dring_state *dr;
+   unsigned long flags;
 
-   while ((req = blk_peek_request(rq)) != NULL) {
-   struct vdc_port *port;
-   struct vio_dring_state *dr;
+   dr = &port->vio.drings[VIO_DRIVER_TX_RING];
 
-   port = req->rq_disk->private_data;
-   dr = &port->vio.drings[VIO_DRIVER_TX_RING];
-   if (unlikely(vdc_tx_dring_avail(dr) < 1))
-   goto wait;
+   blk_mq_start_request(bd->rq);
 
-   blk_start_request(req);
+   spin_lock_irqsave(&port->vio.lock, flags);
 
-   if (__send_request(req) < 0) {
-   blk_requeue_request(rq, req);
-wait:
-   /* Avoid pointless unplugs. */
-   blk_stop_queue(rq);
-   break;
-   }
+   /*
+* Doing drain, just end the request in error
+*/
+   if (unlikely(port->drain)) {
+   spin_unlock_irqrestore(&port->vio.lock, flags);
+   return BLK_STS_IOERR;
}
+
+   if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
+   spin_unlock_irqrestore(&port->vio.lock, flags);
+   blk_mq_stop_hw_queue(hctx);
+   return BLK_STS_DEV_RESOURCE;
+   }
+
+   if (__send_request(bd->rq) < 0) {
+   spin_unlock_irqrestore(&port->vio.lock, flags);
+   return BLK_STS_IOERR;
+   }
+
+   spin_unlock_irqrestore(&port->vio.lock, flags);
+   return BLK_STS_OK;
 }
 
 static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
@@ -759,6 +770,32 @@ static void vdc_port_down(struct vdc_port *port)
vio_ldc_free(&port->

Re: [PATCH v4 00/11] Zoned block device support improvements

2018-10-25 Thread Jens Axboe

On 10/24/18 10:04 AM, Bart Van Assche wrote:
> On Wed, 2018-10-24 at 11:37 -0400, Martin K. Petersen wrote:
>> Mike,
>>
>>>> You keep mentioning this, but I don't recall ever seeing anything to
>>>> that effect. The rest of the kernel appears to be either arbitrary
>>>> ordering or favoring author SoB as the first tag.
>>>
>>> I've always felt the proper order is how Jens likes it too (all dm
>>> commits from me follow that order).
>>
>> That's fine, I don't have any particular preference. And I don't have
>> any issue with you guys sticking to a certain ordering in your
>> respective subsystems. I occasionally shuffle tags when I commit things
>> in SCSI too.
>>
>> I just think it should be properly documented if there is a preferred
>> way to order things...
> 
> When I tried to look up documentation for this I couldn't find anything under
> the Documentation directory. Maybe it's there but I didn't look carefully
> enough. All I could find on the web is e-mails from Linus in which he explains
> that the order of Signed-off-by's should match the chain of authorship.

I don't think there's any documentation on it, none that I've seen.
Haven't looked for it, though.

It's more of a "it always looked like that", until we got patchwork messing
things up. And then people see that, and do the same. It's a bit
frustrating. I like to be able to see the SOB chain in a patch, and if
it's intermingled with other things, it's much harder to read. At least
for me.

I'll continue fixing these up, but I do hope that at least the regulars
on the block side use the proper formatting.

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-24 Thread Jens Axboe

On 10/24/18 6:02 AM, Jens Axboe wrote:
> On 10/24/18 5:52 AM, Jens Axboe wrote:
>> On 10/24/18 5:30 AM, Jens Axboe wrote:
>>> On 10/24/18 5:19 AM, Christoph Hellwig wrote:
>>>> On Mon, Oct 22, 2018 at 03:23:30AM -0600, Jens Axboe wrote:
>>>>> JFYI, I also reordered the series to make it correct. You can apply
>>>>> this one:
>>>>>
>>>>> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-conversions&id=2b2ffa16193e9a69a076595ed64429b8cc9b42aa
>>>>>
>>>>> before the bsg patch, and it should be fine. Or just use the above branch,
>>>>> of course.
>>>>
>>>> Hell no on that one.  The behavior of having methods right on the
>>>> request_queue which can be changed any time is something we absolutely
>>>> must not introduce into blk-mq.
>>>
>>> I agree it's not the prettiest, but it's not like it's something
>>> that is changed at runtime.
>>>
>>>> Just add pass a timeout hander to bsg_register_queue which is called
>>>> from the bsg ->timeout handler is a much better way to sort our your
>>>> problem.  It can also easily be turned into an independent prep patch.
>>>
>>> Good idea, that is cleaner.
>>
>> Except that STILL doesn't work with mq_ops being a constant, I'd have
>> to allocate it.
> 
> Which obviously won't work. I don't see a good way out of this, since
> I can't store the private timeout handler anywhere. Open to suggestions,
> but until something better comes up, I'm keeping q->timeout and
> removing the API to set it. bsg-lib can just manually set it in the
> queue.

Pushed that out - it's now ->bsg_job_timeout_fn, and nobody sets it but
bsg when it initializes the queue. bsg sets up a default timeout
handler, and calls ->bsg_job_timeout_fn, if defined.  Not that that is
any different than from before, but at least it's obvious what it's for
now.

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-24 Thread Jens Axboe

On 10/24/18 5:52 AM, Jens Axboe wrote:
> On 10/24/18 5:30 AM, Jens Axboe wrote:
>> On 10/24/18 5:19 AM, Christoph Hellwig wrote:
>>> On Mon, Oct 22, 2018 at 03:23:30AM -0600, Jens Axboe wrote:
>>>> JFYI, I also reordered the series to make it correct. You can apply
>>>> this one:
>>>>
>>>> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-conversions&id=2b2ffa16193e9a69a076595ed64429b8cc9b42aa
>>>>
>>>> before the bsg patch, and it should be fine. Or just use the above branch,
>>>> of course.
>>>
>>> Hell no on that one.  The behavior of having methods right on the
>>> request_queue which can be changed any time is something we absolutely
>>> must not introduce into blk-mq.
>>
>> I agree it's not the prettiest, but it's not like it's something
>> that is changed at runtime.
>>
>>> Just add pass a timeout hander to bsg_register_queue which is called
>>> from the bsg ->timeout handler is a much better way to sort our your
>>> problem.  It can also easily be turned into an independent prep patch.
>>
>> Good idea, that is cleaner.
> 
> Except that STILL doesn't work with mq_ops being a constant, I'd have
> to allocate it.

Which obviously won't work. I don't see a good way out of this, since
I can't store the private timeout handler anywhere. Open to suggestions,
but until something better comes up, I'm keeping q->timeout and
removing the API to set it. bsg-lib can just manually set it in the
queue.

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-24 Thread Jens Axboe

On 10/24/18 5:30 AM, Jens Axboe wrote:
> On 10/24/18 5:19 AM, Christoph Hellwig wrote:
>> On Mon, Oct 22, 2018 at 03:23:30AM -0600, Jens Axboe wrote:
>>> JFYI, I also reordered the series to make it correct. You can apply
>>> this one:
>>>
>>> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-conversions&id=2b2ffa16193e9a69a076595ed64429b8cc9b42aa
>>>
>>> before the bsg patch, and it should be fine. Or just use the above branch,
>>> of course.
>>
>> Hell no on that one.  The behavior of having methods right on the
>> request_queue which can be changed any time is something we absolutely
>> must not introduce into blk-mq.
> 
> I agree it's not the prettiest, but it's not like it's something
> that is changed at runtime.
> 
>> Just add pass a timeout hander to bsg_register_queue which is called
>> from the bsg ->timeout handler is a much better way to sort our your
>> problem.  It can also easily be turned into an independent prep patch.
> 
> Good idea, that is cleaner.

Except that STILL doesn't work with mq_ops being a constant, I'd have
to allocate it.

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-24 Thread Jens Axboe

On 10/24/18 5:19 AM, Christoph Hellwig wrote:
> On Mon, Oct 22, 2018 at 03:23:30AM -0600, Jens Axboe wrote:
>> JFYI, I also reordered the series to make it correct. You can apply
>> this one:
>>
>> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-conversions&id=2b2ffa16193e9a69a076595ed64429b8cc9b42aa
>>
>> before the bsg patch, and it should be fine. Or just use the above branch,
>> of course.
> 
> Hell no on that one.  The behavior of having methods right on the
> request_queue which can be changed any time is something we absolutely
> must not introduce into blk-mq.

I agree it's not the prettiest, but it's not like it's something
that is changed at runtime.

> Just add pass a timeout hander to bsg_register_queue which is called
> from the bsg ->timeout handler is a much better way to sort our your
> problem.  It can also easily be turned into an independent prep patch.

Good idea, that is cleaner.

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-23 Thread Jens Axboe

On 10/23/18 11:40 AM, Benjamin Block wrote:
> On Mon, Oct 22, 2018 at 06:38:36AM -0600, Jens Axboe wrote:
>> On 10/22/18 4:03 AM, Benjamin Block wrote:
>>> On Fri, Oct 19, 2018 at 09:50:53AM -0600, Jens Axboe wrote:
>>>
>>> Ok so, that gets past the stage where we initialize the queues. Simple
>>> SCSI-I/O also seems to work, that is for example an INQUIRY(10), but
>>> transport commands that get passed to the driver break. Tried to send
>>> a FibreChannel GPN_FT (remote port discovery).
>>>
>>> As the BSG interface goes. This is a bidirectional command, that has
>>> both a buffer for the request and for the reply. AFAIR BSG will create a
>>> struct request for each of them. Protocol is BSG_PROTOCOL_SCSI,
>>> Subprotocol BSG_SUB_PROTOCOL_SCSI_TRANSPORT. The rest should be
>>> transparent till we get into the driver.
>>>
>>> First got this:
>>>
>>> [  566.531100] BUG: sleeping function called from invalid context at 
>>> mm/slab.h:421
>>> [  566.531452] in_atomic(): 1, irqs_disabled(): 0, pid: 3104, name: 
>>> bsg_api_test
>>> [  566.531460] 1 lock held by bsg_api_test/3104:
>>> [  566.531466]  #0: cb4b58e8 (rcu_read_lock){}, at: 
>>> hctx_lock+0x30/0x118
>>> [  566.531498] Preemption disabled at:
>>> [  566.531503] [<008175d0>] __blk_mq_delay_run_hw_queue+0x50/0x218
>>> [  566.531519] CPU: 3 PID: 3104 Comm: bsg_api_test Tainted: GW  
>>>4.19.0-rc6-bb-next+ #1
>>> [  566.531527] Hardware name: IBM 3906 M03 704 (LPAR)
>>> [  566.531533] Call Trace:
>>> [  566.531544] ([<001167fa>] show_stack+0x8a/0xd8)
>>> [  566.531555]  [<00bcc6d2>] dump_stack+0x9a/0xd8
>>> [  566.531565]  [<00196410>] ___might_sleep+0x280/0x298
>>> [  566.531576]  [<003e528c>] __kmalloc+0xbc/0x560
>>> [  566.531584]  [<0083186a>] bsg_map_buffer+0x5a/0xb0
>>> [  566.531591]  [<00831948>] bsg_queue_rq+0x88/0x118
>>> [  566.531599]  [<0081ab56>] blk_mq_dispatch_rq_list+0x37e/0x670
>>> [  566.531607]  [<0082050e>] blk_mq_do_dispatch_sched+0x11e/0x130
>>> [  566.531615]  [<00820dfe>] 
>>> blk_mq_sched_dispatch_requests+0x156/0x1a0
>>> [  566.531622]  [<00817564>] __blk_mq_run_hw_queue+0x144/0x160
>>> [  566.531630]  [<00817614>] __blk_mq_delay_run_hw_queue+0x94/0x218
>>> [  566.531638]  [<008178b2>] blk_mq_run_hw_queue+0xda/0xf0
>>> [  566.531645]  [<008211d8>] blk_mq_sched_insert_request+0x1a8/0x1e8
>>> [  566.531653]  [<00811ee2>] blk_execute_rq_nowait+0x72/0x80
>>> [  566.531660]  [<00811f66>] blk_execute_rq+0x76/0xb8
>>> [  566.531778]  [<00830d0e>] bsg_ioctl+0x426/0x500
>>> [  566.531787]  [<00440cb4>] do_vfs_ioctl+0x68c/0x710
>>> [  566.531794]  [<00440dac>] ksys_ioctl+0x74/0xa0
>>> [  566.531801]  [<00440e0a>] sys_ioctl+0x32/0x40
>>> [  566.531808]  [<00bf1dd0>] system_call+0xd8/0x2d0
>>> [  566.531815] 1 lock held by bsg_api_test/3104:
>>> [  566.531821]  #0: cb4b58e8 (rcu_read_lock){}, at: 
>>> hctx_lock+0x30/0x118
>>>
>>
>> The first one is an easy fix, not sure how I missed that. The other
>> one I have no idea, any chance you could try with this one:
>>
>> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-conversions&id=142dc9f36e3113b6a76d472978c33c8c2a2b702c
>>
>> which fixes the first one, and also corrects a wrong end_io call,
>> but I don't think that's the cause of the above.
>>
>> If it crashes, can you figure out where in the source that is?
>> Basically just do
>>
>> gdb vmlinux
>> l *zfcp_fc_exec_bsg_job+0x116
>>
>> assuming that works fine on s390 :-)
>>
> 
> So I tried 4.19.0 with only the two patches from you:
> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-conversions&id=2b2ffa16193e9a69a076595ed64429b8cc9b42aa
> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-conversions&id=142dc9f36e3113b6a76d472978c33c8c2a2b702c
> 
> This fixed the first warning from before, as you suggested, but it still
> crash like this:
> 
> [ ] Unable to handle kernel pointer dereference in virtual kernel address 
> space
> [ ] Failing address:  TEID: 0483
> [ ] Fault in home space mode while using kernel ASCE.
> [ ] AS:025f0007 R3:

Re: [PATCH v4 00/11] Zoned block device support improvements

2018-10-23 Thread Jens Axboe

On 10/12/18 4:08 AM, Damien Le Moal wrote:
> This series improves zoned block device support (reduce overhead) and
> introduces many simplifications to the code (overall, there are more deletions
> than insertions).
> 
> In more details:
> * Patches 1 to 3 are SCSI side (sd driver) cleanups and improvements reducing
>   the overhead of report zones command execution during disk scan and
>   revalidation.
> * Patches 4 to 9 improve the useability and user API of zoned block devices.
> * Patch 10 is the main part of this series. This patch replaces the
>   REQ_OP_ZONE_REPORT BIO/request operation for executing report zones commands
>   with a block device file operation, removing the need for the command reply
>   payload in-place rewriting in the BIO buffer. This leads to major
>   simplification of the code in many places.
> * Patch 11 further simplifies the code of low level drivers by providing a
>   generic implementation of zoned block device reuest queue zone bitmaps
>   initialization and revalidation.

I've applied this, but I have two complaints:

1) Two had to be hand applied, it wasn't against the block tree.
2) The ordering of the signed-off-by. Someone told me that this is
   patchwork, but I absolutely hate it. SOB should go last, not before
   the reviewed-by. I fixed that up too.

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-22 Thread Jens Axboe

On 10/22/18 9:21 AM, Benjamin Block wrote:
> On Mon, Oct 22, 2018 at 06:38:36AM -0600, Jens Axboe wrote:
>> On 10/22/18 4:03 AM, Benjamin Block wrote:
>>> On Fri, Oct 19, 2018 at 09:50:53AM -0600, Jens Axboe wrote:
>>>> On 10/19/18 9:01 AM, Benjamin Block wrote:
>>>>> On Wed, Oct 17, 2018 at 10:01:16AM -0600, Jens Axboe wrote:
>>>>>> On 10/17/18 9:55 AM, Benjamin Block wrote:
>>>>>>> On Tue, Oct 16, 2018 at 08:43:01AM -0600, Jens Axboe wrote:
>>>>>>>> Requires a few changes to the FC transport class as well.
>>>>>>>>
>>>>>>>> Cc: Johannes Thumshirn 
>>>>>>>> Cc: Benjamin Block 
>>>>>>>> Cc: linux-scsi@vger.kernel.org
>>>>>>>> Signed-off-by: Jens Axboe 
>>>>>>>> ---
>>>>>>>>  block/bsg-lib.c  | 102 +--
>>>>>>>>  drivers/scsi/scsi_transport_fc.c |  61 ++
>>>>>>>>  2 files changed, 91 insertions(+), 72 deletions(-)
>>>>>>>>
>>>>
>>>> but that's not going to apply cleanly... Can you just try and run my
>>>> mq-conversions branch? That has everything, and it also has that
>>>> alloc failure fixed.
>>>>
>>>> git://git.kernel.dk/linux-block mq-conversions
>>>>
>>>
>>> Ok so, that gets past the stage where we initialize the queues. Simple
>>> SCSI-I/O also seems to work, that is for example an INQUIRY(10), but
>>> transport commands that get passed to the driver break. Tried to send
>>> a FibreChannel GPN_FT (remote port discovery).
>>>
>>> As the BSG interface goes. This is a bidirectional command, that has
>>> both a buffer for the request and for the reply. AFAIR BSG will create a
>>> struct request for each of them. Protocol is BSG_PROTOCOL_SCSI,
>>> Subprotocol BSG_SUB_PROTOCOL_SCSI_TRANSPORT. The rest should be
>>> transparent till we get into the driver.
>>>
>>> First got this:
>>>
>>> [  566.531100] BUG: sleeping function called from invalid context at 
>>> mm/slab.h:421
>>> [  566.531452] in_atomic(): 1, irqs_disabled(): 0, pid: 3104, name: 
>>> bsg_api_test
>>> [  566.531460] 1 lock held by bsg_api_test/3104:
>>> [  566.531466]  #0: cb4b58e8 (rcu_read_lock){}, at: 
>>> hctx_lock+0x30/0x118
>>> [  566.531498] Preemption disabled at:
>>> [  566.531503] [<008175d0>] __blk_mq_delay_run_hw_queue+0x50/0x218
>>> [  566.531519] CPU: 3 PID: 3104 Comm: bsg_api_test Tainted: GW  
>>>4.19.0-rc6-bb-next+ #1
>>> [  566.531527] Hardware name: IBM 3906 M03 704 (LPAR)
>>> [  566.531533] Call Trace:
>>> [  566.531544] ([<001167fa>] show_stack+0x8a/0xd8)
>>> [  566.531555]  [<00bcc6d2>] dump_stack+0x9a/0xd8
>>> [  566.531565]  [<00196410>] ___might_sleep+0x280/0x298
>>> [  566.531576]  [<003e528c>] __kmalloc+0xbc/0x560
>>> [  566.531584]  [<0083186a>] bsg_map_buffer+0x5a/0xb0
>>> [  566.531591]  [<00831948>] bsg_queue_rq+0x88/0x118
>>> [  566.531599]  [<0081ab56>] blk_mq_dispatch_rq_list+0x37e/0x670
>>> [  566.531607]  [<0082050e>] blk_mq_do_dispatch_sched+0x11e/0x130
>>> [  566.531615]  [<00820dfe>] 
>>> blk_mq_sched_dispatch_requests+0x156/0x1a0
>>> [  566.531622]  [<00817564>] __blk_mq_run_hw_queue+0x144/0x160
>>> [  566.531630]  [<00817614>] __blk_mq_delay_run_hw_queue+0x94/0x218
>>> [  566.531638]  [<008178b2>] blk_mq_run_hw_queue+0xda/0xf0
>>> [  566.531645]  [<008211d8>] blk_mq_sched_insert_request+0x1a8/0x1e8
>>> [  566.531653]  [<00811ee2>] blk_execute_rq_nowait+0x72/0x80
>>> [  566.531660]  [<00811f66>] blk_execute_rq+0x76/0xb8
>>> [  566.531778]  [<00830d0e>] bsg_ioctl+0x426/0x500
>>> [  566.531787]  [<00440cb4>] do_vfs_ioctl+0x68c/0x710
>>> [  566.531794]  [<00440dac>] ksys_ioctl+0x74/0xa0
>>> [  566.531801]  [<00440e0a>] sys_ioctl+0x32/0x40
>>> [  566.531808]  [<00bf1dd0>] system_call+0xd8/0x2d0
>>> [  566.531815] 1 lock held by bsg_api_test/3104:
>>> [  566.531821]  #0: cb4b58e8 (rcu_read_lock){}, at: 
>>> hctx_lock+0x30/0x118
&g

Re: [PATCH] bsg: convert to use blk-mq

2018-10-22 Thread Jens Axboe

On 10/22/18 4:03 AM, Benjamin Block wrote:
> On Fri, Oct 19, 2018 at 09:50:53AM -0600, Jens Axboe wrote:
>> On 10/19/18 9:01 AM, Benjamin Block wrote:
>>> On Wed, Oct 17, 2018 at 10:01:16AM -0600, Jens Axboe wrote:
>>>> On 10/17/18 9:55 AM, Benjamin Block wrote:
>>>>> On Tue, Oct 16, 2018 at 08:43:01AM -0600, Jens Axboe wrote:
>>>>>> Requires a few changes to the FC transport class as well.
>>>>>>
>>>>>> Cc: Johannes Thumshirn 
>>>>>> Cc: Benjamin Block 
>>>>>> Cc: linux-scsi@vger.kernel.org
>>>>>> Signed-off-by: Jens Axboe 
>>>>>> ---
>>>>>>  block/bsg-lib.c  | 102 +--
>>>>>>  drivers/scsi/scsi_transport_fc.c |  61 ++
>>>>>>  2 files changed, 91 insertions(+), 72 deletions(-)
>>>>>>
>>
>> but that's not going to apply cleanly... Can you just try and run my
>> mq-conversions branch? That has everything, and it also has that
>> alloc failure fixed.
>>
>> git://git.kernel.dk/linux-block mq-conversions
>>
> 
> Ok so, that gets past the stage where we initialize the queues. Simple
> SCSI-I/O also seems to work, that is for example an INQUIRY(10), but
> transport commands that get passed to the driver break. Tried to send
> a FibreChannel GPN_FT (remote port discovery).
> 
> As the BSG interface goes. This is a bidirectional command, that has
> both a buffer for the request and for the reply. AFAIR BSG will create a
> struct request for each of them. Protocol is BSG_PROTOCOL_SCSI,
> Subprotocol BSG_SUB_PROTOCOL_SCSI_TRANSPORT. The rest should be
> transparent till we get into the driver.
> 
> First got this:
> 
> [  566.531100] BUG: sleeping function called from invalid context at 
> mm/slab.h:421
> [  566.531452] in_atomic(): 1, irqs_disabled(): 0, pid: 3104, name: 
> bsg_api_test
> [  566.531460] 1 lock held by bsg_api_test/3104:
> [  566.531466]  #0: cb4b58e8 (rcu_read_lock){}, at: 
> hctx_lock+0x30/0x118
> [  566.531498] Preemption disabled at:
> [  566.531503] [<008175d0>] __blk_mq_delay_run_hw_queue+0x50/0x218
> [  566.531519] CPU: 3 PID: 3104 Comm: bsg_api_test Tainted: GW
>  4.19.0-rc6-bb-next+ #1
> [  566.531527] Hardware name: IBM 3906 M03 704 (LPAR)
> [  566.531533] Call Trace:
> [  566.531544] ([<001167fa>] show_stack+0x8a/0xd8)
> [  566.531555]  [<00bcc6d2>] dump_stack+0x9a/0xd8
> [  566.531565]  [<00196410>] ___might_sleep+0x280/0x298
> [  566.531576]  [<003e528c>] __kmalloc+0xbc/0x560
> [  566.531584]  [<0083186a>] bsg_map_buffer+0x5a/0xb0
> [  566.531591]  [<00831948>] bsg_queue_rq+0x88/0x118
> [  566.531599]  [<0081ab56>] blk_mq_dispatch_rq_list+0x37e/0x670
> [  566.531607]  [<0082050e>] blk_mq_do_dispatch_sched+0x11e/0x130
> [  566.531615]  [<00820dfe>] 
> blk_mq_sched_dispatch_requests+0x156/0x1a0
> [  566.531622]  [<00817564>] __blk_mq_run_hw_queue+0x144/0x160
> [  566.531630]  [<00817614>] __blk_mq_delay_run_hw_queue+0x94/0x218
> [  566.531638]  [<008178b2>] blk_mq_run_hw_queue+0xda/0xf0
> [  566.531645]  [<008211d8>] blk_mq_sched_insert_request+0x1a8/0x1e8
> [  566.531653]  [<00811ee2>] blk_execute_rq_nowait+0x72/0x80
> [  566.531660]  [<00811f66>] blk_execute_rq+0x76/0xb8
> [  566.531778]  [<00830d0e>] bsg_ioctl+0x426/0x500
> [  566.531787]  [<00440cb4>] do_vfs_ioctl+0x68c/0x710
> [  566.531794]  [<00440dac>] ksys_ioctl+0x74/0xa0
> [  566.531801]  [<00440e0a>] sys_ioctl+0x32/0x40
> [  566.531808]  [<00bf1dd0>] system_call+0xd8/0x2d0
> [  566.531815] 1 lock held by bsg_api_test/3104:
> [  566.531821]  #0: cb4b58e8 (rcu_read_lock){}, at: 
> hctx_lock+0x30/0x118
> 
> And then it dies completely:
> 
> [  566.531854] Unable to handle kernel pointer dereference in virtual kernel 
> address space
> [  566.531861] Failing address:  TEID: 0483
> [  566.531867] Fault in home space mode while using kernel ASCE.
> [  566.531885] AS:013ec007 R3:effc8007 S:effce000 
> P:013d
> [  566.531927] Oops: 0004 ilc:3 [#1] PREEMPT SMP DEBUG_PAGEALLOC
> [  566.531938] Modules linked in: ...
> [  566.532042] CPU: 3 PID: 3104 Comm: bsg_api_test Tainted: GW
>  4.19.0-rc6-bb-next+ #1
> [  566.532047] Hardware name: IBM 3906 M03 704 (LPAR)
> [  566.532051] Krnl PSW

Re: [PATCH] bsg: convert to use blk-mq

2018-10-22 Thread Jens Axboe

On 10/19/18 9:57 AM, Benjamin Block wrote:
> On Fri, Oct 19, 2018 at 09:50:53AM -0600, Jens Axboe wrote:
>> On 10/19/18 9:01 AM, Benjamin Block wrote:
>>> On Wed, Oct 17, 2018 at 10:01:16AM -0600, Jens Axboe wrote:
>>>> On 10/17/18 9:55 AM, Benjamin Block wrote:
>>>>> On Tue, Oct 16, 2018 at 08:43:01AM -0600, Jens Axboe wrote:
>>>>>> Requires a few changes to the FC transport class as well.
>>>>>>
>>>>>> Cc: Johannes Thumshirn 
>>>>>> Cc: Benjamin Block 
>>>>>> Cc: linux-scsi@vger.kernel.org
>>>>>> Signed-off-by: Jens Axboe 
>>>>>> ---
>>>>>>  block/bsg-lib.c  | 102 +--
>>>>>>  drivers/scsi/scsi_transport_fc.c |  61 ++
>>>>>>  2 files changed, 91 insertions(+), 72 deletions(-)
>>>>>>
>>>>>
>>>>> Hey Jens,
>>>>>
>>>>> I haven't had time to look into this in any deep way - but I did plan to
>>>>> -, but just to see whether it starts and runs some I/O I tried giving it
>>>>> a spin and came up with nothing (see line 3 and 5):
>>>>
>>>> I'm an idiot, can you try this on top?
>>>>
>>>>
>>>> diff --git a/block/bsg-lib.c b/block/bsg-lib.c
>>>> index 1aa0ed3fc339..95e12b635225 100644
>>>> --- a/block/bsg-lib.c
>>>> +++ b/block/bsg-lib.c
>>>> @@ -311,7 +311,7 @@ struct request_queue *bsg_setup_queue(struct device 
>>>> *dev, const char *name,
>>>>int ret = -ENOMEM;
>>>>  
>>>>set = kzalloc(sizeof(*set), GFP_KERNEL);
>>>> -  if (set)
>>>> +  if (!set)
>>>>return ERR_PTR(-ENOMEM);
>>>>  
>>>>set->ops = &bsg_mq_ops;
>>>>
>>>
>>> Well, yes, that would be wrong. But it still doesn't fly (s390x stack
>>> trace):
>>>
>>> [   36.271953] scsi host0: scsi_eh_0: sleeping
>>> [   36.272571] scsi host0: zfcp
>>> [   36.298065] WARNING: CPU: 7 PID: 856 at block/blk-settings.c:71 
>>> blk_queue_rq_timed_out+0x44/0x60
>>> [   36.298315] Modules linked in: zfcp scsi_transport_fc dm_multipath 
>>> [   36.299015] CPU: 7 PID: 856 Comm: systemd-udevd Tainted: GW  
>>>4.19.0-rc8-bb-next+ #1
>>> [   36.299059] Hardware name: IBM 3906 M03 704 (LPAR)
>>> [   36.299101] Krnl PSW : 0704e0018000 00ced494 
>>> (blk_queue_rq_timed_out+0x44/0x60)
>>> [   36.299192]R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 
>>> PM:0 RI:0 EA:3
>>> [   36.299259] Krnl GPRS:  015cee60 
>>> a0ce0180 0300
>>> [   36.299304]0300 00ced486 
>>> a5738000 03ff8069eba0
>>> [   36.299349]a11ec6a8 a0ce 
>>> a11ec400 03ff805ee438
>>> [   36.299394]a0ce 03ff805f6b00 
>>> 00ced486 a28af0b0
>>> [   36.299460] Krnl Code: 00ced486: e310c182ltg 
>>> %r1,384(%r12)
>>>   00ced48c: a7840004brc 
>>> 8,ced494
>>>  #00ced490: a7f40001brc 
>>> 15,ced492
>>>  >00ced494: 4120c150la  
>>> %r2,336(%r12)
>>>   00ced498: c0e5ffc76290brasl   
>>> %r14,5d99b8
>>>   00ced49e: e3b0c1500024stg 
>>> %r11,336(%r12)
>>>   00ced4a4: ebbff0a4lmg 
>>> %r11,%r15,160(%r15)
>>>   00ced4aa: 07febcr 
>>> 15,%r14
>>> [   36.299879] Call Trace:
>>> [   36.299922] ([<a11ec6a8>] 0xa11ec6a8)
>>> [   36.299979]  [<03ff805ee3fa>] fc_host_setup+0x622/0x660 
>>> [scsi_transport_fc] 
>>> [   36.300029]  [<00f0baca>] transport_setup_classdev+0x62/0x70 
>>> [   36.300075]  [<00f0b592>] 
>>> attribute_container_add_device+0x182/0x1d0 
>>> [   36.300163]  [<03ff80503cae>] scsi_sysfs_add_host+0x5e/0x100 
>>> [scsi_mod] 
>>> [   36.300245]  [<03ff804e6d7c>] scsi_add_host_with_dma+0x2dc/0x468

Re: [PATCH] bsg: convert to use blk-mq

2018-10-19 Thread Jens Axboe

On 10/19/18 9:01 AM, Benjamin Block wrote:
> On Wed, Oct 17, 2018 at 10:01:16AM -0600, Jens Axboe wrote:
>> On 10/17/18 9:55 AM, Benjamin Block wrote:
>>> On Tue, Oct 16, 2018 at 08:43:01AM -0600, Jens Axboe wrote:
>>>> Requires a few changes to the FC transport class as well.
>>>>
>>>> Cc: Johannes Thumshirn 
>>>> Cc: Benjamin Block 
>>>> Cc: linux-scsi@vger.kernel.org
>>>> Signed-off-by: Jens Axboe 
>>>> ---
>>>>  block/bsg-lib.c  | 102 +--
>>>>  drivers/scsi/scsi_transport_fc.c |  61 ++
>>>>  2 files changed, 91 insertions(+), 72 deletions(-)
>>>>
>>>
>>> Hey Jens,
>>>
>>> I haven't had time to look into this in any deep way - but I did plan to
>>> -, but just to see whether it starts and runs some I/O I tried giving it
>>> a spin and came up with nothing (see line 3 and 5):
>>
>> I'm an idiot, can you try this on top?
>>
>>
>> diff --git a/block/bsg-lib.c b/block/bsg-lib.c
>> index 1aa0ed3fc339..95e12b635225 100644
>> --- a/block/bsg-lib.c
>> +++ b/block/bsg-lib.c
>> @@ -311,7 +311,7 @@ struct request_queue *bsg_setup_queue(struct device 
>> *dev, const char *name,
>>  int ret = -ENOMEM;
>>  
>>  set = kzalloc(sizeof(*set), GFP_KERNEL);
>> -if (set)
>> +if (!set)
>>  return ERR_PTR(-ENOMEM);
>>  
>>  set->ops = &bsg_mq_ops;
>>
> 
> Well, yes, that would be wrong. But it still doesn't fly (s390x stack
> trace):
> 
> [   36.271953] scsi host0: scsi_eh_0: sleeping
> [   36.272571] scsi host0: zfcp
> [   36.298065] WARNING: CPU: 7 PID: 856 at block/blk-settings.c:71 
> blk_queue_rq_timed_out+0x44/0x60
> [   36.298315] Modules linked in: zfcp scsi_transport_fc dm_multipath 
> [   36.299015] CPU: 7 PID: 856 Comm: systemd-udevd Tainted: GW
>  4.19.0-rc8-bb-next+ #1
> [   36.299059] Hardware name: IBM 3906 M03 704 (LPAR)
> [   36.299101] Krnl PSW : 0704e0018000 00ced494 
> (blk_queue_rq_timed_out+0x44/0x60)
> [   36.299192]R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 
> RI:0 EA:3
> [   36.299259] Krnl GPRS:  015cee60 a0ce0180 
> 0300
> [   36.299304]0300 00ced486 a5738000 
> 03ff8069eba0
> [   36.299349]a11ec6a8 a0ce a11ec400 
> 03ff805ee438
> [   36.299394]a0ce 03ff805f6b00 00ced486 
> a28af0b0
> [   36.299460] Krnl Code: 00ced486: e310c182ltg 
> %r1,384(%r12)
>   00ced48c: a7840004brc 
> 8,ced494
>  #00ced490: a7f40001brc 
> 15,ced492
>  >00ced494: 4120c150la  
> %r2,336(%r12)
>   00ced498: c0e5ffc76290brasl   
> %r14,5d99b8
>   00ced49e: e3b0c1500024stg 
> %r11,336(%r12)
>   00ced4a4: ebbff0a4lmg 
> %r11,%r15,160(%r15)
>   00ced4aa: 07febcr 
> 15,%r14
> [   36.299879] Call Trace:
> [   36.299922] ([<a11ec6a8>] 0xa11ec6a8)
> [   36.299979]  [<03ff805ee3fa>] fc_host_setup+0x622/0x660 
> [scsi_transport_fc] 
> [   36.300029]  [<00f0baca>] transport_setup_classdev+0x62/0x70 
> [   36.300075]  [<00f0b592>] 
> attribute_container_add_device+0x182/0x1d0 
> [   36.300163]  [<03ff80503cae>] scsi_sysfs_add_host+0x5e/0x100 
> [scsi_mod] 
> [   36.300245]  [<03ff804e6d7c>] scsi_add_host_with_dma+0x2dc/0x468 
> [scsi_mod] 
> [   36.300310]  [<03ff806835f2>] zfcp_scsi_adapter_register+0x222/0x260 
> [zfcp] 
> [   36.300373]  [<03ff8066a49a>] zfcp_adapter_enqueue+0xae2/0xb20 [zfcp] 
> [   36.300435]  [<03ff8066b436>] zfcp_ccw_set_online+0xb6/0x208 [zfcp] 
> [   36.300482]  [<00f8ad14>] ccw_device_set_online+0x41c/0x878 
> [   36.300527]  [<00f8b374>] 
> online_store_recog_and_online+0x204/0x230 
> [   36.300572]  [<00f8de20>] online_store+0x290/0x3e8 
> [   36.300619]  [<007842c0>] kernfs_fop_write+0x1b0/0x288 
> [   36.300663]  [<0064217e>] __vfs_write+0xee/0x398 
> [   36.300705]  [<006426b4>] vfs_write+0xec/0x238 
> [   36.300754]  [<00642abe&

Re: bioset changes in 4.18 broke aha1542

2018-10-18 Thread Jens Axboe

On 10/18/18 2:58 PM, Ondrej Zary wrote:
> On Thursday 18 October 2018 22:28:35 Jens Axboe wrote:
>> On 10/18/18 2:22 PM, Ondrej Zary wrote:
>>> On Thursday 18 October 2018 22:10:31 Jens Axboe wrote:
>>>> On 10/18/18 2:04 PM, Ondrej Zary wrote:
>>>>> On Thursday 18 October 2018 21:59:09 Jens Axboe wrote:
>>>>>> On 10/18/18 1:55 PM, Ondrej Zary wrote:
>>>>>>> On Thursday 18 October 2018 20:58:57 Jens Axboe wrote:
>>>>>>>> On 10/18/18 12:34 PM, Ondrej Zary wrote:
>>>>>>>>> Hello,
>>>>>>>>> aha1542 works fine in 4.17 but crashes in 4.18. It's hard to bisect 
>>>>>>>>> because
>>>>>>>>> of many commits that don't compile.
>>>>>>>>> # only skipped commits left to test
>>>>>>>>> # possible first bad commit: 
>>>>>>>>> [52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b] fs: convert block_dev.c to 
>>>>>>>>> bioset_init()
>>>>>>>>> # possible first bad commit: 
>>>>>>>>> [a47a28b74a5c7c27bf621276b85ad6c124651236] target: convert to 
>>>>>>>>> bioset_init()/mempool_init()
>>>>>>>>> # possible first bad commit: 
>>>>>>>>> [6f1c819c219f7841079f0f43ab62727a55b0d849] dm: convert to 
>>>>>>>>> bioset_init()/mempool_init()
>>>>>>>>> # possible first bad commit: 
>>>>>>>>> [afeee514ce7f4cab605beedd03be71ebaf0c5fc8] md: convert to 
>>>>>>>>> bioset_init()/mempool_init()
>>>>>>>>> # possible first bad commit: 
>>>>>>>>> [d19936a26658a7a53edd5619d631ee2c2c3151a2] bcache: convert to 
>>>>>>>>> bioset_init()/mempool_init()
>>>>>>>>> # possible first bad commit: 
>>>>>>>>> [b906bbb6997785d9ea0bd3f5585537afa6257c43] lightnvm: convert to 
>>>>>>>>> bioset_init()/mempool_init()
>>>>>>>>>
>>>>>>>>> Testing manually, a47a28b74a5c7c27bf621276b85ad6c124651236 works.
>>>>>>>>> 52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b does not compile
>>>>>>>>> 8ac9f7c1fd1d342e82ddf078425423b050652ba0 does not compile
>>>>>>>>> e292d7bc63c8f2adb3dfda27910e805f1b6557f9 does not compile
>>>>>>>>> dad08527525f9a8ac9c7f278864c65f94bc5e9b3 does not compile
>>>>>>>>> 943cf9f3ca16133dbd00f9a4cbfea46512fcb0e8 works
>>>>>>>>> ..
>>>>>>>>> fedc3abe7bd2dcc4c80bcf3cff8708a3908d8219 works
>>>>>>>>> 04c4950d5b373ba712d928592e05e73510785bca crashes
>>>>>>>>
>>>>>>>> It looks like the ISA bioset pool isn't being initialized. You should
>>>>>>>> have messages like this in your dmesg:
>>>>>>>>
>>>>>>>> isa pool size: 16 pages
>>>>>>>>
>>>>>>>> (which you do), but also something on the bioset section. Do you have
>>>>>>>> this one:
>>>>>>>>
>>>>>>>> pool size: 64 pages
>>>>>>>>
>>>>>>>> as well?
>>>>>>>>
>>>>>>>
>>>>>>> No, it's not there.
>>>>>>
>>>>>> Can you attach your .config? I'm guessing CONFIG_HIGHMEM* isn't set.
>>>>>>
>>>>>
>>>>> It is.
>>>>
>>>> Puzzled... Does this work?
>>>>
>>>>
>>>> diff --git a/block/bounce.c b/block/bounce.c
>>>> index b30071ac4ec6..49564a1bfd22 100644
>>>> --- a/block/bounce.c
>>>> +++ b/block/bounce.c
>>>> @@ -35,10 +35,6 @@ static mempool_t page_pool, isa_page_pool;
>>>>  static __init int init_emergency_pool(void)
>>>>  {
>>>>int ret;
>>>> -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
>>>> -  if (max_pfn <= max_low_pfn)
>>>> -  return 0;
>>>> -#endif
>>>>  
>>>>ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
>>>>BUG_ON(ret);
>>>>
>>>
>>> Yes, it does! 
>>> bounce: pool size: 64

Re: bioset changes in 4.18 broke aha1542

2018-10-18 Thread Jens Axboe

On 10/18/18 2:22 PM, Ondrej Zary wrote:
> On Thursday 18 October 2018 22:10:31 Jens Axboe wrote:
>> On 10/18/18 2:04 PM, Ondrej Zary wrote:
>>> On Thursday 18 October 2018 21:59:09 Jens Axboe wrote:
>>>> On 10/18/18 1:55 PM, Ondrej Zary wrote:
>>>>> On Thursday 18 October 2018 20:58:57 Jens Axboe wrote:
>>>>>> On 10/18/18 12:34 PM, Ondrej Zary wrote:
>>>>>>> Hello,
>>>>>>> aha1542 works fine in 4.17 but crashes in 4.18. It's hard to bisect 
>>>>>>> because
>>>>>>> of many commits that don't compile.
>>>>>>> # only skipped commits left to test
>>>>>>> # possible first bad commit: [52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b] 
>>>>>>> fs: convert block_dev.c to bioset_init()
>>>>>>> # possible first bad commit: [a47a28b74a5c7c27bf621276b85ad6c124651236] 
>>>>>>> target: convert to bioset_init()/mempool_init()
>>>>>>> # possible first bad commit: [6f1c819c219f7841079f0f43ab62727a55b0d849] 
>>>>>>> dm: convert to bioset_init()/mempool_init()
>>>>>>> # possible first bad commit: [afeee514ce7f4cab605beedd03be71ebaf0c5fc8] 
>>>>>>> md: convert to bioset_init()/mempool_init()
>>>>>>> # possible first bad commit: [d19936a26658a7a53edd5619d631ee2c2c3151a2] 
>>>>>>> bcache: convert to bioset_init()/mempool_init()
>>>>>>> # possible first bad commit: [b906bbb6997785d9ea0bd3f5585537afa6257c43] 
>>>>>>> lightnvm: convert to bioset_init()/mempool_init()
>>>>>>>
>>>>>>> Testing manually, a47a28b74a5c7c27bf621276b85ad6c124651236 works.
>>>>>>> 52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b does not compile
>>>>>>> 8ac9f7c1fd1d342e82ddf078425423b050652ba0 does not compile
>>>>>>> e292d7bc63c8f2adb3dfda27910e805f1b6557f9 does not compile
>>>>>>> dad08527525f9a8ac9c7f278864c65f94bc5e9b3 does not compile
>>>>>>> 943cf9f3ca16133dbd00f9a4cbfea46512fcb0e8 works
>>>>>>> ..
>>>>>>> fedc3abe7bd2dcc4c80bcf3cff8708a3908d8219 works
>>>>>>> 04c4950d5b373ba712d928592e05e73510785bca crashes
>>>>>>
>>>>>> It looks like the ISA bioset pool isn't being initialized. You should
>>>>>> have messages like this in your dmesg:
>>>>>>
>>>>>> isa pool size: 16 pages
>>>>>>
>>>>>> (which you do), but also something on the bioset section. Do you have
>>>>>> this one:
>>>>>>
>>>>>> pool size: 64 pages
>>>>>>
>>>>>> as well?
>>>>>>
>>>>>
>>>>> No, it's not there.
>>>>
>>>> Can you attach your .config? I'm guessing CONFIG_HIGHMEM* isn't set.
>>>>
>>>
>>> It is.
>>
>> Puzzled... Does this work?
>>
>>
>> diff --git a/block/bounce.c b/block/bounce.c
>> index b30071ac4ec6..49564a1bfd22 100644
>> --- a/block/bounce.c
>> +++ b/block/bounce.c
>> @@ -35,10 +35,6 @@ static mempool_t page_pool, isa_page_pool;
>>  static __init int init_emergency_pool(void)
>>  {
>>  int ret;
>> -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
>> -if (max_pfn <= max_low_pfn)
>> -return 0;
>> -#endif
>>  
>>  ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
>>  BUG_ON(ret);
>>
> 
> Yes, it does! 
> bounce: pool size: 64 pages
> and aha1542 works.
> 
> Also added printks for pfn:
> max_pfn=65520, max_low_pfn=65520

This should be a better fix, though I'm still puzzled why we need
it now. Can you test this one?


diff --git a/block/bounce.c b/block/bounce.c
index b30071ac4ec6..1356a2f4aae2 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -31,6 +31,24 @@
 static struct bio_set bounce_bio_set, bounce_bio_split;
 static mempool_t page_pool, isa_page_pool;
 
+static __init void init_bounce_bioset(void)
+{
+   static bool bounce_bs_setup;
+   int ret;
+
+   if (bounce_bs_setup)
+   return;
+
+   ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+   BUG_ON(ret);
+   if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
+   BUG_ON(1);
+
+   ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
+   BUG_ON(ret);
+   bounce_bs_setup = true;
+}
+
 #if defined(CONFIG_HIGHMEM)
 static __init int init_emergency_pool(void)
 {
@@ -44,14 +62,7 @@ static __init int init_emergency_pool(void)
BUG_ON(ret);
pr_info("pool size: %d pages\n", POOL_SIZE);
 
-   ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-   BUG_ON(ret);
-   if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
-   BUG_ON(1);
-
-   ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
-   BUG_ON(ret);
-
+   init_bounce_bioset();
return 0;
 }
 
@@ -102,6 +113,7 @@ int init_emergency_isa_pool(void)
BUG_ON(ret);
 
pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
+   init_bounce_bioset();
return 0;
 }
 

-- 
Jens Axboe

Re: bioset changes in 4.18 broke aha1542

2018-10-18 Thread Jens Axboe

On 10/18/18 2:04 PM, Ondrej Zary wrote:
> On Thursday 18 October 2018 21:59:09 Jens Axboe wrote:
>> On 10/18/18 1:55 PM, Ondrej Zary wrote:
>>> On Thursday 18 October 2018 20:58:57 Jens Axboe wrote:
>>>> On 10/18/18 12:34 PM, Ondrej Zary wrote:
>>>>> Hello,
>>>>> aha1542 works fine in 4.17 but crashes in 4.18. It's hard to bisect 
>>>>> because
>>>>> of many commits that don't compile.
>>>>> # only skipped commits left to test
>>>>> # possible first bad commit: [52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b] 
>>>>> fs: convert block_dev.c to bioset_init()
>>>>> # possible first bad commit: [a47a28b74a5c7c27bf621276b85ad6c124651236] 
>>>>> target: convert to bioset_init()/mempool_init()
>>>>> # possible first bad commit: [6f1c819c219f7841079f0f43ab62727a55b0d849] 
>>>>> dm: convert to bioset_init()/mempool_init()
>>>>> # possible first bad commit: [afeee514ce7f4cab605beedd03be71ebaf0c5fc8] 
>>>>> md: convert to bioset_init()/mempool_init()
>>>>> # possible first bad commit: [d19936a26658a7a53edd5619d631ee2c2c3151a2] 
>>>>> bcache: convert to bioset_init()/mempool_init()
>>>>> # possible first bad commit: [b906bbb6997785d9ea0bd3f5585537afa6257c43] 
>>>>> lightnvm: convert to bioset_init()/mempool_init()
>>>>>
>>>>> Testing manually, a47a28b74a5c7c27bf621276b85ad6c124651236 works.
>>>>> 52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b does not compile
>>>>> 8ac9f7c1fd1d342e82ddf078425423b050652ba0 does not compile
>>>>> e292d7bc63c8f2adb3dfda27910e805f1b6557f9 does not compile
>>>>> dad08527525f9a8ac9c7f278864c65f94bc5e9b3 does not compile
>>>>> 943cf9f3ca16133dbd00f9a4cbfea46512fcb0e8 works
>>>>> ..
>>>>> fedc3abe7bd2dcc4c80bcf3cff8708a3908d8219 works
>>>>> 04c4950d5b373ba712d928592e05e73510785bca crashes
>>>>
>>>> It looks like the ISA bioset pool isn't being initialized. You should
>>>> have messages like this in your dmesg:
>>>>
>>>> isa pool size: 16 pages
>>>>
>>>> (which you do), but also something on the bioset section. Do you have
>>>> this one:
>>>>
>>>> pool size: 64 pages
>>>>
>>>> as well?
>>>>
>>>
>>> No, it's not there.
>>
>> Can you attach your .config? I'm guessing CONFIG_HIGHMEM* isn't set.
>>
> 
> It is.

Puzzled... Does this work?


diff --git a/block/bounce.c b/block/bounce.c
index b30071ac4ec6..49564a1bfd22 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -35,10 +35,6 @@ static mempool_t page_pool, isa_page_pool;
 static __init int init_emergency_pool(void)
 {
int ret;
-#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
-   if (max_pfn <= max_low_pfn)
-   return 0;
-#endif
 
ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
BUG_ON(ret);

-- 
Jens Axboe

Re: bioset changes in 4.18 broke aha1542

2018-10-18 Thread Jens Axboe

On 10/18/18 1:55 PM, Ondrej Zary wrote:
> On Thursday 18 October 2018 20:58:57 Jens Axboe wrote:
>> On 10/18/18 12:34 PM, Ondrej Zary wrote:
>>> Hello,
>>> aha1542 works fine in 4.17 but crashes in 4.18. It's hard to bisect because
>>> of many commits that don't compile.
>>> # only skipped commits left to test
>>> # possible first bad commit: [52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b] fs: 
>>> convert block_dev.c to bioset_init()
>>> # possible first bad commit: [a47a28b74a5c7c27bf621276b85ad6c124651236] 
>>> target: convert to bioset_init()/mempool_init()
>>> # possible first bad commit: [6f1c819c219f7841079f0f43ab62727a55b0d849] dm: 
>>> convert to bioset_init()/mempool_init()
>>> # possible first bad commit: [afeee514ce7f4cab605beedd03be71ebaf0c5fc8] md: 
>>> convert to bioset_init()/mempool_init()
>>> # possible first bad commit: [d19936a26658a7a53edd5619d631ee2c2c3151a2] 
>>> bcache: convert to bioset_init()/mempool_init()
>>> # possible first bad commit: [b906bbb6997785d9ea0bd3f5585537afa6257c43] 
>>> lightnvm: convert to bioset_init()/mempool_init()
>>>
>>> Testing manually, a47a28b74a5c7c27bf621276b85ad6c124651236 works.
>>> 52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b does not compile
>>> 8ac9f7c1fd1d342e82ddf078425423b050652ba0 does not compile
>>> e292d7bc63c8f2adb3dfda27910e805f1b6557f9 does not compile
>>> dad08527525f9a8ac9c7f278864c65f94bc5e9b3 does not compile
>>> 943cf9f3ca16133dbd00f9a4cbfea46512fcb0e8 works
>>> ..
>>> fedc3abe7bd2dcc4c80bcf3cff8708a3908d8219 works
>>> 04c4950d5b373ba712d928592e05e73510785bca crashes
>>
>> It looks like the ISA bioset pool isn't being initialized. You should
>> have messages like this in your dmesg:
>>
>> isa pool size: 16 pages
>>
>> (which you do), but also something on the bioset section. Do you have
>> this one:
>>
>> pool size: 64 pages
>>
>> as well?
>>
> 
> No, it's not there.

Can you attach your .config? I'm guessing CONFIG_HIGHMEM* isn't set.

-- 
Jens Axboe

Re: bioset changes in 4.18 broke aha1542

2018-10-18 Thread Jens Axboe

On 10/18/18 12:34 PM, Ondrej Zary wrote:
> Hello,
> aha1542 works fine in 4.17 but crashes in 4.18. It's hard to bisect because
> of many commits that don't compile.
> # only skipped commits left to test
> # possible first bad commit: [52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b] fs: 
> convert block_dev.c to bioset_init()
> # possible first bad commit: [a47a28b74a5c7c27bf621276b85ad6c124651236] 
> target: convert to bioset_init()/mempool_init()
> # possible first bad commit: [6f1c819c219f7841079f0f43ab62727a55b0d849] dm: 
> convert to bioset_init()/mempool_init()
> # possible first bad commit: [afeee514ce7f4cab605beedd03be71ebaf0c5fc8] md: 
> convert to bioset_init()/mempool_init()
> # possible first bad commit: [d19936a26658a7a53edd5619d631ee2c2c3151a2] 
> bcache: convert to bioset_init()/mempool_init()
> # possible first bad commit: [b906bbb6997785d9ea0bd3f5585537afa6257c43] 
> lightnvm: convert to bioset_init()/mempool_init()
> 
> Testing manually, a47a28b74a5c7c27bf621276b85ad6c124651236 works.
> 52190f8abe7f2bf2b4e5f9760cbcc1427ca2136b does not compile
> 8ac9f7c1fd1d342e82ddf078425423b050652ba0 does not compile
> e292d7bc63c8f2adb3dfda27910e805f1b6557f9 does not compile
> dad08527525f9a8ac9c7f278864c65f94bc5e9b3 does not compile
> 943cf9f3ca16133dbd00f9a4cbfea46512fcb0e8 works
> ..
> fedc3abe7bd2dcc4c80bcf3cff8708a3908d8219 works
> 04c4950d5b373ba712d928592e05e73510785bca crashes

It looks like the ISA bioset pool isn't being initialized. You should
have messages like this in your dmesg:

isa pool size: 16 pages

(which you do), but also something on the bioset section. Do you have
this one:

pool size: 64 pages

as well?

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-17 Thread Jens Axboe

On 10/17/18 12:08 PM, Douglas Gilbert wrote:
> On 2018-10-17 11:55 a.m., Benjamin Block wrote:
>> On Tue, Oct 16, 2018 at 08:43:01AM -0600, Jens Axboe wrote:
>>> Requires a few changes to the FC transport class as well.
>>>
>>> Cc: Johannes Thumshirn 
>>> Cc: Benjamin Block 
>>> Cc: linux-scsi@vger.kernel.org
>>> Signed-off-by: Jens Axboe 
>>> ---
>>>   block/bsg-lib.c  | 102 +--
>>>   drivers/scsi/scsi_transport_fc.c |  61 ++
>>>   2 files changed, 91 insertions(+), 72 deletions(-)
>>>
>>
>> Hey Jens,
>>
>> I haven't had time to look into this in any deep way - but I did plan to
>> -, but just to see whether it starts and runs some I/O I tried giving it
>> a spin and came up with nothing (see line 3 and 5):
>>
>> [   14.082079] scsi host0: scsi_eh_0: sleeping
>> [   14.082288] scsi host0: zfcp
>> [   14.086823] scsi host0: fc_host0: bsg interface failed to initialize - 
>> setup queue
>> [   14.089198] qdio: 0.0.1900 ZFCP on SC 107 using AI:1 QEBSM:0 PRI:1 TDD:1 
>> SIGA: W AP
>> [   15.228583]  rport-0:0-0: failed to setup bsg queue
>> [   15.229886] scsi 0:0:0:0: scsi scan: INQUIRY pass 1 length 36
>> [   15.230801] scsi 0:0:0:0: scsi scan: INQUIRY successful with code 0x0
>> [   15.230860] scsi 0:0:0:0: scsi scan: INQUIRY pass 2 length 164
>> [   15.231171] scsi 0:0:0:0: scsi scan: INQUIRY successful with code 0x0
>> [   15.231190] scsi 0:0:0:0: scsi scan: peripheral device type of 31, no 
>> device added
>> [   15.233171] scsi 0:0:0:0: scsi scan: Sending REPORT LUNS to (try 0)
>> [   15.234144] scsi 0:0:0:0: scsi scan: REPORT LUNS successful (try 0) 
>> result 0x0
>> [   15.234156] scsi 0:0:0:0: scsi scan: REPORT LUN scan
>> [   15.235040] scsi 0:0:0:1082671104: scsi scan: INQUIRY pass 1 length 36
>> [   15.235220] scsi host1: scsi_eh_1: sleeping
>> [   15.235336] scsi 0:0:0:1082671104: scsi scan: INQUIRY successful with 
>> code 0x0
>> [   15.235355] scsi 0:0:0:1082671104: scsi scan: INQUIRY pass 2 length 164
>> [   15.235434] scsi host1: zfcp
>> [   15.235667] scsi 0:0:0:1082671104: scsi scan: INQUIRY successful with 
>> code 0x0
>> [   15.235709] scsi 0:0:0:1082671104: Direct-Access IBM  2107900 
>>  3230 PQ: 0 ANSI: 5
>> [   15.238468] scsi host1: fc_host1: bsg interface failed to initialize - 
>> setup queue
>> 
>>
>> Seems to already fail at setting up the bsg interfaces for zFCP - at
>> loading time of the module. This is just your patch on top of
>> 4.19.0-rc8.
> 
> Hi,
> Almost all of the utilities in the sg3_utils package (a few exceptions:
> sg_reset, sgm_dd, sgp_dd), when given a bsg file node (e.g.
> 'sg_inq /dev/bsg/0:0:0:1'), will use the bsg sg_v4 interface (i.e. as
> defined in ). Dito with the sdparm and ddpt packages.
> So there is a lot of test code for any changes to the bsg driver.

That's good info, I did in fact run test tools on the bsg nodes and
verified that it worked fine. But I could not test the embedded queues
like Benjamin is doing above - and those were broken due to a silly
wrong pointer check. So hopefully with that fixed, it'll work just
fine.

-- 
Jens Axboe

Re: [PATCH] bsg: convert to use blk-mq

2018-10-17 Thread Jens Axboe

On 10/17/18 9:55 AM, Benjamin Block wrote:
> On Tue, Oct 16, 2018 at 08:43:01AM -0600, Jens Axboe wrote:
>> Requires a few changes to the FC transport class as well.
>>
>> Cc: Johannes Thumshirn 
>> Cc: Benjamin Block 
>> Cc: linux-scsi@vger.kernel.org
>> Signed-off-by: Jens Axboe 
>> ---
>>  block/bsg-lib.c  | 102 +--
>>  drivers/scsi/scsi_transport_fc.c |  61 ++
>>  2 files changed, 91 insertions(+), 72 deletions(-)
>>
> 
> Hey Jens,
> 
> I haven't had time to look into this in any deep way - but I did plan to
> -, but just to see whether it starts and runs some I/O I tried giving it
> a spin and came up with nothing (see line 3 and 5):

I'm an idiot, can you try this on top?


diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 1aa0ed3fc339..95e12b635225 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -311,7 +311,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, 
const char *name,
int ret = -ENOMEM;
 
set = kzalloc(sizeof(*set), GFP_KERNEL);
-   if (set)
+   if (!set)
return ERR_PTR(-ENOMEM);
 
set->ops = &bsg_mq_ops;

-- 
Jens Axboe

Re: [PATCHv6 0/3] Deprecate DAC960 driver

2018-10-17 Thread Jens Axboe

On 10/17/18 9:25 AM, Hannes Reinecke wrote:
> Hi all,
> 
> as we're trying to get rid of the remaining request_fn drivers here's
> a patchset to move the DAC960 driver to the SCSI stack.
> As per request from hch I've split up the driver into two new SCSI
> drivers called 'myrb' and 'myrs'.
> 
> The 'myrb' driver only supports the earlier (V1) firmware interface, which
> doesn't have a SCSI interface for the logical drives; for those I've added
> a (pretty rudimentary, admittedly) SCSI translation for them.
> 
> The 'myrs' driver supports the newer (V2) firmware interface, which is
> SCSI based and doesn't need the translation layer.
> 
> And the weird proc interface from DAC960 has been converted to sysfs 
> attributes.
> 
> Tested with eXtremeRAID 1100 (for V1 Firmware) and Mylex AcceleRAID 170
> (for V2 Firmware).

Applied 3/3 in for-4.20/block

-- 
Jens Axboe

Re: [PATCHv5 0/3] Deprecate DAC960 driver

2018-10-17 Thread Jens Axboe

On 10/12/18 1:15 AM, Hannes Reinecke wrote:
> Hi all,
> 
> as we're trying to get rid of the remaining request_fn drivers here's
> a patchset to move the DAC960 driver to the SCSI stack.
> As per request from hch I've split up the driver into two new SCSI
> drivers called 'myrb' and 'myrs'.
> 
> The 'myrb' driver only supports the earlier (V1) firmware interface, which
> doesn't have a SCSI interface for the logical drives; for those I've added
> a (pretty rudimentary, admittedly) SCSI translation for them.
> 
> The 'myrs' driver supports the newer (V2) firmware interface, which is
> SCSI based and doesn't need the translation layer.
> 
> And the weird proc interface from DAC960 has been converted to sysfs 
> attributes.
> 
> Tested with eXtremeRAID 1100 (for V1 Firmware) and Mylex AcceleRAID 170
> (for V2 Firmware).

Martin, are you taking the two new drivers? Then I'll queue up the
old driver removal.

-- 
Jens Axboe

Re: [PATCH] IB/srp: remove old request_fn_active check

2018-10-16 Thread Jens Axboe

On 10/16/18 9:09 AM, Martin K. Petersen wrote:
> 
> Jens,
> 
>>> How about adding Hannes' Signed-off-by? See also
>>> https://www.spinics.net/lists/linux-scsi/msg123488.html.
>>
>> I had no idea this existed, I'll just replace with that one instead
>> and your reviewed-by (and mine).
> 
>> Martin, is this queued up?
> 
> Nope, I was waiting for Hannes to address the feedback from Bart and
> Johannes.

Hannes, please get this resent with Johannes comment addressed. Bart's
comment should no longer be relevant.

You can add his and mine reviewed-by, as per previous email.

-- 
Jens Axboe

Re: [PATCH] IB/srp: remove old request_fn_active check

2018-10-16 Thread Jens Axboe

On 10/16/18 8:55 AM, Bart Van Assche wrote:
> On Tue, 2018-10-16 at 08:31 -0600, Jens Axboe wrote:
>> This check is only viable for non scsi-mq. Since that is going away,
>> kill this legacy check.
>>
>> Cc: Bart Van Assche 
>> Cc: Parav Pandit 
>> Cc: linux-scsi@vger.kernel.org
>> Signed-off-by: Jens Axboe 
>> ---
>>  drivers/infiniband/ulp/srp/ib_srp.c | 7 ---
>>  1 file changed, 7 deletions(-)
>>
>> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c 
>> b/drivers/infiniband/ulp/srp/ib_srp.c
>> index 0b34e909505f..5a79444c2f3c 100644
>> --- a/drivers/infiniband/ulp/srp/ib_srp.c
>> +++ b/drivers/infiniband/ulp/srp/ib_srp.c
>> @@ -1334,13 +1334,6 @@ static void srp_terminate_io(struct srp_rport *rport)
>>  struct scsi_device *sdev;
>>  int i, j;
>>  
>> -/*
>> - * Invoking srp_terminate_io() while srp_queuecommand() is running
>> - * is not safe. Hence the warning statement below.
>> - */
>> -shost_for_each_device(sdev, shost)
>> -WARN_ON_ONCE(sdev->request_queue->request_fn_active);
>> -
>>  for (i = 0; i < target->ch_count; i++) {
>>  ch = &target->ch[i];
> 
> How about adding Hannes' Signed-off-by? See also
> https://www.spinics.net/lists/linux-scsi/msg123488.html.

I had no idea this existed, I'll just replace with that one instead
and your reviewed-by (and mine).

Martin, is this queued up?

-- 
Jens Axboe

[PATCH] bsg: convert to use blk-mq

2018-10-16 Thread Jens Axboe

Requires a few changes to the FC transport class as well.

Cc: Johannes Thumshirn 
Cc: Benjamin Block 
Cc: linux-scsi@vger.kernel.org
Signed-off-by: Jens Axboe 
---
 block/bsg-lib.c  | 102 +--
 drivers/scsi/scsi_transport_fc.c |  61 ++
 2 files changed, 91 insertions(+), 72 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f3501cdaf1a6..1aa0ed3fc339 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -21,7 +21,7 @@
  *
  */
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -129,7 +129,7 @@ static void bsg_teardown_job(struct kref *kref)
kfree(job->request_payload.sg_list);
kfree(job->reply_payload.sg_list);
 
-   blk_end_request_all(rq, BLK_STS_OK);
+   blk_mq_end_request(rq, BLK_STS_OK);
 }
 
 void bsg_job_put(struct bsg_job *job)
@@ -162,10 +162,10 @@ void bsg_job_done(struct bsg_job *job, int result,
 EXPORT_SYMBOL_GPL(bsg_job_done);
 
 /**
- * bsg_softirq_done - softirq done routine for destroying the bsg requests
+ * bsg_complete - softirq done routine for destroying the bsg requests
  * @rq: BSG request that holds the job to be destroyed
  */
-static void bsg_softirq_done(struct request *rq)
+static void bsg_complete(struct request *rq)
 {
struct bsg_job *job = blk_mq_rq_to_pdu(rq);
 
@@ -224,54 +224,46 @@ static bool bsg_prepare_job(struct device *dev, struct 
request *req)
 }
 
 /**
- * bsg_request_fn - generic handler for bsg requests
- * @q: request queue to manage
+ * bsg_queue_rq - generic handler for bsg requests
+ * @hctx: hardware queue
+ * @bd: queue data
  *
  * On error the create_bsg_job function should return a -Exyz error value
  * that will be set to ->result.
  *
  * Drivers/subsys should pass this to the queue init function.
  */
-static void bsg_request_fn(struct request_queue *q)
-   __releases(q->queue_lock)
-   __acquires(q->queue_lock)
+static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
+const struct blk_mq_queue_data *bd)
 {
+   struct request_queue *q = hctx->queue;
struct device *dev = q->queuedata;
-   struct request *req;
+   struct request *req = bd->rq;
int ret;
 
+   blk_mq_start_request(req);
+
if (!get_device(dev))
-   return;
-
-   while (1) {
-   req = blk_fetch_request(q);
-   if (!req)
-   break;
-   spin_unlock_irq(q->queue_lock);
-
-   if (!bsg_prepare_job(dev, req)) {
-   blk_end_request_all(req, BLK_STS_OK);
-   spin_lock_irq(q->queue_lock);
-   continue;
-   }
-
-   ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
-   spin_lock_irq(q->queue_lock);
-   if (ret)
-   break;
-   }
+   return BLK_STS_IOERR;
+
+   if (!bsg_prepare_job(dev, req))
+   return BLK_STS_IOERR;
+
+   ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
+   if (ret)
+   return BLK_STS_IOERR;
 
-   spin_unlock_irq(q->queue_lock);
put_device(dev);
-   spin_lock_irq(q->queue_lock);
+   return BLK_STS_OK;
 }
 
 /* called right after the request is allocated for the request_queue */
-static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
+static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
+  unsigned int hctx_idx, unsigned int numa_node)
 {
struct bsg_job *job = blk_mq_rq_to_pdu(req);
 
-   job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
+   job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
if (!job->reply)
return -ENOMEM;
return 0;
@@ -289,13 +281,21 @@ static void bsg_initialize_rq(struct request *req)
job->dd_data = job + 1;
 }
 
-static void bsg_exit_rq(struct request_queue *q, struct request *req)
+static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
+  unsigned int hctx_idx)
 {
struct bsg_job *job = blk_mq_rq_to_pdu(req);
 
kfree(job->reply);
 }
 
+static const struct blk_mq_ops bsg_mq_ops = {
+   .queue_rq   = bsg_queue_rq,
+   .init_request   = bsg_init_rq,
+   .exit_request   = bsg_exit_rq,
+   .complete   = bsg_complete,
+};
+
 /**
  * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
  * @dev: device to attach bsg device to
@@ -306,26 +306,32 @@ static void bsg_exit_rq(struct request_queue *q, struct 
request *req)
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
bsg_job_fn *job_fn, int dd_job_size)
 {
+   struct blk_mq_tag_set *set;
struct request_queue *q;
-   int ret;
+   int ret = -ENOMEM;
 
-   q = blk_alloc_queue(GFP_KERNEL);
-

[PATCH] sg: remove bad blk_end_request_all() call

2018-10-16 Thread Jens Axboe

We just need to free the request here. Additionally, this is
currently wrong for a queue that's using MQ currently, it'll
crash.

Cc: Doug Gilbert 
Cc: linux-scsi@vger.kernel.org
Signed-off-by: Jens Axboe 
---
 drivers/scsi/sg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 8a254bb46a9b..c6ad00703c5b 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -822,7 +822,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
if (atomic_read(&sdp->detaching)) {
if (srp->bio) {
scsi_req_free_cmd(scsi_req(srp->rq));
-   blk_end_request_all(srp->rq, BLK_STS_IOERR);
+   blk_put_request(srp->rq);
srp->rq = NULL;
    }
 
-- 
2.17.1

-- 
Jens Axboe

[PATCH] fnic_scsi: replace gross legacy tag hack with blk-mq hack

2018-10-16 Thread Jens Axboe

Would be nice to fix up the SCSI midlayer instead, but this will
do for now.

Cc: Christoph Hellwig 
Cc: Satish Kharat 
Cc: linux-scsi@vger.kernel.org
Signed-off-by: Jens Axboe 
---
 drivers/scsi/fnic/fnic_scsi.c | 61 +++
 1 file changed, 12 insertions(+), 49 deletions(-)

diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 8cbd3c9f0b4c..c156bab6a2ee 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2272,33 +2272,17 @@ static int fnic_clean_pending_aborts(struct fnic *fnic,
 static inline int
 fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 {
-   struct blk_queue_tag *bqt = fnic->lport->host->bqt;
-   int tag, ret = SCSI_NO_TAG;
+   struct request_queue *q = sc->request->q;
+   struct request *dummy;
 
-   BUG_ON(!bqt);
-   if (!bqt) {
-   pr_err("Tags are not supported\n");
-   goto end;
-   }
-
-   do {
-   tag = find_next_zero_bit(bqt->tag_map, bqt->max_depth, 1);
-   if (tag >= bqt->max_depth) {
-   pr_err("Tag allocation failure\n");
-   goto end;
-   }
-   } while (test_and_set_bit(tag, bqt->tag_map));
+   dummy = blk_mq_alloc_request(q, REQ_OP_WRITE, BLK_MQ_REQ_NOWAIT);
+   if (IS_ERR(dummy))
+   return SCSI_NO_TAG;
 
-   bqt->tag_index[tag] = sc->request;
-   sc->request->tag = tag;
-   sc->tag = tag;
-   if (!sc->request->special)
-   sc->request->special = sc;
+   sc->tag = sc->request->tag = dummy->tag;
+   sc->request->special = sc;
 
-   ret = tag;
-
-end:
-   return ret;
+   return dummy->tag;
 }
 
 /**
@@ -2308,20 +2292,9 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct 
scsi_cmnd *sc)
 static inline void
 fnic_scsi_host_end_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 {
-   struct blk_queue_tag *bqt = fnic->lport->host->bqt;
-   int tag = sc->request->tag;
-
-   if (tag == SCSI_NO_TAG)
-   return;
-
-   BUG_ON(!bqt || !bqt->tag_index[tag]);
-   if (!bqt)
-   return;
+   struct request *dummy = sc->request->special;
 
-   bqt->tag_index[tag] = NULL;
-   clear_bit(tag, bqt->tag_map);
-
-   return;
+   blk_mq_free_request(dummy);
 }
 
 /*
@@ -2380,19 +2353,9 @@ int fnic_device_reset(struct scsi_cmnd *sc)
tag = sc->request->tag;
if (unlikely(tag < 0)) {
/*
-* XXX(hch): current the midlayer fakes up a struct
-* request for the explicit reset ioctls, and those
-* don't have a tag allocated to them.  The below
-* code pokes into midlayer structures to paper over
-* this design issue, but that won't work for blk-mq.
-*
-* Either someone who can actually test the hardware
-* will have to come up with a similar hack for the
-* blk-mq case, or we'll have to bite the bullet and
-* fix the way the EH ioctls work for real, but until
-* that happens we fail these explicit requests here.
+* Really should fix the midlayer to pass in a proper
+* request for ioctls...
 */
-
tag = fnic_scsi_host_start_tag(fnic, sc);
if (unlikely(tag == SCSI_NO_TAG))
goto fnic_device_reset_end;
-- 
2.17.1

-- 
Jens Axboe

[PATCH] osd: initiator should use mq variant of request ending

2018-10-16 Thread Jens Axboe

This is currently wrong since it isn't dependent on if we're using
mq or not. At least now it'll be correct when we force mq.

Cc: linux-scsi@vger.kernel.org
Signed-off-by: Jens Axboe 
---
 drivers/scsi/osd/osd_initiator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 67b14576fff2..e19fa883376f 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -445,7 +445,7 @@ static void _put_request(struct request *rq)
 *   code paths.
 */
if (unlikely(rq->bio))
-   blk_end_request(rq, BLK_STS_IOERR, blk_rq_bytes(rq));
+   blk_mq_end_request(rq, BLK_STS_IOERR);
else
blk_put_request(rq);
 }
-- 
2.17.1

-- 
Jens Axboe

[PATCH] IB/srp: remove old request_fn_active check

2018-10-16 Thread Jens Axboe

This check is only viable for non scsi-mq. Since that is going away,
kill this legacy check.

Cc: Bart Van Assche 
Cc: Parav Pandit 
Cc: linux-scsi@vger.kernel.org
Signed-off-by: Jens Axboe 
---
 drivers/infiniband/ulp/srp/ib_srp.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c 
b/drivers/infiniband/ulp/srp/ib_srp.c
index 0b34e909505f..5a79444c2f3c 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1334,13 +1334,6 @@ static void srp_terminate_io(struct srp_rport *rport)
struct scsi_device *sdev;
int i, j;
 
-   /*
-* Invoking srp_terminate_io() while srp_queuecommand() is running
-* is not safe. Hence the warning statement below.
-*/
-   shost_for_each_device(sdev, shost)
-   WARN_ON_ONCE(sdev->request_queue->request_fn_active);
-
for (i = 0; i < target->ch_count; i++) {
ch = &target->ch[i];
 
-- 
2.17.1

-- 
Jens Axboe

Re: [PATCH v4 00/11] Zoned block device support improvements

2018-10-15 Thread Jens Axboe

On 10/14/18 6:45 PM, Damien Le Moal wrote:
> Jens,
> 
> On 2018/10/14 7:43, Jens Axboe wrote:
>> On 10/12/18 4:08 AM, Damien Le Moal wrote:
>>> This series improves zoned block device support (reduce overhead) and
>>> introduces many simplifications to the code (overall, there are more 
>>> deletions
>>> than insertions).
>>>
>>> In more details:
>>> * Patches 1 to 3 are SCSI side (sd driver) cleanups and improvements 
>>> reducing
>>>   the overhead of report zones command execution during disk scan and
>>>   revalidation.
>>> * Patches 4 to 9 improve the useability and user API of zoned block devices.
>>> * Patch 10 is the main part of this series. This patch replaces the
>>>   REQ_OP_ZONE_REPORT BIO/request operation for executing report zones 
>>> commands
>>>   with a block device file operation, removing the need for the command 
>>> reply
>>>   payload in-place rewriting in the BIO buffer. This leads to major
>>>   simplification of the code in many places.
>>> * Patch 11 further simplifies the code of low level drivers by providing a
>>>   generic implementation of zoned block device request queue zone bitmaps
>>>   initialization and revalidation.
>>>
>>> Please consider the addition of these patches in 4.20.
>>> Comments are as always welcome.
>>
>> How do we want to funnel this series? 1-3 look separate, so perhaps
>> they should go through the scsi tree. Then I can take the rest. Or do
>> some of the later ones depend on 1-3 being in, in terms of applying
>> cleanly?  I can also take all of them, looks like only #3 needs a
>> SCSI ack.
> 
> Patch 10 and 11 will not apply cleanly for the scsi part without 1-3
> in first.  1-3, 10 and 11 would need an ack/review from Martin (SCSI).
> And 10-11 also probably need an ack/review from Mike for the DM
> changes.

Probably want to start pinging people, if you're targeting 4.20 with
this...

-- 
Jens Axboe

Re: [PATCH v4 00/11] Zoned block device support improvements

2018-10-13 Thread Jens Axboe

On 10/12/18 4:08 AM, Damien Le Moal wrote:
> This series improves zoned block device support (reduce overhead) and
> introduces many simplifications to the code (overall, there are more deletions
> than insertions).
> 
> In more details:
> * Patches 1 to 3 are SCSI side (sd driver) cleanups and improvements reducing
>   the overhead of report zones command execution during disk scan and
>   revalidation.
> * Patches 4 to 9 improve the useability and user API of zoned block devices.
> * Patch 10 is the main part of this series. This patch replaces the
>   REQ_OP_ZONE_REPORT BIO/request operation for executing report zones commands
>   with a block device file operation, removing the need for the command reply
>   payload in-place rewriting in the BIO buffer. This leads to major
>   simplification of the code in many places.
> * Patch 11 further simplifies the code of low level drivers by providing a
>   generic implementation of zoned block device request queue zone bitmaps
>   initialization and revalidation.
> 
> Please consider the addition of these patches in 4.20.
> Comments are as always welcome.

How do we want to funnel this series? 1-3 look separate, so perhaps they
should go through the scsi tree. Then I can take the rest. Or do some
of the later ones depend on 1-3 being in, in terms of applying cleanly?
I can also take all of them, looks like only #3 needs a SCSI ack.

-- 
Jens Axboe

Re: [PATCHv5 0/3] Deprecate DAC960 driver

2018-10-12 Thread Jens Axboe

On 10/12/18 7:38 AM, Bart Van Assche wrote:
> On 10/12/18 12:15 AM, Hannes Reinecke wrote:
>> Hi all,
>>
>> as we're trying to get rid of the remaining request_fn drivers here's
>> a patchset to move the DAC960 driver to the SCSI stack.
>> As per request from hch I've split up the driver into two new SCSI
>> drivers called 'myrb' and 'myrs'.
> 
> Hi Hannes,
> 
> Apparently only patches 1/3 and 2/3 made it to the linux-scsi mailing 
> list but patch 3/3 not. See also 
> https://marc.info/?l=linux-scsi&r=3&b=201810&w=2.

I'm guessing it's just too big. I got it, but probably due to the CC.

-- 
Jens Axboe

Re: SCSI regression

2018-08-24 Thread Jens Axboe

On 8/24/18 6:21 PM, Jens Axboe wrote:
> On 8/24/18 5:16 PM, Ming Lei wrote:
>> Hi,
>>
>> On Fri, Aug 24, 2018 at 04:20:41PM -0600, Jens Axboe wrote:
>>> Hi,
>>>
>>> Was testing other things today, but ended up with this:
>>>
>>> # echo "write through" > /sys/block/sde/device/scsi_disk/4:0:0:0/cache_type
>>>
>>> hanging. Looking closer, the request is successfully queued and the
>>> caller is waiting on rq execution and completion, but the request is
>>> sitting in the hctx->dispatch list and is continually being attempted
>>> issued, but gets a BLK_STS_RESOURCE return.
>>
>> Just run fio randwrite and 'dbench -s' on virtio-scsi/usb-storage
>> after setting 'write through', looks not see such issue.
>>
>> Also not see such kind of issue on blktests/xfstests against today's
>> next tree too.
>>
>> Could you share a bit more(disk, io sched, dmesg log, workload) about
>> how to reproduce it? Is it in normal IO path or EH?
> 
> You're misunderstanding. The echo "write through" is the one that hangs,
> not subsequent IO. As written above, that first spawns a TUR and that
> request is being inserted, and the caller ends up waiting for it to
> complete off blk_execute_rq(). But the request itself sits on the
> dispatch list, gets dispatched, and gets BLK_STS_RESOURCE off
> ->queue_rq(). It goes back on the dispatch list, and the process repeats
> indefinitely since it always gets a BUSY return. On the SCSI side, what
> happens is that scsi_host_queue_ready() keeps returning false, which is
> why we keep returning BLK_STS_RESOURCE and not making any progress at
> all.

Task doing the echo:

[<0>] blk_execute_rq+0x77/0xa0
[<0>] __scsi_execute+0xd3/0x1f0
[<0>] sd_revalidate_disk+0xda/0x1cd0 [sd_mod]
[<0>] revalidate_disk+0x20/0x80
[<0>] cache_type_store+0x1f7/0x210 [sd_mod]
[<0>] kernfs_fop_write+0x106/0x190
[<0>] __vfs_write+0x23/0x150
[<0>] vfs_write+0xbe/0x1b0
[<0>] ksys_write+0x45/0xa0
[<0>] do_syscall_64+0x42/0x100
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[<0>] 0x

and the host is in perpetual recovery mode:

# cat /sys/bus/scsi/devices/host4/scsi_host/host4/state
recovery

This is a normal SATA drive, hanging off ahci, queue depth 32. As mentioned
earlier, scsi_host_queue_ready() keeps returning false.

-- 
Jens Axboe

Re: SCSI regression

2018-08-24 Thread Jens Axboe

On 8/24/18 5:16 PM, Ming Lei wrote:
> Hi,
> 
> On Fri, Aug 24, 2018 at 04:20:41PM -0600, Jens Axboe wrote:
>> Hi,
>>
>> Was testing other things today, but ended up with this:
>>
>> # echo "write through" > /sys/block/sde/device/scsi_disk/4:0:0:0/cache_type
>>
>> hanging. Looking closer, the request is successfully queued and the
>> caller is waiting on rq execution and completion, but the request is
>> sitting in the hctx->dispatch list and is continually being attempted
>> issued, but gets a BLK_STS_RESOURCE return.
> 
> Just run fio randwrite and 'dbench -s' on virtio-scsi/usb-storage
> after setting 'write through', looks not see such issue.
> 
> Also not see such kind of issue on blktests/xfstests against today's
> next tree too.
> 
> Could you share a bit more(disk, io sched, dmesg log, workload) about
> how to reproduce it? Is it in normal IO path or EH?

You're misunderstanding. The echo "write through" is the one that hangs,
not subsequent IO. As written above, that first spawns a TUR and that
request is being inserted, and the caller ends up waiting for it to
complete off blk_execute_rq(). But the request itself sits on the
dispatch list, gets dispatched, and gets BLK_STS_RESOURCE off
->queue_rq(). It goes back on the dispatch list, and the process repeats
indefinitely since it always gets a BUSY return. On the SCSI side, what
happens is that scsi_host_queue_ready() keeps returning false, which is
why we keep returning BLK_STS_RESOURCE and not making any progress at
all.

Hope that helps...

-- 
Jens Axboe

Re: SCSI regression

2018-08-24 Thread Jens Axboe

On 8/24/18 4:20 PM, Jens Axboe wrote:
> Hi,
> 
> Was testing other things today, but ended up with this:
> 
> # echo "write through" > /sys/block/sde/device/scsi_disk/4:0:0:0/cache_type
> 
> hanging. Looking closer, the request is successfully queued and the
> caller is waiting on rq execution and completion, but the request is
> sitting in the hctx->dispatch list and is continually being attempted
> issued, but gets a BLK_STS_RESOURCE return.
> 
> The problematic commit is:
> 
> commit 328728630d9f2bf14b82ca30b5e47489beefe361
> Author: Ming Lei 
> Date:   Sun Jun 24 22:03:27 2018 +0800
> 
> scsi: core: avoid host-wide host_busy counter for scsi_mq
> 
> so I'm guessing it's some one-off or similar. I won't have more time
> to look into this today, so might have to wait until Monday.

Forgot to mention that this is with scsi-mq enabled.

-- 
Jens Axboe

SCSI regression

2018-08-24 Thread Jens Axboe

Hi,

Was testing other things today, but ended up with this:

# echo "write through" > /sys/block/sde/device/scsi_disk/4:0:0:0/cache_type

hanging. Looking closer, the request is successfully queued and the
caller is waiting on rq execution and completion, but the request is
sitting in the hctx->dispatch list and is continually being attempted
issued, but gets a BLK_STS_RESOURCE return.

The problematic commit is:

commit 328728630d9f2bf14b82ca30b5e47489beefe361
Author: Ming Lei 
Date:   Sun Jun 24 22:03:27 2018 +0800

scsi: core: avoid host-wide host_busy counter for scsi_mq

so I'm guessing it's some one-off or similar. I won't have more time
to look into this today, so might have to wait until Monday.

-- 
Jens Axboe

Re: [PATCH RFC] sr: mark the device as changed when burning a CD

2018-06-20 Thread Jens Axboe

On 6/20/18 5:52 AM, Maurizio Lombardi wrote:
> Hi Jens,
> 
> Dne 23.5.2018 v 16:42 Jens Axboe napsal(a):
>> On 5/23/18 3:19 AM, Maurizio Lombardi wrote:
>>>
>>>
>>> Dne 22.5.2018 v 16:47 Jens Axboe napsal(a):
>>>> It's been many years, but back in the day the program writing the cd
>>>> would eject the disc once done. This of course forces a reload of
>>>> the toc and clearing of the flag. What program is this? Seems like
>>>> it should probably eject when it's done.
>>>
>>> They are using wodim to burn the CDs on their servers.
>>> The problem is that they do not want the CD to be ejected because their 
>>> drives
>>> lack a motorized tray, thus requiring manual intervention which they would 
>>> like to avoid.
>>
>> I took a quick look at it, man that sr driver needs a bit of love :-)
>>
>> Anyway, I wonder if something like the below would work. Check for
>> a close track command in the sr completion handler, and flag the media
>> as changed if we see one. Totally untested...
>>
>>
>> diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
>> index 3f3cb72e0c0c..48f0d7a096db 100644
>> --- a/drivers/scsi/sr.c
>> +++ b/drivers/scsi/sr.c
>> @@ -328,6 +328,9 @@ static int sr_done(struct scsi_cmnd *SCpnt)
>>  scmd_printk(KERN_INFO, SCpnt, "done: %x\n", result);
>>  #endif
>>  
>> +if (SCpnt->cmnd[0] == GPCMD_CLOSE_TRACK)
>> +cd->device->changed = 1;
>> +
>>  /*
>>   * Handle MEDIUM ERRORs or VOLUME OVERFLOWs that indicate partial
>>   * success.  Since this is a relatively rare error condition, no
>>
> 
> I just want to let you know that I tested the patch but unfortunately it 
> doesn't work.
> I will try to find out what GPCMD_* commands are passed to sr_done() when 
> burning a disc
> to see if there is another one which we could use.

It's been a decade since I last messed with this or burned a cd-r, so that
would be appreciated. blktrace should be enough to tell you what commands
are being sent.

-- 
Jens Axboe

Re: [PATCH 3/6] block: Create scsi_sense.h for SCSI and ATAPI

2018-05-23 Thread Jens Axboe

On 5/23/18 3:20 PM, Randy Dunlap wrote:
> On 05/23/2018 02:14 PM, Jens Axboe wrote:
>> On 5/23/18 2:52 PM, Kees Cook wrote:
>>> On Wed, May 23, 2018 at 7:31 AM, Jens Axboe  wrote:
>>>> On 5/23/18 8:25 AM, Christoph Hellwig wrote:
>>>>> On Wed, May 23, 2018 at 08:13:56AM -0600, Jens Axboe wrote:
>>>>>>> Should I move to code to a new drivers/scsi/scsi_sense.c and add it to
>>>>>>> drivers/scsi/Makefile as:
>>>>>>>
>>>>>>> obj-$(CONFIG_BLK_SCSI_REQUEST)+= scsi_sense.o
>>>>>>>
>>>>>>> Every place I want to use the code is already covered by
>>>>>>> CONFIG_BLK_SCSI_REQUEST, so it seems like I just need to know where to
>>>>>>> put the .c file. :P
>>>>>>
>>>>>> I think this is so much saner than a SCSI select or dependency, so I'll
>>>>>> have to disagree with Martin and Christoph. Just put it in drivers/scsi,
>>>>>> if it's the location they care about.
>>>>>
>>>>> I actually plan to remove CONFIG_BLK_SCSI_REQUEST in a merge window
>>>>> or two.  The only users are scsi and the ide layer, (virtio_blk
>>>>> support has already been accidentally disabled for a while), and getting
>>>>> rid of it allows to to shrink and simply the scsi data structures.
>>>>>
>>>>> But if you want this for now lets keep scsi_sense.c in drivers/scsi
>>>>> but depend on CONFIG_BLK_SCSI_REQUEST, that is easy enough to fix up.
>>>>
>>>> It could be a stand-alone dependency, doesn't have to be BLK_SCSI_REQUEST.
>>>> BLA_SCSI_SENSE or something would do. I don't care too much about that,
>>>> mostly getting rid of the entire stack dependency.
>>>
>>> Aaand, I can't do this and leave it in drivers/scsi because of 
>>> drivers/Makefile:
>>>
>>> obj-$(CONFIG_SCSI)  += scsi/
>>>
>>> So: this needs to live in block/ just like CONFIG_BLK_SCSI_REQUEST's
>>> scsi_ioctl.c. I will split it into CONFIG_BLK_SCSI_SENSE, but I'll
>>> still need to move the code from drivers/scsi/ to block/. Is this
>>> okay?
>>
>> Ugh, so that would necessitate a change there too. As I said before,
>> I don't really care where it lives. I know the SCSI folks seem bothered
>> by moving it, but in reality, it's not like this stuff will likely ever
>> really change. Of the two choices (select entire SCSI stack, or just move
>> this little bit), I know what I would consider the saner option...
>>
> 
> or option 3:
> 
> obj-y   += scsi/
> 
> so that make descends into drivers/scsi/ and then builds whatever is needed,
> depending on individual kconfig options.

Right, that was the initial option, the later two are the other options.

-- 
Jens Axboe

Re: [PATCH 3/6] block: Create scsi_sense.h for SCSI and ATAPI

2018-05-23 Thread Jens Axboe

On 5/23/18 2:52 PM, Kees Cook wrote:
> On Wed, May 23, 2018 at 7:31 AM, Jens Axboe  wrote:
>> On 5/23/18 8:25 AM, Christoph Hellwig wrote:
>>> On Wed, May 23, 2018 at 08:13:56AM -0600, Jens Axboe wrote:
>>>>> Should I move to code to a new drivers/scsi/scsi_sense.c and add it to
>>>>> drivers/scsi/Makefile as:
>>>>>
>>>>> obj-$(CONFIG_BLK_SCSI_REQUEST)+= scsi_sense.o
>>>>>
>>>>> Every place I want to use the code is already covered by
>>>>> CONFIG_BLK_SCSI_REQUEST, so it seems like I just need to know where to
>>>>> put the .c file. :P
>>>>
>>>> I think this is so much saner than a SCSI select or dependency, so I'll
>>>> have to disagree with Martin and Christoph. Just put it in drivers/scsi,
>>>> if it's the location they care about.
>>>
>>> I actually plan to remove CONFIG_BLK_SCSI_REQUEST in a merge window
>>> or two.  The only users are scsi and the ide layer, (virtio_blk
>>> support has already been accidentally disabled for a while), and getting
>>> rid of it allows to to shrink and simply the scsi data structures.
>>>
>>> But if you want this for now lets keep scsi_sense.c in drivers/scsi
>>> but depend on CONFIG_BLK_SCSI_REQUEST, that is easy enough to fix up.
>>
>> It could be a stand-alone dependency, doesn't have to be BLK_SCSI_REQUEST.
>> BLA_SCSI_SENSE or something would do. I don't care too much about that,
>> mostly getting rid of the entire stack dependency.
> 
> Aaand, I can't do this and leave it in drivers/scsi because of 
> drivers/Makefile:
> 
> obj-$(CONFIG_SCSI)  += scsi/
> 
> So: this needs to live in block/ just like CONFIG_BLK_SCSI_REQUEST's
> scsi_ioctl.c. I will split it into CONFIG_BLK_SCSI_SENSE, but I'll
> still need to move the code from drivers/scsi/ to block/. Is this
> okay?

Ugh, so that would necessitate a change there too. As I said before,
I don't really care where it lives. I know the SCSI folks seem bothered
by moving it, but in reality, it's not like this stuff will likely ever
really change. Of the two choices (select entire SCSI stack, or just move
this little bit), I know what I would consider the saner option...

-- 
Jens Axboe

Re: [PATCH RFC] sr: mark the device as changed when burning a CD

2018-05-23 Thread Jens Axboe

On 5/23/18 3:19 AM, Maurizio Lombardi wrote:
> 
> 
> Dne 22.5.2018 v 16:47 Jens Axboe napsal(a):
>> It's been many years, but back in the day the program writing the cd
>> would eject the disc once done. This of course forces a reload of
>> the toc and clearing of the flag. What program is this? Seems like
>> it should probably eject when it's done.
> 
> They are using wodim to burn the CDs on their servers.
> The problem is that they do not want the CD to be ejected because their drives
> lack a motorized tray, thus requiring manual intervention which they would 
> like to avoid.

I took a quick look at it, man that sr driver needs a bit of love :-)

Anyway, I wonder if something like the below would work. Check for
a close track command in the sr completion handler, and flag the media
as changed if we see one. Totally untested...


diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 3f3cb72e0c0c..48f0d7a096db 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -328,6 +328,9 @@ static int sr_done(struct scsi_cmnd *SCpnt)
scmd_printk(KERN_INFO, SCpnt, "done: %x\n", result);
 #endif
 
+   if (SCpnt->cmnd[0] == GPCMD_CLOSE_TRACK)
+   cd->device->changed = 1;
+
/*
 * Handle MEDIUM ERRORs or VOLUME OVERFLOWs that indicate partial
     * success.  Since this is a relatively rare error condition, no

-- 
Jens Axboe

Re: [PATCH 3/6] block: Create scsi_sense.h for SCSI and ATAPI

2018-05-23 Thread Jens Axboe

On 5/23/18 8:25 AM, Christoph Hellwig wrote:
> On Wed, May 23, 2018 at 08:13:56AM -0600, Jens Axboe wrote:
>>> Should I move to code to a new drivers/scsi/scsi_sense.c and add it to
>>> drivers/scsi/Makefile as:
>>>
>>> obj-$(CONFIG_BLK_SCSI_REQUEST)+= scsi_sense.o
>>>
>>> Every place I want to use the code is already covered by
>>> CONFIG_BLK_SCSI_REQUEST, so it seems like I just need to know where to
>>> put the .c file. :P
>>
>> I think this is so much saner than a SCSI select or dependency, so I'll
>> have to disagree with Martin and Christoph. Just put it in drivers/scsi,
>> if it's the location they care about.
> 
> I actually plan to remove CONFIG_BLK_SCSI_REQUEST in a merge window
> or two.  The only users are scsi and the ide layer, (virtio_blk
> support has already been accidentally disabled for a while), and getting
> rid of it allows to to shrink and simply the scsi data structures.
> 
> But if you want this for now lets keep scsi_sense.c in drivers/scsi
> but depend on CONFIG_BLK_SCSI_REQUEST, that is easy enough to fix up.

It could be a stand-alone dependency, doesn't have to be BLK_SCSI_REQUEST.
BLA_SCSI_SENSE or something would do. I don't care too much about that,
mostly getting rid of the entire stack dependency.

-- 
Jens Axboe

Re: [PATCH 3/6] block: Create scsi_sense.h for SCSI and ATAPI

2018-05-23 Thread Jens Axboe

On 5/22/18 5:49 PM, Kees Cook wrote:
> On Tue, May 22, 2018 at 4:42 PM, Jens Axboe  wrote:
>> On May 22, 2018, at 5:31 PM, Kees Cook  wrote:
>>>
>>>> On Tue, May 22, 2018 at 12:16 PM, Jens Axboe  wrote:
>>>>> On 5/22/18 1:13 PM, Christoph Hellwig wrote:
>>>>>> On Tue, May 22, 2018 at 01:09:41PM -0600, Jens Axboe wrote:
>>>>>> I think Martin and Christoph are objecting to moving the code to
>>>>>> block/scsi_ioctl.h. I don't care too much about where the code is, but
>>>>>> think it would be nice to have the definitions in a separate header. But
>>>>>> if they prefer just pulling in all of SCSI for it, well then I guess
>>>>>> it's pointless to move the header bits. Seems very heavy handed to me,
>>>>>> though.
>>>>>
>>>>> It might be heavy handed for the 3 remaining users of drivers/ide,
>>>>
>>>> Brutal :-)
>>>
>>> Heh. I noticed a similar sense buffer use in drivers/cdrom/cdrom.c
>>> too. Is this okay under the same considerations?
>>
>> This is going from somewhat crazy to pretty nuts, imho. I guess in practical 
>> terms it doesn’t matter that much, since most folks would be using sr. I 
>> still think a split would be vastly superior. Just keep the scsi code in 
>> drivers/scsi/ and make it independently selectable.
> 
> I had originally tied it to BLK_SCSI_REQUEST. Logically speaking,
> sense buffers are part of the request, and the CONFIG work is already
> done. This is roughly what I tried to do before, since scsi_ioctl.c is
> the only code pulled in for CONFIG_BLK_SCSI_REQUEST:
> 
> $ git grep BLK_SCSI_REQUEST | grep Makefile
> block/Makefile:obj-$(CONFIG_BLK_SCSI_REQUEST)   += scsi_ioctl.o
> 
> Should I move to code to a new drivers/scsi/scsi_sense.c and add it to
> drivers/scsi/Makefile as:
> 
> obj-$(CONFIG_BLK_SCSI_REQUEST)+= scsi_sense.o
> 
> Every place I want to use the code is already covered by
> CONFIG_BLK_SCSI_REQUEST, so it seems like I just need to know where to
> put the .c file. :P

I think this is so much saner than a SCSI select or dependency, so I'll
have to disagree with Martin and Christoph. Just put it in drivers/scsi,
if it's the location they care about.

-- 
Jens Axboe

Re: [PATCH 3/6] block: Create scsi_sense.h for SCSI and ATAPI

2018-05-22 Thread Jens Axboe

On May 22, 2018, at 5:31 PM, Kees Cook  wrote:
> 
>> On Tue, May 22, 2018 at 12:16 PM, Jens Axboe  wrote:
>>> On 5/22/18 1:13 PM, Christoph Hellwig wrote:
>>>> On Tue, May 22, 2018 at 01:09:41PM -0600, Jens Axboe wrote:
>>>> I think Martin and Christoph are objecting to moving the code to
>>>> block/scsi_ioctl.h. I don't care too much about where the code is, but
>>>> think it would be nice to have the definitions in a separate header. But
>>>> if they prefer just pulling in all of SCSI for it, well then I guess
>>>> it's pointless to move the header bits. Seems very heavy handed to me,
>>>> though.
>>> 
>>> It might be heavy handed for the 3 remaining users of drivers/ide,
>> 
>> Brutal :-)
> 
> Heh. I noticed a similar sense buffer use in drivers/cdrom/cdrom.c
> too. Is this okay under the same considerations?

This is going from somewhat crazy to pretty nuts, imho. I guess in practical 
terms it doesn’t matter that much, since most folks would be using sr. I still 
think a split would be vastly superior. Just keep the scsi code in 
drivers/scsi/ and make it independently selectable.

Jens

Re: [PATCH 3/6] block: Create scsi_sense.h for SCSI and ATAPI

2018-05-22 Thread Jens Axboe

On 5/22/18 1:13 PM, Christoph Hellwig wrote:
> On Tue, May 22, 2018 at 01:09:41PM -0600, Jens Axboe wrote:
>> I think Martin and Christoph are objecting to moving the code to
>> block/scsi_ioctl.h. I don't care too much about where the code is, but
>> think it would be nice to have the definitions in a separate header. But
>> if they prefer just pulling in all of SCSI for it, well then I guess
>> it's pointless to move the header bits. Seems very heavy handed to me,
>> though.
> 
> It might be heavy handed for the 3 remaining users of drivers/ide,

Brutal :-)

> but as long as that stuff just keeps working I'd rather worry about
> everyone else, and keep the scsi code where it belongs.

Fine with me then, hopefully we can some day kill it off.

-- 
Jens Axboe

Re: [PATCH 3/6] block: Create scsi_sense.h for SCSI and ATAPI

2018-05-22 Thread Jens Axboe

On 5/22/18 12:59 PM, Kees Cook wrote:
> On Tue, May 22, 2018 at 11:50 AM, Martin K. Petersen
>  wrote:
>>
>> Christoph,
>>
>>> On Tue, May 22, 2018 at 11:15:09AM -0700, Kees Cook wrote:
>>>> Both SCSI and ATAPI share the sense header. In preparation for using the
>>>> struct scsi_sense_hdr more widely, move this into a separate header and
>>>> move the helper function to scsi_ioctl.c which is linked with CONFIG_IDE
>>>> by way of CONFIG_BLK_SCSI_REQUEST.
>>>
>>> Please keep the code where it is and just depend on SCSI on the legacy
>>> ide driver.  No need to do gymnastics just for a legacy case.
>>
>> Yup, I agree.
> 
> Oh, er, this was mainly done at Jens's request. Jens, can you advise?

I think Martin and Christoph are objecting to moving the code to
block/scsi_ioctl.h. I don't care too much about where the code is, but
think it would be nice to have the definitions in a separate header. But
if they prefer just pulling in all of SCSI for it, well then I guess
it's pointless to move the header bits. Seems very heavy handed to me,
though.

-- 
Jens Axboe

Re: [PATCH RFC] sr: mark the device as changed when burning a CD

2018-05-22 Thread Jens Axboe

On 5/22/18 3:15 AM, Maurizio Lombardi wrote:
> Some of our customers complain that, after burning a CD, it's not possible
> to proceed to mount it without ejecting it first.
> 
> The problem is that the sr driver caches some information about the CD-ROM
> and doesn't update them after burning it, leading to a failure
> when the isofs filesystem tries to read the superblock.
> 
> This patch modifies the sr driver so it detects the SG_IO
> write ioctls and marks the device as "changed".
> As a result, at the next open() syscall, the revalidate_block()
> callback will be executed and the sr driver will update its cached info.
> 
> Is this patch acceptable to you? Do you have different ideas?

It's been many years, but back in the day the program writing the cd
would eject the disc once done. This of course forces a reload of
the toc and clearing of the flag. What program is this? Seems like
it should probably eject when it's done.

There are many commands that will indicate writing to the device, but
not cause anything that should force a reload. A mode select comes to
mind.

-- 
Jens Axboe

Re: [PATCH 1/2] Convert target drivers to use sbitmap

2018-05-15 Thread Jens Axboe

On 5/15/18 10:11 AM, Jens Axboe wrote:
> On 5/15/18 10:00 AM, Matthew Wilcox wrote:
>> From: Matthew Wilcox 
>>
>> The sbitmap and the percpu_ida perform essentially the same task,
>> allocating tags for commands.  Since the sbitmap is more used than
>> the percpu_ida, convert the percpu_ida users to the sbitmap API.
> 
> It should also be the same performance as percpu_ida in light use, and
> performs much better at > 50% utilization of the tag space. I think
> that's better justification than "more used than".

Had to search long and hard for the perf numbers I did for percpu_ida
on higher utilization, but here it is:

https://lkml.org/lkml/2014/4/22/553

-- 
Jens Axboe

Re: [PATCH 1/2] Convert target drivers to use sbitmap

2018-05-15 Thread Jens Axboe

On 5/15/18 10:00 AM, Matthew Wilcox wrote:
> From: Matthew Wilcox 
> 
> The sbitmap and the percpu_ida perform essentially the same task,
> allocating tags for commands.  Since the sbitmap is more used than
> the percpu_ida, convert the percpu_ida users to the sbitmap API.

It should also be the same performance as percpu_ida in light use, and
performs much better at > 50% utilization of the tag space. I think
that's better justification than "more used than".

> diff --git a/drivers/target/iscsi/iscsi_target_util.c 
> b/drivers/target/iscsi/iscsi_target_util.c
> index 4435bf374d2d..28bcffae609f 100644
> --- a/drivers/target/iscsi/iscsi_target_util.c
> +++ b/drivers/target/iscsi/iscsi_target_util.c
> @@ -17,7 +17,7 @@
>   
> **/
>  
>  #include 
> -#include 
> +#include 
>  #include  /* ipv6_addr_equal() */
>  #include 
>  #include 
> @@ -147,6 +147,28 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
>   spin_unlock_bh(&cmd->r2t_lock);
>  }
>  
> +int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
> +{
> + int tag = -1;
> + DEFINE_WAIT(wait);
> + struct sbq_wait_state *ws;
> +
> + if (state == TASK_RUNNING)
> + return tag;
> +
> + ws = &se_sess->sess_tag_pool.ws[0];
> + for (;;) {
> + prepare_to_wait_exclusive(&ws->wait, &wait, state);
> + if (signal_pending_state(state, current))
> + break;
> + schedule();
> + tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
> + }
> +
> + finish_wait(&ws->wait, &wait);
> + return tag;
> +}

Seems like that should be:


ws = &se_sess->sess_tag_pool.ws[0];
for (;;) {
prepare_to_wait_exclusive(&ws->wait, &wait, state);
if (signal_pending_state(state, current))
break;
tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
if (tag != -1)
break;
schedule();
}

finish_wait(&ws->wait, &wait);
return tag;

>  /*
>   * May be called from software interrupt (timer) context for allocating
>   * iSCSI NopINs.
> @@ -155,9 +177,11 @@ struct iscsi_cmd *iscsit_allocate_cmd(struct iscsi_conn 
> *conn, int state)
>  {
>   struct iscsi_cmd *cmd;
>   struct se_session *se_sess = conn->sess->se_sess;
> - int size, tag;
> + int size, tag, cpu;
>  
> - tag = percpu_ida_alloc(&se_sess->sess_tag_pool, state);
> + tag = sbitmap_queue_get(&se_sess->sess_tag_pool, &cpu);
> + if (tag < 0)
> + tag = iscsit_wait_for_tag(se_sess, state, &cpu);
>   if (tag < 0)
>   return NULL;

Might make sense to just roll the whole thing into iscsi_get_tag(), that
would be cleaner.

sbitmap should provide a helper for that, but we can do that cleanup
later. That would encapsulate things like the per-cpu caching hint too,
for instance.

Rest looks fine to me.

-- 
Jens Axboe

Re: [PATCH 0/3] scsi: scsi-mq: don't hold host_busy in IO path

2018-04-27 Thread Jens Axboe

On 4/27/18 9:48 AM, Bart Van Assche wrote:
> On Fri, 2018-04-27 at 09:39 -0600, Jens Axboe wrote:
>> blk_mq_tagset_busy_iter(&shost->tag_set, scsi_host_check_in_flight,
>>  &in_flight);
>> return in_flight.cnt + atomic_read(&shost->host_busy);
>>
>> The atomic read is basically free, once we get rid of the dirty of that
>> variable on each IO.
> 
> Hello Jens,
> 
> What makes you think that " + atomic_read(&shost->host_busy)" is necessary?
> I am not aware of any code outside the SCSI core that modifies the host_busy
> member.

It's a (scalable) hack to count those as well. Going forward they should be
converted to just using reserved tags, of course.

-- 
Jens Axboe

Re: [PATCH 0/3] scsi: scsi-mq: don't hold host_busy in IO path

2018-04-27 Thread Jens Axboe

On 4/27/18 9:31 AM, Bart Van Assche wrote:
> On Fri, 2018-04-20 at 14:57 +0800, Ming Lei wrote:
>> This patches removes the expensive atomic opeation on host-wide counter
>> of .host_busy for scsi-mq, and it is observed that IOPS can be increased by
>> 15% with this change in IO test over scsi_debug.
>>
>>
>> Ming Lei (3):
>>   scsi: introduce scsi_host_busy()
>>   scsi: read host_busy via scsi_host_busy()
>>   scsi: avoid to hold host-wide counter of host_busy for scsi_mq
>>
>>  drivers/scsi/advansys.c   |  8 
>>  drivers/scsi/hosts.c  | 32 
>> +++
>>  drivers/scsi/libsas/sas_scsi_host.c   |  4 ++--
>>  drivers/scsi/megaraid/megaraid_sas_base.c |  2 +-
>>  drivers/scsi/mpt3sas/mpt3sas_base.c   |  4 ++--
>>  drivers/scsi/qlogicpti.c  |  2 +-
>>  drivers/scsi/scsi.c   |  2 +-
>>  drivers/scsi/scsi_error.c |  6 +++---
>>  drivers/scsi/scsi_lib.c   | 23 --
>>  drivers/scsi/scsi_sysfs.c |  2 +-
>>  include/scsi/scsi_host.h  |  1 +
>>  11 files changed, 65 insertions(+), 21 deletions(-)\
> 
> Hello Ming,
> 
> From the MAINTAINERS file:
> 
> SCSI SUBSYSTEM
> M:  "James E.J. Bottomley" 
> T:  git git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git
> M:  "Martin K. Petersen" 
> T:  git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git
> L:  linux-scsi@vger.kernel.org
> S:  Maintained
> F:  Documentation/devicetree/bindings/scsi/
> F:  drivers/scsi/
> F:  include/scsi/
> 
> Hence my surprise when I saw that you sent this patch series to Jens instead
> of James and Martin?

Martin and James are both on the CC as well. For what it's worth, the patch
seems like a good approach to me. To handle the case that Hannes was concerned
about (older drivers doing internal command issue), I would suggest that those
drivers get instrumented to include a inc/dec of the host busy count for
internal commands that bypass the normal tagging. That means the mq case needs
to be

blk_mq_tagset_busy_iter(&shost->tag_set, scsi_host_check_in_flight,
&in_flight);
return in_flight.cnt + atomic_read(&shost->host_busy);

The atomic read is basically free, once we get rid of the dirty of that
variable on each IO.

-- 
Jens Axboe

Re: [PATCH 2/5] ide: kill ide_toggle_bounce

2018-04-26 Thread Jens Axboe

On 4/26/18 1:20 AM, Christoph Hellwig wrote:
> On Tue, Apr 24, 2018 at 08:02:56PM -0600, Jens Axboe wrote:
>> On 4/24/18 12:16 PM, Christoph Hellwig wrote:
>>> ide_toggle_bounce did select various strange block bounce limits, including
>>> not bouncing at all as soon as an iommu is present in the system.  Given
>>> that the dma_map routines now handle any required bounce buffering except
>>> for ISA DMA, and the ide code already must handle either ISA DMA or highmem
>>> at least for iommu equipped systems we can get rid of the block layer
>>> bounce limit setting entirely.
>>
>> Pretty sure I was the one to add this code, when highmem page IO was
>> enabled about two decades ago...
>>
>> Outside of DMA, the issue was that the PIO code could not handle
>> highmem. That's not the case anymore, so this should be fine.
> 
> Yes, that is the rationale.  Any chance to you could look over the
> other patches as well?  Except for the networking one for which I'd
> really like to see a review from Dave all the users of the interface
> are block related.

You can add my reviewed-by to 1-3, and 5. Looks good to me.

-- 
Jens Axboe

Re: [PATCH 2/5] ide: kill ide_toggle_bounce

2018-04-24 Thread Jens Axboe

On 4/24/18 12:16 PM, Christoph Hellwig wrote:
> ide_toggle_bounce did select various strange block bounce limits, including
> not bouncing at all as soon as an iommu is present in the system.  Given
> that the dma_map routines now handle any required bounce buffering except
> for ISA DMA, and the ide code already must handle either ISA DMA or highmem
> at least for iommu equipped systems we can get rid of the block layer
> bounce limit setting entirely.

Pretty sure I was the one to add this code, when highmem page IO was
enabled about two decades ago...

Outside of DMA, the issue was that the PIO code could not handle
highmem. That's not the case anymore, so this should be fine.

Reviewed-by: Jens Axboe 

-- 
Jens Axboe

Re: [PATCH blktests] scsi/004: add regression test for false BLK_STS_OK with non good SAM status

2018-04-19 Thread Jens Axboe

On 4/19/18 1:13 PM, Omar Sandoval wrote:
> On Thu, Apr 19, 2018 at 11:53:30AM -0700, Omar Sandoval wrote:
>> Thanks for the test! Applied.
> 
> Side note, it's unfortunate that this test takes 180 seconds to run only
> because we have to wait for the command timeout. We should be able to
> export request_queue->rq_timeout writeable in sysfs. Would you be
> interested in doing that?

Here's one, I even tested it.


diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d00d1b0ec109..d5d628c3c12d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -496,6 +496,27 @@ static ssize_t queue_dax_show(struct request_queue *q, 
char *page)
return queue_var_show(blk_queue_dax(q), page);
 }
 
+static ssize_t queue_timeout_show(struct request_queue *q, char *page)
+{
+   return sprintf(page, "%u\n", q->rq_timeout / HZ);
+}
+
+static ssize_t queue_timeout_store(struct request_queue *q, const char *page,
+  size_t count)
+{
+   unsigned long timeout;
+   ssize_t ret;
+
+   ret = queue_var_store(&timeout, page, count);
+   if (ret < 0)
+   return ret;
+   if (!timeout)
+   return -EINVAL;
+
+   blk_queue_rq_timeout(q, timeout * HZ);
+   return count;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -678,6 +699,12 @@ static struct queue_sysfs_entry throtl_sample_time_entry = 
{
 };
 #endif
 
+static struct queue_sysfs_entry queue_timeout_entry = {
+   .attr = {.name = "timeout", .mode = S_IRUGO | S_IWUSR },
+   .show = queue_timeout_show,
+   .store = queue_timeout_store,
+};
+
 static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -714,6 +741,7 @@ static struct attribute *default_attrs[] = {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
&throtl_sample_time_entry.attr,
 #endif
+   &queue_timeout_entry.attr,
NULL,
 };
 

-- 
Jens Axboe

Re: [PATCH blktests] scsi/004: add regression test for false BLK_STS_OK with non good SAM status

2018-04-19 Thread Jens Axboe

On 4/19/18 1:41 PM, Bart Van Assche wrote:
> On Thu, 2018-04-19 at 12:13 -0700, Omar Sandoval wrote:
>> On Thu, Apr 19, 2018 at 11:53:30AM -0700, Omar Sandoval wrote:
>>> Thanks for the test! Applied.
>>
>> Side note, it's unfortunate that this test takes 180 seconds to run only
>> because we have to wait for the command timeout. We should be able to
>> export request_queue->rq_timeout writeable in sysfs. Would you be
>> interested in doing that?
> 
> Hello Omar,
> 
> Is this perhaps what you are looking for?
> # ls -l /sys/class/scsi_device/*/*/timeout 
> -rw-r--r-- 1 root root 4096 Apr 19 08:52 
> /sys/class/scsi_device/2:0:0:0/device/timeout
> -rw-r--r-- 1 root root 4096 Apr 19 12:39 
> /sys/class/scsi_device/8:0:0:1/device/timeout

We should have it generically available though, not just for SCSI. In
retrospect, it should have been under queue/ from the start, now we'll
end up with duplicate entries for SCSI.

-- 
Jens Axboe

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-18 Thread Jens Axboe

On 4/18/18 3:08 AM, Paolo Valente wrote:
> 
> 
>> Il giorno 18 apr 2018, alle ore 00:57, Jens Axboe  ha 
>> scritto:
>>
>> On 4/17/18 3:48 PM, Jens Axboe wrote:
>>> On 4/17/18 3:47 PM, Kees Cook wrote:
>>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe  wrote:
>>>>> On 4/17/18 3:25 PM, Kees Cook wrote:
>>>>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  wrote:
>>>>>>> I see elv.priv[1] assignments made in a few places -- is it possible
>>>>>>> there is some kind of uninitialized-but-not-NULL state that can leak
>>>>>>> in there?
>>>>>>
>>>>>> Got it. This fixes it for me:
>>>>>>
>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>> index 0dc9e341c2a7..859df3160303 100644
>>>>>> --- a/block/blk-mq.c
>>>>>> +++ b/block/blk-mq.c
>>>>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
>>>>>> request_queue *q,
>>>>>>
>>>>>>rq = blk_mq_rq_ctx_init(data, tag, op);
>>>>>>if (!op_is_flush(op)) {
>>>>>> -   rq->elv.icq = NULL;
>>>>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>>>>>if (e && e->type->ops.mq.prepare_request) {
>>>>>>if (e->type->icq_cache && rq_ioc(bio))
>>>>>>blk_mq_sched_assign_ioc(rq, bio);
>>>>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>>>>>>e->type->ops.mq.finish_request(rq);
>>>>>>if (rq->elv.icq) {
>>>>>>put_io_context(rq->elv.icq->ioc);
>>>>>> -   rq->elv.icq = NULL;
>>>>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>>>>>}
>>>>>>}
>>>>>
>>>>> This looks like a BFQ problem, this should not be necessary. Paolo,
>>>>> you're calling your own prepare request handler from the insert
>>>>> as well, and your prepare request does nothing if rq->elv.icq == NULL.
>>>>
>>>> I sent the patch anyway, since it's kind of a robustness improvement,
>>>> I'd hope. If you fix BFQ also, please add:
>>>
>>> It's also a memset() in the hot path, would prefer to avoid that...
>>> The issue here is really the convoluted bfq usage of insert/prepare,
>>> I'm sure Paolo can take it from here.
>>
> 
> Hi,
> I'm very sorry for tuning in very late, but, at the same time, very
> glad to find the problem probably already solved ;) (in this respect, I swear,
> my delay was not intentional)
> 
>> Does this fix it?
>>
>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>> index f0ecd98509d8..d883469a1582 100644
>> --- a/block/bfq-iosched.c
>> +++ b/block/bfq-iosched.c
>> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, 
>> struct bio *bio)
>>  bool new_queue = false;
>>  bool bfqq_already_existing = false, split = false;
>>
>> -if (!rq->elv.icq)
>> +if (!rq->elv.icq) {
>> +rq->elv.priv[0] = rq->elv.priv[1] = NULL;
>>  return;
>> +}
>> +
> 
> This does solve the problem at hand.  But it also arouses a question,
> related to a possible subtle bug.
> 
> For BFQ, !rq->elv.icq basically means "this request is not for me, as
> I am an icq-based scheduler".  But, IIUC the main points in this
> thread, then this assumption is false.  If it is actually false, then
> I hope that all requests with !rq->elv.icq that are sent to BFQ do
> verify the condition (at_head || blk_rq_is_passthrough(rq)).  In fact,
> requests that do not verify that condition are those that BFQ must put
> in a bfq_queue.  So, even if this patch makes the crash disappear, we
> drive BFQ completely crazy (and we may expect other strange failures)
> if we send BFQ a request with !((at_head || blk_rq_is_passthrough(rq))
> and !rq->elv.icq.  BFQ has to put that rq into a bfq_queue, but simply
> cannot.
> 
> Jens, or any other, could you please shed a light on this, and explain
> how things are exactly?

Your assumption is correct, however you set ->priv[0] and ->priv[1] for
requests, but only for ->elv.icq != NULL. So let's assume you get a
request and assign those two, request completes. Later on, you get
the same request, bypass insert it. BFQ doesn't clear the bic/bfqq
pointers in the request, since ->elv.icq == NULL. It gets inserted
into the dispatch list.

Then when __bfq_dispatch_request() is called, you do:

bfqq = RQ_BFQQ(rq);
if (bfqq)
bfqq->dispatched++;
[...]

which is wrong, since you don't know if you assigned a bfqq for this
request. The memory that bfqq points to could be long gone, if that
queue is freed. So you could either guard any bfqq/bic retrieval
with ->elv.icq != NULL, or you could just clear the pointers for
the case where the values aren't valid.

-- 
Jens Axboe

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-17 Thread Jens Axboe

On 4/17/18 5:06 PM, Kees Cook wrote:
> On Tue, Apr 17, 2018 at 3:57 PM, Jens Axboe  wrote:
>> On 4/17/18 3:48 PM, Jens Axboe wrote:
>>> On 4/17/18 3:47 PM, Kees Cook wrote:
>>>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe  wrote:
>>>>> On 4/17/18 3:25 PM, Kees Cook wrote:
>>>>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  wrote:
>>>>>>> I see elv.priv[1] assignments made in a few places -- is it possible
>>>>>>> there is some kind of uninitialized-but-not-NULL state that can leak
>>>>>>> in there?
>>>>>>
>>>>>> Got it. This fixes it for me:
>>>>>>
>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>> index 0dc9e341c2a7..859df3160303 100644
>>>>>> --- a/block/blk-mq.c
>>>>>> +++ b/block/blk-mq.c
>>>>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
>>>>>> request_queue *q,
>>>>>>
>>>>>> rq = blk_mq_rq_ctx_init(data, tag, op);
>>>>>> if (!op_is_flush(op)) {
>>>>>> -   rq->elv.icq = NULL;
>>>>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>>>>> if (e && e->type->ops.mq.prepare_request) {
>>>>>> if (e->type->icq_cache && rq_ioc(bio))
>>>>>> blk_mq_sched_assign_ioc(rq, bio);
>>>>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>>>>>> e->type->ops.mq.finish_request(rq);
>>>>>> if (rq->elv.icq) {
>>>>>> put_io_context(rq->elv.icq->ioc);
>>>>>> -   rq->elv.icq = NULL;
>>>>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>>>>> }
>>>>>> }
>>>>>
>>>>> This looks like a BFQ problem, this should not be necessary. Paolo,
>>>>> you're calling your own prepare request handler from the insert
>>>>> as well, and your prepare request does nothing if rq->elv.icq == NULL.
>>>>
>>>> I sent the patch anyway, since it's kind of a robustness improvement,
>>>> I'd hope. If you fix BFQ also, please add:
>>>
>>> It's also a memset() in the hot path, would prefer to avoid that...
>>> The issue here is really the convoluted bfq usage of insert/prepare,
>>> I'm sure Paolo can take it from here.
>>
>> Does this fix it?
>>
>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>> index f0ecd98509d8..d883469a1582 100644
>> --- a/block/bfq-iosched.c
>> +++ b/block/bfq-iosched.c
>> @@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, 
>> struct bio *bio)
>> bool new_queue = false;
>> bool bfqq_already_existing = false, split = false;
>>
>> -   if (!rq->elv.icq)
>> +   if (!rq->elv.icq) {
>> +   rq->elv.priv[0] = rq->elv.priv[1] = NULL;
>> return;
>> +   }
>> +
>> bic = icq_to_bic(rq->elv.icq);
>>
>> spin_lock_irq(&bfqd->lock);
> 
> It does! Excellent. :)

Sweet! I'll add a comment and queue it up for 4.17 and mark for stable, with
your annotations too.

-- 
Jens Axboe

Re: [PATCH] blk-mq: Clear out elevator private data

2018-04-17 Thread Jens Axboe

On 4/17/18 4:57 PM, Kees Cook wrote:
> On Tue, Apr 17, 2018 at 2:45 PM, Jens Axboe  wrote:
>> On 4/17/18 3:42 PM, Kees Cook wrote:
>>> Some elevators may not correctly check rq->rq_flags & RQF_ELVPRIV, and
>>> may attempt to read rq->elv fields. When requests got reused, this
>>> caused BFQ to think it already had a bfqq (rq->elv.priv[1]) allocated.
>>> This could lead to odd behaviors like having the sense buffer address
>>> slowly start incrementing. This eventually tripped HARDENED_USERCOPY
>>> and KASAN.
>>>
>>> This patch wipes all of rq->elv instead of just rq->elv.icq. While
>>> it shouldn't technically be needed, this ends up being a robustness
>>> improvement that should lead to at least finding bugs in elevators faster.
>>
>> Comments from the other email still apply, we should not need to do this
>> full memset() for every request. From a quick look, BFQ needs to straighten
>> out its usage of prepare request and interactions with insert_request.
> 
> Sure, understood. I would point out, FWIW, that memset() gets unrolled
> by the compiler and this is just two more XORs in the same cacheline
> (the two words following icq). (And there is SO much more being
> cleared during alloc, it didn't seem like hardly any extra cost vs the
> robustness it provided.)

Yeah, it's not super pricey, but it's not needed. BFQ is the user of
the members, and the one that assigns them. You're saying leftover
assignments, since it doesn't always assign them. Hence I think that's
a better fix, just sent out a test patch a few minutes ago.

You did all the hard work, I'm just coasting on your findings.

-- 
Jens Axboe

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-17 Thread Jens Axboe

On 4/17/18 3:48 PM, Jens Axboe wrote:
> On 4/17/18 3:47 PM, Kees Cook wrote:
>> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe  wrote:
>>> On 4/17/18 3:25 PM, Kees Cook wrote:
>>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  wrote:
>>>>> I see elv.priv[1] assignments made in a few places -- is it possible
>>>>> there is some kind of uninitialized-but-not-NULL state that can leak
>>>>> in there?
>>>>
>>>> Got it. This fixes it for me:
>>>>
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index 0dc9e341c2a7..859df3160303 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
>>>> request_queue *q,
>>>>
>>>> rq = blk_mq_rq_ctx_init(data, tag, op);
>>>> if (!op_is_flush(op)) {
>>>> -   rq->elv.icq = NULL;
>>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>>> if (e && e->type->ops.mq.prepare_request) {
>>>> if (e->type->icq_cache && rq_ioc(bio))
>>>> blk_mq_sched_assign_ioc(rq, bio);
>>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>>>> e->type->ops.mq.finish_request(rq);
>>>> if (rq->elv.icq) {
>>>> put_io_context(rq->elv.icq->ioc);
>>>> -   rq->elv.icq = NULL;
>>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>>> }
>>>> }
>>>
>>> This looks like a BFQ problem, this should not be necessary. Paolo,
>>> you're calling your own prepare request handler from the insert
>>> as well, and your prepare request does nothing if rq->elv.icq == NULL.
>>
>> I sent the patch anyway, since it's kind of a robustness improvement,
>> I'd hope. If you fix BFQ also, please add:
> 
> It's also a memset() in the hot path, would prefer to avoid that...
> The issue here is really the convoluted bfq usage of insert/prepare,
> I'm sure Paolo can take it from here.

Does this fix it?

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f0ecd98509d8..d883469a1582 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4934,8 +4934,11 @@ static void bfq_prepare_request(struct request *rq, 
struct bio *bio)
bool new_queue = false;
bool bfqq_already_existing = false, split = false;
 
-   if (!rq->elv.icq)
+   if (!rq->elv.icq) {
+   rq->elv.priv[0] = rq->elv.priv[1] = NULL;
return;
+   }
+
bic = icq_to_bic(rq->elv.icq);
 
spin_lock_irq(&bfqd->lock);

-- 
Jens Axboe

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-17 Thread Jens Axboe

On 4/17/18 3:47 PM, Kees Cook wrote:
> On Tue, Apr 17, 2018 at 2:39 PM, Jens Axboe  wrote:
>> On 4/17/18 3:25 PM, Kees Cook wrote:
>>> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  wrote:
>>>> I see elv.priv[1] assignments made in a few places -- is it possible
>>>> there is some kind of uninitialized-but-not-NULL state that can leak
>>>> in there?
>>>
>>> Got it. This fixes it for me:
>>>
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 0dc9e341c2a7..859df3160303 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
>>> request_queue *q,
>>>
>>> rq = blk_mq_rq_ctx_init(data, tag, op);
>>> if (!op_is_flush(op)) {
>>> -   rq->elv.icq = NULL;
>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>> if (e && e->type->ops.mq.prepare_request) {
>>> if (e->type->icq_cache && rq_ioc(bio))
>>> blk_mq_sched_assign_ioc(rq, bio);
>>> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
>>> e->type->ops.mq.finish_request(rq);
>>> if (rq->elv.icq) {
>>> put_io_context(rq->elv.icq->ioc);
>>> -   rq->elv.icq = NULL;
>>> +   memset(&rq->elv, 0, sizeof(rq->elv));
>>> }
>>> }
>>
>> This looks like a BFQ problem, this should not be necessary. Paolo,
>> you're calling your own prepare request handler from the insert
>> as well, and your prepare request does nothing if rq->elv.icq == NULL.
> 
> I sent the patch anyway, since it's kind of a robustness improvement,
> I'd hope. If you fix BFQ also, please add:

It's also a memset() in the hot path, would prefer to avoid that...
The issue here is really the convoluted bfq usage of insert/prepare,
I'm sure Paolo can take it from here.

-- 
Jens Axboe

Re: [PATCH] blk-mq: Clear out elevator private data

2018-04-17 Thread Jens Axboe

On 4/17/18 3:42 PM, Kees Cook wrote:
> Some elevators may not correctly check rq->rq_flags & RQF_ELVPRIV, and
> may attempt to read rq->elv fields. When requests got reused, this
> caused BFQ to think it already had a bfqq (rq->elv.priv[1]) allocated.
> This could lead to odd behaviors like having the sense buffer address
> slowly start incrementing. This eventually tripped HARDENED_USERCOPY
> and KASAN.
> 
> This patch wipes all of rq->elv instead of just rq->elv.icq. While
> it shouldn't technically be needed, this ends up being a robustness
> improvement that should lead to at least finding bugs in elevators faster.

Comments from the other email still apply, we should not need to do this
full memset() for every request. From a quick look, BFQ needs to straighten
out its usage of prepare request and interactions with insert_request.

> Reported-by: Oleksandr Natalenko 
> Fixes: bd166ef183c26 ("blk-mq-sched: add framework for MQ capable IO 
> schedulers")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Kees Cook 
> ---
> In theory, BFQ needs to also check the RQF_ELVPRIV flag, but I'll leave that
> to Paolo to figure out. Also, my Fixes line is kind of a best-guess. This
> is where icq was originally wiped, so it seemed as good a commit as any.

Yeah, that's probably a bit too broad for fixes :-)

-- 
Jens Axboe

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-17 Thread Jens Axboe

On 4/17/18 3:25 PM, Kees Cook wrote:
> On Tue, Apr 17, 2018 at 1:46 PM, Kees Cook  wrote:
>> I see elv.priv[1] assignments made in a few places -- is it possible
>> there is some kind of uninitialized-but-not-NULL state that can leak
>> in there?
> 
> Got it. This fixes it for me:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0dc9e341c2a7..859df3160303 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -363,7 +363,7 @@ static struct request *blk_mq_get_request(struct
> request_queue *q,
> 
> rq = blk_mq_rq_ctx_init(data, tag, op);
> if (!op_is_flush(op)) {
> -   rq->elv.icq = NULL;
> +   memset(&rq->elv, 0, sizeof(rq->elv));
> if (e && e->type->ops.mq.prepare_request) {
> if (e->type->icq_cache && rq_ioc(bio))
> blk_mq_sched_assign_ioc(rq, bio);
> @@ -461,7 +461,7 @@ void blk_mq_free_request(struct request *rq)
> e->type->ops.mq.finish_request(rq);
> if (rq->elv.icq) {
> put_io_context(rq->elv.icq->ioc);
> -   rq->elv.icq = NULL;
> +   memset(&rq->elv, 0, sizeof(rq->elv));
> }
> }

This looks like a BFQ problem, this should not be necessary. Paolo,
you're calling your own prepare request handler from the insert
as well, and your prepare request does nothing if rq->elv.icq == NULL.

-- 
Jens Axboe

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-17 Thread Jens Axboe

On 4/17/18 2:25 PM, Kees Cook wrote:
> On Tue, Apr 17, 2018 at 1:20 PM, Kees Cook  wrote:
>> On Tue, Apr 17, 2018 at 1:03 PM, Kees Cook  wrote:
>>> The above bfq_dispatch_request+0x99/0xad0 is still
>>> __bfq_dispatch_request at block/bfq-iosched.c:3902, just with KASAN
>>> removed. 0x99 is 153 decimal:
>>>
>>> (gdb) disass bfq_dispatch_request
>>> Dump of assembler code for function bfq_dispatch_request:
>>> ...
>>>0x8134b2ad <+141>:   test   %rax,%rax
>>>0x8134b2b0 <+144>:   je 0x8134b2bd
>>> 
>>>0x8134b2b2 <+146>:   addl   $0x1,0x100(%rax)
>>>0x8134b2b9 <+153>:   addl   $0x1,0x3c(%rbx)
>>>0x8134b2bd <+157>:   orl$0x2,0x18(%r12)
>>>0x8134b2c3 <+163>:   test   %ebp,%ebp
>>>0x8134b2c5 <+165>:   je 0x8134b2ce
>>> 
>>>0x8134b2c7 <+167>:   mov0x108(%r14),%rax
>>>0x8134b2ce <+174>:   mov%r15,%rdi
>>>0x8134b2d1 <+177>:   callq  0x81706f90 
>>> <_raw_spin_unlock_irq>
>>>
>>> Just as a sanity-check, at +157 %r12 should be rq, rq_flags is 0x18
>>> offset from, $0x2 is RQF_STARTED, so that maps to "rq->rq_flags |=
>>> RQF_STARTED", the next C statement. I don't know what +146 is, though?
>>> An increment of something 256 bytes offset? There's a lot of inline
>>> fun and reordering happening here, so I'm ignoring that for the
>>> moment.
>>
>> No -- I'm reading this wrong. The RIP is the IP _after_ the trap, so
>> +146 is the offender.
>>
>> [   29.284746] watchpoint @ 95d41a0fe580 triggered
>> [   29.285349] sense before:95d41f45f700 after:95d41f45f701 
>> (@95d41a
>> 0fe580)
>> [   29.286176] elevator before:95d419419c00 after:95d419419c00
>> [   29.286847] elevator_data before:95d419418c00 after:95d419418c00
>> ...
>> [   29.295069] RIP: 0010:bfq_dispatch_request+0x99/0xbb0
>> [   29.295622] RSP: 0018:b26e01707a40 EFLAGS: 0002
>> [   29.296181] RAX: 95d41a0fe480 RBX: 95d419418c00 RCX: 
>> 95d419418c08
>>
>> RAX is 95d41a0fe480 and sense is stored at 95d41a0fe580,
>> exactly 0x100 away.
>>
>> WTF is this addl?
> 
> What are the chances? :P Two ++ statements in a row separate by a
> collapsed goto. FML. :)
> 
> ...
> bfqq->dispatched++;
>     goto inc_in_driver_start_rq;
> ...
> inc_in_driver_start_rq:
> bfqd->rq_in_driver++;
> ...
> 
> And there's the 0x100 (256):
> 
> struct bfq_queue {
> ...
> intdispatched;   /*   256 4 */
> 
> So bfqq is corrupted somewhere... I'll keep digging. I hope you're all
> enjoying my live debugging transcript. ;)

It has to be the latter bfqq->dispatched increment, as those are
transient (and bfqd is not).

Adding Paolo.

-- 
Jens Axboe

Re: usercopy whitelist woe in scsi_sense_cache

2018-04-17 Thread Jens Axboe

On 4/17/18 10:42 AM, Kees Cook wrote:
> On Mon, Apr 16, 2018 at 8:12 PM, Kees Cook  wrote:
>> With a hardware watchpoint, I've isolated the corruption to here:
>>
>> bfq_dispatch_request+0x2be/0x1610:
>> __bfq_dispatch_request at block/bfq-iosched.c:3902
>> 3900if (rq) {
>> 3901inc_in_driver_start_rq:
>> 3902bfqd->rq_in_driver++;
>> 3903start_rq:
>> 3904rq->rq_flags |= RQF_STARTED;
>> 3905}
> 
> FWIW, the stacktrace here (removing the ? lines) is:
> 
> [   34.311980] RIP: 0010:bfq_dispatch_request+0x2be/0x1610
> [   34.452491]  blk_mq_do_dispatch_sched+0x1d9/0x260
> [   34.454561]  blk_mq_sched_dispatch_requests+0x3da/0x4b0
> [   34.458789]  __blk_mq_run_hw_queue+0xae/0x130
> [   34.460001]  __blk_mq_delay_run_hw_queue+0x192/0x280
> [   34.460823]  blk_mq_run_hw_queue+0x10b/0x1b0
> [   34.463240]  blk_mq_sched_insert_request+0x3bd/0x4d0
> [   34.467342]  blk_execute_rq+0xcf/0x140
> [   34.468483]  sg_io+0x2f7/0x730
> 
> Can anyone tell me more about the memory allocation layout of the
> various variables here? It looks like struct request is a header in
> front of struct scsi_request? How do struct elevator_queue, struct
> blk_mq_ctx, and struct blk_mq_hw_ctx overlap these?

The scsi_request is a payload item for the block request, it's
located right after the request in memory. These are persistent
allocations, we don't allocate/free them per IO.

blk_mq_ctx are the blk-mq software queues, they are percpu and
allocated when the queue is setup.

blk_mq_hw_ctx is the hardware queue. You probably have just one,
it's allocated when the queue is setup.

struct elevator_queue is allocated when the scheduler is attached
to the queue. This can get freed and allocated if you switch
the scheduler on a queue, otherwise it persists until the queue
is torn down (and the scheduler data is freed).

> Regardless, I'll check for elevator data changing too...

It should not change unless you switch IO schedulers. If you're
using BFQ and not switching, then it won't change.

-- 
Jens Axboe

sr: get/drop reference to device in revalidate and check_events

2018-04-11 Thread Jens Axboe

We can't just use scsi_cd() to get the scsi_cd structure, we have
to grab a live reference to the device. For both callbacks, we're
not inside an open where we already hold a reference to the device.

This fixes device removal/addition under concurrent device access,
which otherwise could result in the below oops.

NULL pointer dereference at 0010
PGD 0 P4D 0 
Oops:  [#1] PREEMPT SMP
Modules linked in:
sr 12:0:0:0: [sr2] scsi-1 drive
 scsi_debug crc_t10dif crct10dif_generic crct10dif_common nvme nvme_core 
sb_edac xl
sr 12:0:0:0: Attached scsi CD-ROM sr2
 sr_mod cdrom btrfs xor zstd_decompress zstd_compress xxhash lzo_compress 
zlib_defc
sr 12:0:0:0: Attached scsi generic sg7 type 5
 igb ahci libahci i2c_algo_bit libata dca [last unloaded: crc_t10dif]
CPU: 43 PID: 4629 Comm: systemd-udevd Not tainted 4.16.0+ #650
Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 11/09/2016
RIP: 0010:sr_block_revalidate_disk+0x23/0x190 [sr_mod]
RSP: 0018:883ff357bb58 EFLAGS: 00010292
RAX: a00b07d0 RBX: 883ff3058000 RCX: 883ff357bb66
RDX: 0003 RSI: 7530 RDI: 881fea631000
RBP:  R08: 881fe4d38400 R09: 
R10:  R11: 01b6 R12: 085d
R13: 085d R14: 883ffd9b3790 R15: 
FS:  7f7dc8e6d8c0() GS:883fff34() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 0010 CR3: 003ffda98005 CR4: 003606e0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 ? __invalidate_device+0x48/0x60
 check_disk_change+0x4c/0x60
 sr_block_open+0x16/0xd0 [sr_mod]
 __blkdev_get+0xb9/0x450
 ? iget5_locked+0x1c0/0x1e0
 blkdev_get+0x11e/0x320
 ? bdget+0x11d/0x150
 ? _raw_spin_unlock+0xa/0x20
 ? bd_acquire+0xc0/0xc0
 do_dentry_open+0x1b0/0x320
 ? inode_permission+0x24/0xc0
 path_openat+0x4e6/0x1420
 ? cpumask_any_but+0x1f/0x40
 ? flush_tlb_mm_range+0xa0/0x120
 do_filp_open+0x8c/0xf0
 ? __seccomp_filter+0x28/0x230
 ? _raw_spin_unlock+0xa/0x20
 ? __handle_mm_fault+0x7d6/0x9b0
 ? list_lru_add+0xa8/0xc0
 ? _raw_spin_unlock+0xa/0x20
 ? __alloc_fd+0xaf/0x160
 ? do_sys_open+0x1a6/0x230
 do_sys_open+0x1a6/0x230
 do_syscall_64+0x5a/0x100
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Signed-off-by: Jens Axboe 

---

diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 0cf25d789d05..3f3cb72e0c0c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -587,18 +587,28 @@ static int sr_block_ioctl(struct block_device *bdev, 
fmode_t mode, unsigned cmd,
 static unsigned int sr_block_check_events(struct gendisk *disk,
  unsigned int clearing)
 {
-   struct scsi_cd *cd = scsi_cd(disk);
+   unsigned int ret = 0;
+   struct scsi_cd *cd;
 
-   if (atomic_read(&cd->device->disk_events_disable_depth))
+   cd = scsi_cd_get(disk);
+   if (!cd)
return 0;
 
-   return cdrom_check_events(&cd->cdi, clearing);
+   if (!atomic_read(&cd->device->disk_events_disable_depth))
+   ret = cdrom_check_events(&cd->cdi, clearing);
+
+   scsi_cd_put(cd);
+   return ret;
 }
 
 static int sr_block_revalidate_disk(struct gendisk *disk)
 {
-   struct scsi_cd *cd = scsi_cd(disk);
struct scsi_sense_hdr sshdr;
+   struct scsi_cd *cd;
+
+   cd = scsi_cd_get(disk);
+   if (!cd)
+   return -ENXIO;
 
/* if the unit is not ready, nothing more to do */
if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
@@ -607,6 +617,7 @@ static int sr_block_revalidate_disk(struct gendisk *disk)
sr_cd_check(&cd->cdi);
get_sectorsize(cd);
 out:
+   scsi_cd_put(cd);
return 0;
 }
 

-- 
Jens Axboe

Re: Hotplug

2018-04-11 Thread Jens Axboe

On 4/11/18 10:14 AM, Jan Kara wrote:
> Hello,
> 
> On Wed 11-04-18 08:11:13, Jens Axboe wrote:
>> On 4/11/18 7:58 AM, Jan Kara wrote:
>>> On Tue 10-04-18 11:17:46, Jens Axboe wrote:
>>>> Been running some tests and I keep running into issues with hotplug.
>>>> This looks similar to what Bart posted the other day, but it looks
>>>> more deeply rooted than just having to protect the queue in
>>>> generic_make_request_checks(). The test run is blktests,
>>>> block/001. Current -git doesn't survive it. I've seen at least two
>>>> different oopses, pasted below.
>>>>
>>>> [  102.163442] NULL pointer dereference at 0010
>>>> [  102.163444] PGD 0 P4D 0 
>>>> [  102.163447] Oops:  [#1] PREEMPT SMP
>>>> [  102.163449] Modules linked in:
>>>> [  102.175540] sr 12:0:0:0: [sr2] scsi-1 drive
>>>> [  102.180112]  scsi_debug crc_t10dif crct10dif_generic crct10dif_common 
>>>> nvme nvme_core sb_edac xl
>>>> [  102.186934] sr 12:0:0:0: Attached scsi CD-ROM sr2
>>>> [  102.191896]  sr_mod cdrom btrfs xor zstd_decompress zstd_compress 
>>>> xxhash lzo_compress zlib_defc
>>>> [  102.197169] sr 12:0:0:0: Attached scsi generic sg7 type 5
>>>> [  102.203475]  igb ahci libahci i2c_algo_bit libata dca [last unloaded: 
>>>> crc_t10dif]
>>>> [  102.203484] CPU: 43 PID: 4629 Comm: systemd-udevd Not tainted 4.16.0+ 
>>>> #650
>>>> [  102.203487] Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 
>>>> 11/09/2016
>>>> [  102.350882] RIP: 0010:sr_block_revalidate_disk+0x23/0x190 [sr_mod]
>>>> [  102.358299] RSP: 0018:883ff357bb58 EFLAGS: 00010292
>>>> [  102.364734] RAX: a00b07d0 RBX: 883ff3058000 RCX: 
>>>> 883ff357bb66
>>>> [  102.373220] RDX: 0003 RSI: 7530 RDI: 
>>>> 881fea631000
>>>> [  102.381705] RBP:  R08: 881fe4d38400 R09: 
>>>> 
>>>> [  102.390185] R10:  R11: 01b6 R12: 
>>>> 085d
>>>> [  102.398671] R13: 085d R14: 883ffd9b3790 R15: 
>>>> 
>>>> [  102.407156] FS:  7f7dc8e6d8c0() GS:883fff34() 
>>>> knlGS:
>>>> [  102.417138] CS:  0010 DS:  ES:  CR0: 80050033
>>>> [  102.424066] CR2: 0010 CR3: 003ffda98005 CR4: 
>>>> 003606e0
>>>> [  102.432545] DR0:  DR1:  DR2: 
>>>> 
>>>> [  102.441024] DR3:  DR6: fffe0ff0 DR7: 
>>>> 0400
>>>> [  102.449502] Call Trace:
>>>> [  102.452744]  ? __invalidate_device+0x48/0x60
>>>> [  102.458022]  check_disk_change+0x4c/0x60
>>>> [  102.462900]  sr_block_open+0x16/0xd0 [sr_mod]
>>>> [  102.468270]  __blkdev_get+0xb9/0x450
>>>> [  102.472774]  ? iget5_locked+0x1c0/0x1e0
>>>> [  102.477568]  blkdev_get+0x11e/0x320
>>>> [  102.481969]  ? bdget+0x11d/0x150
>>>> [  102.486083]  ? _raw_spin_unlock+0xa/0x20
>>>> [  102.490968]  ? bd_acquire+0xc0/0xc0
>>>> [  102.495368]  do_dentry_open+0x1b0/0x320
>>>> [  102.500159]  ? inode_permission+0x24/0xc0
>>>> [  102.505140]  path_openat+0x4e6/0x1420
>>>> [  102.509741]  ? cpumask_any_but+0x1f/0x40
>>>> [  102.514630]  ? flush_tlb_mm_range+0xa0/0x120
>>>> [  102.519903]  do_filp_open+0x8c/0xf0
>>>> [  102.524305]  ? __seccomp_filter+0x28/0x230
>>>> [  102.529389]  ? _raw_spin_unlock+0xa/0x20
>>>> [  102.534283]  ? __handle_mm_fault+0x7d6/0x9b0
>>>> [  102.539559]  ? list_lru_add+0xa8/0xc0
>>>> [  102.544157]  ? _raw_spin_unlock+0xa/0x20
>>>> [  102.549047]  ? __alloc_fd+0xaf/0x160
>>>> [  102.553549]  ? do_sys_open+0x1a6/0x230
>>>> [  102.558244]  do_sys_open+0x1a6/0x230
>>>> [  102.562742]  do_syscall_64+0x5a/0x100
>>>> [  102.567336]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>>>
>>> Interesting. Thinking out loud: This is cd->device dereference I guess
>>> which means disk->private_data was NULL. That gets set in sr_probe()
>>> together with disk->fops which are certainly set as they must have led us
>>> to the crashing function sr_block_revalidate_disk(). So likely
>>> disk->private_data go

Re: Hotplug

2018-04-11 Thread Jens Axboe

On 4/11/18 7:58 AM, Jan Kara wrote:
> Hi,
> 
> On Tue 10-04-18 11:17:46, Jens Axboe wrote:
>> Been running some tests and I keep running into issues with hotplug.
>> This looks similar to what Bart posted the other day, but it looks
>> more deeply rooted than just having to protect the queue in
>> generic_make_request_checks(). The test run is blktests,
>> block/001. Current -git doesn't survive it. I've seen at least two
>> different oopses, pasted below.
>>
>> [  102.163442] NULL pointer dereference at 0010
>> [  102.163444] PGD 0 P4D 0 
>> [  102.163447] Oops:  [#1] PREEMPT SMP
>> [  102.163449] Modules linked in:
>> [  102.175540] sr 12:0:0:0: [sr2] scsi-1 drive
>> [  102.180112]  scsi_debug crc_t10dif crct10dif_generic crct10dif_common 
>> nvme nvme_core sb_edac xl
>> [  102.186934] sr 12:0:0:0: Attached scsi CD-ROM sr2
>> [  102.191896]  sr_mod cdrom btrfs xor zstd_decompress zstd_compress xxhash 
>> lzo_compress zlib_defc
>> [  102.197169] sr 12:0:0:0: Attached scsi generic sg7 type 5
>> [  102.203475]  igb ahci libahci i2c_algo_bit libata dca [last unloaded: 
>> crc_t10dif]
>> [  102.203484] CPU: 43 PID: 4629 Comm: systemd-udevd Not tainted 4.16.0+ #650
>> [  102.203487] Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 
>> 11/09/2016
>> [  102.350882] RIP: 0010:sr_block_revalidate_disk+0x23/0x190 [sr_mod]
>> [  102.358299] RSP: 0018:883ff357bb58 EFLAGS: 00010292
>> [  102.364734] RAX: a00b07d0 RBX: 883ff3058000 RCX: 
>> 883ff357bb66
>> [  102.373220] RDX: 0003 RSI: 7530 RDI: 
>> 881fea631000
>> [  102.381705] RBP:  R08: 881fe4d38400 R09: 
>> 
>> [  102.390185] R10:  R11: 01b6 R12: 
>> 085d
>> [  102.398671] R13: 085d R14: 883ffd9b3790 R15: 
>> 
>> [  102.407156] FS:  7f7dc8e6d8c0() GS:883fff34() 
>> knlGS:
>> [  102.417138] CS:  0010 DS:  ES:  CR0: 80050033
>> [  102.424066] CR2: 0010 CR3: 003ffda98005 CR4: 
>> 003606e0
>> [  102.432545] DR0:  DR1:  DR2: 
>> 
>> [  102.441024] DR3:  DR6: fffe0ff0 DR7: 
>> 0400
>> [  102.449502] Call Trace:
>> [  102.452744]  ? __invalidate_device+0x48/0x60
>> [  102.458022]  check_disk_change+0x4c/0x60
>> [  102.462900]  sr_block_open+0x16/0xd0 [sr_mod]
>> [  102.468270]  __blkdev_get+0xb9/0x450
>> [  102.472774]  ? iget5_locked+0x1c0/0x1e0
>> [  102.477568]  blkdev_get+0x11e/0x320
>> [  102.481969]  ? bdget+0x11d/0x150
>> [  102.486083]  ? _raw_spin_unlock+0xa/0x20
>> [  102.490968]  ? bd_acquire+0xc0/0xc0
>> [  102.495368]  do_dentry_open+0x1b0/0x320
>> [  102.500159]  ? inode_permission+0x24/0xc0
>> [  102.505140]  path_openat+0x4e6/0x1420
>> [  102.509741]  ? cpumask_any_but+0x1f/0x40
>> [  102.514630]  ? flush_tlb_mm_range+0xa0/0x120
>> [  102.519903]  do_filp_open+0x8c/0xf0
>> [  102.524305]  ? __seccomp_filter+0x28/0x230
>> [  102.529389]  ? _raw_spin_unlock+0xa/0x20
>> [  102.534283]  ? __handle_mm_fault+0x7d6/0x9b0
>> [  102.539559]  ? list_lru_add+0xa8/0xc0
>> [  102.544157]  ? _raw_spin_unlock+0xa/0x20
>> [  102.549047]  ? __alloc_fd+0xaf/0x160
>> [  102.553549]  ? do_sys_open+0x1a6/0x230
>> [  102.558244]  do_sys_open+0x1a6/0x230
>> [  102.562742]  do_syscall_64+0x5a/0x100
>> [  102.567336]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> 
> Interesting. Thinking out loud: This is cd->device dereference I guess
> which means disk->private_data was NULL. That gets set in sr_probe()
> together with disk->fops which are certainly set as they must have led us
> to the crashing function sr_block_revalidate_disk(). So likely
> disk->private_data got already cleared. That happens in sr_kref_release()
> and the fact that that function got called means struct scsi_cd went away -
> so sr_remove() must have been called as well. That all seems possible like:
> 
> CPU1  CPU2
> sr_probe()
>   __blkdev_get()
> disk = bdev_get_gendisk();
> 
> sr_remove()
>   del_gendisk()
>   ...
>   kref_put(&cd->kref, sr_kref_release);
> disk->private_data = NULL;
> put_disk(disk);
> kfree(cd);
> if (disk->fops->open) {
>   ret = disk->fops->open(bdev, mode); => sr_block_open
> check_disk_change(bdev);
>

Re: Hotplug

2018-04-11 Thread Jens Axboe

On 4/11/18 8:06 AM, Bart Van Assche wrote:
> On Wed, 2018-04-11 at 15:58 +0200, Jan Kara wrote:
>> I'm not really sure where this is crashing and 'Code' line is incomplete to
>> tell me.
> 
> Hello Jan,
> 
> The following patch should fix this crash:
> https://www.mail-archive.com/linux-block@vger.kernel.org/msg20209.html.

Yeah, I forgot the link in my reply, thanks.

-- 
Jens Axboe

Hotplug

2018-04-10 Thread Jens Axboe

677.021757]  block_read_full_page+0x216/0x330
[ 4677.027095]  ? check_disk_change+0x60/0x60
[ 4677.032144]  ? __atime_needs_update+0x7f/0x190
[ 4677.037576]  ? find_get_entry+0x98/0xf0
[ 4677.042329]  ? pagecache_get_page+0x30/0x220
[ 4677.047567]  ? touch_atime+0x27/0xb0
[ 4677.052029]  generic_file_read_iter+0x60b/0x970
[ 4677.057561]  ? _copy_to_user+0x56/0x70
[ 4677.062221]  ? __seccomp_filter+0x28/0x230
[ 4677.067269]  __vfs_read+0xd1/0x120
[ 4677.071539]  vfs_read+0x9b/0x140
[ 4677.075614]  ksys_read+0x45/0xa0
[ 4677.079686]  do_syscall_64+0x5a/0x100
[ 4677.084241]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[ 4677.090345] RIP: 0033:0x7fe63d6741f0
[ 4677.094808] RSP: 002b:7ffcb901f918 EFLAGS: 0246 ORIG_RAX: 

[ 4677.104143] RAX: ffda RBX: 55b1645c33c0 RCX: 7fe63d6741f0
[ 4677.112584] RDX: 0400 RSI: 55b1645c33e8 RDI: 000f
[ 4677.121037] RBP: 55b1645c32b0 R08: 7fe63d65f008 R09: 0430
[ 4677.129489] R10: 55b1645c33d8 R11: 0246 R12: 
[ 4677.137934] R13: 0400 R14: 55b1645c3300 R15: 0400
[ 4677.146385] Code: 85 f6 74 4e 48 63 05 ab 33 d6 00 4c 8b bc c6 c8 02 00 00 
0f b7 43 14 f6 c4 0 
[ 4677.168785] RIP: blk_throtl_bio+0x45/0x9b0 RSP: 881ff0c8bb38


-- 
Jens Axboe

Re: PROBLEM: buffer overflow and kernel panic in mmc_ioctl_cdrom_read_data

2018-03-28 Thread Jens Axboe

On 3/28/18 11:07 AM, Piotr Gabriel Kosinski wrote:
> When trying to read raw data from a CD drive using CDROMREADRAW ioctl
> when a CD is not present, the kernel crashes with a stack corruption
> error in mmc_ioctl_cdrom_read_data.
> 
> From my (cursory) analysis it looks like the bug is caused by size
> mismatch between:
> - struct request_sense (64 bytes), used inside mmc_ioctl_cdrom_read_data
> - unsigned char[96], expected inside scsi_execute
> 
> When the request_sense struct is passed to the cdrom_read_block, which
> then ultimately calls scsi_execute, the struct gets overwritten and
> overrun in drivers/scsi/scsi_lib.c:289:
> 
> if (sense && rq->sense_len)
> memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
> 
> I have recompiled the module with a hacky fix which replaces (in
> mmc_ioctl_cdrom_read_data):
> 
> struct request_sense sense;
> 
> with
> 
> union {
> struct request_sense data;
> unsigned char buf[SCSI_SENSE_BUFFERSIZE];
> } sense;
> 
> and that fixes the problem completely. The ioctl returns ENOMEDIUM as 
> expected.

Thanks for debugging this. However, the scsi code looks a bit dangerous,
if it assumes that ->sense_len is >= SCSI_SENSE_BUFFERSIZE. I think the
correct fix would be to fix that assumption, and ensure that the path
of sr is correctly setting sense_len.

-- 
Jens Axboe

Re: General protection fault with use_blk_mq=1.

2018-03-28 Thread Jens Axboe

On 3/28/18 9:13 PM, Zephaniah E. Loss-Cutler-Hull wrote:
> On 03/28/2018 06:02 PM, Jens Axboe wrote:
>> On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote:
>>> I am not subscribed to any of the lists on the To list here, please CC
>>> me on any replies.
>>>
>>> I am encountering a fairly consistent crash anywhere from 15 minutes to
>>> 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> 
>>> The crash looks like:
>>>
> 
>>>
>>> Looking through the code, I'd guess that this is dying inside
>>> blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP
>>> is pointing at.
>>
>> Leaving the whole thing here for Paolo - it's crashing off insertion of
>> a request coming out of SG_IO. Don't think we've seen this BFQ failure
>> case before.
>>
>> You can mitigate this by switching the scsi-mq devices to mq-deadline
>> instead.
>>
> 
> I'm thinking that I should also be able to mitigate it by disabling
> CONFIG_DEBUG_BLK_CGROUP.
> 
> That should remove that entire chunk of code.
> 
> Of course, that won't help if this is actually a symptom of a bigger
> problem.

Yes, it's not a given that it will fully mask the issue at hand. But
turning off BFQ has a much higher chance of working for you.

This time actually CC'ing Paolo.


-- 
Jens Axboe

Re: General protection fault with use_blk_mq=1.

2018-03-28 Thread Jens Axboe

On 3/28/18 5:03 PM, Zephaniah E. Loss-Cutler-Hull wrote:
> I am not subscribed to any of the lists on the To list here, please CC
> me on any replies.
> 
> I am encountering a fairly consistent crash anywhere from 15 minutes to
> 12 hours after boot with scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1> 
> The crash looks like:
> 
> [ 5466.075993] general protection fault:  [#1] PREEMPT SMP PTI
> [ 5466.075997] Modules linked in: esp4 xfrm4_mode_tunnel fuse usblp
> uvcvideo pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O)
> ip6table_filter ip6_tables xt_tcpudp nf_conntrack_ipv4 nf_defrag_ipv4
> xt_conntrack nf_conntrack iptable_filter ip_tables x_tables intel_rapl
> joydev serio_raw wmi_bmof iwldvm iwlwifi shpchp kvm_intel kvm irqbypass
> autofs4 algif_skcipher nls_iso8859_1 nls_cp437 crc32_pclmul
> ghash_clmulni_intel
> [ 5466.076022] CPU: 3 PID: 10573 Comm: pool Tainted: G   O
> 4.15.13-f1-dirty #148
> [ 5466.076024] Hardware name: Hewlett-Packard HP EliteBook Folio
> 9470m/18DF, BIOS 68IBD Ver. F.44 05/22/2013
> [ 5466.076029] RIP: 0010:percpu_counter_add_batch+0x2b/0xb0
> [ 5466.076031] RSP: 0018:a556c47afb58 EFLAGS: 00010002
> [ 5466.076033] RAX: 95cda87ce018 RBX: 95cda87cdb68 RCX:
> 
> [ 5466.076034] RDX: 3fff RSI: 896495c4 RDI:
> 895b2bed
> [ 5466.076036] RBP: 3fff R08:  R09:
> 95cb7d5f8148
> [ 5466.076037] R10: 0200 R11:  R12:
> 0001
> [ 5466.076038] R13: 95cda87ce088 R14: 95cda6ebd100 R15:
> a556c47afc58
> [ 5466.076040] FS:  7f25f5305700() GS:95cdbeac()
> knlGS:
> [ 5466.076042] CS:  0010 DS:  ES:  CR0: 80050033
> [ 5466.076043] CR2: 7f25e807e0a8 CR3: 0003ed5a6001 CR4:
> 001606e0
> [ 5466.076044] Call Trace:
> [ 5466.076050]  bfqg_stats_update_io_add+0x58/0x100
> [ 5466.076055]  bfq_insert_requests+0xec/0xd80
> [ 5466.076059]  ? blk_rq_append_bio+0x8f/0xa0
> [ 5466.076061]  ? blk_rq_map_user_iov+0xc3/0x1d0
> [ 5466.076065]  blk_mq_sched_insert_request+0xa3/0x130
> [ 5466.076068]  blk_execute_rq+0x3a/0x50
> [ 5466.076070]  sg_io+0x197/0x3e0
> [ 5466.076073]  ? dput+0xca/0x210
> [ 5466.076077]  ? mntput_no_expire+0x11/0x1a0
> [ 5466.076079]  scsi_cmd_ioctl+0x289/0x400
> [ 5466.076082]  ? filename_lookup+0xe1/0x170
> [ 5466.076085]  sd_ioctl+0xc7/0x1a0
> [ 5466.076088]  blkdev_ioctl+0x4d4/0x8c0
> [ 5466.076091]  block_ioctl+0x39/0x40
> [ 5466.076094]  do_vfs_ioctl+0x92/0x5e0
> [ 5466.076097]  ? __fget+0x73/0xc0
> [ 5466.076099]  SyS_ioctl+0x74/0x80
> [ 5466.076102]  do_syscall_64+0x60/0x110
> [ 5466.076106]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> [ 5466.076109] RIP: 0033:0x7f25f75fef47
> [ 5466.076110] RSP: 002b:7f25f53049a8 EFLAGS: 0246 ORIG_RAX:
> 0010
> [ 5466.076112] RAX: ffda RBX: 000c RCX:
> 7f25f75fef47
> [ 5466.076114] RDX: 7f25f53049b0 RSI: 2285 RDI:
> 000c
> [ 5466.076115] RBP: 0010 R08: 7f25e8007818 R09:
> 0200
> [ 5466.076116] R10: 0001 R11: 0246 R12:
> 
> [ 5466.076118] R13:  R14: 7f25f8a6b5e0 R15:
> 7f25e80173e0
> [ 5466.076120] Code: 41 55 49 89 fd bf 01 00 00 00 41 54 49 89 f4 55 89
> d5 53 e8 18 e1 bb ff 48 c7 c7 c4 95 64 89 e8 dc e9 fb ff 49 8b 45 20 48
> 63 d5 <65> 8b 18 48 63 db 4c 01 e3 48 39 d3 7d 0a f7 dd 48 63 ed 48 39
> [ 5466.076147] RIP: percpu_counter_add_batch+0x2b/0xb0 RSP: a556c47afb58
> [ 5466.076149] ---[ end trace 8d7eb80aafef4494 ]---
> [ 5466.670153] note: pool[10573] exited with preempt_count 2
> 
> (I only have the one instance right this minute as a result of not
> having remote syslog setup before now.)
> 
> This is clearly deep in the blk_mq code, and it goes away when I remove
> the use_blk_mq kernel command line parameters.
> 
> My next obvious step is to try and disable the load of the vbox modules.
> 
> I can include the full dmesg output if it would be helpful.
> 
> The system is an older HP Ultrabook, and the root partition is, sda1 (a
> SSD) -> a LUKS encrypted partition -> LVM -> BTRFS.
> 
> The kernel is a stock 4.15.11, however I only recently added the blk_mq
> options, so while I can state that I have seen this on multiple kernels
> in the 4.15.x series, I have not tested earlier kernels in this
> configuration.
> 
> Looking through the code, I'd guess that this is dying inside
> blkg_rwstat_add, which calls percpu_counter_add_batch, which is what RIP
> is pointing at.

Leaving the whole thing here for Paolo - it's crashing off insertion of
a request coming out of SG_IO. Don't think we've seen this BFQ failure
case before.

You can mitigate this by switching the scsi-mq devices to mq-deadline
instead.

-- 
Jens Axboe

Re: [PATCH 1/3] blk-mq: Allow PCI vector offset for mapping queues

2018-03-27 Thread Jens Axboe

On 3/27/18 9:39 AM, Keith Busch wrote:
> The PCI interrupt vectors intended to be associated with a queue may
> not start at 0; a driver may allocate pre_vectors for special use. This
> patch adds an offset parameter so blk-mq may find the intended affinity
> mask and updates all drivers using this API accordingly.

Looks good to me, I'll queue it up for 4.17.

-- 
Jens Axboe

Re: [PATCH V5 1/5] scsi: hpsa: fix selection of reply queue

2018-03-19 Thread Jens Axboe

On 3/19/18 5:48 AM, Bityutskiy, Artem wrote:
> On Tue, 2018-03-13 at 17:42 +0800, Ming Lei wrote:
>> From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
>> one msix vector can be created without any online CPU mapped, then one
>> command's completion may not be notified.
> 
> Hello Martin, James,
> 
> we have this patch-set and it fixes megaraid regression in v4.16. Do
> you plan to mege it to v4.16-rcX? I am worried - there seem to be no
> sight that this is going to me merged. They are not in the linux-next.

I'm assuming that Martin will eventually queue this up. But probably
for 4.17, then we can always flag it for a backport to stable once
it's been thoroughly tested.

-- 
Jens Axboe

Re: bsg-lib interface cleanup V2

2018-03-13 Thread Jens Axboe

On 3/13/18 9:28 AM, Christoph Hellwig wrote:
> Hi all,
> 
> this series cleans up various abuses of the bsg interfaces, and then
> splits bsg for SCSI passthrough from bsg for arbitrary transport
> passthrough.  This removes the scsi_request abuse in bsg-lib that is
> very confusing, and also makes sure we can sanity check the requests
> we get.  The current code will happily execute scsi commands against
> bsg-lib queues, and transport pass through against scsi nodes, without
> any indication to the driver that we are doing the wrong thing.

Applied for 4.17, thanks.

-- 
Jens Axboe

Re: [PATCH] cdrom: do not call check_disk_change() inside cdrom_open()

2018-03-09 Thread Jens Axboe

On 3/9/18 5:59 AM, Maurizio Lombardi wrote:
> when mounting an ISO filesystem sometimes (very rarely)
> the system hangs because of a race condition between two tasks.
> 
> PID: 6766   TASK: 88007b2a6dd0  CPU: 0   COMMAND: "mount"
>  #0 [880078447ae0] __schedule at 8168d605
>  #1 [880078447b48] schedule_preempt_disabled at 8168ed49
>  #2 [880078447b58] __mutex_lock_slowpath at 8168c995
>  #3 [880078447bb8] mutex_lock at 8168bdef
>  #4 [880078447bd0] sr_block_ioctl at a00b6818 [sr_mod]
>  #5 [880078447c10] blkdev_ioctl at 812fea50
>  #6 [880078447c70] ioctl_by_bdev at 8123a8b3
>  #7 [880078447c90] isofs_fill_super at a04fb1e1 [isofs]
>  #8 [880078447da8] mount_bdev at 81202570
>  #9 [880078447e18] isofs_mount at a04f9828 [isofs]
> #10 [880078447e28] mount_fs at 81202d09
> #11 [880078447e70] vfs_kern_mount at 8121ea8f
> #12 [880078447ea8] do_mount at 81220fee
> #13 [880078447f28] sys_mount at 812218d6
> #14 [880078447f80] system_call_fastpath at 81698c49
> RIP: 7fd9ea914e9a  RSP: 7ffd5d9bf648  RFLAGS: 00010246
> RAX: 00a5  RBX: 81698c49  RCX: 0010
> RDX: 7fd9ec2bc210  RSI: 7fd9ec2bc290  RDI: 7fd9ec2bcf30
> RBP:    R8:    R9: 0010
> R10: c0ed0001  R11: 0206  R12: 7fd9ec2bc040
> R13: 7fd9eb6b2380  R14: 7fd9ec2bc210  R15: 7fd9ec2bcf30
> ORIG_RAX: 00a5  CS: 0033  SS: 002b
> 
> This task was trying to mount the cdrom.  It allocated and configured a
> super_block struct and owned the write-lock for the super_block->s_umount
> rwsem. While exclusively owning the s_umount lock, it called
> sr_block_ioctl and waited to acquire the global sr_mutex lock.
> 
> PID: 6785   TASK: 880078720fb0  CPU: 0   COMMAND: "systemd-udevd"
>  #0 [880078417898] __schedule at 8168d605
>  #1 [880078417900] schedule at 8168dc59
>  #2 [880078417910] rwsem_down_read_failed at 8168f605
>  #3 [880078417980] call_rwsem_down_read_failed at 81328838
>  #4 [8800784179d0] down_read at 8168cde0
>  #5 [8800784179e8] get_super at 81201cc7
>  #6 [880078417a10] __invalidate_device at 8123a8de
>  #7 [880078417a40] flush_disk at 8123a94b
>  #8 [880078417a88] check_disk_change at 8123ab50
>  #9 [880078417ab0] cdrom_open at a00a29e1 [cdrom]
> #10 [880078417b68] sr_block_open at a00b6f9b [sr_mod]
> #11 [880078417b98] __blkdev_get at 8123ba86
> #12 [880078417bf0] blkdev_get at 8123bd65
> #13 [880078417c78] blkdev_open at 8123bf9b
> #14 [880078417c90] do_dentry_open at 811fc7f7
> #15 [880078417cd8] vfs_open at 811fc9cf
> #16 [880078417d00] do_last at 8120d53d
> #17 [880078417db0] path_openat at 8120e6b2
> #18 [880078417e48] do_filp_open at 8121082b
> #19 [880078417f18] do_sys_open at 811fdd33
> #20 [880078417f70] sys_open at 811fde4e
> #21 [880078417f80] system_call_fastpath at 81698c49
> RIP: 7f29438b0c20  RSP: 7ffc76624b78  RFLAGS: 00010246
> RAX: 0002  RBX: 81698c49  RCX: 
> RDX: 7f2944a5fa70  RSI: 000a0800  RDI: 7f2944a5fa70
> RBP: 7f2944a5f540   R8:    R9: 0020
> R10: 7f2943614c40  R11: 0246  R12: 811fde4e
> R13: 880078417f78  R14: 000c  R15: 7f2944a4b010
> ORIG_RAX: 0002  CS: 0033  SS: 002b
> 
> This task tried to open the cdrom device, the sr_block_open function
> acquired the global sr_mutex lock. The call to check_disk_change()
> then saw an event flag indicating a possible media change and tried
> to flush any cached data for the device.
> As part of the flush, it tried to acquire the super_block->s_umount
> lock associated with the cdrom device.
> This was the same super_block as created and locked by the previous task.
> 
> The first task acquires the s_umount lock and then the sr_mutex_lock;
> the second task acquires the sr_mutex_lock and then the s_umount lock.
> 
> This patch fixes the issue by moving check_disk_change() out of
> cdrom_open() and let the caller take care of it.

Looks good to me, applied.

-- 
Jens Axboe

Re: [PATCH V2 2/8] blk-mq: introduce BLK_MQ_F_GLOBAL_TAGS

2018-02-06 Thread Jens Axboe

On 2/5/18 8:20 AM, Ming Lei wrote:
> Quite a few HBAs(such as HPSA, megaraid, mpt3sas, ..) support multiple
> reply queues, but tags is often HBA wide.
> 
> These HBAs have switched to use pci_alloc_irq_vectors(PCI_IRQ_AFFINITY)
> for automatic affinity assignment.
> 
> Now 84676c1f21e8ff5(genirq/affinity: assign vectors to all possible CPUs)
> has been merged to V4.16-rc, and it is easy to allocate all offline CPUs
> for some irq vectors, this can't be avoided even though the allocation
> is improved.
> 
> So all these drivers have to avoid to ask HBA to complete request in
> reply queue which hasn't online CPUs assigned, and HPSA has been broken
> with v4.15+:
> 
>   https://marc.info/?l=linux-kernel&m=151748144730409&w=2
> 
> This issue can be solved generically and easily via blk_mq(scsi_mq) multiple
> hw queue by mapping each reply queue into hctx, but one tricky thing is
> the HBA wide(instead of hw queue wide) tags.
> 
> This patch is based on the following Hannes's patch:
> 
>   https://marc.info/?l=linux-block&m=149132580511346&w=2
> 
> One big difference with Hannes's is that this patch only makes the tags 
> sbitmap
> and active_queues data structure HBA wide, and others are kept as NUMA 
> locality,
> such as request, hctx, tags, ...
> 
> The following patch will support global tags on null_blk, also the performance
> data is provided, no obvious performance loss is observed when the whole
> hw queue depth is same.

GLOBAL implies that it's, strangely enough, global. That isn't really the
case. Why not call this BLK_MQ_F_HOST_TAGS or something like that? I'd
welcome better names, but global doesn't seem to be a great choice.

BLK_MQ_F_SET_TAGS?

-- 
Jens Axboe

Re: [PATCH V2 0/8] blk-mq/scsi-mq: support global tags & introduce force_blk_mq

2018-02-06 Thread Jens Axboe

On 2/5/18 8:20 AM, Ming Lei wrote:
> Hi All,
> 
> This patchset supports global tags which was started by Hannes originally:
> 
>   https://marc.info/?l=linux-block&m=149132580511346&w=2
> 
> Also inroduce 'force_blk_mq' and 'host_tagset' to 'struct scsi_host_template',
> so that driver can avoid to support two IO paths(legacy and blk-mq), 
> especially
> recent discusion mentioned that SCSI_MQ will be enabled at default soon.
> 
>   https://marc.info/?l=linux-scsi&m=151727684915589&w=2
> 
> With the above changes, it should be easier to convert SCSI drivers'
> reply queue into blk-mq's hctx, then the automatic irq affinity issue can
> be solved easily, please see detailed descrption in commit log and the
> 8th patch for converting HPSA.
> 
> Also drivers may require to complete request on the submission CPU
> for avoiding hard/soft deadlock, which can be done easily with blk_mq
> too.
> 
>   https://marc.info/?t=15160185141&r=1&w=2
> 
> The 6th patch uses the introduced 'force_blk_mq' to fix virtio_scsi
> so that IO hang issue can be avoided inside legacy IO path, this issue is
> a bit generic, at least HPSA/virtio-scsi are found broken with v4.15+.
> 
> The 7th & 8th patch fixes HPSA's IO issue which is caused by the reply
> queue selection algorithem.
> 
> Laurence has verified that this patch makes HPSA working with the linus
> tree with this patchset.

Do you have any performance numbers for this patchset? I'd be curious
to know how big the hit is.

-- 
Jens Axboe

Re: [dm-devel] [PATCH v5] blk-mq: introduce BLK_STS_DEV_RESOURCE

2018-01-30 Thread Jens Axboe

On 1/30/18 8:27 PM, Bart Van Assche wrote:
> On Tue, 2018-01-30 at 20:22 -0700, Jens Axboe wrote:
>> On 1/30/18 8:21 PM, Bart Van Assche wrote:
>>> On Tue, 2018-01-30 at 20:17 -0700, Jens Axboe wrote:
>>>> BLK_STS_RESOURCE should always be safe to return, and it should work
>>>> the same as STS_DEV_RESOURCE, except it may cause an extra queue
>>>> run.
>>>>
>>>> Well written drivers should use STS_DEV_RESOURCE where it makes
>>>> sense.
>>>
>>> Hello Jens,
>>>
>>> I would appreciate it if other names would be chosen than BLK_STS_RESOURCE
>>> and BLK_STS_DEV_RESOURCE, e.g. names that directly refer to the fact that
>>> one of the two status codes causes the queue to be rerun and the other not.
>>> I'm afraid that the currently chosen names will cause confusion.
>>
>> DEV_RESOURCE is pretty clear I think, but I agree that STS_RESOURCE
>> could perhaps be better. STS_SYSTEM_RESOURCE? It makes the distinction
>> a bit clearer.
> 
> I'm concerned about both. A block driver can namely run out of device
> resources without receiving a notification if that resource becomes
> available again.

That is true, and that is why I don't want to driver to make guesses on
when to re-run. The saving grace here is that it should happen extremely
rarely. I went over this in a previous email. If it's not a rare
occurence, then there's no other way around it then wiring up a
notification mechanism to kick the queue when the shortage clears.

One way to deal with that is making the assumption that other IO will
clear the issue. That means we can confine it to the blk-mq layer.
Similarly to how we currently sometimes have to track starvation across
hardware queues and restart queue X when Y completes IO, we may have to
have a broader scope of shortage. That would probably fix all bug
pathological cases.

> In that case the appropriate return code is BLK_STS_RESOURCE. Hence my
> concern ...

But this isn't a new thing, the only real change here is making it so
that drivers don't have to think about that case.

-- 
Jens Axboe

Re: [dm-devel] [PATCH v5] blk-mq: introduce BLK_STS_DEV_RESOURCE

2018-01-30 Thread Jens Axboe

On 1/30/18 8:21 PM, Bart Van Assche wrote:
> On Tue, 2018-01-30 at 20:17 -0700, Jens Axboe wrote:
>> BLK_STS_RESOURCE should always be safe to return, and it should work
>> the same as STS_DEV_RESOURCE, except it may cause an extra queue
>> run.
>>
>> Well written drivers should use STS_DEV_RESOURCE where it makes
>> sense.
> 
> Hello Jens,
> 
> I would appreciate it if other names would be chosen than BLK_STS_RESOURCE
> and BLK_STS_DEV_RESOURCE, e.g. names that directly refer to the fact that
> one of the two status codes causes the queue to be rerun and the other not.
> I'm afraid that the currently chosen names will cause confusion.

DEV_RESOURCE is pretty clear I think, but I agree that STS_RESOURCE
could perhaps be better. STS_SYSTEM_RESOURCE? It makes the distinction
a bit clearer.

-- 
Jens Axboe

Re: [PATCH v6] blk-mq: introduce BLK_STS_DEV_RESOURCE

2018-01-30 Thread Jens Axboe

On 1/30/18 8:04 PM, Mike Snitzer wrote:
> From: Ming Lei 
> 
> This status is returned from driver to block layer if device related
> resource is unavailable, but driver can guarantee that IO dispatch
> will be triggered in future when the resource is available.
> 
> Convert some drivers to return BLK_STS_DEV_RESOURCE.  Also, if driver
> returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after
> a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls.  BLK_MQ_DELAY_QUEUE is
> 3 ms because both scsi-mq and nvmefc are using that magic value.
> 
> If a driver can make sure there is in-flight IO, it is safe to return
> BLK_STS_DEV_RESOURCE because:
> 
> 1) If all in-flight IOs complete before examining SCHED_RESTART in
> blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue
> is run immediately in this case by blk_mq_dispatch_rq_list();
> 
> 2) if there is any in-flight IO after/when examining SCHED_RESTART
> in blk_mq_dispatch_rq_list():
> - if SCHED_RESTART isn't set, queue is run immediately as handled in 1)
> - otherwise, this request will be dispatched after any in-flight IO is
>   completed via blk_mq_sched_restart()
> 
> 3) if SCHED_RESTART is set concurently in context because of
> BLK_STS_RESOURCE, blk_mq_delay_run_hw_queue() will cover the above two
> cases and make sure IO hang can be avoided.
> 
> One invariant is that queue will be rerun if SCHED_RESTART is set.

Applied, thanks.

-- 
Jens Axboe

Re: [dm-devel] [PATCH v5] blk-mq: introduce BLK_STS_DEV_RESOURCE

2018-01-30 Thread Jens Axboe

On 1/30/18 10:52 AM, Bart Van Assche wrote:
> On 01/30/18 06:24, Mike Snitzer wrote:
>> + *
>> + * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
>> + * bit is set, run queue after a delay to avoid IO stalls
>> + * that could otherwise occur if the queue is idle.
>>   */
>> -if (!blk_mq_sched_needs_restart(hctx) ||
>> +needs_restart = blk_mq_sched_needs_restart(hctx);
>> +if (!needs_restart ||
>>  (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
>>  blk_mq_run_hw_queue(hctx, true);
>> +else if (needs_restart && (ret == BLK_STS_RESOURCE))
>> +blk_mq_delay_run_hw_queue(hctx, BLK_MQ_QUEUE_DELAY);
>>  }
> 
> If a request completes concurrently with execution of the above code 
> then the request completion will trigger a call of 
> blk_mq_sched_restart_hctx() and that call will clear the 
> BLK_MQ_S_SCHED_RESTART bit. If that bit is cleared before the above code 
> tests it then the above code will schedule an asynchronous call of 
> __blk_mq_run_hw_queue(). If the .queue_rq() call triggered by the new 
> queue run returns again BLK_STS_RESOURCE then the above code will be 
> executed again. In other words, a loop occurs. That loop will repeat as 
> long as the described race occurs. The current (kernel v4.15) block 
> layer behavior is simpler: only block drivers call 
> blk_mq_delay_run_hw_queue() and the block layer core never calls that 
> function. Hence that loop cannot occur with the v4.15 block layer core 
> and block drivers. A motivation of why that loop is preferred compared 
> to the current behavior (no loop) is missing. Does this mean that that 
> loop is a needless complication of the block layer core?

The dispatch, and later restart check, is within the hctx lock. The
completions should be as well.

> Sorry but I still prefer the v4.15 block layer approach because this 
> patch has in my view the following disadvantages:
> - It involves a blk-mq API change. API changes are always risky and need
>some time to stabilize.
> - The delay after which to rerun the queue is moved from block layer
>drivers into the block layer core. I think that's wrong because only
>the block driver authors can make a good choice for this constant.

It's exactly the right place to put it, as drivers cannot make a good
decision for when to run the queue again. You get NULL on allocating
some piece of memory, when do you run it again? That's the last thing
I want driver writers to make a decision on, because a novice device
driver writer will just think that he should run the queue again asap.
In reality we are screwed. Decisions like that SHOULD be in shared
and generic code, not in driver private code.

> - This patch makes block drivers harder to understand. Anyone who sees
>return BLK_STS_RESOURCE / return BLK_STS_DEV_RESOURCE for the first
>time will have to look up the meaning of these constants. An explicit
>blk_mq_delay_run_hw_queue() call is easier to understand.

BLK_STS_RESOURCE should always be safe to return, and it should work
the same as STS_DEV_RESOURCE, except it may cause an extra queue
run.

Well written drivers should use STS_DEV_RESOURCE where it makes
sense.

> - This patch does not fix any bugs nor makes block drivers easier to
>    read or to implement. So why is this patch considered useful?

It does fix the run bug on global resources, I'm assuming you mean
it doesn't fix any EXTRA bugs compared to just use the delayed
run?

-- 
Jens Axboe

Re: [PATCH v5] blk-mq: introduce BLK_STS_DEV_RESOURCE

2018-01-30 Thread Jens Axboe

On 1/30/18 7:24 AM, Mike Snitzer wrote:
> From: Ming Lei 
> 
> This status is returned from driver to block layer if device related
> resource is unavailable, but driver can guarantee that IO dispatch
> will be triggered in future when the resource is available.
> 
> Convert some drivers to return BLK_STS_DEV_RESOURCE.  Also, if driver
> returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after
> a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls.  BLK_MQ_DELAY_QUEUE is
> 3 ms because both scsi-mq and nvmefc are using that magic value.
> 
> If a driver can make sure there is in-flight IO, it is safe to return
> BLK_STS_DEV_RESOURCE because:
> 
> 1) If all in-flight IOs complete before examining SCHED_RESTART in
> blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue
> is run immediately in this case by blk_mq_dispatch_rq_list();
> 
> 2) if there is any in-flight IO after/when examining SCHED_RESTART
> in blk_mq_dispatch_rq_list():
> - if SCHED_RESTART isn't set, queue is run immediately as handled in 1)
> - otherwise, this request will be dispatched after any in-flight IO is
>   completed via blk_mq_sched_restart()
> 
> 3) if SCHED_RESTART is set concurently in context because of
> BLK_STS_RESOURCE, blk_mq_delay_run_hw_queue() will cover the above two
> cases and make sure IO hang can be avoided.
> 
> One invariant is that queue will be rerun if SCHED_RESTART is set.

This looks pretty good to me. I'm waffling a bit on whether to retain
the current BLK_STS_RESOURCE behavior and name the new one something
else, but I do like using the DEV name in there to signify the
difference between a global and device resource.

Just a few small nits below - can you roll a v6 with the changes?

> diff --git a/block/blk-core.c b/block/blk-core.c
> index cdae69be68e9..38279d4ae08b 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -145,6 +145,7 @@ static const struct {
>   [BLK_STS_MEDIUM]= { -ENODATA,   "critical medium" },
>   [BLK_STS_PROTECTION]= { -EILSEQ,"protection" },
>   [BLK_STS_RESOURCE]  = { -ENOMEM,"kernel resource" },
> + [BLK_STS_DEV_RESOURCE]  = { -ENOMEM,"device resource" },
>   [BLK_STS_AGAIN] = { -EAGAIN,"nonblocking retry" },

I think we should make BLK_STS_DEV_RESOURCE be -EBUSY, that more closely
matches the result and makes it distinctly different than the global
shortage that is STS_RESOURCE/ENOMEM.

> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 43e7449723e0..e39b4e2a63db 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1160,6 +1160,8 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx 
> **hctx,
>   return true;
>  }
>  
> +#define BLK_MQ_QUEUE_DELAY   3   /* ms units */

Make that BLK_MQ_RESOURCE_DELAY or something like that. The name is too
generic right now.

> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 2d973ac54b09..f41d2057215f 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -39,6 +39,23 @@ typedef u8 __bitwise blk_status_t;
>  
>  #define BLK_STS_AGAIN((__force blk_status_t)12)
>  
> +/*
> + * BLK_STS_DEV_RESOURCE is returned from driver to block layer if device
> + * related resource is unavailable, but driver can guarantee that queue
> + * will be rerun in future once the resource is available (whereby
> + * dispatching requests).
> + *
> + * To safely return BLK_STS_DEV_RESOURCE, and allow forward progress, a
> + * driver just needs to make sure there is in-flight IO.
> + *
> + * Difference with BLK_STS_RESOURCE:
> + * If driver isn't sure if the queue will be rerun once device resource
> + * is made available, please return BLK_STS_RESOURCE.  For example: when
> + * memory allocation, DMA Mapping or other system resource allocation
> + * fails and IO can't be submitted to device.
> + */
> +#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)

I'd rephrase that as:

BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
device related resource are unavailable, but the driver can guarantee
that the queue will be rerun in the future once resources become
available again. This is typically the case for device specific
resources that are consumed for IO. If the driver fails allocating these
resources, we know that inflight (or pending) IO will free these
resource upon completion.

This is different from BLK_STS_RESOURCE in that it explicitly references
device specific resource. For resources of wider scope, allocation
failure can happen without having pending IO. This means that we can't
rely on request completions freeing these resources, as IO may not be in
flight. Examples of that are kernel memory allocations, DMA mappings, or
any other system wide resources.

-- 
Jens Axboe

Re: [LSF/MM TOPIC] Two blk-mq related topics

2018-01-29 Thread Jens Axboe

On 1/29/18 4:46 PM, James Bottomley wrote:
> On Mon, 2018-01-29 at 14:00 -0700, Jens Axboe wrote:
>> On 1/29/18 1:56 PM, James Bottomley wrote:
>>>
>>> On Mon, 2018-01-29 at 23:46 +0800, Ming Lei wrote:
>>> [...]
>>>>
>>>> 2. When to enable SCSI_MQ at default again?
>>>
>>> I'm not sure there's much to discuss ... I think the basic answer
>>> is as soon as Christoph wants to try it again.
>>
>> FWIW, internally I've been running various IO intensive workloads on
>> what is essentially 4.12 upstream with scsi-mq the default (with
>> mq-deadline as the scheduler) and comparing IO workloads with a
>> previous 4.6 kernel (without scsi-mq), and things are looking
>> great.
>>
>> We're never going to iron out the last kinks with it being off
>> by default, I think we should attempt to flip the switch again
>> for 4.16.
> 
> Absolutely, I agree we turn it on ASAP.  I just don't want to be on the
> receiving end of Linus' flamethrower because a bug we already had
> reported against scsi-mq caused problems.  Get confirmation from the
> original reporters (or as close to it as you can) that their problems
> are fixed and we're good to go; he won't kick us nearly as hard for new
> bugs that turn up.

I agree, the functional issues definitely have to be verified to be
resolved. Various performance hitches we can dive into if they
crop up, but reintroducing some random suspend regression is not
acceptable.

-- 
Jens Axboe

Re: [LSF/MM TOPIC] Two blk-mq related topics

2018-01-29 Thread Jens Axboe

On 1/29/18 1:56 PM, James Bottomley wrote:
> On Mon, 2018-01-29 at 23:46 +0800, Ming Lei wrote:
> [...]
>> 2. When to enable SCSI_MQ at default again?
> 
> I'm not sure there's much to discuss ... I think the basic answer is as
> soon as Christoph wants to try it again.

FWIW, internally I've been running various IO intensive workloads on
what is essentially 4.12 upstream with scsi-mq the default (with
mq-deadline as the scheduler) and comparing IO workloads with a
previous 4.6 kernel (without scsi-mq), and things are looking
great.

We're never going to iron out the last kinks with it being off
by default, I think we should attempt to flip the switch again
for 4.16.

-- 
Jens Axboe

Re: [PATCH] blk_rq_map_user_iov: fix error override

2018-01-15 Thread Jens Axboe

On 1/14/18 3:00 PM, Douglas Gilbert wrote:
> During stress tests by syzkaller on the sg driver the block layer
> infrequently returns EINVAL. Closer inspection shows the block
> layer was trying to return ENOMEM (which is much more
> understandable) but for some reason overroad that useful error.
> 
> Patch below does not show this (unchanged) line:
>ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
> That 'ret' was being overridden when that function failed.

Thanks, applied.

-- 
Jens Axboe

Re: [PATCH V9 0/7] blk-mq support for ZBC disks

2018-01-08 Thread Jens Axboe

On 1/8/18 8:52 AM, Martin K. Petersen wrote:
> 
> Jens,
> 
>> This looks OK for me for 4.16. I can grab all of them, or I can leave
>> the last two for Martin to apply if he prefers that, though that will
>> add a block tree dependency for SCSI.
> 
> I already have a block dependency for 4.16. But it doesn't matter much.

Completely up to you - I already have 1-5, I can add 6/7 as well, or
just can do it in your tree. Let me know what you prefer.


-- 
Jens Axboe

Re: [PATCH v4 0/5] Introduce sgl_alloc() and sgl_free()

2018-01-05 Thread Jens Axboe

On 1/5/18 9:26 AM, Bart Van Assche wrote:
> Hello Jens,
> 
> As you know there are multiple drivers that both allocate a scatter/gather
> list and populate that list with pages. This patch series moves the code for
> allocating and freeing such scatterlists from several drivers into
> lib/scatterlist.c. Please consider this patch series for kernel v4.16.

Thanks Bart, applied for 4.16.

-- 
Jens Axboe

Re: [PATCH V9 0/7] blk-mq support for ZBC disks

2018-01-05 Thread Jens Axboe

On Thu, Dec 21 2017, Damien Le Moal wrote:
> This series, formerly titled "scsi-mq support for ZBC disks", implements
> support for ZBC disks for system using the scsi-mq I/O path.
> 
> The current scsi level support of ZBC disks guarantees write request ordering
> using a per-zone write lock which prevents issuing simultaneously multiple
> write commands to a zone, doing so avoid reordering of sequential writes to
> sequential zones. This method is however ineffective when scsi-mq is used with
> zoned block devices. This is due to the different execution model of blk-mq
> which passes a request to the scsi layer for dispatching after the request has
> been removed from the I/O scheduler queue. That is, when the scsi layer tries
> to lock the target zone of the request, the request may already be out of
> order and zone write locking fails to prevent that.
> 
> Various approaches have been tried to solve this problem directly from the 
> core
> code of blk-mq. All of them had the serious disadvantage of cluttering blk-mq
> code with zoned block device specific conditions and processing, making
> maintenance and testing difficult.
> 
> This series adds blk-mq support for zoned block devices at the I/O scheduler
> level with simple modifications of the mq-deadline scheduler. Implementation
> is done with reusable helpers defined in the zoned block device support file
> (blk-zoned.c). These helpers provide per zone write locking control functions
> similar to what was implemented directly in the SCSI layer in sd_zbc.c.
> The zone write locking mechanism is used by mq-deadline for the exact same
> purpose, that is, to limit writes per zone to at most one request to avoid
> reordering.
> 
> The changes to mq-deadline do not affect its operation with regular disks. The
> same scheduling behavior is maintained for these devices. Compared to the SCSI
> layer zone locking implementation, this series optimizes avoids locking
> conventional zones which result in a use of these zone that is comparable to a
> regular disk.
> 
> This series also implements changes to the legacy deadline-iosched. Doing so,
> the zone locking code at the SCSI layer in sd.c and sd_zbc.c can be removed.
> This results in a significant simplification of the sd driver command 
> handling.
> 
> Patch 1 to 5 introduce the zone locking helpers in the block layer and modify
> the deadline and mq-deadline schedulers.
> Patch 6 and 7 remove the SCSI layer zone locking and initialize the device
> request queue zone information.
> 
> All patches apply without conflicts to the scsi tree branch 4.16/scsi-queue, 
> to
> the block tree branch for-linus as well as to the current 4.15-rc4 tree.
> 
> Of note is that this series imposes the use of the deadline and mq-deadline
> schedulers with zoned block devices. A system can trivialy enforce this using
> a udev rule such as:
> 
> ACTION=="add|change", KERNEL=="sd[a-z]", ATTRS{queue/zoned}=="host-managed", \
> ATTR{queue/scheduler}="deadline"
> 
> This rules applies equally for the legacy SCSI path as well as the scsi-mq 
> path
> thanks to "mq-deadline" being aliased to "deadline".
> 
> Comments are as always very much appreciated.

This looks OK for me for 4.16. I can grab all of them, or I can leave
the last two for Martin to apply if he prefers that, though that will
add a block tree dependency for SCSI.

Martin?

-- 
Jens Axboe

Re: [PATCH V8 0/7] blk-mq support for ZBC disks

2017-11-24 Thread Jens Axboe

On 11/24/2017 04:48 PM, Damien Le Moal wrote:
> [Full quote deleted]
> 
> Hi Jens,
> 
> Any comment regarding this series ?
> I understand that this would be for the 4.16 merge window, so no hurry,
> but I would like to know if I need to go back to the drawing board for
> ZBC blk-mq/scsi-mq support or if this is an acceptable solution.

I'll give it a thorough look-over on Monday.

-- 
Jens Axboe

Re: [PATCH v12 0/7] block, scsi: Improve suspend and resume

2017-11-09 Thread Jens Axboe

On 11/09/2017 11:49 AM, Bart Van Assche wrote:
> Hello Jens,
> 
> It is known that during the resume following a hibernate, especially when
> using an md RAID1 array created on top of SCSI devices, sometimes the system
> hangs instead of resuming up properly. This patch series fixes that
> problem. These patches have been tested on top of the block layer for-next
> branch. Please consider these changes for kernel v4.15.

Applied, thanks Bart.

-- 
Jens Axboe

Re: [PATCH v11 0/7] block, scsi, md: Improve suspend and resume

2017-11-09 Thread Jens Axboe

On 11/09/2017 09:54 AM, Bart Van Assche wrote:
> On Thu, 2017-11-09 at 07:16 +0100, Oleksandr Natalenko wrote:
>> is this something known to you, or it is just my fault applying this series 
>> to 
>> v4.13? Except having this warning, suspend/resume works for me:
>>
>> [   27.383846] sd 0:0:0:0: [sda] Starting disk
>> [   27.383976] sd 1:0:0:0: [sdb] Starting disk
>> [   27.451218] sdb: Attempt to allocate non-preempt request in preempt-only 
>> mode.
>> [   27.459640] [ cut here ]
>> [   27.464521] WARNING: CPU: 0 PID: 172 at block/blk-core.c:823 
>> blk_queue_enter+0x222/0x280
> 
> Hello Oleksandr,
> 
> Thanks for the testing. The warning that you reported is expected. Maybe it
> should be left out to avoid that users get confused.

If the warning is expected, then it should be removed. It'll just confuse
users and cause useless bug reports.

> Jens, this patch series still applies cleanly on top of your for-4.15/block
> branch. Are you fine with this patch series or do you perhaps want me to
> repost it with Oleksandr's Tested-by tag added to each patch?

Since you need to kill the warning anyway, let's get it respun.

-- 
Jens Axboe

Re: [PATCH] blk-mq: Avoid that request queue removal can trigger list corruption

2017-11-08 Thread Jens Axboe

On 11/08/2017 11:23 AM, Bart Van Assche wrote:
> Avoid that removal of a request queue sporadically triggers the
> following warning:
> 
> list_del corruption. next->prev should be 8807d649b970, but was 
> 6b6b6b6b6b6b6b6b
> WARNING: CPU: 3 PID: 342 at lib/list_debug.c:56 
> __list_del_entry_valid+0x92/0xa0
> Call Trace:
>  process_one_work+0x11b/0x660
>  worker_thread+0x3d/0x3b0
>  kthread+0x129/0x140
>  ret_from_fork+0x27/0x40

Looks good to me, applied.

-- 
Jens Axboe

Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()

2017-11-08 Thread Jens Axboe

On 11/08/2017 11:22 AM, Laurence Oberman wrote:
> On Wed, 2017-11-08 at 10:57 -0700, Jens Axboe wrote:
>> On 11/08/2017 09:41 AM, Bart Van Assche wrote:
>>> On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
>>>> At this point, I have no idea what Bart's setup looks like. Bart,
>>>> it
>>>> would be REALLY helpful if you could tell us how you are
>>>> reproducing
>>>> your hang. I don't know why this has to be dragged out.
>>>
>>> Hello Jens,
>>>
>>> It is a disappointment to me that you have allowed Ming to evaluate
>>> other
>>> approaches than reverting "blk-mq: don't handle TAG_SHARED in
>>> restart". That
>>> patch namely replaces an algorithm that is trusted by the community
>>> with an
>>> algorithm of which even Ming acknowledged that it is racy. A quote
>>> from [1]:
>>> "IO hang may be caused if all requests are completed just before
>>> the current
>>> SCSI device is added to shost->starved_list". I don't know of any
>>> way to fix
>>> that race other than serializing request submission and completion
>>> by adding
>>> locking around these actions, which is something we don't want.
>>> Hence my
>>> request to revert that patch.
>>
>> I was reluctant to revert it, in case we could work out a better way
>> of
>> doing it. As I mentioned in the other replies, it's not exactly the
>> prettiest or most efficient. However, since we currently don't have
>> a good solution for the issue, I'm fine with reverting that patch.
>>
>>> Regarding the test I run, here is a summary of what I mentioned in
>>> previous
>>> e-mails:
>>> * I modified the SRP initiator such that the SCSI target queue
>>> depth is
>>>   reduced to one by setting starget->can_queue to 1 from inside
>>>   scsi_host_template.target_alloc.
>>> * With that modified SRP initiator I run the srp-test software as
>>> follows
>>>   until something breaks:
>>>   while ./run_tests -f xfs -d -e deadline -r 60; do :; done
>>
>> What kernel options are needed? Where do I download everything I
>> need?
>>
>> In other words, would it be possible to do a fuller guide for getting
>> this setup and running?
>>
>> I'll run my simple test case as well, since it's currently breaking
>> basically everywhere.
>>
>>> Today a system with at least one InfiniBand HCA is required to run
>>> that test.
>>> When I have the time I will post the SRP initiator and target
>>> patches on the
>>> linux-rdma mailing list that make it possible to run that test
>>> against the
>>> SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware
>>> required to
>>> use that driver is an Ethernet adapter.
>>
>> OK, I guess I can't run it then... I'll have to rely on your testing.
> 
> Hello 
> 
> I agree with Bart in this case, we should revert this.
> My test-bed is tied up and I have not been able to give it back to Ming
> so he could follow up on Bart's last update.
> 
> Right now its safer to revert.

I had already reverted it when sending out that email, so we should be
all set (hopefully).

-- 
Jens Axboe

Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()

2017-11-08 Thread Jens Axboe

On 11/08/2017 08:59 AM, Ming Lei wrote:
> On Wed, Nov 08, 2017 at 02:20:41PM +0800, Ming Lei wrote:
>> On Tue, Nov 07, 2017 at 08:17:59PM -0700, Jens Axboe wrote:
>>> On 11/07/2017 08:12 PM, Ming Lei wrote:
>>>> On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
>>>>> On 11/07/2017 06:03 PM, Ming Lei wrote:
>>>>>> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
>>>>>>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
>>>>>>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>>>>>>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>>>>>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>>>>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>>>>>>>>> If you can reproduce, please provide me at least the following log
>>>>>>>>>>>> first:
>>>>>>>>>>>>
>>>>>>>>>>>>find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>>>>>>>>
>>>>>>>>>>>> If any pending requests arn't completed, please provide the related
>>>>>>>>>>>> info in dbgfs about where is the request.
>>>>>>>>>>>
>>>>>>>>>>> Every time I ran the above or a similar command its output was 
>>>>>>>>>>> empty. I
>>>>>>>>>>> assume that's because the hang usually occurs in a phase where 
>>>>>>>>>>> these debugfs
>>>>>>>>>>> attributes either have not yet been created or have already 
>>>>>>>>>>> disappeared.
>>>>>>>>>>
>>>>>>>>>> Bart, do you still see a hang with the patch that fixes the tag leak 
>>>>>>>>>> when
>>>>>>>>>> we fail to get a dispatch budget?
>>>>>>>>>>
>>>>>>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>>>>>>>>
>>>>>>>>>> I've run a lot of stability testing here, and I haven't run into any
>>>>>>>>>> issues. This is with shared tags as well. So if you still see the 
>>>>>>>>>> failure
>>>>>>>>>> case with the current tree AND the above patch, then I'll try and get
>>>>>>>>>> a test case setup that hits it too so we can get to the bottom of 
>>>>>>>>>> this.
>>>>>>>>>
>>>>>>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
>>>>>>>>> WITHOUT the patch mentioned above, running with that now.
>>>>>>>>>
>>>>>>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 
>>>>>>>>> submit_queues=1 hw_queue_depth=1
>>>>>>>>>
>>>>>>>>> and using this fio file:
>>>>>>>>>
>>>>>>>>> [global]
>>>>>>>>> bs=4k
>>>>>>>>> rw=randread
>>>>>>>>> norandommap
>>>>>>>>> direct=1
>>>>>>>>> ioengine=libaio
>>>>>>>>> iodepth=4
>>>>>>>>>
>>>>>>>>> [nullb0]
>>>>>>>>> filename=/dev/nullb0
>>>>>>>>> [nullb1]
>>>>>>>>> filename=/dev/nullb1
>>>>>>>>> [nullb2]
>>>>>>>>> filename=/dev/nullb2
>>>>>>>>> [nullb3]
>>>>>>>>> filename=/dev/nullb3
>>>>>>>>>
>>>>>>>>> it seemed to keep running, but it hung when exiting. The troublesome
>>>>>>>>> device was nullb1:
>>>>>>>>>
>>>>>>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>>>>>>>>> [  492.520782]   Not tainted 4.14.0-rc7+ #499
>>>>>>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
>>>>>>>>> disables this message.
>>

Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()

2017-11-08 Thread Jens Axboe

On 11/08/2017 09:41 AM, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 20:06 -0700, Jens Axboe wrote:
>> At this point, I have no idea what Bart's setup looks like. Bart, it
>> would be REALLY helpful if you could tell us how you are reproducing
>> your hang. I don't know why this has to be dragged out.
> 
> Hello Jens,
> 
> It is a disappointment to me that you have allowed Ming to evaluate other
> approaches than reverting "blk-mq: don't handle TAG_SHARED in restart". That
> patch namely replaces an algorithm that is trusted by the community with an
> algorithm of which even Ming acknowledged that it is racy. A quote from [1]:
> "IO hang may be caused if all requests are completed just before the current
> SCSI device is added to shost->starved_list". I don't know of any way to fix
> that race other than serializing request submission and completion by adding
> locking around these actions, which is something we don't want. Hence my
> request to revert that patch.

I was reluctant to revert it, in case we could work out a better way of
doing it. As I mentioned in the other replies, it's not exactly the
prettiest or most efficient. However, since we currently don't have
a good solution for the issue, I'm fine with reverting that patch.

> Regarding the test I run, here is a summary of what I mentioned in previous
> e-mails:
> * I modified the SRP initiator such that the SCSI target queue depth is
>   reduced to one by setting starget->can_queue to 1 from inside
>   scsi_host_template.target_alloc.
> * With that modified SRP initiator I run the srp-test software as follows
>   until something breaks:
>   while ./run_tests -f xfs -d -e deadline -r 60; do :; done

What kernel options are needed? Where do I download everything I need?

In other words, would it be possible to do a fuller guide for getting
this setup and running?

I'll run my simple test case as well, since it's currently breaking
basically everywhere.

> Today a system with at least one InfiniBand HCA is required to run that test.
> When I have the time I will post the SRP initiator and target patches on the
> linux-rdma mailing list that make it possible to run that test against the
> SoftRoCE driver (drivers/infiniband/sw/rxe). The only hardware required to
> use that driver is an Ethernet adapter.

OK, I guess I can't run it then... I'll have to rely on your testing.

-- 
Jens Axboe

Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()

2017-11-07 Thread Jens Axboe

On 11/07/2017 08:12 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 08:01:48PM -0700, Jens Axboe wrote:
>> On 11/07/2017 06:03 PM, Ming Lei wrote:
>>> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
>>>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
>>>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>>>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>>>>>> If you can reproduce, please provide me at least the following log
>>>>>>>>> first:
>>>>>>>>>
>>>>>>>>>   find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>>>>>
>>>>>>>>> If any pending requests arn't completed, please provide the related
>>>>>>>>> info in dbgfs about where is the request.
>>>>>>>>
>>>>>>>> Every time I ran the above or a similar command its output was empty. I
>>>>>>>> assume that's because the hang usually occurs in a phase where these 
>>>>>>>> debugfs
>>>>>>>> attributes either have not yet been created or have already 
>>>>>>>> disappeared.
>>>>>>>
>>>>>>> Bart, do you still see a hang with the patch that fixes the tag leak 
>>>>>>> when
>>>>>>> we fail to get a dispatch budget?
>>>>>>>
>>>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>>>>>
>>>>>>> I've run a lot of stability testing here, and I haven't run into any
>>>>>>> issues. This is with shared tags as well. So if you still see the 
>>>>>>> failure
>>>>>>> case with the current tree AND the above patch, then I'll try and get
>>>>>>> a test case setup that hits it too so we can get to the bottom of this.
>>>>>>
>>>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
>>>>>> WITHOUT the patch mentioned above, running with that now.
>>>>>>
>>>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 
>>>>>> submit_queues=1 hw_queue_depth=1
>>>>>>
>>>>>> and using this fio file:
>>>>>>
>>>>>> [global]
>>>>>> bs=4k
>>>>>> rw=randread
>>>>>> norandommap
>>>>>> direct=1
>>>>>> ioengine=libaio
>>>>>> iodepth=4
>>>>>>
>>>>>> [nullb0]
>>>>>> filename=/dev/nullb0
>>>>>> [nullb1]
>>>>>> filename=/dev/nullb1
>>>>>> [nullb2]
>>>>>> filename=/dev/nullb2
>>>>>> [nullb3]
>>>>>> filename=/dev/nullb3
>>>>>>
>>>>>> it seemed to keep running, but it hung when exiting. The troublesome
>>>>>> device was nullb1:
>>>>>>
>>>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>>>>>> [  492.520782]   Not tainted 4.14.0-rc7+ #499
>>>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" 
>>>>>> disables this message.
>>>>>> [  492.535904] fio D13208  3263   3211 0x
>>>>>> [  492.542535] Call Trace:
>>>>>> [  492.545764]  __schedule+0x279/0x720
>>>>>> [  492.550151]  schedule+0x33/0x90
>>>>>> [  492.554145]  io_schedule+0x16/0x40
>>>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
>>>>>> [  492.563211]  ? finish_wait+0x90/0x90
>>>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
>>>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
>>>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
>>>>>> [  492.583177]  submit_bio+0x64/0x120
>>>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
>>>>>> [  492.592736]  ? submit_bio+0x64/0x120
>>>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
>>>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
>>>>>> [  492.607546]  ? free_ioctx_users

Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()

2017-11-07 Thread Jens Axboe

On 11/07/2017 09:17 AM, Bart Van Assche wrote:
> On Tue, 2017-11-07 at 18:15 +0800, Ming Lei wrote:
>> Last time, you didn't mention the target patch for setting its
>> can_queue as 1, so I think you can't reproduce the issue on upstream
>> kernel without out-of-tree patch. Then looks it is another issue,
>> and we are making progress actually.
> 
> If I don't trust a patch I introduce additional tests. The fact that I
> modified the SRP initiator before this hang occurred does not mean that the
> approach of your patch is fine. What this means is that all your patch does
> is to reduce the race window and that there is still a race window.

I agree, reducing the depth to test that specific case is perfectly
valid, it doesn't make the test invalid. It's perfectly possible that
other use cases out there have exactly that configuration. My null_blk
test case is basically the same. But it would be nice if all of this was
out in the open, so everybody is clear on exactly what is being
run/tested.

However, where my trace is hung off getting a scheduler tag, yours seems
to be waiting on IO completion. So not the same fingerprint, which is
worrisome.

-- 
Jens Axboe

Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()

2017-11-07 Thread Jens Axboe

On 11/07/2017 07:58 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 07:55:32PM -0700, Jens Axboe wrote:
>> On 11/07/2017 05:39 PM, Ming Lei wrote:
>>> On Tue, Nov 07, 2017 at 04:20:08PM +, Bart Van Assche wrote:
>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>> If you can reproduce, please provide me at least the following log
>>>>> first:
>>>>>
>>>>>   find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>
>>>>> If any pending requests arn't completed, please provide the related
>>>>> info in dbgfs about where is the request.
>>>>
>>>> Every time I ran the above or a similar command its output was empty. I
>>>> assume that's because the hang usually occurs in a phase where these 
>>>> debugfs
>>>> attributes either have not yet been created or have already disappeared.
>>>
>>> Could you dump all tags? Then you can see if this attribute is disappeared.
>>>
>>> If that output is empty, it often means there isn't pending request not
>>> completed. So if that is true, your hang is _not_ related with RESTART.
>>
>> You need to check sched_tags as well. It could still be a restart race
>> or problem, if tags is empty but sched_tags has busy bits.
> 
> Yeah, I didn't mention because SRP is MQ hardware, and the default
> scheduler is none, but if Bart changes that, the sched_tags need to
> checked first.

At this point, I have no idea what Bart's setup looks like. Bart, it
would be REALLY helpful if you could tell us how you are reproducing
your hang. I don't know why this has to be dragged out.

Ming/Bart - there seems to be an increasing amount of tension between
you two, for reasons that are unknown to me. I suggest you put that
aside in the pursuit of fixing the current issue, and then we can
discuss how to best resolve these going forward. But right now the top
priority is getting to the bottom of this. There's a chance that the
issue I can reproduce is the same that Bart is seeing, in which case we
might be fixing both in one fell swoop. But if that isn't the case, then
we have some work to do this week.

-- 
Jens Axboe

Re: [PATCH] SCSI: don't get target/host busy_count in scsi_mq_get_budget()

2017-11-07 Thread Jens Axboe

On 11/07/2017 06:03 PM, Ming Lei wrote:
> On Tue, Nov 07, 2017 at 03:06:31PM -0700, Jens Axboe wrote:
>> On 11/07/2017 10:36 AM, Jens Axboe wrote:
>>> On 11/07/2017 10:10 AM, Jens Axboe wrote:
>>>> On 11/07/2017 09:29 AM, Jens Axboe wrote:
>>>>> On 11/07/2017 09:20 AM, Bart Van Assche wrote:
>>>>>> On Tue, 2017-11-07 at 10:11 +0800, Ming Lei wrote:
>>>>>>> If you can reproduce, please provide me at least the following log
>>>>>>> first:
>>>>>>>
>>>>>>> find /sys/kernel/debug/block -name tags | xargs cat | grep busy
>>>>>>>
>>>>>>> If any pending requests arn't completed, please provide the related
>>>>>>> info in dbgfs about where is the request.
>>>>>>
>>>>>> Every time I ran the above or a similar command its output was empty. I
>>>>>> assume that's because the hang usually occurs in a phase where these 
>>>>>> debugfs
>>>>>> attributes either have not yet been created or have already disappeared.
>>>>>
>>>>> Bart, do you still see a hang with the patch that fixes the tag leak when
>>>>> we fail to get a dispatch budget?
>>>>>
>>>>> https://marc.info/?l=linux-block&m=151004881411480&w=2
>>>>>
>>>>> I've run a lot of stability testing here, and I haven't run into any
>>>>> issues. This is with shared tags as well. So if you still see the failure
>>>>> case with the current tree AND the above patch, then I'll try and get
>>>>> a test case setup that hits it too so we can get to the bottom of this.
>>>>
>>>> Ming, I managed to reproduce the hang using null_blk. Note this is
>>>> WITHOUT the patch mentioned above, running with that now.
>>>>
>>>> # modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 
>>>> submit_queues=1 hw_queue_depth=1
>>>>
>>>> and using this fio file:
>>>>
>>>> [global]
>>>> bs=4k
>>>> rw=randread
>>>> norandommap
>>>> direct=1
>>>> ioengine=libaio
>>>> iodepth=4
>>>>
>>>> [nullb0]
>>>> filename=/dev/nullb0
>>>> [nullb1]
>>>> filename=/dev/nullb1
>>>> [nullb2]
>>>> filename=/dev/nullb2
>>>> [nullb3]
>>>> filename=/dev/nullb3
>>>>
>>>> it seemed to keep running, but it hung when exiting. The troublesome
>>>> device was nullb1:
>>>>
>>>> [  492.513374] INFO: task fio:3263 blocked for more than 120 seconds.
>>>> [  492.520782]   Not tainted 4.14.0-rc7+ #499
>>>> [  492.526247] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables 
>>>> this message.
>>>> [  492.535904] fio D13208  3263   3211 0x
>>>> [  492.542535] Call Trace:
>>>> [  492.545764]  __schedule+0x279/0x720
>>>> [  492.550151]  schedule+0x33/0x90
>>>> [  492.554145]  io_schedule+0x16/0x40
>>>> [  492.558435]  blk_mq_get_tag+0x148/0x250
>>>> [  492.563211]  ? finish_wait+0x90/0x90
>>>> [  492.567693]  blk_mq_get_request+0xf0/0x3e0
>>>> [  492.572760]  blk_mq_make_request+0xe2/0x690
>>>> [  492.577913]  generic_make_request+0xfc/0x2f0
>>>> [  492.583177]  submit_bio+0x64/0x120
>>>> [  492.587475]  ? set_page_dirty_lock+0x4b/0x60
>>>> [  492.592736]  ? submit_bio+0x64/0x120
>>>> [  492.597309]  ? bio_set_pages_dirty+0x55/0x60
>>>> [  492.602570]  blkdev_direct_IO+0x388/0x3c0
>>>> [  492.607546]  ? free_ioctx_users+0xe0/0xe0
>>>> [  492.612511]  ? blk_mq_dispatch_rq_list+0x238/0x3a0
>>>> [  492.618353]  ? _raw_spin_unlock+0xe/0x20
>>>> [  492.623227]  generic_file_read_iter+0xb3/0xa00
>>>> [  492.628682]  ? generic_file_read_iter+0xb3/0xa00
>>>> [  492.634334]  ? security_file_permission+0x9b/0xc0
>>>> [  492.640114]  blkdev_read_iter+0x35/0x40
>>>> [  492.644877]  aio_read+0xc5/0x120
>>>> [  492.648973]  ? aio_read_events+0x24c/0x340
>>>> [  492.654124]  ? __might_sleep+0x4a/0x80
>>>> [  492.658800]  do_io_submit+0x47c/0x5e0
>>>> [  492.663373]  ? do_io_submit+0x47c/0x5e0
>>>> [  492.668234]  SyS_io_submit+0x10/0x20
>>>> [  492.672715]  ? SyS_io_submit+0

< 1 2 3 4 5 6 7 8 9 10 >

201 - 300 of 1039 matches

Mail list logo