date:20171013

Re: [PATCH V9 0/7] blk-mq-sched: improve sequential I/O performance

2017-10-13 Thread Ming Lei

On Fri, Oct 13, 2017 at 02:23:07PM -0600, Jens Axboe wrote:
> On 10/13/2017 01:21 PM, Jens Axboe wrote:
> > On 10/13/2017 01:08 PM, Jens Axboe wrote:
> >> On 10/13/2017 12:05 PM, Ming Lei wrote:
> >>> Hi Jens,
> >>>
> >>> In Red Hat internal storage test wrt. blk-mq scheduler, we found that I/O
> >>> performance is much bad with mq-deadline, especially about sequential I/O
> >>> on some multi-queue SCSI devcies(lpfc, qla2xxx, SRP...)
> >>>
> >>> Turns out one big issue causes the performance regression: requests are
> >>> still dequeued from sw queue/scheduler queue even when ldd's queue is
> >>> busy, so I/O merge becomes quite difficult to make, then sequential IO
> >>> performance degrades a lot.
> >>>
> >>> This issue becomes one of mains reasons for reverting default SCSI_MQ
> >>> in V4.13.
> >>>
> >>> This 8 patches improve this situation, and brings back performance loss.
> >>>
> >>> With this change, SCSI-MQ sequential I/O performance is improved much, 
> >>> Paolo
> >>> reported that mq-deadline performance improved much[2] in his dbench test
> >>> wrt V2. Also performance improvement on lpfc/qla2xx was observed with 
> >>> V1.[1]
> >>>
> >>> [1] http://marc.info/?l=linux-block&m=150151989915776&w=2
> >>> [2] https://marc.info/?l=linux-block&m=150217980602843&w=2
> >>
> >> I wanted to run some sanity testing on this series before committing it,
> >> and unfortunately it doesn't even boot for me. Just hangs after loading
> >> the kernel. Maybe an error slipped in for v8/9?
> > 
> > Or it might be something with kyber, my laptop defaults to that. Test
> > box seems to boot (which is SCSI), and nvme loads fine by default,
> > but not with kyber.
> > 
> > I don't have time to look into this more today, but the above might
> > help you figure out what is going on.
> 
> Verified that the laptop boots just fine if I remove the kyber udev
> rule.

I can reproduce this issue with kyber switched to, and will figure out
it soon.

-- 
Ming

Re: [PATCH v2 5/5] ufs/phy: qcom: Refactor to use phy_init call

2017-10-13 Thread Subhash Jadavani


On 2017-10-11 23:19, Vivek Gautam wrote:

Refactor ufs_qcom_power_up_sequence() to get rid of ugly
exported phy APIs and use the phy_init() and phy_power_on()
to do the phy initialization.

Signed-off-by: Vivek Gautam 
---

Changes since v1:
 - The UFS phy retain state in low power mode. The phy can
   enter the low power state and come up without starting the
   serdes again, unless we reprogram the phy.
   So, added a check to start the serdes only once after phy
   initialization.

 drivers/phy/qualcomm/phy-qcom-ufs-i.h|  3 +-
 drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c | 15 --
 drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c | 15 --
 drivers/phy/qualcomm/phy-qcom-ufs.c  | 42 
++--
 drivers/scsi/ufs/ufs-qcom.c  | 36 
++--

 include/linux/phy/phy-qcom-ufs.h |  3 --
 6 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/drivers/phy/qualcomm/phy-qcom-ufs-i.h
b/drivers/phy/qualcomm/phy-qcom-ufs-i.h
index 94326ed107c3..822c83b8efcd 100644
--- a/drivers/phy/qualcomm/phy-qcom-ufs-i.h
+++ b/drivers/phy/qualcomm/phy-qcom-ufs-i.h
@@ -114,6 +114,7 @@ struct ufs_qcom_phy {
struct ufs_qcom_phy_calibration *cached_regs;
int cached_regs_table_size;
bool is_powered_on;
+   bool is_started;
struct ufs_qcom_phy_specific_ops *phy_spec_ops;

enum phy_mode mode;
@@ -123,7 +124,6 @@ struct ufs_qcom_phy {
  * struct ufs_qcom_phy_specific_ops - set of pointers to functions 
which have a

  * specific implementation per phy. Each UFS phy, should implement
  * those functions according to its spec and requirements
- * @calibrate_phy: pointer to a function that calibrate the phy
  * @start_serdes: pointer to a function that starts the serdes
  * @is_physical_coding_sublayer_ready: pointer to a function that
  * checks pcs readiness. returns 0 for success and non-zero for error.
@@ -132,7 +132,6 @@ struct ufs_qcom_phy {
  * and writes to QSERDES_RX_SIGDET_CNTRL attribute
  */
 struct ufs_qcom_phy_specific_ops {
-   int (*calibrate_phy)(struct ufs_qcom_phy *phy, bool is_rate_B);
void (*start_serdes)(struct ufs_qcom_phy *phy);
int (*is_physical_coding_sublayer_ready)(struct ufs_qcom_phy *phy);
void (*set_tx_lane_enable)(struct ufs_qcom_phy *phy, u32 val);
diff --git a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c
b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c
index af65785230b5..ba1895b76a5d 100644
--- a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c
+++ b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-14nm.c
@@ -44,7 +44,19 @@ void ufs_qcom_phy_qmp_14nm_advertise_quirks(struct
ufs_qcom_phy *phy_common)

 static int ufs_qcom_phy_qmp_14nm_init(struct phy *generic_phy)
 {
-   return 0;
+   struct ufs_qcom_phy *phy_common = get_ufs_qcom_phy(generic_phy);
+   bool is_rate_B = false;
+   int ret;
+
+   if (phy_common->mode == PHY_MODE_UFS_HS_B)
+   is_rate_B = true;
+
+   ret = ufs_qcom_phy_qmp_14nm_phy_calibrate(phy_common, is_rate_B);
+   if (!ret)
+   /* phy calibrated, but yet to be started */
+   phy_common->is_started = false;
+
+   return ret;
 }

 static int ufs_qcom_phy_qmp_14nm_exit(struct phy *generic_phy)
@@ -120,7 +132,6 @@ static int
ufs_qcom_phy_qmp_14nm_is_pcs_ready(struct ufs_qcom_phy *phy_common)
 };

 static struct ufs_qcom_phy_specific_ops phy_14nm_ops = {
-   .calibrate_phy  = ufs_qcom_phy_qmp_14nm_phy_calibrate,
.start_serdes   = ufs_qcom_phy_qmp_14nm_start_serdes,
 	.is_physical_coding_sublayer_ready = 
ufs_qcom_phy_qmp_14nm_is_pcs_ready,

.set_tx_lane_enable = ufs_qcom_phy_qmp_14nm_set_tx_lane_enable,
diff --git a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c
b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c
index 5c18c41dbdb4..49f435c71147 100644
--- a/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c
+++ b/drivers/phy/qualcomm/phy-qcom-ufs-qmp-20nm.c
@@ -63,7 +63,19 @@ void ufs_qcom_phy_qmp_20nm_advertise_quirks(struct
ufs_qcom_phy *phy_common)

 static int ufs_qcom_phy_qmp_20nm_init(struct phy *generic_phy)
 {
-   return 0;
+   struct ufs_qcom_phy *phy_common = get_ufs_qcom_phy(generic_phy);
+   bool is_rate_B = false;
+   int ret;
+
+   if (phy_common->mode == PHY_MODE_UFS_HS_B)
+   is_rate_B = true;
+
+   ret = ufs_qcom_phy_qmp_20nm_phy_calibrate(phy_common, is_rate_B);
+   if (!ret)
+   /* phy calibrated, but yet to be started */
+   phy_common->is_started = false;
+
+   return ret;
 }

 static int ufs_qcom_phy_qmp_20nm_exit(struct phy *generic_phy)
@@ -178,7 +190,6 @@ static int
ufs_qcom_phy_qmp_20nm_is_pcs_ready(struct ufs_qcom_phy *phy_common)
 };

 static struct ufs_qcom_phy_specific_ops phy_20nm_ops = {
-   .calibrate_phy  = ufs_qcom_phy_qmp_20nm_phy_calibrate,
.start_serdes   = ufs_qcom_phy_qmp_20nm_start_serdes,
 	.is_physical_coding_subl

Re: [PATCH V9 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Bart Van Assche

On Sat, 2017-10-14 at 02:05 +0800, Ming Lei wrote:
> @@ -89,19 +89,36 @@ static bool blk_mq_sched_restart_hctx(struct 
> blk_mq_hw_ctx *hctx)
>   return false;
>  }
>  
> -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)

Shouldn't the meaning of the return value of this function be documented?

>  {
>   struct request_queue *q = hctx->queue;
>   struct elevator_queue *e = q->elevator;
>   LIST_HEAD(rq_list);
>  
>   do {
> - struct request *rq = e->type->ops.mq.dispatch_request(hctx);
> + struct request *rq;
> + blk_status_t ret;
>  
> - if (!rq)
> + if (e->type->ops.mq.has_work &&
> + !e->type->ops.mq.has_work(hctx))
>   break;
> +
> + ret = blk_mq_get_dispatch_budget(hctx);
> + if (ret == BLK_STS_RESOURCE)
> + return true;
> +
> + rq = e->type->ops.mq.dispatch_request(hctx);
> + if (!rq) {
> + blk_mq_put_dispatch_budget(hctx, true);
> + break;
> + } else if (ret != BLK_STS_OK) {
> + blk_mq_end_request(rq, ret);
> + continue;
> + }
>   list_add(&rq->queuelist, &rq_list);
> - } while (blk_mq_dispatch_rq_list(q, &rq_list));
> + } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> +
> + return false;
>  }

This means that the request in rq_list becomes the owner of the budget allocated
by blk_mq_get_dispatch_budget(). Shouldn't that be mentioned as a comment above
blk_mq_dispatch_rq_list()?

> + if (run_queue) {
> + if (!blk_mq_sched_needs_restart(hctx) &&
> + !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
> + blk_mq_sched_mark_restart_hctx(hctx);
> + blk_mq_run_hw_queue(hctx, true);
> + }
>   }
>  }

The above if-statement can be changed from a nested if into a single
if-statement.

Additionally, why has the code been added to blk_mq_sched_dispatch_requests()
that reruns the queue if blk_mq_get_dispatch_budget() returned BLK_STS_RESOURCE?
Is that code necessary or can it be left out?

> +static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx,
> + bool got_budget)
> +{
> + struct request_queue *q = hctx->queue;
> +
> + if (q->mq_ops->put_budget && got_budget)
> + q->mq_ops->put_budget(hctx);
> +}

So the above function is passed a boolean as second argument and all what
that boolean is used for is to decide whether or not the function is executed?
Sorry but I think that's wrong and that the second argument should be removed
and that it should be evaluated by the caller instead of inside
blk_mq_put_dispatch_budget().

Bart.

Re: [PATCH V9 6/7] SCSI: allow to pass null rq to scsi_prep_state_check()

2017-10-13 Thread Bart Van Assche

On Sat, 2017-10-14 at 02:05 +0800, Ming Lei wrote:
> In the following patch, we will implement scsi_get_budget()
> which need to call scsi_prep_state_check() when rq isn't
> dequeued yet.

My understanding is that this change is only needed because scsi_mq_get_budget()
calls scsi_prep_state_check() with the req argument set to NULL. Since
scsi_prep_state_check() tests for sdev states other than SDEV_RUNNING and since
the purpose of this series is to optimize performance for the case SDEV_RUNNING
I'm not sure it's a good idea to make scsi_mq_get_budget() call
scsi_prep_state_check(). And if scsi_mq_get_budget() wouldn't call
scsi_prep_state_check() then this patch is not necessary.

Bart.

[PATCH 2/2] qla2xxx: Query FC4 type during RSCN processing

2017-10-13 Thread Madhani, Madhani

From: Giridhar Malavali 

Based on the FC4 type, login will proceed to either
FCP or FC-NVMe remote ports.

Signed-off-by: Giridhar Malavali 
Signed-off-by: Himanshu Madhani 
---
 drivers/scsi/qla2xxx/qla_os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 0cfce0486b70..4b9a7f1711c7 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -4762,7 +4762,7 @@ void qla24xx_create_new_sess(struct scsi_qla_host *vha, 
struct qla_work_evt *e)
if (pla)
qlt_plogi_ack_unref(vha, pla);
else
-   qla24xx_async_gnl(vha, fcport);
+   qla24xx_async_gffid(vha, fcport);
}
 
if (free_fcport) {
-- 
2.12.0

[PATCH 0/2] qla2xxx: Couple bug fixes for FC-NVMe

2017-10-13 Thread Madhani, Madhani

From: Himanshu Madhani 

Hi Martin, 

This series has couple bug fixes for FC-NVMe code path. Please apply 
them to 4.15/scsi-queue at your earliest convenience.

Thanks,
Himanshu

Giridhar Malavali (1):
  qla2xxx: Query FC4 type during RSCN processing

Himanshu Madhani (1):
  qla2xxx: Use ql2xnvmeenable to enable Q-Pair for FC-NVMe

 drivers/scsi/qla2xxx/qla_init.c |  2 +-
 drivers/scsi/qla2xxx/qla_isr.c  |  2 +-
 drivers/scsi/qla2xxx/qla_mid.c  |  2 +-
 drivers/scsi/qla2xxx/qla_os.c   | 29 +++--
 4 files changed, 22 insertions(+), 13 deletions(-)

-- 
2.12.0

[PATCH 1/2] qla2xxx: Use ql2xnvmeenable to enable Q-Pair for FC-NVMe

2017-10-13 Thread Madhani, Madhani

From: Himanshu Madhani 

In some env, user can choose to not enable SCSI-MQ but wants
to use FC-NVMe feature of the driver. Since driver relies on
Q-Pairs to allocate FC-NVMe resources, use existing module
parameter to create Q-Pairs when FC-NVMe is enabled.

Signed-off-by: Himanshu Madhani 
---
 drivers/scsi/qla2xxx/qla_init.c |  2 +-
 drivers/scsi/qla2xxx/qla_isr.c  |  2 +-
 drivers/scsi/qla2xxx/qla_mid.c  |  2 +-
 drivers/scsi/qla2xxx/qla_os.c   | 27 ++-
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index b2a391f93775..9752ac4c1003 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -8036,7 +8036,7 @@ struct qla_qpair *qla2xxx_create_qpair(struct 
scsi_qla_host *vha, int qos,
return NULL;
}
 
-   if (ql2xmqsupport) {
+   if (ql2xmqsupport || ql2xnvmeenable) {
qpair = kzalloc(sizeof(struct qla_qpair), GFP_KERNEL);
if (qpair == NULL) {
ql_log(ql_log_warn, vha, 0x0182,
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index d06a1a809188..bdaa4d644424 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -3411,7 +3411,7 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct 
rsp_que *rsp)
ha->msix_count, ret);
ha->msix_count = ret;
/* Recalculate queue values */
-   if (ha->mqiobase && ql2xmqsupport) {
+   if (ha->mqiobase && (ql2xmqsupport || ql2xnvmeenable)) {
ha->max_req_queues = ha->msix_count - 1;
 
/* ATIOQ needs 1 vector. That's 1 less QPair */
diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c
index c0f8f6c17b79..3630bb66a74c 100644
--- a/drivers/scsi/qla2xxx/qla_mid.c
+++ b/drivers/scsi/qla2xxx/qla_mid.c
@@ -606,7 +606,7 @@ qla25xx_delete_queues(struct scsi_qla_host *vha)
struct qla_hw_data *ha = vha->hw;
struct qla_qpair *qpair, *tqpair;
 
-   if (ql2xmqsupport) {
+   if (ql2xmqsupport || ql2xnvmeenable) {
list_for_each_entry_safe(qpair, tqpair, &vha->qp_list,
qp_list_elem)
qla2xxx_delete_qpair(vha, qpair);
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index cb719345aa0d..0cfce0486b70 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -433,7 +433,7 @@ static int qla2x00_alloc_queues(struct qla_hw_data *ha, 
struct req_que *req,
 
qla_init_base_qpair(vha, req, rsp);
 
-   if (ql2xmqsupport && ha->max_qpairs) {
+   if ((ql2xmqsupport || ql2xnvmeenable) && ha->max_qpairs) {
ha->queue_pair_map = kcalloc(ha->max_qpairs, sizeof(struct 
qla_qpair *),
GFP_KERNEL);
if (!ha->queue_pair_map) {
@@ -1976,7 +1976,8 @@ qla2x00_iospace_config(struct qla_hw_data *ha)
/* Determine queue resources */
ha->max_req_queues = ha->max_rsp_queues = 1;
ha->msix_count = QLA_BASE_VECTORS;
-   if (!ql2xmqsupport || (!IS_QLA25XX(ha) && !IS_QLA81XX(ha)))
+   if (!ql2xmqsupport || !ql2xnvmeenable ||
+   (!IS_QLA25XX(ha) && !IS_QLA81XX(ha)))
goto mqiobase_exit;
 
ha->mqiobase = ioremap(pci_resource_start(ha->pdev, 3),
@@ -2073,7 +2074,7 @@ qla83xx_iospace_config(struct qla_hw_data *ha)
 * By default, driver uses at least two msix vectors
 * (default & rspq)
 */
-   if (ql2xmqsupport) {
+   if (ql2xmqsupport || ql2xnvmeenable) {
/* MB interrupt uses 1 vector */
ha->max_req_queues = ha->msix_count - 1;
 
@@ -3089,9 +3090,17 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct 
pci_device_id *id)
 
ql_dbg(ql_dbg_init, base_vha, 0x0192,
"blk/scsi-mq enabled, HW queues = %d.\n", 
host->nr_hw_queues);
-   } else
-   ql_dbg(ql_dbg_init, base_vha, 0x0193,
-   "blk/scsi-mq disabled.\n");
+   } else {
+   if (ql2xnvmeenable) {
+   host->nr_hw_queues = ha->max_qpairs;
+   ql_dbg(ql_dbg_init, base_vha, 0x0194,
+   "FC-NVMe support is enabled, HW queues=%d\n",
+   host->nr_hw_queues);
+   } else {
+   ql_dbg(ql_dbg_init, base_vha, 0x0193,
+   "blk/scsi-mq disabled.\n");
+   }
+   }
 
qlt_probe_one_stage1(base_vha, ha);
 
@@ -6301,7 +6310,7 @@ qla2xxx_pci_error_detected(struct pci_dev *pdev, 
pci_channel_state_t state)
switch (state) {
case pci_channel_io_normal:
ha->flags.eeh_busy = 0;
-   if (ql2xmqsupport) {
+

Re: [PATCH V9 0/7] blk-mq-sched: improve sequential I/O performance

2017-10-13 Thread Jens Axboe

On 10/13/2017 01:21 PM, Jens Axboe wrote:
> On 10/13/2017 01:08 PM, Jens Axboe wrote:
>> On 10/13/2017 12:05 PM, Ming Lei wrote:
>>> Hi Jens,
>>>
>>> In Red Hat internal storage test wrt. blk-mq scheduler, we found that I/O
>>> performance is much bad with mq-deadline, especially about sequential I/O
>>> on some multi-queue SCSI devcies(lpfc, qla2xxx, SRP...)
>>>
>>> Turns out one big issue causes the performance regression: requests are
>>> still dequeued from sw queue/scheduler queue even when ldd's queue is
>>> busy, so I/O merge becomes quite difficult to make, then sequential IO
>>> performance degrades a lot.
>>>
>>> This issue becomes one of mains reasons for reverting default SCSI_MQ
>>> in V4.13.
>>>
>>> This 8 patches improve this situation, and brings back performance loss.
>>>
>>> With this change, SCSI-MQ sequential I/O performance is improved much, Paolo
>>> reported that mq-deadline performance improved much[2] in his dbench test
>>> wrt V2. Also performance improvement on lpfc/qla2xx was observed with V1.[1]
>>>
>>> [1] http://marc.info/?l=linux-block&m=150151989915776&w=2
>>> [2] https://marc.info/?l=linux-block&m=150217980602843&w=2
>>
>> I wanted to run some sanity testing on this series before committing it,
>> and unfortunately it doesn't even boot for me. Just hangs after loading
>> the kernel. Maybe an error slipped in for v8/9?
> 
> Or it might be something with kyber, my laptop defaults to that. Test
> box seems to boot (which is SCSI), and nvme loads fine by default,
> but not with kyber.
> 
> I don't have time to look into this more today, but the above might
> help you figure out what is going on.

Verified that the laptop boots just fine if I remove the kyber udev
rule.

-- 
Jens Axboe

Re: [PATCH V9 0/7] blk-mq-sched: improve sequential I/O performance

2017-10-13 Thread Jens Axboe

On 10/13/2017 01:08 PM, Jens Axboe wrote:
> On 10/13/2017 12:05 PM, Ming Lei wrote:
>> Hi Jens,
>>
>> In Red Hat internal storage test wrt. blk-mq scheduler, we found that I/O
>> performance is much bad with mq-deadline, especially about sequential I/O
>> on some multi-queue SCSI devcies(lpfc, qla2xxx, SRP...)
>>
>> Turns out one big issue causes the performance regression: requests are
>> still dequeued from sw queue/scheduler queue even when ldd's queue is
>> busy, so I/O merge becomes quite difficult to make, then sequential IO
>> performance degrades a lot.
>>
>> This issue becomes one of mains reasons for reverting default SCSI_MQ
>> in V4.13.
>>
>> This 8 patches improve this situation, and brings back performance loss.
>>
>> With this change, SCSI-MQ sequential I/O performance is improved much, Paolo
>> reported that mq-deadline performance improved much[2] in his dbench test
>> wrt V2. Also performance improvement on lpfc/qla2xx was observed with V1.[1]
>>
>> [1] http://marc.info/?l=linux-block&m=150151989915776&w=2
>> [2] https://marc.info/?l=linux-block&m=150217980602843&w=2
> 
> I wanted to run some sanity testing on this series before committing it,
> and unfortunately it doesn't even boot for me. Just hangs after loading
> the kernel. Maybe an error slipped in for v8/9?

Or it might be something with kyber, my laptop defaults to that. Test
box seems to boot (which is SCSI), and nvme loads fine by default,
but not with kyber.

I don't have time to look into this more today, but the above might
help you figure out what is going on.

-- 
Jens Axboe

Re: [PATCH V9 0/7] blk-mq-sched: improve sequential I/O performance

2017-10-13 Thread Jens Axboe

On 10/13/2017 12:05 PM, Ming Lei wrote:
> Hi Jens,
> 
> In Red Hat internal storage test wrt. blk-mq scheduler, we found that I/O
> performance is much bad with mq-deadline, especially about sequential I/O
> on some multi-queue SCSI devcies(lpfc, qla2xxx, SRP...)
> 
> Turns out one big issue causes the performance regression: requests are
> still dequeued from sw queue/scheduler queue even when ldd's queue is
> busy, so I/O merge becomes quite difficult to make, then sequential IO
> performance degrades a lot.
> 
> This issue becomes one of mains reasons for reverting default SCSI_MQ
> in V4.13.
> 
> This 8 patches improve this situation, and brings back performance loss.
> 
> With this change, SCSI-MQ sequential I/O performance is improved much, Paolo
> reported that mq-deadline performance improved much[2] in his dbench test
> wrt V2. Also performance improvement on lpfc/qla2xx was observed with V1.[1]
> 
> [1] http://marc.info/?l=linux-block&m=150151989915776&w=2
> [2] https://marc.info/?l=linux-block&m=150217980602843&w=2

I wanted to run some sanity testing on this series before committing it,
and unfortunately it doesn't even boot for me. Just hangs after loading
the kernel. Maybe an error slipped in for v8/9?

-- 
Jens Axboe

Re: [PATCH 1/5] qla2xxx: Add module param ql2xenablemsix

2017-10-13 Thread Madhani, Himanshu

Hi Johannes, 

> On Oct 12, 2017, at 12:19 AM, Johannes Thumshirn  wrote:
> 
> On Wed, Oct 11, 2017 at 01:36:46PM -0700, Madhani, Himanshu wrote:
> [...]
>> +int ql2xenablemsix = 1;
>> +module_param(ql2xenablemsix, int, 0444);
>> +MODULE_PARM_DESC(ql2xenablemsix,
>> +"Set to enable MSI or MSI-X interrupt mechanism.\n"
>> +" Default is 1, enable MSI-X interrupt mechanism.\n"
>> +" 0 -- enable traditional pin-based mechanism.\n"
>> +" 1 -- enable MSI-X interrupt mechanism.\n"
>> +" 2 -- enable MSI interrupt mechanism.\n");
> 
> Maybe qla2xirqmode is a more appropriate name?
> 

Sorry this one slipped out of my sight. The reason I used "ql2xenablemsix" was 
because driver already
had this parameter in code until few versions back where we removed it. Since 
existing user are familiar
with this module parameter name, it would be easy to keep that same. 

Let me know if you are okay with the name or prefer new name instead? 

> -- 
> Johannes Thumshirn  Storage
> jthumsh...@suse.de+49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Thanks,
- Himanshu

[PATCH V9 2/7] blk-mq-sched: move actual dispatching into one helper

2017-10-13 Thread Ming Lei

So that it becomes easy to support to dispatch from sw queue in the
following patch.

No functional change.

Reviewed-by: Bart Van Assche 
Reviewed-by: Omar Sandoval 
Suggested-by: Christoph Hellwig  # for simplifying dispatch logic
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 43 ---
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index eca011fdfa0e..be29ba849408 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,12 +89,26 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+   struct request_queue *q = hctx->queue;
+   struct elevator_queue *e = q->elevator;
+   LIST_HEAD(rq_list);
+
+   do {
+   struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+
+   if (!rq)
+   break;
+   list_add(&rq->queuelist, &rq_list);
+   } while (blk_mq_dispatch_rq_list(q, &rq_list));
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-   bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,30 +136,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 * scheduler, we can no longer merge or sort them. So it's best to
 * leave them there for as long as we can. Mark the hw queue as
 * needing a restart in that case.
+*
+* We want to dispatch from the scheduler if there was nothing
+* on the dispatch list or we were able to dispatch from the
+* dispatch list.
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
-   } else if (!has_sched_dispatch) {
+   if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
+   blk_mq_do_dispatch_sched(hctx);
+   } else if (has_sched_dispatch) {
+   blk_mq_do_dispatch_sched(hctx);
+   } else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
}
-
-   /*
-* We want to dispatch from the scheduler if there was nothing
-* on the dispatch list or we were able to dispatch from the
-* dispatch list.
-*/
-   if (do_sched_dispatch && has_sched_dispatch) {
-   do {
-   struct request *rq;
-
-   rq = e->type->ops.mq.dispatch_request(hctx);
-   if (!rq)
-   break;
-   list_add(&rq->queuelist, &rq_list);
-   } while (blk_mq_dispatch_rq_list(q, &rq_list));
-   }
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
2.9.5

[PATCH V9 1/7] blk-mq-sched: dispatch from scheduler only after progress is made on ->dispatch

2017-10-13 Thread Ming Lei

When hw queue is busy, we shouldn't take requests from
scheduler queue any more, otherwise it is difficult to do
IO merge.

This patch fixes the awful IO performance on some
SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber
is used by not taking requests if hw queue is busy.

Reviewed-by: Omar Sandoval 
Reviewed-by: Bart Van Assche 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..eca011fdfa0e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-   bool did_work = false;
+   bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -125,18 +125,18 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   did_work = blk_mq_dispatch_rq_list(q, &rq_list);
+   do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
} else if (!has_sched_dispatch) {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
}
 
/*
-* We want to dispatch from the scheduler if we had no work left
-* on the dispatch list, OR if we did have work but weren't able
-* to make progress.
+* We want to dispatch from the scheduler if there was nothing
+* on the dispatch list or we were able to dispatch from the
+* dispatch list.
 */
-   if (!did_work && has_sched_dispatch) {
+   if (do_sched_dispatch && has_sched_dispatch) {
do {
struct request *rq;
 
-- 
2.9.5

[PATCH V9 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

For SCSI devices, there is often per-request-queue depth, which need
to be respected before queuing one request.

The current blk-mq always dequeues request first, then calls .queue_rq()
to dispatch the request to lld. One obvious issue of this way is that I/O
merge may not be good, because when the per-request-queue depth can't be
respected, .queue_rq() has to return BLK_STS_RESOURCE, then this request
has to stay in hctx->dispatch list, and never got chance to participate
into I/O merge.

This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
Once the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all, then I/O merge can get improved a lot.

Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c   | 43 +++
 block/blk-mq.c | 33 ++---
 block/blk-mq.h | 21 -
 include/linux/blk-mq.h | 11 +++
 4 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba849408..6bb92bbc3135 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,19 +89,36 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
 
do {
-   struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+   struct request *rq;
+   blk_status_t ret;
 
-   if (!rq)
+   if (e->type->ops.mq.has_work &&
+   !e->type->ops.mq.has_work(hctx))
break;
+
+   ret = blk_mq_get_dispatch_budget(hctx);
+   if (ret == BLK_STS_RESOURCE)
+   return true;
+
+   rq = e->type->ops.mq.dispatch_request(hctx);
+   if (!rq) {
+   blk_mq_put_dispatch_budget(hctx, true);
+   break;
+   } else if (ret != BLK_STS_OK) {
+   blk_mq_end_request(rq, ret);
+   continue;
+   }
list_add(&rq->queuelist, &rq_list);
-   } while (blk_mq_dispatch_rq_list(q, &rq_list));
+   } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+   return false;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -110,6 +127,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
LIST_HEAD(rq_list);
+   bool run_queue = false;
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -143,13 +161,22 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-   blk_mq_do_dispatch_sched(hctx);
+   if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+   has_sched_dispatch)
+   run_queue = blk_mq_do_dispatch_sched(hctx);
} else if (has_sched_dispatch) {
-   blk_mq_do_dispatch_sched(hctx);
+   run_queue = blk_mq_do_dispatch_sched(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
-   blk_mq_dispatch_rq_list(q, &rq_list);
+   blk_mq_dispatch_rq_list(q, &rq_list, false);
+   }
+
+   if (run_queue) {
+   if (!blk_mq_sched_needs_restart(hctx) &&
+   !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+   blk_mq_sched_mark_restart_hctx(hctx);
+   blk_mq_run_hw_queue(hctx, true);
+   }
}
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40cba1b1978f..feb89f1fcc63 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx 
*hctx)
return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+   bool got_budget)
 {
struct blk_mq_hw_ctx *hctx;
struct request *rq;
@@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
struct list_head *list)
if (list_empty(list))
return false;
 
+   WARN_ON(!list_is_singular(list) && got_budget);
+

[PATCH V9 5/7] blk-mq-sched: improve dispatching from sw queue

2017-10-13 Thread Ming Lei

SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. Meantime there is also queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver, unfortunately
it is easy to cause queue busy because of the small .cmd_per_lun.
Once these requests are flushed out, they have to stay in hctx->dispatch,
and no bio merge can participate into these requests, and sequential IO
performance is hurt a lot.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
sw queue so that we can dispatch them in scheduler's way, then we can
avoid to dequeue too many requests from sw queue when ->dispatch isn't
flushed completely.

This patch improves dispatching from sw queue by using the callback of
.get_budget and .put_budget

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c   | 68 +++---
 block/blk-mq.c | 39 +
 block/blk-mq.h |  2 ++
 include/linux/blk-mq.h |  2 ++
 4 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 6bb92bbc3135..92df88351ad2 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -121,6 +121,55 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+   unsigned idx = ctx->index_hw;
+
+   if (++idx == hctx->nr_ctx)
+   idx = 0;
+
+   return hctx->ctxs[idx];
+}
+
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+   struct request_queue *q = hctx->queue;
+   LIST_HEAD(rq_list);
+   struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+   do {
+   struct request *rq;
+   blk_status_t ret;
+
+   if (!sbitmap_any_bit_set(&hctx->ctx_map))
+   break;
+
+   ret = blk_mq_get_dispatch_budget(hctx);
+   if (ret == BLK_STS_RESOURCE)
+   return true;
+
+   rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+   if (!rq) {
+   blk_mq_put_dispatch_budget(hctx, true);
+   break;
+   } else if (ret != BLK_STS_OK) {
+   blk_mq_end_request(rq, ret);
+   continue;
+   }
+
+   list_add(&rq->queuelist, &rq_list);
+
+   /* round robin for fair dispatch */
+   ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+   } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+   WRITE_ONCE(hctx->dispatch_from, ctx);
+
+   return false;
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
@@ -161,11 +210,24 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
-   has_sched_dispatch)
-   run_queue = blk_mq_do_dispatch_sched(hctx);
+   if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+   if (has_sched_dispatch)
+   run_queue = blk_mq_do_dispatch_sched(hctx);
+   else
+   run_queue = blk_mq_do_dispatch_ctx(hctx);
+   }
} else if (has_sched_dispatch) {
run_queue = blk_mq_do_dispatch_sched(hctx);
+   } else if (q->mq_ops->get_budget) {
+   /*
+* If we need to get budget before queuing request, we
+* dequeue request one by one from sw queue for avoiding
+* to mess up I/O merge when dispatch runs out of resource.
+*
+* TODO: get more budgets, and dequeue more requests in
+* one time.
+*/
+   run_queue = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index feb89f1fcc63..3fdc5662231e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, 
struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+   struct blk_mq_hw_ctx *hctx;
+   struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+   void *data)
+{
+   struct dispatch_rq_

[PATCH V9 6/7] SCSI: allow to pass null rq to scsi_prep_state_check()

2017-10-13 Thread Ming Lei

In the following patch, we will implement scsi_get_budget()
which need to call scsi_prep_state_check() when rq isn't
dequeued yet.

Signed-off-by: Ming Lei 
---
 drivers/scsi/scsi_lib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80fe297..d159bb085714 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct 
request *req)
/*
 * If the devices is blocked we defer normal commands.
 */
-   if (!(req->rq_flags & RQF_PREEMPT))
+   if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_DEFER;
break;
default:
@@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct 
request *req)
 * special commands.  In particular any user initiated
 * command is not allowed.
 */
-   if (!(req->rq_flags & RQF_PREEMPT))
+   if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_KILL;
break;
}
-- 
2.9.5

[PATCH V9 7/7] SCSI: implement .get_budget and .put_budget for blk-mq

2017-10-13 Thread Ming Lei

We need to tell blk-mq for reserving resource before queuing
one request, so implement these two callbacks. Then blk-mq
can avoid to dequeue request earlier, and IO merge can
be improved a lot.

Signed-off-by: Ming Lei 
---
 drivers/scsi/scsi_lib.c | 75 ++---
 1 file changed, 52 insertions(+), 23 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d159bb085714..6f10afaca25b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1946,25 +1946,32 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
blk_mq_complete_request(cmd->request);
 }
 
-static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
-const struct blk_mq_queue_data *bd)
+static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 {
-   struct request *req = bd->rq;
-   struct request_queue *q = req->q;
+   struct request_queue *q = hctx->queue;
+   struct scsi_device *sdev = q->queuedata;
+   struct Scsi_Host *shost = sdev->host;
+
+   atomic_dec(&shost->host_busy);
+   if (scsi_target(sdev)->can_queue > 0)
+   atomic_dec(&scsi_target(sdev)->target_busy);
+   atomic_dec(&sdev->device_busy);
+   put_device(&sdev->sdev_gendev);
+}
+
+static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+{
+   struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost = sdev->host;
-   struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
blk_status_t ret;
-   int reason;
 
-   ret = prep_to_mq(scsi_prep_state_check(sdev, req));
-   if (ret != BLK_STS_OK)
-   goto out;
+   ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
+   if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK)
+   return ret;
 
-   ret = BLK_STS_RESOURCE;
if (!get_device(&sdev->sdev_gendev))
goto out;
-
if (!scsi_dev_queue_ready(q, sdev))
goto out_put_device;
if (!scsi_target_queue_ready(shost, sdev))
@@ -1972,10 +1979,38 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx 
*hctx,
if (!scsi_host_queue_ready(q, shost, sdev))
goto out_dec_target_busy;
 
+   return BLK_STS_OK;
+
+out_dec_target_busy:
+   if (scsi_target(sdev)->can_queue > 0)
+   atomic_dec(&scsi_target(sdev)->target_busy);
+out_dec_device_busy:
+   atomic_dec(&sdev->device_busy);
+out_put_device:
+   put_device(&sdev->sdev_gendev);
+out:
+   return BLK_STS_RESOURCE;
+}
+
+static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+const struct blk_mq_queue_data *bd)
+{
+   struct request *req = bd->rq;
+   struct request_queue *q = req->q;
+   struct scsi_device *sdev = q->queuedata;
+   struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
+   blk_status_t ret;
+   int reason;
+
+   ret = prep_to_mq(scsi_prep_state_check(sdev, req));
+   if (ret != BLK_STS_OK)
+   goto out_put_budget;
+
+   ret = BLK_STS_RESOURCE;
if (!(req->rq_flags & RQF_DONTPREP)) {
ret = prep_to_mq(scsi_mq_prep_fn(req));
if (ret != BLK_STS_OK)
-   goto out_dec_host_busy;
+   goto out_put_budget;
req->rq_flags |= RQF_DONTPREP;
} else {
blk_mq_start_request(req);
@@ -1993,21 +2028,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx 
*hctx,
if (reason) {
scsi_set_blocked(cmd, reason);
ret = BLK_STS_RESOURCE;
-   goto out_dec_host_busy;
+   goto out_put_budget;
}
 
return BLK_STS_OK;
 
-out_dec_host_busy:
-   atomic_dec(&shost->host_busy);
-out_dec_target_busy:
-   if (scsi_target(sdev)->can_queue > 0)
-   atomic_dec(&scsi_target(sdev)->target_busy);
-out_dec_device_busy:
-   atomic_dec(&sdev->device_busy);
-out_put_device:
-   put_device(&sdev->sdev_gendev);
-out:
+out_put_budget:
+   scsi_mq_put_budget(hctx);
switch (ret) {
case BLK_STS_OK:
break;
@@ -2211,6 +2238,8 @@ struct request_queue *scsi_old_alloc_queue(struct 
scsi_device *sdev)
 }
 
 static const struct blk_mq_ops scsi_mq_ops = {
+   .get_budget = scsi_mq_get_budget,
+   .put_budget = scsi_mq_put_budget,
.queue_rq   = scsi_queue_rq,
.complete   = scsi_softirq_done,
.timeout= scsi_timeout,
-- 
2.9.5

[PATCH V9 3/7] sbitmap: introduce __sbitmap_for_each_set()

2017-10-13 Thread Ming Lei

We need to iterate ctx starting from any ctx in round robin
way, so introduce this helper.

Reviewed-by: Omar Sandoval 
Cc: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/sbitmap.h | 64 -
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a1904aadbc45..0dcc60e820de 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb);
  */
 bool sbitmap_any_bit_clear(const struct sbitmap *sb);
 
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
 typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
 
 /**
- * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @start: Where to start the iteration.
  * @sb: Bitmap to iterate over.
  * @fn: Callback. Should return true to continue or false to break early.
  * @data: Pointer to pass to callback.
@@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned 
int, void *);
  * This is inline even though it's non-trivial so that the function calls to 
the
  * callback will hopefully get optimized away.
  */
-static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
-   void *data)
+static inline void __sbitmap_for_each_set(struct sbitmap *sb,
+ unsigned int start,
+ sb_for_each_fn fn, void *data)
 {
-   unsigned int i;
+   unsigned int index;
+   unsigned int nr;
+   unsigned int scanned = 0;
 
-   for (i = 0; i < sb->map_nr; i++) {
-   struct sbitmap_word *word = &sb->map[i];
-   unsigned int off, nr;
+   if (start >= sb->depth)
+   start = 0;
+   index = SB_NR_TO_INDEX(sb, start);
+   nr = SB_NR_TO_BIT(sb, start);
 
-   if (!word->word)
-   continue;
+   while (scanned < sb->depth) {
+   struct sbitmap_word *word = &sb->map[index];
+   unsigned int depth = min_t(unsigned int, word->depth - nr,
+  sb->depth - scanned);
 
-   nr = 0;
-   off = i << sb->shift;
+   scanned += depth;
+   if (!word->word)
+   goto next;
+
+   /*
+* On the first iteration of the outer loop, we need to add the
+* bit offset back to the size of the word for find_next_bit().
+* On all other iterations, nr is zero, so this is a noop.
+*/
+   depth += nr;
while (1) {
-   nr = find_next_bit(&word->word, word->depth, nr);
-   if (nr >= word->depth)
+   nr = find_next_bit(&word->word, depth, nr);
+   if (nr >= depth)
break;
-
-   if (!fn(sb, off + nr, data))
+   if (!fn(sb, (index << sb->shift) + nr, data))
return;
 
nr++;
}
+next:
+   nr = 0;
+   if (++index >= sb->map_nr)
+   index = 0;
}
 }
 
-#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
-#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+   void *data)
+{
+   __sbitmap_for_each_set(sb, 0, fn, data);
+}
 
 static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
unsigned int bitnr)
-- 
2.9.5

[PATCH V9 0/7] blk-mq-sched: improve sequential I/O performance

2017-10-13 Thread Ming Lei

Hi Jens,

In Red Hat internal storage test wrt. blk-mq scheduler, we found that I/O
performance is much bad with mq-deadline, especially about sequential I/O
on some multi-queue SCSI devcies(lpfc, qla2xxx, SRP...)

Turns out one big issue causes the performance regression: requests are
still dequeued from sw queue/scheduler queue even when ldd's queue is
busy, so I/O merge becomes quite difficult to make, then sequential IO
performance degrades a lot.

This issue becomes one of mains reasons for reverting default SCSI_MQ
in V4.13.

This 8 patches improve this situation, and brings back performance loss.

With this change, SCSI-MQ sequential I/O performance is improved much, Paolo
reported that mq-deadline performance improved much[2] in his dbench test
wrt V2. Also performance improvement on lpfc/qla2xx was observed with V1.[1]

[1] http://marc.info/?l=linux-block&m=150151989915776&w=2
[2] https://marc.info/?l=linux-block&m=150217980602843&w=2

gitweb:

https://github.com/ming1/linux/commits/blk_mq_improve_scsi_mpath_perf_V9_part1

V9:
- change title of patch1
- rename blk_mq_get_budget() as blk_mq_get_dispatch_budget()
- make check on .get_budget/.put_budget cleaner
- all are suggested by Jens, thank for review

V8:
- introduce helper of blk_mq_get_budget
- handle failure path of get_budget

V7:
- introduce .get_budget/.put_budget, and IO merge gets improved
compared with V6, and in theory this approach is better than the way
taken in block legacy

- drop patch of 'blk-mq-sched: don't dequeue request until all in 
->dispatch are flushed'

V6:
- address comments from Christoph
- drop the 1st patch which changes blk_mq_request_bypass_insert(),
which should belong to dm-mpath's improvement
- move ' blk-mq-sched: move actual dispatching into one helper'
as 2nd patch, and use the introduced helper to simplify dispatch
logic
- merge two previous patches into one for improving dispatch from sw 
queue
- make comment/commit log line width as ~70, as suggested by
  Christoph

V5:
- address some comments from Omar
- add Tested-by & Reveiewed-by tag
- use direct issue for blk_mq_request_bypass_insert(), and
start to consider to improve sequential I/O for dm-mpath
- only include part 1(the original patch 1 ~ 6), as suggested
by Omar

V4:
- add Reviewed-by tag
- some trival change: typo fix in commit log or comment,
variable name, no actual functional change

V3:
- totally round robin for picking req from ctx, as suggested
by Bart
- remove one local variable in __sbitmap_for_each_set()
- drop patches of single dispatch list, which can improve
performance on mq-deadline, but cause a bit degrade on
none because all hctxs need to be checked after ->dispatch
is flushed. Will post it again once it is mature.
- rebase on v4.13-rc6 with block for-next

V2:
- dequeue request from sw queues in round roubin's style
as suggested by Bart, and introduces one helper in sbitmap
for this purpose
- improve bio merge via hash table from sw queue
- add comments about using DISPATCH_BUSY state in lockless way,
simplifying handling on busy state,
- hold ctx->lock when clearing ctx busy bit as suggested
by Bart

Ming Lei (7):
  blk-mq-sched: dispatch from scheduler only after progress is made on
->dispatch
  blk-mq-sched: move actual dispatching into one helper
  sbitmap: introduce __sbitmap_for_each_set()
  blk-mq: introduce .get_budget and .put_budget in blk_mq_ops
  blk-mq-sched: improve dispatching from sw queue
  SCSI: allow to pass null rq to scsi_prep_state_check()
  SCSI: implement .get_budget and .put_budget for blk-mq

 block/blk-mq-sched.c| 130 +---
 block/blk-mq.c  |  72 +--
 block/blk-mq.h  |  23 -
 drivers/scsi/scsi_lib.c |  79 +++--
 include/linux/blk-mq.h  |  13 +
 include/linux/sbitmap.h |  64 +---
 6 files changed, 317 insertions(+), 64 deletions(-)

-- 
2.9.5

Re: [PATCH 1/8] lib: Introduce sgl_alloc() and sgl_free()

2017-10-13 Thread Bart Van Assche

On Fri, 2017-10-13 at 10:43 -0700, Randy Dunlap wrote:
> On 10/12/17 15:45, Bart Van Assche wrote:
> > + * @sg: Scatterlist with one or more elements
> 
>   @sgl:

Thanks Randy. I will fix this before I repost this series.

Bart.

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Bart Van Assche

On Sat, 2017-10-14 at 01:29 +0800, Ming Lei wrote:
> ->can_queue is size of the whole tag space shared by all LUNs, looks it isn't
> reasonable to increase cmd_per_lun to .can_queue.

Sorry but I disagree. Setting cmd_per_lun to a value lower than can_queue
will result in suboptimal performance if there is only a single LUN per SCSI
host. If there are multiple LUNs per SCSI host then the blk-mq core tracks
the number of active LUNs through the blk_mq_tags.active_queues variable.
See also hctx_may_queue(). The comment above that function is as follows:

/*
 * For shared tag users, we track the number of currently active users
 * and attempt to provide a fair share of the tag depth for each of them.
 */

BTW, the ib_srp initiator driver sets cmd_per_lun to can_queue and is able
to achieve more than one million IOPS even in tests with multiple LUNs per
SCSI host.

Bart.

Re: [PATCH 1/8] lib: Introduce sgl_alloc() and sgl_free()

2017-10-13 Thread Randy Dunlap

On 10/12/17 15:45, Bart Van Assche wrote:

> diff --git a/lib/sgl_alloc.c b/lib/sgl_alloc.c
> new file mode 100644
> index ..d96b395dd5c8
> --- /dev/null
> +++ b/lib/sgl_alloc.c
> @@ -0,0 +1,102 @@

> +/**
> + * sgl_free_order - free a scatterlist and its pages
> + * @sg: Scatterlist with one or more elements

  @sgl:

> + * @order: Second argument for __free_pages()
> + */
> +void sgl_free_order(struct scatterlist *sgl, int order)
> +{
> + struct scatterlist *sg;
> + struct page *page;
> +
> + for (sg = sgl; sg; sg = sg_next(sg)) {
> + page = sg_page(sg);
> + if (page)
> + __free_pages(page, order);
> + }
> + kfree(sgl);
> +}
> +EXPORT_SYMBOL(sgl_free_order);
> +
> +/**
> + * sgl_free - free a scatterlist and its pages
> + * @sg: Scatterlist with one or more elements

  @sgl:

> + */
> +void sgl_free(struct scatterlist *sgl)
> +{
> + sgl_free_order(sgl, 0);
> +}
> +EXPORT_SYMBOL(sgl_free);
> 

ta.
-- 
~Randy

Re: [PATCH V8 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Jens Axboe

On 10/13/2017 11:24 AM, Ming Lei wrote:
> For SCSI devices, there is often per-request-queue depth, which need
> to be respected before queuing one request.
> 
> The current blk-mq always dequeues request first, then calls .queue_rq()
> to dispatch the request to lld. One obvious issue of this way is that I/O
> merge may not be good, because when the per-request-queue depth can't be
> respected, .queue_rq() has to return BLK_STS_RESOURCE, then this request
> has to stay in hctx->dispatch list, and never got chance to participate
> into I/O merge.
> 
> This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
> then we can try to get reserved budget first before dequeuing request.
> Once the budget for queueing I/O can't be satisfied, we don't need to
> dequeue request at all, then I/O merge can get improved a lot.

Still think this should be blk_mq_get_dispatch_budget(), like in the
incremental I sent out. That way you actually know what it is doing,
get_budget() could be anything.

> @@ -2582,6 +2606,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>   if (!set->ops->queue_rq)
>   return -EINVAL;
>  
> + if ((!!set->ops->get_budget) != (!!set->ops->put_budget))
> + return -EINVAL;

if (!set->ops->get_budget ^ !set->ops->put_budget)

is cleaner, imho.

-- 
Jens Axboe

Re: [PATCH V8 1/7] blk-mq-sched: fix scheduler bad performance

2017-10-13 Thread Jens Axboe

On 10/13/2017 11:24 AM, Ming Lei wrote:
> When hw queue is busy, we shouldn't take requests from
> scheduler queue any more, otherwise it is difficult to do
> IO merge.
> 
> This patch fixes the awful IO performance on some
> SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber
> is used by not taking requests if hw queue is busy.

This looks fine to me, but needs a much better patch title.
"fix scheduler bad performance" tells you nothing.

-- 
Jens Axboe

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

On Fri, Oct 13, 2017 at 05:08:52PM +, Bart Van Assche wrote:
> On Sat, 2017-10-14 at 00:45 +0800, Ming Lei wrote:
> > On Fri, Oct 13, 2017 at 04:31:04PM +, Bart Van Assche wrote:
> > > On Sat, 2017-10-14 at 00:07 +0800, Ming Lei wrote:
> > > > Actually it is in hot path, for example, lpfc and qla2xx's queue depth 
> > > > is 3,
> > > 
> > > Sorry but I doubt whether that is correct. More in general, I don't know 
> > > any modern
> > > storage HBA for which the default queue depth is so low.
> > 
> > You can grep:
> > 
> > [ming@ming linux]$ git grep -n cmd_per_lun ./drivers/scsi/ | grep -E 
> > "qla2xxx|lpfc"
> 
> Such a low queue depth will result in suboptimal performance for adapters
> that communicate over a storage network. I think that's a bug and that both
> adapters support much higher cmd_per_lun values.
> 
> (+James Smart)
> 
> James, can you explain us why commit 445cf4f4d2aa decreased LPFC_CMD_PER_LUN
> from 30 to 3? Was that perhaps a workaround for a bug in a specific target
> implementation?
> 
> (+Himanshu Madhani)
> 
> Himanshu, do you perhaps know whether it is safe to increase cmd_per_lun for
> the qla2xxx initiator driver to the scsi_host->can_queue value?

->can_queue is size of the whole tag space shared by all LUNs, looks it isn't
reasonable to increase cmd_per_lun to .can_queue.

> 
> > Even SRP/IB isn't big too, just 32.
> 
> The default value for ib_srp for cmd_per_lun is 62 but that value can be
> overridden by selecting another value in /etc/default/srp_daemon.conf. Note:
> a lower value is selected if after SRP login it becomes clear that the target
> queue depth is lower than the cmd_per_lun value requested by the user. This
> is a performance optimization and avoids that the SRP target system has to
> send back BUSY responses to the initiator.

OK, thanks for sharing, I just read it as 32 in Laurence's machine.

-- 
Ming

[PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance

2017-10-13 Thread Ming Lei

Hi Jens,

In Red Hat internal storage test wrt. blk-mq scheduler, we found that I/O
performance is much bad with mq-deadline, especially about sequential I/O
on some multi-queue SCSI devcies(lpfc, qla2xxx, SRP...)

Turns out one big issue causes the performance regression: requests are
still dequeued from sw queue/scheduler queue even when ldd's queue is
busy, so I/O merge becomes quite difficult to make, then sequential IO
performance degrades a lot.

This issue becomes one of mains reasons for reverting default SCSI_MQ
in V4.13.

This 8 patches improve this situation, and brings back performance loss.

With this change, SCSI-MQ sequential I/O performance is improved much, Paolo
reported that mq-deadline performance improved much[2] in his dbench test
wrt V2. Also performance improvement on lpfc/qla2xx was observed with V1.[1]

[1] http://marc.info/?l=linux-block&m=150151989915776&w=2
[2] https://marc.info/?l=linux-block&m=150217980602843&w=2

gitweb:

https://github.com/ming1/linux/commits/blk_mq_improve_scsi_mpath_perf_V8_part1

V8:
- introduce helper of blk_mq_get_budget
- handle failure path of get_budget

V7:
- introduce .get_budget/.put_budget, and IO merge gets improved
compared with V6, and in theory this approach is better than the way
taken in block legacy

- drop patch of 'blk-mq-sched: don't dequeue request until all in 
->dispatch are flushed'

V6:
- address comments from Christoph
- drop the 1st patch which changes blk_mq_request_bypass_insert(),
which should belong to dm-mpath's improvement
- move ' blk-mq-sched: move actual dispatching into one helper'
as 2nd patch, and use the introduced helper to simplify dispatch
logic
- merge two previous patches into one for improving dispatch from sw 
queue
- make comment/commit log line width as ~70, as suggested by
  Christoph

V5:
- address some comments from Omar
- add Tested-by & Reveiewed-by tag
- use direct issue for blk_mq_request_bypass_insert(), and
start to consider to improve sequential I/O for dm-mpath
- only include part 1(the original patch 1 ~ 6), as suggested
by Omar

V4:
- add Reviewed-by tag
- some trival change: typo fix in commit log or comment,
variable name, no actual functional change

V3:
- totally round robin for picking req from ctx, as suggested
by Bart
- remove one local variable in __sbitmap_for_each_set()
- drop patches of single dispatch list, which can improve
performance on mq-deadline, but cause a bit degrade on
none because all hctxs need to be checked after ->dispatch
is flushed. Will post it again once it is mature.
- rebase on v4.13-rc6 with block for-next

V2:
- dequeue request from sw queues in round roubin's style
as suggested by Bart, and introduces one helper in sbitmap
for this purpose
- improve bio merge via hash table from sw queue
- add comments about using DISPATCH_BUSY state in lockless way,
simplifying handling on busy state,
- hold ctx->lock when clearing ctx busy bit as suggested
by Bart

Ming Lei (7):
  blk-mq-sched: fix scheduler bad performance
  blk-mq-sched: move actual dispatching into one helper
  sbitmap: introduce __sbitmap_for_each_set()
  blk-mq: introduce .get_budget and .put_budget in blk_mq_ops
  blk-mq-sched: improve dispatching from sw queue
  SCSI: allow to pass null rq to scsi_prep_state_check()
  SCSI: implement .get_budget and .put_budget for blk-mq

 block/blk-mq-sched.c| 130 +---
 block/blk-mq.c  |  72 +--
 block/blk-mq.h  |  22 +++-
 drivers/scsi/scsi_lib.c |  79 +++--
 include/linux/blk-mq.h  |  13 +
 include/linux/sbitmap.h |  64 +---
 6 files changed, 316 insertions(+), 64 deletions(-)

-- 
2.9.5

[PATCH V8 2/7] blk-mq-sched: move actual dispatching into one helper

2017-10-13 Thread Ming Lei

So that it becomes easy to support to dispatch from sw queue in the
following patch.

No functional change.

Reviewed-by: Bart Van Assche 
Reviewed-by: Omar Sandoval 
Suggested-by: Christoph Hellwig  # for simplifying dispatch logic
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 43 ---
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index eca011fdfa0e..be29ba849408 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,12 +89,26 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+   struct request_queue *q = hctx->queue;
+   struct elevator_queue *e = q->elevator;
+   LIST_HEAD(rq_list);
+
+   do {
+   struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+
+   if (!rq)
+   break;
+   list_add(&rq->queuelist, &rq_list);
+   } while (blk_mq_dispatch_rq_list(q, &rq_list));
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-   bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,30 +136,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 * scheduler, we can no longer merge or sort them. So it's best to
 * leave them there for as long as we can. Mark the hw queue as
 * needing a restart in that case.
+*
+* We want to dispatch from the scheduler if there was nothing
+* on the dispatch list or we were able to dispatch from the
+* dispatch list.
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
-   } else if (!has_sched_dispatch) {
+   if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
+   blk_mq_do_dispatch_sched(hctx);
+   } else if (has_sched_dispatch) {
+   blk_mq_do_dispatch_sched(hctx);
+   } else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
}
-
-   /*
-* We want to dispatch from the scheduler if there was nothing
-* on the dispatch list or we were able to dispatch from the
-* dispatch list.
-*/
-   if (do_sched_dispatch && has_sched_dispatch) {
-   do {
-   struct request *rq;
-
-   rq = e->type->ops.mq.dispatch_request(hctx);
-   if (!rq)
-   break;
-   list_add(&rq->queuelist, &rq_list);
-   } while (blk_mq_dispatch_rq_list(q, &rq_list));
-   }
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
2.9.5

[PATCH V8 1/7] blk-mq-sched: fix scheduler bad performance

2017-10-13 Thread Ming Lei

When hw queue is busy, we shouldn't take requests from
scheduler queue any more, otherwise it is difficult to do
IO merge.

This patch fixes the awful IO performance on some
SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber
is used by not taking requests if hw queue is busy.

Reviewed-by: Omar Sandoval 
Reviewed-by: Bart Van Assche 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..eca011fdfa0e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-   bool did_work = false;
+   bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -125,18 +125,18 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   did_work = blk_mq_dispatch_rq_list(q, &rq_list);
+   do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
} else if (!has_sched_dispatch) {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
}
 
/*
-* We want to dispatch from the scheduler if we had no work left
-* on the dispatch list, OR if we did have work but weren't able
-* to make progress.
+* We want to dispatch from the scheduler if there was nothing
+* on the dispatch list or we were able to dispatch from the
+* dispatch list.
 */
-   if (!did_work && has_sched_dispatch) {
+   if (do_sched_dispatch && has_sched_dispatch) {
do {
struct request *rq;
 
-- 
2.9.5

[PATCH V8 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

For SCSI devices, there is often per-request-queue depth, which need
to be respected before queuing one request.

The current blk-mq always dequeues request first, then calls .queue_rq()
to dispatch the request to lld. One obvious issue of this way is that I/O
merge may not be good, because when the per-request-queue depth can't be
respected, .queue_rq() has to return BLK_STS_RESOURCE, then this request
has to stay in hctx->dispatch list, and never got chance to participate
into I/O merge.

This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
Once the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all, then I/O merge can get improved a lot.

Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c   | 43 +++
 block/blk-mq.c | 33 ++---
 block/blk-mq.h | 20 +++-
 include/linux/blk-mq.h | 11 +++
 4 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba849408..cd1c0caae16a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,19 +89,36 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
 
do {
-   struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+   struct request *rq;
+   blk_status_t ret;
 
-   if (!rq)
+   if (e->type->ops.mq.has_work &&
+   !e->type->ops.mq.has_work(hctx))
break;
+
+   ret = blk_mq_get_budget(hctx);
+   if (ret == BLK_STS_RESOURCE)
+   return true;
+
+   rq = e->type->ops.mq.dispatch_request(hctx);
+   if (!rq) {
+   blk_mq_put_budget(hctx, true);
+   break;
+   } else if (ret != BLK_STS_OK) {
+   blk_mq_end_request(rq, ret);
+   continue;
+   }
list_add(&rq->queuelist, &rq_list);
-   } while (blk_mq_dispatch_rq_list(q, &rq_list));
+   } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+   return false;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -110,6 +127,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
LIST_HEAD(rq_list);
+   bool run_queue = false;
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -143,13 +161,22 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-   blk_mq_do_dispatch_sched(hctx);
+   if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+   has_sched_dispatch)
+   run_queue = blk_mq_do_dispatch_sched(hctx);
} else if (has_sched_dispatch) {
-   blk_mq_do_dispatch_sched(hctx);
+   run_queue = blk_mq_do_dispatch_sched(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
-   blk_mq_dispatch_rq_list(q, &rq_list);
+   blk_mq_dispatch_rq_list(q, &rq_list, false);
+   }
+
+   if (run_queue) {
+   if (!blk_mq_sched_needs_restart(hctx) &&
+   !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+   blk_mq_sched_mark_restart_hctx(hctx);
+   blk_mq_run_hw_queue(hctx, true);
+   }
}
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40cba1b1978f..24c1b80d4312 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx 
*hctx)
return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+   bool got_budget)
 {
struct blk_mq_hw_ctx *hctx;
struct request *rq;
@@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
struct list_head *list)
if (list_empty(list))
return false;
 
+   WARN_ON(!list_is_singular(list) && got_budget);
+
/*
 * Now

[PATCH V8 3/7] sbitmap: introduce __sbitmap_for_each_set()

2017-10-13 Thread Ming Lei

We need to iterate ctx starting from any ctx in round robin
way, so introduce this helper.

Reviewed-by: Omar Sandoval 
Cc: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/sbitmap.h | 64 -
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a1904aadbc45..0dcc60e820de 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb);
  */
 bool sbitmap_any_bit_clear(const struct sbitmap *sb);
 
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
 typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
 
 /**
- * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @start: Where to start the iteration.
  * @sb: Bitmap to iterate over.
  * @fn: Callback. Should return true to continue or false to break early.
  * @data: Pointer to pass to callback.
@@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned 
int, void *);
  * This is inline even though it's non-trivial so that the function calls to 
the
  * callback will hopefully get optimized away.
  */
-static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
-   void *data)
+static inline void __sbitmap_for_each_set(struct sbitmap *sb,
+ unsigned int start,
+ sb_for_each_fn fn, void *data)
 {
-   unsigned int i;
+   unsigned int index;
+   unsigned int nr;
+   unsigned int scanned = 0;
 
-   for (i = 0; i < sb->map_nr; i++) {
-   struct sbitmap_word *word = &sb->map[i];
-   unsigned int off, nr;
+   if (start >= sb->depth)
+   start = 0;
+   index = SB_NR_TO_INDEX(sb, start);
+   nr = SB_NR_TO_BIT(sb, start);
 
-   if (!word->word)
-   continue;
+   while (scanned < sb->depth) {
+   struct sbitmap_word *word = &sb->map[index];
+   unsigned int depth = min_t(unsigned int, word->depth - nr,
+  sb->depth - scanned);
 
-   nr = 0;
-   off = i << sb->shift;
+   scanned += depth;
+   if (!word->word)
+   goto next;
+
+   /*
+* On the first iteration of the outer loop, we need to add the
+* bit offset back to the size of the word for find_next_bit().
+* On all other iterations, nr is zero, so this is a noop.
+*/
+   depth += nr;
while (1) {
-   nr = find_next_bit(&word->word, word->depth, nr);
-   if (nr >= word->depth)
+   nr = find_next_bit(&word->word, depth, nr);
+   if (nr >= depth)
break;
-
-   if (!fn(sb, off + nr, data))
+   if (!fn(sb, (index << sb->shift) + nr, data))
return;
 
nr++;
}
+next:
+   nr = 0;
+   if (++index >= sb->map_nr)
+   index = 0;
}
 }
 
-#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
-#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+   void *data)
+{
+   __sbitmap_for_each_set(sb, 0, fn, data);
+}
 
 static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
unsigned int bitnr)
-- 
2.9.5

[PATCH V8 5/7] blk-mq-sched: improve dispatching from sw queue

2017-10-13 Thread Ming Lei

SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. Meantime there is also queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver, unfortunately
it is easy to cause queue busy because of the small .cmd_per_lun.
Once these requests are flushed out, they have to stay in hctx->dispatch,
and no bio merge can participate into these requests, and sequential IO
performance is hurt a lot.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
sw queue so that we can dispatch them in scheduler's way, then we can
avoid to dequeue too many requests from sw queue when ->dispatch isn't
flushed completely.

This patch improves dispatching from sw queue by using the callback of
.get_budget and .put_budget

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c   | 68 +++---
 block/blk-mq.c | 39 +
 block/blk-mq.h |  2 ++
 include/linux/blk-mq.h |  2 ++
 4 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index cd1c0caae16a..78a862eabdec 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -121,6 +121,55 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+   unsigned idx = ctx->index_hw;
+
+   if (++idx == hctx->nr_ctx)
+   idx = 0;
+
+   return hctx->ctxs[idx];
+}
+
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+   struct request_queue *q = hctx->queue;
+   LIST_HEAD(rq_list);
+   struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+   do {
+   struct request *rq;
+   blk_status_t ret;
+
+   if (!sbitmap_any_bit_set(&hctx->ctx_map))
+   break;
+
+   ret = blk_mq_get_budget(hctx);
+   if (ret == BLK_STS_RESOURCE)
+   return true;
+
+   rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+   if (!rq) {
+   blk_mq_put_budget(hctx, true);
+   break;
+   } else if (ret != BLK_STS_OK) {
+   blk_mq_end_request(rq, ret);
+   continue;
+   }
+
+   list_add(&rq->queuelist, &rq_list);
+
+   /* round robin for fair dispatch */
+   ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+   } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+   WRITE_ONCE(hctx->dispatch_from, ctx);
+
+   return false;
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
@@ -161,11 +210,24 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
-   has_sched_dispatch)
-   run_queue = blk_mq_do_dispatch_sched(hctx);
+   if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+   if (has_sched_dispatch)
+   run_queue = blk_mq_do_dispatch_sched(hctx);
+   else
+   run_queue = blk_mq_do_dispatch_ctx(hctx);
+   }
} else if (has_sched_dispatch) {
run_queue = blk_mq_do_dispatch_sched(hctx);
+   } else if (q->mq_ops->get_budget) {
+   /*
+* If we need to get budget before queuing request, we
+* dequeue request one by one from sw queue for avoiding
+* to mess up I/O merge when dispatch runs out of resource.
+*
+* TODO: get more budgets, and dequeue more requests in
+* one time.
+*/
+   run_queue = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 24c1b80d4312..2416db4dc98b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, 
struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+   struct blk_mq_hw_ctx *hctx;
+   struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+   void *data)
+{
+   struct dispatch_rq_data *dispatch_dat

[PATCH V8 6/7] SCSI: allow to pass null rq to scsi_prep_state_check()

2017-10-13 Thread Ming Lei

In the following patch, we will implement scsi_get_budget()
which need to call scsi_prep_state_check() when rq isn't
dequeued yet.

Signed-off-by: Ming Lei 
---
 drivers/scsi/scsi_lib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80fe297..d159bb085714 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct 
request *req)
/*
 * If the devices is blocked we defer normal commands.
 */
-   if (!(req->rq_flags & RQF_PREEMPT))
+   if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_DEFER;
break;
default:
@@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct 
request *req)
 * special commands.  In particular any user initiated
 * command is not allowed.
 */
-   if (!(req->rq_flags & RQF_PREEMPT))
+   if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_KILL;
break;
}
-- 
2.9.5

[PATCH V8 7/7] SCSI: implement .get_budget and .put_budget for blk-mq

2017-10-13 Thread Ming Lei

We need to tell blk-mq for reserving resource before queuing
one request, so implement these two callbacks. Then blk-mq
can avoid to dequeue request earlier, and IO merge can
be improved a lot.

Signed-off-by: Ming Lei 
---
 drivers/scsi/scsi_lib.c | 75 ++---
 1 file changed, 52 insertions(+), 23 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d159bb085714..6f10afaca25b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1946,25 +1946,32 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
blk_mq_complete_request(cmd->request);
 }
 
-static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
-const struct blk_mq_queue_data *bd)
+static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 {
-   struct request *req = bd->rq;
-   struct request_queue *q = req->q;
+   struct request_queue *q = hctx->queue;
+   struct scsi_device *sdev = q->queuedata;
+   struct Scsi_Host *shost = sdev->host;
+
+   atomic_dec(&shost->host_busy);
+   if (scsi_target(sdev)->can_queue > 0)
+   atomic_dec(&scsi_target(sdev)->target_busy);
+   atomic_dec(&sdev->device_busy);
+   put_device(&sdev->sdev_gendev);
+}
+
+static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+{
+   struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost = sdev->host;
-   struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
blk_status_t ret;
-   int reason;
 
-   ret = prep_to_mq(scsi_prep_state_check(sdev, req));
-   if (ret != BLK_STS_OK)
-   goto out;
+   ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
+   if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK)
+   return ret;
 
-   ret = BLK_STS_RESOURCE;
if (!get_device(&sdev->sdev_gendev))
goto out;
-
if (!scsi_dev_queue_ready(q, sdev))
goto out_put_device;
if (!scsi_target_queue_ready(shost, sdev))
@@ -1972,10 +1979,38 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx 
*hctx,
if (!scsi_host_queue_ready(q, shost, sdev))
goto out_dec_target_busy;
 
+   return BLK_STS_OK;
+
+out_dec_target_busy:
+   if (scsi_target(sdev)->can_queue > 0)
+   atomic_dec(&scsi_target(sdev)->target_busy);
+out_dec_device_busy:
+   atomic_dec(&sdev->device_busy);
+out_put_device:
+   put_device(&sdev->sdev_gendev);
+out:
+   return BLK_STS_RESOURCE;
+}
+
+static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+const struct blk_mq_queue_data *bd)
+{
+   struct request *req = bd->rq;
+   struct request_queue *q = req->q;
+   struct scsi_device *sdev = q->queuedata;
+   struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
+   blk_status_t ret;
+   int reason;
+
+   ret = prep_to_mq(scsi_prep_state_check(sdev, req));
+   if (ret != BLK_STS_OK)
+   goto out_put_budget;
+
+   ret = BLK_STS_RESOURCE;
if (!(req->rq_flags & RQF_DONTPREP)) {
ret = prep_to_mq(scsi_mq_prep_fn(req));
if (ret != BLK_STS_OK)
-   goto out_dec_host_busy;
+   goto out_put_budget;
req->rq_flags |= RQF_DONTPREP;
} else {
blk_mq_start_request(req);
@@ -1993,21 +2028,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx 
*hctx,
if (reason) {
scsi_set_blocked(cmd, reason);
ret = BLK_STS_RESOURCE;
-   goto out_dec_host_busy;
+   goto out_put_budget;
}
 
return BLK_STS_OK;
 
-out_dec_host_busy:
-   atomic_dec(&shost->host_busy);
-out_dec_target_busy:
-   if (scsi_target(sdev)->can_queue > 0)
-   atomic_dec(&scsi_target(sdev)->target_busy);
-out_dec_device_busy:
-   atomic_dec(&sdev->device_busy);
-out_put_device:
-   put_device(&sdev->sdev_gendev);
-out:
+out_put_budget:
+   scsi_mq_put_budget(hctx);
switch (ret) {
case BLK_STS_OK:
break;
@@ -2211,6 +2238,8 @@ struct request_queue *scsi_old_alloc_queue(struct 
scsi_device *sdev)
 }
 
 static const struct blk_mq_ops scsi_mq_ops = {
+   .get_budget = scsi_mq_get_budget,
+   .put_budget = scsi_mq_put_budget,
.queue_rq   = scsi_queue_rq,
.complete   = scsi_softirq_done,
.timeout= scsi_timeout,
-- 
2.9.5

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Bart Van Assche

On Sat, 2017-10-14 at 00:45 +0800, Ming Lei wrote:
> On Fri, Oct 13, 2017 at 04:31:04PM +, Bart Van Assche wrote:
> > On Sat, 2017-10-14 at 00:07 +0800, Ming Lei wrote:
> > > Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 
> > > 3,
> > 
> > Sorry but I doubt whether that is correct. More in general, I don't know 
> > any modern
> > storage HBA for which the default queue depth is so low.
> 
> You can grep:
> 
> [ming@ming linux]$ git grep -n cmd_per_lun ./drivers/scsi/ | grep -E 
> "qla2xxx|lpfc"

Such a low queue depth will result in suboptimal performance for adapters
that communicate over a storage network. I think that's a bug and that both
adapters support much higher cmd_per_lun values.

(+James Smart)

James, can you explain us why commit 445cf4f4d2aa decreased LPFC_CMD_PER_LUN
from 30 to 3? Was that perhaps a workaround for a bug in a specific target
implementation?

(+Himanshu Madhani)

Himanshu, do you perhaps know whether it is safe to increase cmd_per_lun for
the qla2xxx initiator driver to the scsi_host->can_queue value?

> Even SRP/IB isn't big too, just 32.

The default value for ib_srp for cmd_per_lun is 62 but that value can be
overridden by selecting another value in /etc/default/srp_daemon.conf. Note:
a lower value is selected if after SRP login it becomes clear that the target
queue depth is lower than the cmd_per_lun value requested by the user. This
is a performance optimization and avoids that the SRP target system has to
send back BUSY responses to the initiator.

Bart.

Re: [PATCH v2 4/5] qla2xxx: Changes to support N2N logins

2017-10-13 Thread Ewan D. Milne

On Fri, 2017-10-13 at 09:34 -0700, Madhani, Madhani wrote:
> From: Duane Grigsby 
> 
> If we discovered a topology that is N2N then we will issue a login
> to the target. If our WWPN is bigger than the target's WWPN then we
> will initiate login, otherwise we will just wait for the target
> to initiate login.
> 
> Signed-off-by: Duane Grigsby 
> Signed-off-by: Michael Hernandez 
> Signed-off-by: Himanshu Madhani 
> ---
>  drivers/scsi/qla2xxx/qla_def.h  |  24 +
>  drivers/scsi/qla2xxx/qla_fw.h   |   4 +-
>  drivers/scsi/qla2xxx/qla_gbl.h  |   4 +
>  drivers/scsi/qla2xxx/qla_init.c | 135 +++-
>  drivers/scsi/qla2xxx/qla_iocb.c | 195 
> +---
>  drivers/scsi/qla2xxx/qla_isr.c  |  42 -
>  drivers/scsi/qla2xxx/qla_mbx.c  |  76 
>  7 files changed, 459 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
> index f712c0cd46d6..01a9b8971e88 100644
> --- a/drivers/scsi/qla2xxx/qla_def.h
> +++ b/drivers/scsi/qla2xxx/qla_def.h
> @@ -323,6 +323,12 @@ struct els_logo_payload {
>   uint8_t wwpn[WWN_SIZE];
>  };
>  
> +struct els_plogi_payload {
> + uint8_t opcode;
> + uint8_t rsvd[3];
> + uint8_t data[112];
> +};
> +
>  struct ct_arg {
>   void*iocb;
>   u16 nport_handle;
> @@ -358,6 +364,19 @@ struct srb_iocb {
>   dma_addr_t els_logo_pyld_dma;
>   } els_logo;
>   struct {
> +#define ELS_DCMD_PLOGI 0x3
> + uint32_t flags;
> + uint32_t els_cmd;
> + struct completion comp;
> + struct els_plogi_payload *els_plogi_pyld;
> + struct els_plogi_payload *els_resp_pyld;
> + dma_addr_t els_plogi_pyld_dma;
> + dma_addr_t els_resp_pyld_dma;
> + uint32_tfw_status[3];
> + __le16  comp_status;
> + __le16  len;
> + } els_plogi;
> + struct {
>   /*
>* Values for flags field below are as
>* defined in tsk_mgmt_entry struct
> @@ -2349,6 +2368,7 @@ typedef struct fc_port {
>   uint8_t fc4_type;
>   uint8_t fc4f_nvme;
>   uint8_t scan_state;
> + uint8_t n2n_flag;
>  
>   unsigned long last_queue_full;
>   unsigned long last_ramp_up;
> @@ -2372,6 +2392,7 @@ typedef struct fc_port {
>   u8 iocb[IOCB_SIZE];
>   u8 current_login_state;
>   u8 last_login_state;
> + struct completion n2n_done;
>  } fc_port_t;
>  
>  #define QLA_FCPORT_SCAN  1
> @@ -4228,6 +4249,9 @@ typedef struct scsi_qla_host {
>   wait_queue_head_t fcport_waitQ;
>   wait_queue_head_t vref_waitq;
>   uint8_t min_link_speed_feat;
> + uint8_t n2n_node_name[WWN_SIZE];
> + uint8_t n2n_port_name[WWN_SIZE];
> + uint16_tn2n_id;
>  } scsi_qla_host_t;
>  
>  struct qla27xx_image_status {
> diff --git a/drivers/scsi/qla2xxx/qla_fw.h b/drivers/scsi/qla2xxx/qla_fw.h
> index bec641aae7b3..d5cef0727e72 100644
> --- a/drivers/scsi/qla2xxx/qla_fw.h
> +++ b/drivers/scsi/qla2xxx/qla_fw.h
> @@ -753,9 +753,7 @@ struct els_entry_24xx {
>   uint8_t reserved_2;
>  
>   uint8_t port_id[3];
> - uint8_t reserved_3;
> -
> - uint16_t reserved_4;
> + uint8_t s_id[3];
>  
>   uint16_t control_flags; /* Control flags. */
>  #define ECF_PAYLOAD_DESCR_MASK   (BIT_15|BIT_14|BIT_13)
> diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h
> index 46c7822c20fc..1e35e961683f 100644
> --- a/drivers/scsi/qla2xxx/qla_gbl.h
> +++ b/drivers/scsi/qla2xxx/qla_gbl.h
> @@ -45,6 +45,8 @@ extern int qla2x00_fabric_login(scsi_qla_host_t *, 
> fc_port_t *, uint16_t *);
>  extern int qla2x00_local_device_login(scsi_qla_host_t *, fc_port_t *);
>  
>  extern int qla24xx_els_dcmd_iocb(scsi_qla_host_t *, int, port_id_t);
> +extern int qla24xx_els_dcmd2_iocb(scsi_qla_host_t *, int, fc_port_t *,
> +port_id_t);
>  
>  extern void qla2x00_update_fcports(scsi_qla_host_t *);
>  
> @@ -487,6 +489,8 @@ int qla24xx_gidlist_wait(struct scsi_qla_host *, void *, 
> dma_addr_t,
>  uint16_t *);
>  int __qla24xx_parse_gpdb(struct scsi_qla_host *, fc_port_t *,
>   struct port_database_24xx *);
> +int qla24xx_get_port_login_templ(scsi_qla_host_t *, dma_addr_t,
> +void *, uint16_t);
>  
>  extern int qla27xx_get_zio_threshold(scsi_qla_host_t *, uint16_t *);
>  extern int qla27xx_set_zio_threshold(scsi_qla_host_t *, uint16_t);
> diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
> index 4c53199db371..b2a391f93775 100644
> --- a/drivers/scsi/qla2xxx/qla_init.c
> +++ b/drivers/scsi/qla2xxx/qla_init.c
> @@ -1433,6 +1433,14 @@ qla24xx_handle_prli_done_event(struct scsi_qla_host 
> *vha, struct event_arg *ea)
>   qla24xx_post_gpdb_

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

On Fri, Oct 13, 2017 at 04:31:04PM +, Bart Van Assche wrote:
> On Sat, 2017-10-14 at 00:07 +0800, Ming Lei wrote:
> > Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 3,
> 
> Sorry but I doubt whether that is correct. More in general, I don't know any 
> modern
> storage HBA for which the default queue depth is so low.

You can grep:

[ming@ming linux]$ git grep -n cmd_per_lun ./drivers/scsi/ | grep -E 
"qla2xxx|lpfc"

Even SRP/IB isn't big too, just 32.

-- 
Ming

[PATCH v2 5/5] qla2xxx: Update driver version to 10.00.00.02-k

2017-10-13 Thread Madhani, Madhani

From: Himanshu Madhani 

Signed-off-by: Himanshu Madhani 
---
 drivers/scsi/qla2xxx/qla_version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/qla2xxx/qla_version.h 
b/drivers/scsi/qla2xxx/qla_version.h
index 8c4b505c9f66..b6ec02b96d3d 100644
--- a/drivers/scsi/qla2xxx/qla_version.h
+++ b/drivers/scsi/qla2xxx/qla_version.h
@@ -7,7 +7,7 @@
 /*
  * Driver version
  */
-#define QLA2XXX_VERSION  "10.00.00.01-k"
+#define QLA2XXX_VERSION  "10.00.00.02-k"
 
 #define QLA_DRIVER_MAJOR_VER   10
 #define QLA_DRIVER_MINOR_VER   0
-- 
2.12.0

[PATCH v2 2/5] qla2xxx: Add ATIO-Q processing for INTx mode

2017-10-13 Thread Madhani, Madhani

From: Himanshu Madhani 

Signed-off-by: Himanshu Madhani 
---
 drivers/scsi/qla2xxx/qla_def.h|  1 +
 drivers/scsi/qla2xxx/qla_isr.c|  8 ++--
 drivers/scsi/qla2xxx/qla_target.c | 12 +---
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 486c075998f6..66d239cbbd66 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -922,6 +922,7 @@ struct mbx_cmd_32 {
 #define INTR_RSP_QUE_UPDATE_83XX   0x14
 #define INTR_ATIO_QUE_UPDATE   0x1C
 #define INTR_ATIO_RSP_QUE_UPDATE   0x1D
+#define INTR_ATIO_QUE_UPDATE_27XX  0x1E
 
 /* ISP mailbox loopback echo diagnostic error code */
 #define MBS_LB_RESET   0x17
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index ef7afd5eefe3..ab97fb06c239 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -3129,6 +3129,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
case INTR_RSP_QUE_UPDATE_83XX:
qla24xx_process_response_queue(vha, rsp);
break;
+   case INTR_ATIO_QUE_UPDATE_27XX:
case INTR_ATIO_QUE_UPDATE:{
unsigned long flags2;
spin_lock_irqsave(&ha->tgt.atio_lock, flags2);
@@ -3259,6 +3260,7 @@ qla24xx_msix_default(int irq, void *dev_id)
case INTR_RSP_QUE_UPDATE_83XX:
qla24xx_process_response_queue(vha, rsp);
break;
+   case INTR_ATIO_QUE_UPDATE_27XX:
case INTR_ATIO_QUE_UPDATE:{
unsigned long flags2;
spin_lock_irqsave(&ha->tgt.atio_lock, flags2);
@@ -3347,7 +3349,8 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct 
rsp_que *rsp)
.pre_vectors = QLA_BASE_VECTORS,
};
 
-   if (QLA_TGT_MODE_ENABLED() && IS_ATIO_MSIX_CAPABLE(ha)) {
+   if (QLA_TGT_MODE_ENABLED() && (ql2xenablemsix != 0) &&
+   IS_ATIO_MSIX_CAPABLE(ha)) {
desc.pre_vectors++;
min_vecs++;
}
@@ -3432,7 +3435,8 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct 
rsp_que *rsp)
 * If target mode is enable, also request the vector for the ATIO
 * queue.
 */
-   if (QLA_TGT_MODE_ENABLED() && IS_ATIO_MSIX_CAPABLE(ha)) {
+   if (QLA_TGT_MODE_ENABLED() && (ql2xenablemsix != 0) &&
+   IS_ATIO_MSIX_CAPABLE(ha)) {
qentry = &ha->msix_entries[QLA_ATIO_VECTOR];
rsp->msix = qentry;
qentry->handle = rsp;
diff --git a/drivers/scsi/qla2xxx/qla_target.c 
b/drivers/scsi/qla2xxx/qla_target.c
index f05cfc83c9c8..12976a25f082 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -6546,6 +6546,7 @@ void
 qlt_24xx_config_rings(struct scsi_qla_host *vha)
 {
struct qla_hw_data *ha = vha->hw;
+   struct init_cb_24xx *icb;
if (!QLA_TGT_MODE_ENABLED())
return;
 
@@ -6553,14 +6554,19 @@ qlt_24xx_config_rings(struct scsi_qla_host *vha)
WRT_REG_DWORD(ISP_ATIO_Q_OUT(vha), 0);
RD_REG_DWORD(ISP_ATIO_Q_OUT(vha));
 
-   if (IS_ATIO_MSIX_CAPABLE(ha)) {
+   icb = (struct init_cb_24xx *)ha->init_cb;
+
+   if ((ql2xenablemsix != 0) && IS_ATIO_MSIX_CAPABLE(ha)) {
struct qla_msix_entry *msix = &ha->msix_entries[2];
-   struct init_cb_24xx *icb = (struct init_cb_24xx *)ha->init_cb;
 
icb->msix_atio = cpu_to_le16(msix->entry);
ql_dbg(ql_dbg_init, vha, 0xf072,
"Registering ICB vector 0x%x for atio que.\n",
msix->entry);
+   } else if (ql2xenablemsix == 0) {
+   icb->firmware_options_2 |= cpu_to_le32(BIT_26);
+   ql_dbg(ql_dbg_init, vha, 0xf07f,
+   "Registering INTx vector for ATIO.\n");
}
 }
 
@@ -6805,7 +6811,7 @@ qlt_probe_one_stage1(struct scsi_qla_host *base_vha, 
struct qla_hw_data *ha)
if (!QLA_TGT_MODE_ENABLED())
return;
 
-   if  (IS_QLA83XX(ha) || IS_QLA27XX(ha)) {
+   if  ((ql2xenablemsix == 0) || IS_QLA83XX(ha) || IS_QLA27XX(ha)) {
ISP_ATIO_Q_IN(base_vha) = &ha->mqiobase->isp25mq.atio_q_in;
ISP_ATIO_Q_OUT(base_vha) = &ha->mqiobase->isp25mq.atio_q_out;
} else {
-- 
2.12.0

[PATCH v2 3/5] qla2xxx: Allow MBC_GET_PORT_DATABASE to query and save the port states

2017-10-13 Thread Madhani, Madhani

From: Duane Grigsby 

The MBC_GET_PORT_DATABASE command normally checks the port state
informationi. This patch allows it to save that info in the fcport
structure and ignore the check if the query flag is set.

Signed-off-by: Duane Grigsby 
Signed-off-by: Himanshu Madhani 
---
 drivers/scsi/qla2xxx/qla_def.h |  4 
 drivers/scsi/qla2xxx/qla_mbx.c | 29 ++---
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 66d239cbbd66..f712c0cd46d6 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -2303,6 +2303,7 @@ typedef struct fc_port {
unsigned int send_els_logo:1;
unsigned int login_pause:1;
unsigned int login_succ:1;
+   unsigned int query:1;
 
struct work_struct nvme_del_work;
struct completion nvme_del_done;
@@ -2369,6 +2370,8 @@ typedef struct fc_port {
struct list_head gnl_entry;
struct work_struct del_work;
u8 iocb[IOCB_SIZE];
+   u8 current_login_state;
+   u8 last_login_state;
 } fc_port_t;
 
 #define QLA_FCPORT_SCAN1
@@ -4114,6 +4117,7 @@ typedef struct scsi_qla_host {
 #define QPAIR_ONLINE_CHECK_NEEDED  27
 #define SET_ZIO_THRESHOLD_NEEDED   28
 #define DETECT_SFP_CHANGE  29
+#define N2N_LOGIN_NEEDED   30
 
unsigned long   pci_flags;
 #define PFLG_DISCONNECTED  0   /* PCI device removed */
diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
index 7f71fd378c27..71e56877e1eb 100644
--- a/drivers/scsi/qla2xxx/qla_mbx.c
+++ b/drivers/scsi/qla2xxx/qla_mbx.c
@@ -1822,17 +1822,32 @@ qla2x00_get_port_database(scsi_qla_host_t *vha, 
fc_port_t *fcport, uint8_t opt)
 
if (IS_FWI2_CAPABLE(ha)) {
uint64_t zero = 0;
+   u8 current_login_state, last_login_state;
+
pd24 = (struct port_database_24xx *) pd;
 
/* Check for logged in state. */
-   if (pd24->current_login_state != PDS_PRLI_COMPLETE &&
-   pd24->last_login_state != PDS_PRLI_COMPLETE) {
-   ql_dbg(ql_dbg_mbx, vha, 0x1051,
-   "Unable to verify login-state (%x/%x) for "
-   "loop_id %x.\n", pd24->current_login_state,
-   pd24->last_login_state, fcport->loop_id);
+   if (fcport->fc4f_nvme) {
+   current_login_state = pd24->current_login_state >> 4;
+   last_login_state = pd24->last_login_state >> 4;
+   } else {
+   current_login_state = pd24->current_login_state & 0xf;
+   last_login_state = pd24->last_login_state & 0xf;
+   }
+   fcport->current_login_state = pd24->current_login_state;
+   fcport->last_login_state = pd24->last_login_state;
+
+   /* Check for logged in state. */
+   if (current_login_state != PDS_PRLI_COMPLETE &&
+   last_login_state != PDS_PRLI_COMPLETE) {
+   ql_dbg(ql_dbg_mbx, vha, 0x119a,
+   "Unable to verify login-state (%x/%x) for loop_id 
%x.\n",
+   current_login_state, last_login_state,
+   fcport->loop_id);
rval = QLA_FUNCTION_FAILED;
-   goto gpd_error_out;
+
+   if (!fcport->query)
+   goto gpd_error_out;
}
 
if (fcport->loop_id == FC_NO_LOOP_ID ||
-- 
2.12.0

[PATCH v2 4/5] qla2xxx: Changes to support N2N logins

2017-10-13 Thread Madhani, Madhani

From: Duane Grigsby 

If we discovered a topology that is N2N then we will issue a login
to the target. If our WWPN is bigger than the target's WWPN then we
will initiate login, otherwise we will just wait for the target
to initiate login.

Signed-off-by: Duane Grigsby 
Signed-off-by: Michael Hernandez 
Signed-off-by: Himanshu Madhani 
---
 drivers/scsi/qla2xxx/qla_def.h  |  24 +
 drivers/scsi/qla2xxx/qla_fw.h   |   4 +-
 drivers/scsi/qla2xxx/qla_gbl.h  |   4 +
 drivers/scsi/qla2xxx/qla_init.c | 135 +++-
 drivers/scsi/qla2xxx/qla_iocb.c | 195 +---
 drivers/scsi/qla2xxx/qla_isr.c  |  42 -
 drivers/scsi/qla2xxx/qla_mbx.c  |  76 
 7 files changed, 459 insertions(+), 21 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index f712c0cd46d6..01a9b8971e88 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -323,6 +323,12 @@ struct els_logo_payload {
uint8_t wwpn[WWN_SIZE];
 };
 
+struct els_plogi_payload {
+   uint8_t opcode;
+   uint8_t rsvd[3];
+   uint8_t data[112];
+};
+
 struct ct_arg {
void*iocb;
u16 nport_handle;
@@ -358,6 +364,19 @@ struct srb_iocb {
dma_addr_t els_logo_pyld_dma;
} els_logo;
struct {
+#define ELS_DCMD_PLOGI 0x3
+   uint32_t flags;
+   uint32_t els_cmd;
+   struct completion comp;
+   struct els_plogi_payload *els_plogi_pyld;
+   struct els_plogi_payload *els_resp_pyld;
+   dma_addr_t els_plogi_pyld_dma;
+   dma_addr_t els_resp_pyld_dma;
+   uint32_tfw_status[3];
+   __le16  comp_status;
+   __le16  len;
+   } els_plogi;
+   struct {
/*
 * Values for flags field below are as
 * defined in tsk_mgmt_entry struct
@@ -2349,6 +2368,7 @@ typedef struct fc_port {
uint8_t fc4_type;
uint8_t fc4f_nvme;
uint8_t scan_state;
+   uint8_t n2n_flag;
 
unsigned long last_queue_full;
unsigned long last_ramp_up;
@@ -2372,6 +2392,7 @@ typedef struct fc_port {
u8 iocb[IOCB_SIZE];
u8 current_login_state;
u8 last_login_state;
+   struct completion n2n_done;
 } fc_port_t;
 
 #define QLA_FCPORT_SCAN1
@@ -4228,6 +4249,9 @@ typedef struct scsi_qla_host {
wait_queue_head_t fcport_waitQ;
wait_queue_head_t vref_waitq;
uint8_t min_link_speed_feat;
+   uint8_t n2n_node_name[WWN_SIZE];
+   uint8_t n2n_port_name[WWN_SIZE];
+   uint16_tn2n_id;
 } scsi_qla_host_t;
 
 struct qla27xx_image_status {
diff --git a/drivers/scsi/qla2xxx/qla_fw.h b/drivers/scsi/qla2xxx/qla_fw.h
index bec641aae7b3..d5cef0727e72 100644
--- a/drivers/scsi/qla2xxx/qla_fw.h
+++ b/drivers/scsi/qla2xxx/qla_fw.h
@@ -753,9 +753,7 @@ struct els_entry_24xx {
uint8_t reserved_2;
 
uint8_t port_id[3];
-   uint8_t reserved_3;
-
-   uint16_t reserved_4;
+   uint8_t s_id[3];
 
uint16_t control_flags; /* Control flags. */
 #define ECF_PAYLOAD_DESCR_MASK (BIT_15|BIT_14|BIT_13)
diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h
index 46c7822c20fc..1e35e961683f 100644
--- a/drivers/scsi/qla2xxx/qla_gbl.h
+++ b/drivers/scsi/qla2xxx/qla_gbl.h
@@ -45,6 +45,8 @@ extern int qla2x00_fabric_login(scsi_qla_host_t *, fc_port_t 
*, uint16_t *);
 extern int qla2x00_local_device_login(scsi_qla_host_t *, fc_port_t *);
 
 extern int qla24xx_els_dcmd_iocb(scsi_qla_host_t *, int, port_id_t);
+extern int qla24xx_els_dcmd2_iocb(scsi_qla_host_t *, int, fc_port_t *,
+port_id_t);
 
 extern void qla2x00_update_fcports(scsi_qla_host_t *);
 
@@ -487,6 +489,8 @@ int qla24xx_gidlist_wait(struct scsi_qla_host *, void *, 
dma_addr_t,
 uint16_t *);
 int __qla24xx_parse_gpdb(struct scsi_qla_host *, fc_port_t *,
struct port_database_24xx *);
+int qla24xx_get_port_login_templ(scsi_qla_host_t *, dma_addr_t,
+void *, uint16_t);
 
 extern int qla27xx_get_zio_threshold(scsi_qla_host_t *, uint16_t *);
 extern int qla27xx_set_zio_threshold(scsi_qla_host_t *, uint16_t);
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index 4c53199db371..b2a391f93775 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -1433,6 +1433,14 @@ qla24xx_handle_prli_done_event(struct scsi_qla_host 
*vha, struct event_arg *ea)
qla24xx_post_gpdb_work(vha, ea->fcport, 0);
break;
default:
+   if (ea->fcport->n2n_flag) {
+   ql_dbg(ql_dbg_disc, vha, 0x2118,
+   "%s %d %8phC post fc4 prli\n"

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Jens Axboe

On 10/13/2017 10:31 AM, Bart Van Assche wrote:
> On Sat, 2017-10-14 at 00:07 +0800, Ming Lei wrote:
>> Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 3,
> 
> Sorry but I doubt whether that is correct. More in general, I don't know any 
> modern
> storage HBA for which the default queue depth is so low.

That does seem insanely low. BTW, even a low depth isn't an issue, as long as
we know what it roughly is.


-- 
Jens Axboe

[PATCH v2 0/5] qla2xxx: Patches for scsi-misc

2017-10-13 Thread Madhani, Madhani

From: Himanshu Madhani 

Hi Martin,

This series adds support for INTx mode for qla2xxx driver. Also,
adds support for N2N login for FC-NVMe.

Please apply to 4.15/scsi-queue at your earliest convenience.

Changes from v1 -> v2

o Fixed warnings reported by 0-day kernel build.

Thanks,
Himanshu

Duane Grigsby (2):
  qla2xxx: Allow MBC_GET_PORT_DATABASE to query and save the port states
  qla2xxx: Changes to support N2N logins

Himanshu Madhani (3):
  qla2xxx: Add module param ql2xenablemsix
  qla2xxx: Add ATIO-Q processing for INTx mode
  qla2xxx: Update driver version to 10.00.00.02-k

 drivers/scsi/qla2xxx/qla_def.h |  29 ++
 drivers/scsi/qla2xxx/qla_fw.h  |   4 +-
 drivers/scsi/qla2xxx/qla_gbl.h |   5 +
 drivers/scsi/qla2xxx/qla_init.c| 135 -
 drivers/scsi/qla2xxx/qla_iocb.c| 195 +++--
 drivers/scsi/qla2xxx/qla_isr.c |  59 +--
 drivers/scsi/qla2xxx/qla_mbx.c | 105 ++--
 drivers/scsi/qla2xxx/qla_os.c  |   9 ++
 drivers/scsi/qla2xxx/qla_target.c  |  12 ++-
 drivers/scsi/qla2xxx/qla_version.h |   2 +-
 10 files changed, 518 insertions(+), 37 deletions(-)

-- 
2.12.0

[PATCH v2 1/5] qla2xxx: Add module param ql2xenablemsix

2017-10-13 Thread Madhani, Madhani

From: Himanshu Madhani 

Signed-off-by: Himanshu Madhani 
---
 drivers/scsi/qla2xxx/qla_gbl.h | 1 +
 drivers/scsi/qla2xxx/qla_isr.c | 9 ++---
 drivers/scsi/qla2xxx/qla_os.c  | 9 +
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h
index f852ca60c49f..46c7822c20fc 100644
--- a/drivers/scsi/qla2xxx/qla_gbl.h
+++ b/drivers/scsi/qla2xxx/qla_gbl.h
@@ -145,6 +145,7 @@ extern int ql2xmvasynctoatio;
 extern int ql2xuctrlirq;
 extern int ql2xnvmeenable;
 extern int ql2xautodetectsfp;
+extern int ql2xenablemsix;
 
 extern int qla2x00_loop_reset(scsi_qla_host_t *);
 extern void qla2x00_abort_all_cmds(scsi_qla_host_t *, int);
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index 9d9668aac6f6..ef7afd5eefe3 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -3486,11 +3486,14 @@ qla2x00_request_irqs(struct qla_hw_data *ha, struct 
rsp_que *rsp)
scsi_qla_host_t *vha = pci_get_drvdata(ha->pdev);
 
/* If possible, enable MSI-X. */
-   if (!IS_QLA2432(ha) && !IS_QLA2532(ha) && !IS_QLA8432(ha) &&
-   !IS_CNA_CAPABLE(ha) && !IS_QLA2031(ha) && !IS_QLAFX00(ha) &&
-   !IS_QLA27XX(ha))
+   if (ql2xenablemsix == 0 || (!IS_QLA2432(ha) && !IS_QLA2532(ha) &&
+   !IS_QLA8432(ha) && !IS_CNA_CAPABLE(ha) && !IS_QLA2031(ha) &&
+   !IS_QLAFX00(ha) && !IS_QLA27XX(ha)))
goto skip_msi;
 
+   if (ql2xenablemsix == 2)
+   goto skip_msix;
+
if (ha->pdev->subsystem_vendor == PCI_VENDOR_ID_HP &&
(ha->pdev->subsystem_device == 0x7040 ||
ha->pdev->subsystem_device == 0x7041 ||
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 937209805baf..cb719345aa0d 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -268,6 +268,15 @@ MODULE_PARM_DESC(ql2xautodetectsfp,
 "Detect SFP range and set appropriate distance.\n"
 "1 (Default): Enable\n");
 
+int ql2xenablemsix = 1;
+module_param(ql2xenablemsix, int, 0444);
+MODULE_PARM_DESC(ql2xenablemsix,
+"Set to enable MSI or MSI-X interrupt mechanism.\n"
+" Default is 1, enable MSI-X interrupt mechanism.\n"
+" 0 -- enable traditional pin-based mechanism.\n"
+" 1 -- enable MSI-X interrupt mechanism.\n"
+" 2 -- enable MSI interrupt mechanism.\n");
+
 /*
  * SCSI host template entry points
  */
-- 
2.12.0

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Bart Van Assche

On Sat, 2017-10-14 at 00:07 +0800, Ming Lei wrote:
> Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 3,

Sorry but I doubt whether that is correct. More in general, I don't know any 
modern
storage HBA for which the default queue depth is so low.

Bart.

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Jens Axboe

On 10/13/2017 10:22 AM, Ming Lei wrote:
> On Fri, Oct 13, 2017 at 10:20:01AM -0600, Jens Axboe wrote:
>> On 10/13/2017 10:17 AM, Ming Lei wrote:
>>> On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
 On 10/12/2017 06:19 PM, Ming Lei wrote:
> On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
>> On 10/12/2017 12:37 PM, Ming Lei wrote:
>>> For SCSI devices, there is often per-request-queue depth, which need
>>> to be respected before queuing one request.
>>>
>>> The current blk-mq always dequeues one request first, then calls 
>>> .queue_rq()
>>> to dispatch the request to lld. One obvious issue of this way is that 
>>> I/O
>>> merge may not be good, because when the per-request-queue depth can't be
>>> respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this 
>>> request
>>> has to staty in hctx->dispatch list, and never got chance to participate
>>> into I/O merge.
>>>
>>> This patch introduces .get_budget and .put_budget callback in 
>>> blk_mq_ops,
>>> then we can try to get reserved budget first before dequeuing request.
>>> Once we can't get budget for queueing I/O, we don't need to dequeue 
>>> request
>>> at all, then I/O merge can get improved a lot.
>>
>> I can't help but think that it would be cleaner to just be able to
>> reinsert the request into the scheduler properly, if we fail to
>> dispatch it. Bart hinted at that earlier as well.
>
> Actually when I start to investigate the issue, the 1st thing I tried
> is to reinsert, but that way is even worse on qla2xxx.
>
> Once request is dequeued, the IO merge chance is decreased a lot.
> With none scheduler, it becomes not possible to merge because
> we only try to merge over the last 8 requests. With mq-deadline,
> when one request is reinserted, another request may be dequeued
> at the same time.

 I don't care too much about 'none'. If perfect merging is crucial for
 getting to the performance level you want on the hardware you are using,
 you should not be using 'none'. 'none' will work perfectly fine for NVMe
 etc style devices, where we are not dependent on merging to the same
 extent that we are on other devices.
>>>
>>> We still have some SCSI device, such as qla2xxx, which is 1:1 multi-queue
>>> device, like NVMe, in my test, the big lock of mq-deadline has been
>>> an issue for this kind of device, and none actually is better than
>>> mq-deadline, even though its merge isn't good.
>>
>> Kyber should be able to fill that hole, hopefully.
> 
> Yeah, kyber still uses same IO merge with none, :-)

Doesn't mean it can't be changed... 'none' has to remain with very low
overhead, any extra smarts or logic should be a scheduler thing.

-- 
Jens Axboe

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Jens Axboe

On 10/13/2017 10:21 AM, Ming Lei wrote:
> On Fri, Oct 13, 2017 at 10:19:04AM -0600, Jens Axboe wrote:
>> On 10/13/2017 10:07 AM, Ming Lei wrote:
>>> On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
 On 10/12/2017 06:19 PM, Ming Lei wrote:
> On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
>> On 10/12/2017 12:37 PM, Ming Lei wrote:
>>> For SCSI devices, there is often per-request-queue depth, which need
>>> to be respected before queuing one request.
>>>
>>> The current blk-mq always dequeues one request first, then calls 
>>> .queue_rq()
>>> to dispatch the request to lld. One obvious issue of this way is that 
>>> I/O
>>> merge may not be good, because when the per-request-queue depth can't be
>>> respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this 
>>> request
>>> has to staty in hctx->dispatch list, and never got chance to participate
>>> into I/O merge.
>>>
>>> This patch introduces .get_budget and .put_budget callback in 
>>> blk_mq_ops,
>>> then we can try to get reserved budget first before dequeuing request.
>>> Once we can't get budget for queueing I/O, we don't need to dequeue 
>>> request
>>> at all, then I/O merge can get improved a lot.
>>
>> I can't help but think that it would be cleaner to just be able to
>> reinsert the request into the scheduler properly, if we fail to
>> dispatch it. Bart hinted at that earlier as well.
>
> Actually when I start to investigate the issue, the 1st thing I tried
> is to reinsert, but that way is even worse on qla2xxx.
>
> Once request is dequeued, the IO merge chance is decreased a lot.
> With none scheduler, it becomes not possible to merge because
> we only try to merge over the last 8 requests. With mq-deadline,
> when one request is reinserted, another request may be dequeued
> at the same time.

 I don't care too much about 'none'. If perfect merging is crucial for
 getting to the performance level you want on the hardware you are using,
 you should not be using 'none'. 'none' will work perfectly fine for NVMe
 etc style devices, where we are not dependent on merging to the same
 extent that we are on other devices.

 mq-deadline reinsertion will be expensive, that's in the nature of that
 beast. It's basically the same as a normal request inserition.  So for
 that, we'd have to be a bit careful not to run into this too much. Even
 with a dumb approach, it should only happen 1 out of N times, where N is
 the typical point at which the device will return STS_RESOURCE. The
 reinsertion vs dequeue should be serialized with your patch to do that,
 at least for the single queue mq-deadline setup. In fact, I think your
 approach suffers from that same basic race, in that the budget isn't a
 hard allocation, it's just a hint. It can change from the time you check
 it, and when you go and dispatch the IO, if you don't serialize that
 part. So really should be no different in that regard.
>>>
>>> In case of SCSI, the .get_buget is done as atomic counting,
>>> and it is completely effective to avoid unnecessary dequeue, please take
>>> a look at patch 6.
>>
>> Looks like you are right, I had initially misread that as just checking
>> the busy count. But you are actually getting the count at that point,
>> so it should be solid.
>>
> Not mention the cost of acquiring/releasing lock, that work
> is just doing useless work and wasting CPU.

 Sure, my point is that if it doesn't happen too often, it doesn't really
 matter. It's not THAT expensive.
>>>
>>> Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 3,
>>> it is quite easy to trigger STS_RESOURCE.
>>
>> Ugh, that is low.
>>
>> OK, I think we should just roll with this and see how far we can go. I'll
>> apply it for 4.15.
> 
> OK, I have some update, will post a new version soon.

Fold something like this into it then:


diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ccbbc7e108ea..b7bf84b5ddf2 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -102,13 +102,12 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx 
*hctx)
!e->type->ops.mq.has_work(hctx))
break;
 
-   if (q->mq_ops->get_budget && !q->mq_ops->get_budget(hctx))
+   if (!blk_mq_get_dispatch_budget(hctx))
return true;
 
rq = e->type->ops.mq.dispatch_request(hctx);
if (!rq) {
-   if (q->mq_ops->put_budget)
-   q->mq_ops->put_budget(hctx);
+   blk_mq_put_dispatch_budget(hctx, true);
break;
}
list_add(&rq->queuelist, &rq_list);
@@ -140,13 +139,12 @@ static bool blk_mq_do_dispatc

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

On Fri, Oct 13, 2017 at 10:20:01AM -0600, Jens Axboe wrote:
> On 10/13/2017 10:17 AM, Ming Lei wrote:
> > On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
> >> On 10/12/2017 06:19 PM, Ming Lei wrote:
> >>> On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
>  On 10/12/2017 12:37 PM, Ming Lei wrote:
> > For SCSI devices, there is often per-request-queue depth, which need
> > to be respected before queuing one request.
> >
> > The current blk-mq always dequeues one request first, then calls 
> > .queue_rq()
> > to dispatch the request to lld. One obvious issue of this way is that 
> > I/O
> > merge may not be good, because when the per-request-queue depth can't be
> > respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this 
> > request
> > has to staty in hctx->dispatch list, and never got chance to participate
> > into I/O merge.
> >
> > This patch introduces .get_budget and .put_budget callback in 
> > blk_mq_ops,
> > then we can try to get reserved budget first before dequeuing request.
> > Once we can't get budget for queueing I/O, we don't need to dequeue 
> > request
> > at all, then I/O merge can get improved a lot.
> 
>  I can't help but think that it would be cleaner to just be able to
>  reinsert the request into the scheduler properly, if we fail to
>  dispatch it. Bart hinted at that earlier as well.
> >>>
> >>> Actually when I start to investigate the issue, the 1st thing I tried
> >>> is to reinsert, but that way is even worse on qla2xxx.
> >>>
> >>> Once request is dequeued, the IO merge chance is decreased a lot.
> >>> With none scheduler, it becomes not possible to merge because
> >>> we only try to merge over the last 8 requests. With mq-deadline,
> >>> when one request is reinserted, another request may be dequeued
> >>> at the same time.
> >>
> >> I don't care too much about 'none'. If perfect merging is crucial for
> >> getting to the performance level you want on the hardware you are using,
> >> you should not be using 'none'. 'none' will work perfectly fine for NVMe
> >> etc style devices, where we are not dependent on merging to the same
> >> extent that we are on other devices.
> > 
> > We still have some SCSI device, such as qla2xxx, which is 1:1 multi-queue
> > device, like NVMe, in my test, the big lock of mq-deadline has been
> > an issue for this kind of device, and none actually is better than
> > mq-deadline, even though its merge isn't good.
> 
> Kyber should be able to fill that hole, hopefully.

Yeah, kyber still uses same IO merge with none, :-)

-- 
Ming

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

On Fri, Oct 13, 2017 at 10:19:04AM -0600, Jens Axboe wrote:
> On 10/13/2017 10:07 AM, Ming Lei wrote:
> > On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
> >> On 10/12/2017 06:19 PM, Ming Lei wrote:
> >>> On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
>  On 10/12/2017 12:37 PM, Ming Lei wrote:
> > For SCSI devices, there is often per-request-queue depth, which need
> > to be respected before queuing one request.
> >
> > The current blk-mq always dequeues one request first, then calls 
> > .queue_rq()
> > to dispatch the request to lld. One obvious issue of this way is that 
> > I/O
> > merge may not be good, because when the per-request-queue depth can't be
> > respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this 
> > request
> > has to staty in hctx->dispatch list, and never got chance to participate
> > into I/O merge.
> >
> > This patch introduces .get_budget and .put_budget callback in 
> > blk_mq_ops,
> > then we can try to get reserved budget first before dequeuing request.
> > Once we can't get budget for queueing I/O, we don't need to dequeue 
> > request
> > at all, then I/O merge can get improved a lot.
> 
>  I can't help but think that it would be cleaner to just be able to
>  reinsert the request into the scheduler properly, if we fail to
>  dispatch it. Bart hinted at that earlier as well.
> >>>
> >>> Actually when I start to investigate the issue, the 1st thing I tried
> >>> is to reinsert, but that way is even worse on qla2xxx.
> >>>
> >>> Once request is dequeued, the IO merge chance is decreased a lot.
> >>> With none scheduler, it becomes not possible to merge because
> >>> we only try to merge over the last 8 requests. With mq-deadline,
> >>> when one request is reinserted, another request may be dequeued
> >>> at the same time.
> >>
> >> I don't care too much about 'none'. If perfect merging is crucial for
> >> getting to the performance level you want on the hardware you are using,
> >> you should not be using 'none'. 'none' will work perfectly fine for NVMe
> >> etc style devices, where we are not dependent on merging to the same
> >> extent that we are on other devices.
> >>
> >> mq-deadline reinsertion will be expensive, that's in the nature of that
> >> beast. It's basically the same as a normal request inserition.  So for
> >> that, we'd have to be a bit careful not to run into this too much. Even
> >> with a dumb approach, it should only happen 1 out of N times, where N is
> >> the typical point at which the device will return STS_RESOURCE. The
> >> reinsertion vs dequeue should be serialized with your patch to do that,
> >> at least for the single queue mq-deadline setup. In fact, I think your
> >> approach suffers from that same basic race, in that the budget isn't a
> >> hard allocation, it's just a hint. It can change from the time you check
> >> it, and when you go and dispatch the IO, if you don't serialize that
> >> part. So really should be no different in that regard.
> > 
> > In case of SCSI, the .get_buget is done as atomic counting,
> > and it is completely effective to avoid unnecessary dequeue, please take
> > a look at patch 6.
> 
> Looks like you are right, I had initially misread that as just checking
> the busy count. But you are actually getting the count at that point,
> so it should be solid.
> 
> >>> Not mention the cost of acquiring/releasing lock, that work
> >>> is just doing useless work and wasting CPU.
> >>
> >> Sure, my point is that if it doesn't happen too often, it doesn't really
> >> matter. It's not THAT expensive.
> > 
> > Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 3,
> > it is quite easy to trigger STS_RESOURCE.
> 
> Ugh, that is low.
> 
> OK, I think we should just roll with this and see how far we can go. I'll
> apply it for 4.15.

OK, I have some update, will post a new version soon.

Thanks
Ming

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Jens Axboe

On 10/13/2017 10:17 AM, Ming Lei wrote:
> On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
>> On 10/12/2017 06:19 PM, Ming Lei wrote:
>>> On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
 On 10/12/2017 12:37 PM, Ming Lei wrote:
> For SCSI devices, there is often per-request-queue depth, which need
> to be respected before queuing one request.
>
> The current blk-mq always dequeues one request first, then calls 
> .queue_rq()
> to dispatch the request to lld. One obvious issue of this way is that I/O
> merge may not be good, because when the per-request-queue depth can't be
> respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this request
> has to staty in hctx->dispatch list, and never got chance to participate
> into I/O merge.
>
> This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
> then we can try to get reserved budget first before dequeuing request.
> Once we can't get budget for queueing I/O, we don't need to dequeue 
> request
> at all, then I/O merge can get improved a lot.

 I can't help but think that it would be cleaner to just be able to
 reinsert the request into the scheduler properly, if we fail to
 dispatch it. Bart hinted at that earlier as well.
>>>
>>> Actually when I start to investigate the issue, the 1st thing I tried
>>> is to reinsert, but that way is even worse on qla2xxx.
>>>
>>> Once request is dequeued, the IO merge chance is decreased a lot.
>>> With none scheduler, it becomes not possible to merge because
>>> we only try to merge over the last 8 requests. With mq-deadline,
>>> when one request is reinserted, another request may be dequeued
>>> at the same time.
>>
>> I don't care too much about 'none'. If perfect merging is crucial for
>> getting to the performance level you want on the hardware you are using,
>> you should not be using 'none'. 'none' will work perfectly fine for NVMe
>> etc style devices, where we are not dependent on merging to the same
>> extent that we are on other devices.
> 
> We still have some SCSI device, such as qla2xxx, which is 1:1 multi-queue
> device, like NVMe, in my test, the big lock of mq-deadline has been
> an issue for this kind of device, and none actually is better than
> mq-deadline, even though its merge isn't good.

Kyber should be able to fill that hole, hopefully.

-- 
Jens Axboe

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Jens Axboe

On 10/13/2017 10:07 AM, Ming Lei wrote:
> On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
>> On 10/12/2017 06:19 PM, Ming Lei wrote:
>>> On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
 On 10/12/2017 12:37 PM, Ming Lei wrote:
> For SCSI devices, there is often per-request-queue depth, which need
> to be respected before queuing one request.
>
> The current blk-mq always dequeues one request first, then calls 
> .queue_rq()
> to dispatch the request to lld. One obvious issue of this way is that I/O
> merge may not be good, because when the per-request-queue depth can't be
> respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this request
> has to staty in hctx->dispatch list, and never got chance to participate
> into I/O merge.
>
> This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
> then we can try to get reserved budget first before dequeuing request.
> Once we can't get budget for queueing I/O, we don't need to dequeue 
> request
> at all, then I/O merge can get improved a lot.

 I can't help but think that it would be cleaner to just be able to
 reinsert the request into the scheduler properly, if we fail to
 dispatch it. Bart hinted at that earlier as well.
>>>
>>> Actually when I start to investigate the issue, the 1st thing I tried
>>> is to reinsert, but that way is even worse on qla2xxx.
>>>
>>> Once request is dequeued, the IO merge chance is decreased a lot.
>>> With none scheduler, it becomes not possible to merge because
>>> we only try to merge over the last 8 requests. With mq-deadline,
>>> when one request is reinserted, another request may be dequeued
>>> at the same time.
>>
>> I don't care too much about 'none'. If perfect merging is crucial for
>> getting to the performance level you want on the hardware you are using,
>> you should not be using 'none'. 'none' will work perfectly fine for NVMe
>> etc style devices, where we are not dependent on merging to the same
>> extent that we are on other devices.
>>
>> mq-deadline reinsertion will be expensive, that's in the nature of that
>> beast. It's basically the same as a normal request inserition.  So for
>> that, we'd have to be a bit careful not to run into this too much. Even
>> with a dumb approach, it should only happen 1 out of N times, where N is
>> the typical point at which the device will return STS_RESOURCE. The
>> reinsertion vs dequeue should be serialized with your patch to do that,
>> at least for the single queue mq-deadline setup. In fact, I think your
>> approach suffers from that same basic race, in that the budget isn't a
>> hard allocation, it's just a hint. It can change from the time you check
>> it, and when you go and dispatch the IO, if you don't serialize that
>> part. So really should be no different in that regard.
> 
> In case of SCSI, the .get_buget is done as atomic counting,
> and it is completely effective to avoid unnecessary dequeue, please take
> a look at patch 6.

Looks like you are right, I had initially misread that as just checking
the busy count. But you are actually getting the count at that point,
so it should be solid.

>>> Not mention the cost of acquiring/releasing lock, that work
>>> is just doing useless work and wasting CPU.
>>
>> Sure, my point is that if it doesn't happen too often, it doesn't really
>> matter. It's not THAT expensive.
> 
> Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 3,
> it is quite easy to trigger STS_RESOURCE.

Ugh, that is low.

OK, I think we should just roll with this and see how far we can go. I'll
apply it for 4.15.

-- 
Jens Axboe

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
> On 10/12/2017 06:19 PM, Ming Lei wrote:
> > On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
> >> On 10/12/2017 12:37 PM, Ming Lei wrote:
> >>> For SCSI devices, there is often per-request-queue depth, which need
> >>> to be respected before queuing one request.
> >>>
> >>> The current blk-mq always dequeues one request first, then calls 
> >>> .queue_rq()
> >>> to dispatch the request to lld. One obvious issue of this way is that I/O
> >>> merge may not be good, because when the per-request-queue depth can't be
> >>> respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this request
> >>> has to staty in hctx->dispatch list, and never got chance to participate
> >>> into I/O merge.
> >>>
> >>> This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
> >>> then we can try to get reserved budget first before dequeuing request.
> >>> Once we can't get budget for queueing I/O, we don't need to dequeue 
> >>> request
> >>> at all, then I/O merge can get improved a lot.
> >>
> >> I can't help but think that it would be cleaner to just be able to
> >> reinsert the request into the scheduler properly, if we fail to
> >> dispatch it. Bart hinted at that earlier as well.
> > 
> > Actually when I start to investigate the issue, the 1st thing I tried
> > is to reinsert, but that way is even worse on qla2xxx.
> > 
> > Once request is dequeued, the IO merge chance is decreased a lot.
> > With none scheduler, it becomes not possible to merge because
> > we only try to merge over the last 8 requests. With mq-deadline,
> > when one request is reinserted, another request may be dequeued
> > at the same time.
> 
> I don't care too much about 'none'. If perfect merging is crucial for
> getting to the performance level you want on the hardware you are using,
> you should not be using 'none'. 'none' will work perfectly fine for NVMe
> etc style devices, where we are not dependent on merging to the same
> extent that we are on other devices.

We still have some SCSI device, such as qla2xxx, which is 1:1 multi-queue
device, like NVMe, in my test, the big lock of mq-deadline has been
an issue for this kind of device, and none actually is better than
mq-deadline, even though its merge isn't good.

Thanks,
Ming

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Ming Lei

On Fri, Oct 13, 2017 at 08:44:23AM -0600, Jens Axboe wrote:
> On 10/12/2017 06:19 PM, Ming Lei wrote:
> > On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
> >> On 10/12/2017 12:37 PM, Ming Lei wrote:
> >>> For SCSI devices, there is often per-request-queue depth, which need
> >>> to be respected before queuing one request.
> >>>
> >>> The current blk-mq always dequeues one request first, then calls 
> >>> .queue_rq()
> >>> to dispatch the request to lld. One obvious issue of this way is that I/O
> >>> merge may not be good, because when the per-request-queue depth can't be
> >>> respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this request
> >>> has to staty in hctx->dispatch list, and never got chance to participate
> >>> into I/O merge.
> >>>
> >>> This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
> >>> then we can try to get reserved budget first before dequeuing request.
> >>> Once we can't get budget for queueing I/O, we don't need to dequeue 
> >>> request
> >>> at all, then I/O merge can get improved a lot.
> >>
> >> I can't help but think that it would be cleaner to just be able to
> >> reinsert the request into the scheduler properly, if we fail to
> >> dispatch it. Bart hinted at that earlier as well.
> > 
> > Actually when I start to investigate the issue, the 1st thing I tried
> > is to reinsert, but that way is even worse on qla2xxx.
> > 
> > Once request is dequeued, the IO merge chance is decreased a lot.
> > With none scheduler, it becomes not possible to merge because
> > we only try to merge over the last 8 requests. With mq-deadline,
> > when one request is reinserted, another request may be dequeued
> > at the same time.
> 
> I don't care too much about 'none'. If perfect merging is crucial for
> getting to the performance level you want on the hardware you are using,
> you should not be using 'none'. 'none' will work perfectly fine for NVMe
> etc style devices, where we are not dependent on merging to the same
> extent that we are on other devices.
> 
> mq-deadline reinsertion will be expensive, that's in the nature of that
> beast. It's basically the same as a normal request inserition.  So for
> that, we'd have to be a bit careful not to run into this too much. Even
> with a dumb approach, it should only happen 1 out of N times, where N is
> the typical point at which the device will return STS_RESOURCE. The
> reinsertion vs dequeue should be serialized with your patch to do that,
> at least for the single queue mq-deadline setup. In fact, I think your
> approach suffers from that same basic race, in that the budget isn't a
> hard allocation, it's just a hint. It can change from the time you check
> it, and when you go and dispatch the IO, if you don't serialize that
> part. So really should be no different in that regard.

In case of SCSI, the .get_buget is done as atomic counting,
and it is completely effective to avoid unnecessary dequeue, please take
a look at patch 6.

> 
> > Not mention the cost of acquiring/releasing lock, that work
> > is just doing useless work and wasting CPU.
> 
> Sure, my point is that if it doesn't happen too often, it doesn't really
> matter. It's not THAT expensive.

Actually it is in hot path, for example, lpfc and qla2xx's queue depth is 3,
it is quite easy to trigger STS_RESOURCE.


Thanks,
Ming

Hello

2017-10-13 Thread Daria Yoong Shang

Hello,

Can i trust an investment project in your country? accepted please send email 
for more details.

Best Regards 
Daria Yoong Shang

[PATCH] scsi: scsi_transport_fc: make the function argument as const

2017-10-13 Thread Bhumika Goyal

This is a followup patch for: https://lkml.org/lkml/2017/10/13/476

Make the function argument of fc_attach_transport as const as it is
only stored in the const field 'f' (made const in the patch in the link)
of a fc_internal structure.

Signed-off-by: Bhumika Goyal 
---
This change allows some fc_function_template structures to be const.
I will send the patches for structure constification after this gets 
applied.

 drivers/scsi/scsi_transport_fc.c | 2 +-
 include/scsi/scsi_transport_fc.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 8c46a6d..fb30ede 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -2162,7 +2162,7 @@ enum blk_eh_timer_return
 }
 
 struct scsi_transport_template *
-fc_attach_transport(struct fc_function_template *ft)
+fc_attach_transport(const struct fc_function_template *ft)
 {
int count;
struct fc_internal *i = kzalloc(sizeof(struct fc_internal),
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index e8644ee..60fff05 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -784,7 +784,7 @@ static inline void u64_to_wwn(u64 inm, u8 *wwn)
 }
 
 struct scsi_transport_template *fc_attach_transport(
-   struct fc_function_template *);
+   const struct fc_function_template *);
 void fc_release_transport(struct scsi_transport_template *);
 void fc_remove_host(struct Scsi_Host *);
 struct fc_rport *fc_remote_port_add(struct Scsi_Host *shost,
-- 
1.9.1

Re: [PATCH V7 4/6] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

2017-10-13 Thread Jens Axboe

On 10/12/2017 06:19 PM, Ming Lei wrote:
> On Thu, Oct 12, 2017 at 12:46:24PM -0600, Jens Axboe wrote:
>> On 10/12/2017 12:37 PM, Ming Lei wrote:
>>> For SCSI devices, there is often per-request-queue depth, which need
>>> to be respected before queuing one request.
>>>
>>> The current blk-mq always dequeues one request first, then calls .queue_rq()
>>> to dispatch the request to lld. One obvious issue of this way is that I/O
>>> merge may not be good, because when the per-request-queue depth can't be
>>> respected,  .queue_rq() has to return BLK_STS_RESOURCE, then this request
>>> has to staty in hctx->dispatch list, and never got chance to participate
>>> into I/O merge.
>>>
>>> This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
>>> then we can try to get reserved budget first before dequeuing request.
>>> Once we can't get budget for queueing I/O, we don't need to dequeue request
>>> at all, then I/O merge can get improved a lot.
>>
>> I can't help but think that it would be cleaner to just be able to
>> reinsert the request into the scheduler properly, if we fail to
>> dispatch it. Bart hinted at that earlier as well.
> 
> Actually when I start to investigate the issue, the 1st thing I tried
> is to reinsert, but that way is even worse on qla2xxx.
> 
> Once request is dequeued, the IO merge chance is decreased a lot.
> With none scheduler, it becomes not possible to merge because
> we only try to merge over the last 8 requests. With mq-deadline,
> when one request is reinserted, another request may be dequeued
> at the same time.

I don't care too much about 'none'. If perfect merging is crucial for
getting to the performance level you want on the hardware you are using,
you should not be using 'none'. 'none' will work perfectly fine for NVMe
etc style devices, where we are not dependent on merging to the same
extent that we are on other devices.

mq-deadline reinsertion will be expensive, that's in the nature of that
beast. It's basically the same as a normal request inserition.  So for
that, we'd have to be a bit careful not to run into this too much. Even
with a dumb approach, it should only happen 1 out of N times, where N is
the typical point at which the device will return STS_RESOURCE. The
reinsertion vs dequeue should be serialized with your patch to do that,
at least for the single queue mq-deadline setup. In fact, I think your
approach suffers from that same basic race, in that the budget isn't a
hard allocation, it's just a hint. It can change from the time you check
it, and when you go and dispatch the IO, if you don't serialize that
part. So really should be no different in that regard.

> Not mention the cost of acquiring/releasing lock, that work
> is just doing useless work and wasting CPU.

Sure, my point is that if it doesn't happen too often, it doesn't really
matter. It's not THAT expensive.

-- 
Jens Axboe

[PATCH] scsi: scsi_transport_fc: make a field of fc_internal structure const

2017-10-13 Thread Bhumika Goyal

The 'f' field of the fc_internal structure do not modify the fields
of the fc_function_template structure it points to. And there are no
other pointers initialized with this field 'f'. So, the field 'f' is
never modified and therefore make it const.

Signed-off-by: Bhumika Goyal 
---
 drivers/scsi/scsi_transport_fc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index cbd4495..f38975f 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -323,7 +323,7 @@ struct device_attribute device_attr_##_prefix##_##_name =   
\
 
 struct fc_internal {
struct scsi_transport_template t;
-   struct fc_function_template *f;
+   const struct fc_function_template *f;
 
/*
 * For attributes : each object has :
-- 
1.9.1

Re: [PATCH 1/1] qla2xxx: Fix sparse warnings for N2N

2017-10-13 Thread Madhani, Himanshu


> On Oct 13, 2017, at 5:25 AM, Ewan D. Milne  wrote:
> 
> On Thu, 2017-10-12 at 23:08 -0700, Madhani, Madhani wrote:
>> From: Himanshu Madhani 
>> 
>> Fixes following warning reported by 0-day kernel test build
>> 
>> drivers/scsi/qla2xxx/qla_iocb.c: In function 
>> 'qla2x00_els_dcmd2_iocb_timeout':
>> drivers/scsi/qla2xxx/qla_iocb.c:2611:50: warning: format '%x' expects a 
>> matching
>> 'unsigned int' argument [-Wformat=]
>> "%s %d ELS Timeout, %8phC hdl=%x, portid=%06x\n",
>> 
>> drivers/scsi/qla2xxx/qla_iocb.c: In function 'qla2x00_els_dcmd2_sp_done':
>> drivers/scsi/qla2xxx/qla_iocb.c:2634:11: warning: format '%s' expects 
>> argument of type 'char *', but argument 6 has type 'uint32_t {aka unsigned 
>> int}' [-Wformat=]
>> "%s %s ELS hdl=%x, portid=%06x done %8pC\n",
>> ^
>> drivers/scsi/qla2xxx/qla_iocb.c:2634:35: warning: format '%x' expects 
>> argument of type 'unsigned int', but argument 8 has type 'uint8_t * {aka 
>> unsigned char *}' [-Wformat=]
>> "%s %s ELS hdl=%x, portid=%06x done %8pC\n",
>> ^
>> drivers/scsi/qla2xxx/qla_iocb.c:2634:44: warning: format '%p' expects a 
>> matching 'void *' argument [-Wformat=]
>> "%s %s %ELS %hdl=%x, %portid=%06x %done %8pC\n",
>> ^
>> Signed-off-by: Himanshu Madhani 
>> ---
>> drivers/scsi/qla2xxx/qla_iocb.c | 4 ++--
>> 1 file changed, 2 insertions(+), 2 deletions(-)
>> 
>> diff --git a/drivers/scsi/qla2xxx/qla_iocb.c 
>> b/drivers/scsi/qla2xxx/qla_iocb.c
>> index 6e57c48f8d95..0502ed7ee14e 100644
>> --- a/drivers/scsi/qla2xxx/qla_iocb.c
>> +++ b/drivers/scsi/qla2xxx/qla_iocb.c
>> @@ -2608,7 +2608,7 @@ qla2x00_els_dcmd2_iocb_timeout(void *data)
>>  int res;
>> 
>>  ql_dbg(ql_dbg_io + ql_dbg_disc, vha, 0x3069,
>> -"%s %d ELS Timeout, %8phC hdl=%x, portid=%06x\n",
>> +"%s hdl=%x ELS Timeout, %8phC portid=%06x\n",
>>  sp->name, sp->handle, fcport->port_name, fcport->d_id.b24);
>> 
>>  /* Abort the exchange */
>> @@ -2631,7 +2631,7 @@ qla2x00_els_dcmd2_sp_done(void *ptr, int res)
>>  struct scsi_qla_host *vha = sp->vha;
>> 
>>  ql_dbg(ql_dbg_io + ql_dbg_disc, vha, 0x3072,
>> -"%s %s ELS hdl=%x, portid=%06x done %8pC\n",
>> +"%s ELS hdl=%x, portid=%06x done %8pC\n",
>>  sp->name, sp->handle, fcport->d_id.b24, fcport->port_name);
>> 
>>  complete(&lio->u.els_plogi.comp);
> 
> Can you please just submit a corrected v2 of your earlier patch instead
> of this one?

Sure.

Martin, 

let me know if you would refer me to submit v2 of the original patch or this 
patch is okay? 

> -Ewan
> 

Thanks,
Himanshu

Re: [PATCH 1/8] lib: Introduce sgl_alloc() and sgl_free()

2017-10-13 Thread Jens Axboe

On 10/12/2017 05:00 PM, Bart Van Assche wrote:
> On Thu, 2017-10-12 at 16:52 -0600, Jens Axboe wrote:
>> On 10/12/2017 04:45 PM, Bart Van Assche wrote:
>>> +++ b/include/linux/sgl_alloc.h
>>> @@ -0,0 +1,16 @@
>>> +#ifndef _SGL_ALLOC_H_
>>> +#define _SGL_ALLOC_H_
>>> +
>>> +#include  /* bool, gfp_t */
>>> +
>>> +struct scatterlist;
>>> +
>>> +struct scatterlist *sgl_alloc_order(unsigned long long length,
>>> +   unsigned int order, unsigned int *nent_p,
>>> +   gfp_t gfp, bool chainable);
>>> +struct scatterlist *sgl_alloc(unsigned long long length, unsigned int 
>>> *nent_p,
>>> + gfp_t gfp);
>>> +void sgl_free_order(struct scatterlist *sgl, int order);
>>> +void sgl_free(struct scatterlist *sgl);
>>> +
>>> +#endif /* _SGL_ALLOC_H_ */
>>
>> Should this just go into linux/scatterlist.h instead of creating a new header
>> file?
> 
> That's something I can change. I don't have a strong opinion about whether
> these declarations should go into a new header file or into 
> linux/scatterlist.h.

I think you should make that change, keep things simpler. As Johannes
suggested, you could move the code to lib/scatterlist.c as well. I would
recommend keeping the kconfig option to only build it when needed, though.

-- 
Jens Axboe

[PATCH] zfcp: fix erp_action use-before-initialize in REC action trace

2017-10-13 Thread Steffen Maier

v4.10 commit 6f2ce1c6af37 ("scsi: zfcp: fix rport unblock race with LUN
recovery") extended accessing parent pointer fields of
struct zfcp_erp_action for tracing.
If an erp_action has never been enqueued before, these parent pointer
fields are uninitialized and NULL. Examples are zfcp objects freshly added
to the parent object's children list, before enqueueing their first
recovery subsequently. In zfcp_erp_try_rport_unblock(), we iterate such
list. Accessing erp_action fields can cause a NULL pointer dereference.
Since the kernel can read from lowcore on s390, it does not immediately
cause a kernel page fault. Instead it can cause hangs on trying to acquire
the wrong erp_action->adapter->dbf->rec_lock in zfcp_dbf_rec_action_lvl()
  ^bogus^
while holding already other locks with IRQs disabled.

Real life example from attaching lots of LUNs in parallel on many CPUs:

crash> bt 17723
PID: 17723  TASK: ...   CPU: 25  COMMAND: "zfcperp0.0.1800"
 LOWCORE INFO:
  -psw  : 0x040430018000 0x0038e424
  -function : _raw_spin_lock_wait_flags at 38e424
...
 #0 [fdde8fc90] zfcp_dbf_rec_action_lvl at 3e0004e9862 [zfcp]
 #1 [fdde8fce8] zfcp_erp_try_rport_unblock at 3e0004dfddc [zfcp]
 #2 [fdde8fd38] zfcp_erp_strategy at 3e0004e0234 [zfcp]
 #3 [fdde8fda8] zfcp_erp_thread at 3e0004e0a12 [zfcp]
 #4 [fdde8fe60] kthread at 173550
 #5 [fdde8feb8] kernel_thread_starter at 10add2

zfcp_adapter
 zfcp_port
  zfcp_unit , 0x404040d6
  scsi_device NULL, returning early!
zfcp_scsi_dev.status = 0x4000
0x4000 ZFCP_STATUS_COMMON_RUNNING

crash> zfcp_unit 
struct zfcp_unit {
  erp_action = {
adapter = 0x0,
port = 0x0,
unit = 0x0,
  },
}

zfcp_erp_action is always fully embedded into its container object. Such
container object is never moved in its object tree (only add or delete).
Hence, erp_action parent pointers can never change.

To fix the issue, initialize the erp_action parent pointers
before adding the erp_action container to any list and thus before it
becomes accessible from outside of its initializing function.

In order to also close the time window between zfcp_erp_setup_act()
memsetting the entire erp_action to zero and setting the parent pointers
again, drop the memset and instead explicitly initialize individually all
erp_action fields except for parent pointers. To be extra careful not to
introduce any other unintended side effect, even keep zeroing the
erp_action fields for list and timer. Also double-check with WARN_ON_ONCE
that erp_action parent pointers never change, so we get to know when
we would deviate from previous behavior.

Signed-off-by: Steffen Maier 
Fixes: 6f2ce1c6af37 ("scsi: zfcp: fix rport unblock race with LUN recovery")
Cc:  #2.6.32+
Reviewed-by: Benjamin Block 
---

James, Martin,

it's an important bugfix cut against James' scsi.git fixes branch,
and would be nice if it could make it into 4.14 via rc.

 drivers/s390/scsi/zfcp_aux.c  |5 +
 drivers/s390/scsi/zfcp_erp.c  |   18 +++---
 drivers/s390/scsi/zfcp_scsi.c |5 +
 3 files changed, 21 insertions(+), 7 deletions(-)

--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -357,6 +357,8 @@ struct zfcp_adapter *zfcp_adapter_enqueu
 
adapter->next_port_scan = jiffies;
 
+   adapter->erp_action.adapter = adapter;
+
if (zfcp_qdio_setup(adapter))
goto failed;
 
@@ -513,6 +515,9 @@ struct zfcp_port *zfcp_port_enqueue(stru
port->dev.groups = zfcp_port_attr_groups;
port->dev.release = zfcp_port_release;
 
+   port->erp_action.adapter = adapter;
+   port->erp_action.port = port;
+
if (dev_set_name(&port->dev, "0x%016llx", (unsigned long long)wwpn)) {
kfree(port);
goto err_out;
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -193,9 +193,8 @@ static struct zfcp_erp_action *zfcp_erp_
atomic_or(ZFCP_STATUS_COMMON_ERP_INUSE,
&zfcp_sdev->status);
erp_action = &zfcp_sdev->erp_action;
-   memset(erp_action, 0, sizeof(struct zfcp_erp_action));
-   erp_action->port = port;
-   erp_action->sdev = sdev;
+   WARN_ON_ONCE(erp_action->port != port);
+   WARN_ON_ONCE(erp_action->sdev != sdev);
if (!(atomic_read(&zfcp_sdev->status) &
  ZFCP_STATUS_COMMON_RUNNING))
act_status |= ZFCP_STATUS_ERP_CLOSE_ONLY;
@@ -208,8 +207,8 @@ static struct zfcp_erp_action *zfcp_erp_
zfcp_erp_action_dismiss_port(port);
atomic_or(ZFCP_STATUS_COMMON_ERP_INUSE, &port->status);
erp_action = &port->erp_action;
-   memset(erp_action, 0, sizeof(struct zfcp_erp_action));
-   erp_action->port = port;
+   WARN_ON_ONCE(erp_action->port != port);
+   WARN_ON_ONCE(erp_action->sdev != NULL);

Re: [PATCH 1/1] qla2xxx: Fix sparse warnings for N2N

2017-10-13 Thread Ewan D. Milne

On Thu, 2017-10-12 at 23:08 -0700, Madhani, Madhani wrote:
> From: Himanshu Madhani 
> 
> Fixes following warning reported by 0-day kernel test build
> 
> drivers/scsi/qla2xxx/qla_iocb.c: In function 'qla2x00_els_dcmd2_iocb_timeout':
> drivers/scsi/qla2xxx/qla_iocb.c:2611:50: warning: format '%x' expects a 
> matching
> 'unsigned int' argument [-Wformat=]
>  "%s %d ELS Timeout, %8phC hdl=%x, portid=%06x\n",
> 
> drivers/scsi/qla2xxx/qla_iocb.c: In function 'qla2x00_els_dcmd2_sp_done':
> drivers/scsi/qla2xxx/qla_iocb.c:2634:11: warning: format '%s' expects 
> argument of type 'char *', but argument 6 has type 'uint32_t {aka unsigned 
> int}' [-Wformat=]
>  "%s %s ELS hdl=%x, portid=%06x done %8pC\n",
>  ^
> drivers/scsi/qla2xxx/qla_iocb.c:2634:35: warning: format '%x' expects 
> argument of type 'unsigned int', but argument 8 has type 'uint8_t * {aka 
> unsigned char *}' [-Wformat=]
>  "%s %s ELS hdl=%x, portid=%06x done %8pC\n",
>  ^
> drivers/scsi/qla2xxx/qla_iocb.c:2634:44: warning: format '%p' expects a 
> matching 'void *' argument [-Wformat=]
>  "%s %s %ELS %hdl=%x, %portid=%06x %done %8pC\n",
>  ^
> Signed-off-by: Himanshu Madhani 
> ---
>  drivers/scsi/qla2xxx/qla_iocb.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c
> index 6e57c48f8d95..0502ed7ee14e 100644
> --- a/drivers/scsi/qla2xxx/qla_iocb.c
> +++ b/drivers/scsi/qla2xxx/qla_iocb.c
> @@ -2608,7 +2608,7 @@ qla2x00_els_dcmd2_iocb_timeout(void *data)
>   int res;
>  
>   ql_dbg(ql_dbg_io + ql_dbg_disc, vha, 0x3069,
> - "%s %d ELS Timeout, %8phC hdl=%x, portid=%06x\n",
> + "%s hdl=%x ELS Timeout, %8phC portid=%06x\n",
>   sp->name, sp->handle, fcport->port_name, fcport->d_id.b24);
>  
>   /* Abort the exchange */
> @@ -2631,7 +2631,7 @@ qla2x00_els_dcmd2_sp_done(void *ptr, int res)
>   struct scsi_qla_host *vha = sp->vha;
>  
>   ql_dbg(ql_dbg_io + ql_dbg_disc, vha, 0x3072,
> - "%s %s ELS hdl=%x, portid=%06x done %8pC\n",
> + "%s ELS hdl=%x, portid=%06x done %8pC\n",
>   sp->name, sp->handle, fcport->d_id.b24, fcport->port_name);
>  
>   complete(&lio->u.els_plogi.comp);

Can you please just submit a corrected v2 of your earlier patch instead
of this one?

-Ewan

Re: [PATCHv6 4/5] scsi_devinfo: Whitespace fixes

2017-10-13 Thread Johannes Thumshirn

Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCHv6 5/5] scsi_devinfo: fixup string compare

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCHv6 1/5] scsi_debug: allow to specify inquiry vendor and model

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCHv6 3/5] scsi_devinfo: Reformat blacklist flags

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCHv6 2/5] scsi: Export blacklist flags to sysfs

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCHv6 0/5] scsi: Fixup blacklist handling

2017-10-13 Thread Hannes Reinecke

On 10/02/2017 04:26 PM, Hannes Reinecke wrote:
> Hi all,
> 
> the SCSI blacklist handling seems to be rather tricky issue;
> everytime a fix is included it tends to break other devices.
> This patchset attempt to simplify the devlist handling yet again,
> but this time implementing the framework for regression testing, too.
> A patch adding a regression test to the blktest suite will follow.
> 
> As usual, comments and reviews are welcome.
> 
Martin, ping?
What is the status of this patchset?
Do you need anything from my side?

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH 8/8] scsi/pmcraid: Use sgl_alloc_order() and sgl_free_order()

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 7/8] scsi/pmcraid: Remove an unused structure member

2017-10-13 Thread Johannes Thumshirn

Fair enough,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 6/8] scsi/ipr: Use sgl_alloc_order() and sgl_free_order()

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 1/8] lib: Introduce sgl_alloc() and sgl_free()

2017-10-13 Thread Johannes Thumshirn

On Thu, Oct 12, 2017 at 11:00:10PM +, Bart Van Assche wrote:
> On Thu, 2017-10-12 at 16:52 -0600, Jens Axboe wrote:
> > On 10/12/2017 04:45 PM, Bart Van Assche wrote:
> > > +++ b/include/linux/sgl_alloc.h
> > > @@ -0,0 +1,16 @@
> > > +#ifndef _SGL_ALLOC_H_
> > > +#define _SGL_ALLOC_H_
> > > +
> > > +#include  /* bool, gfp_t */
> > > +
> > > +struct scatterlist;
> > > +
> > > +struct scatterlist *sgl_alloc_order(unsigned long long length,
> > > + unsigned int order, unsigned int *nent_p,
> > > + gfp_t gfp, bool chainable);
> > > +struct scatterlist *sgl_alloc(unsigned long long length, unsigned int 
> > > *nent_p,
> > > +   gfp_t gfp);
> > > +void sgl_free_order(struct scatterlist *sgl, int order);
> > > +void sgl_free(struct scatterlist *sgl);
> > > +
> > > +#endif /* _SGL_ALLOC_H_ */
> > 
> > Should this just go into linux/scatterlist.h instead of creating a new 
> > header
> > file?
> 
> That's something I can change. I don't have a strong opinion about whether
> these declarations should go into a new header file or into 
> linux/scatterlist.h.

Yes please do (and the implementaion should go into lib/scatterlist.c).
Additionally please get rid of the new CONFIG_SGL_ALLOC as well.

Thanks,
Johannes
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 4/8] nvmet/rdma: Use sgl_alloc() and sgl_free()

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 3/8] nvmet/fc: Use sgl_alloc() and sgl_free()

2017-10-13 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

[PATCH v4 0/5] scsi: ufs: add ufs driver code for Hisilicon Hi3660 SoC

2017-10-13 Thread Li Wei

This patchset adds driver support for UFS for Hi3660 SoC. It is verified on 
HiKey960 board.

Li Wei (5):
  scsi: ufs: add Hisilicon ufs driver code
  dt-bindings: scsi: ufs: add document for hisi-ufs
  arm64: dts: add ufs dts node
  arm64: defconfig: enable configs for Hisilicon ufs
  arm64: defconfig: enable f2fs and squashfs

 Documentation/devicetree/bindings/ufs/ufs-hisi.txt |  47 ++
 arch/arm64/boot/dts/hisilicon/hi3660-hikey960.dts  |   5 +
 arch/arm64/boot/dts/hisilicon/hi3660.dtsi  |  19 +
 arch/arm64/configs/defconfig   |  11 +
 drivers/scsi/ufs/Kconfig   |   9 +
 drivers/scsi/ufs/Makefile  |   1 +
 drivers/scsi/ufs/ufs-hisi.c| 625 +
 drivers/scsi/ufs/ufs-hisi.h| 161 ++
 8 files changed, 878 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/ufs/ufs-hisi.txt
 create mode 100644 drivers/scsi/ufs/ufs-hisi.c
 create mode 100644 drivers/scsi/ufs/ufs-hisi.h

-- 
2.11.0

[PATCH v4 4/5] arm64: defconfig: enable configs for Hisilicon ufs

2017-10-13 Thread Li Wei

This enable configs for Hisilicon Hi UFS driver.

Signed-off-by: Li Wei 
Signed-off-by: Zhangfei Gao 
Signed-off-by: Guodong Xu 
---
 arch/arm64/configs/defconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 34480e9af2e7..8aff981915f5 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -168,6 +168,9 @@ CONFIG_BLK_DEV_SD=y
 CONFIG_SCSI_SAS_ATA=y
 CONFIG_SCSI_HISI_SAS=y
 CONFIG_SCSI_HISI_SAS_PCI=y
+CONFIG_SCSI_UFSHCD=y
+CONFIG_SCSI_UFSHCD_PLATFORM=y
+CONFIG_SCSI_UFS_HISI=y
 CONFIG_ATA=y
 CONFIG_SATA_AHCI=y
 CONFIG_SATA_AHCI_PLATFORM=y
-- 
2.11.0

[PATCH v4 2/5] dt-bindings: scsi: ufs: add document for hisi-ufs

2017-10-13 Thread Li Wei

add ufs node document for Hisilicon.

Signed-off-by: Li Wei 
---
 Documentation/devicetree/bindings/ufs/ufs-hisi.txt | 47 ++
 1 file changed, 47 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/ufs/ufs-hisi.txt

diff --git a/Documentation/devicetree/bindings/ufs/ufs-hisi.txt 
b/Documentation/devicetree/bindings/ufs/ufs-hisi.txt
new file mode 100644
index ..ee114a65143d
--- /dev/null
+++ b/Documentation/devicetree/bindings/ufs/ufs-hisi.txt
@@ -0,0 +1,47 @@
+* Hisilicon Universal Flash Storage (UFS) Host Controller
+
+UFS nodes are defined to describe on-chip UFS hardware macro.
+Each UFS Host Controller should have its own node.
+
+Required properties:
+- compatible: compatible list, contains one of the following -
+   "hisilicon,hi3660-ufs" for hisi ufs host controller
+present on Hi3660 chipset.
+- reg   : should contain UFS register address space & UFS SYS CTRL 
register address,
+- interrupt-parent  : interrupt device
+- interrupts: interrupt number
+- clocks   : List of phandle and clock specifier pairs
+- clock-names   : List of clock input name strings sorted in the same
+ order as the clocks property. "clk_ref", "clk_phy" is 
optional
+- resets: reset node register, one reset the clk and the other 
reset the controller
+- reset-names   : describe reset node register
+
+Optional properties for board device:
+- reset-gpio   : specifies to reset devices
+
+Example:
+
+   ufs: ufs@ff3b {
+   compatible = "hisilicon,hi3660-ufs", "jedec,ufs-1.1";
+   /* 0: HCI standard */
+   /* 1: UFS SYS CTRL */
+   reg = <0x0 0xff3b 0x0 0x1000>,
+   <0x0 0xff3b1000 0x0 0x1000>;
+   interrupt-parent = <&gic>;
+   interrupts = ;
+   clocks = <&crg_ctrl HI3660_CLK_GATE_UFSIO_REF>,
+   <&crg_ctrl HI3660_CLK_GATE_UFSPHY_CFG>;
+   clock-names = "clk_ref", "clk_phy";
+   freq-table-hz = <0 0>, <0 0>;
+   /* offset: 0x84; bit: 12 */
+   /* offset: 0x84; bit: 7  */
+   resets = <&crg_rst 0x84 12>,
+   <&crg_rst 0x84 7>;
+   reset-names = "rst", "assert";
+   };
+
+   &ufs {
+   reset-gpio = <&gpio18 1 0>;
+   status = "okay";
+   };
+
-- 
2.11.0

[PATCH v4 5/5] arm64: defconfig: enable f2fs and squashfs

2017-10-13 Thread Li Wei

Partitions in HiKey960 are formatted as f2fs and squashfs.
f2fs is for userdata; squashfs is for system. Both partitions are required
by Android.

Signed-off-by: Li Wei 
Signed-off-by: Zhangfei Gao 
Signed-off-by: Guodong Xu 
---
 arch/arm64/configs/defconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 8aff981915f5..0a8a843cd0be 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -555,6 +555,7 @@ CONFIG_ACPI_APEI_GHES=y
 CONFIG_ACPI_APEI_PCIEAER=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
+CONFIG_F2FS_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_BTRFS_FS=m
 CONFIG_BTRFS_FS_POSIX_ACL=y
@@ -570,6 +571,13 @@ CONFIG_HUGETLBFS=y
 CONFIG_CONFIGFS_FS=y
 CONFIG_EFIVAR_FS=y
 CONFIG_SQUASHFS=y
+CONFIG_SQUASHFS_FILE_DIRECT=y
+CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y
+CONFIG_SQUASHFS_XATTR=y
+CONFIG_SQUASHFS_LZ4=y
+CONFIG_SQUASHFS_LZO=y
+CONFIG_SQUASHFS_XZ=y
+CONFIG_SQUASHFS_4K_DEVBLK_SIZE=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V4=y
 CONFIG_NFS_V4_1=y
-- 
2.11.0

[PATCH v4 3/5] arm64: dts: add ufs dts node

2017-10-13 Thread Li Wei

arm64: dts: add ufs node for Hisilicon.

Signed-off-by: Li Wei 
---
 arch/arm64/boot/dts/hisilicon/hi3660-hikey960.dts |  5 +
 arch/arm64/boot/dts/hisilicon/hi3660.dtsi | 19 +++
 2 files changed, 24 insertions(+)

diff --git a/arch/arm64/boot/dts/hisilicon/hi3660-hikey960.dts 
b/arch/arm64/boot/dts/hisilicon/hi3660-hikey960.dts
index fd4705c451e2..457645a2b53f 100644
--- a/arch/arm64/boot/dts/hisilicon/hi3660-hikey960.dts
+++ b/arch/arm64/boot/dts/hisilicon/hi3660-hikey960.dts
@@ -276,3 +276,8 @@
interrupts = <3 IRQ_TYPE_EDGE_RISING>;
};
 };
+
+&ufs {
+   reset-gpio = <&gpio18 1 0>;
+   status = "okay";
+};
diff --git a/arch/arm64/boot/dts/hisilicon/hi3660.dtsi 
b/arch/arm64/boot/dts/hisilicon/hi3660.dtsi
index b7a90d632959..a24ab8472347 100644
--- a/arch/arm64/boot/dts/hisilicon/hi3660.dtsi
+++ b/arch/arm64/boot/dts/hisilicon/hi3660.dtsi
@@ -978,5 +978,24 @@
clocks = <&crg_ctrl HI3660_OSC32K>;
clock-names = "apb_pclk";
};
+
+   ufs: ufs@ff3b {
+   compatible = "hisilicon,hi3660-ufs", "jedec,ufs-1.1";
+   /* 0: HCI standard */
+   /* 1: UFS SYS CTRL */
+   reg = <0x0 0xff3b 0x0 0x1000>,
+   <0x0 0xff3b1000 0x0 0x1000>;
+   interrupt-parent = <&gic>;
+   interrupts = ;
+   clocks = <&crg_ctrl HI3660_CLK_GATE_UFSIO_REF>,
+   <&crg_ctrl HI3660_CLK_GATE_UFSPHY_CFG>;
+   clock-names = "clk_ref", "clk_phy";
+   freq-table-hz = <0 0>, <0 0>;
+   /* offset: 0x84; bit: 12 */
+   /* offset: 0x84; bit: 7  */
+   resets = <&crg_rst 0x84 12>,
+   <&crg_rst 0x84 7>;
+   reset-names = "rst", "assert";
+   };
};
 };
-- 
2.11.0

[PATCH v4 1/5] scsi: ufs: add Hisilicon ufs driver code

2017-10-13 Thread Li Wei

add Hisilicon ufs driver code.

Signed-off-by: Li Wei 
Signed-off-by: Geng Jianfeng 
Signed-off-by: Zang Leigang 
Signed-off-by: Yu Jianfeng 
---
 drivers/scsi/ufs/Kconfig|   9 +
 drivers/scsi/ufs/Makefile   |   1 +
 drivers/scsi/ufs/ufs-hisi.c | 625 
 drivers/scsi/ufs/ufs-hisi.h | 161 
 4 files changed, 796 insertions(+)
 create mode 100644 drivers/scsi/ufs/ufs-hisi.c
 create mode 100644 drivers/scsi/ufs/ufs-hisi.h

diff --git a/drivers/scsi/ufs/Kconfig b/drivers/scsi/ufs/Kconfig
index e27b4d4e6ae2..bf2ff5628b15 100644
--- a/drivers/scsi/ufs/Kconfig
+++ b/drivers/scsi/ufs/Kconfig
@@ -80,6 +80,15 @@ config SCSI_UFSHCD_PLATFORM
 
  If unsure, say N.
 
+config SCSI_UFS_HISI
+   tristate "Hisilicon specific hooks to UFS controller platform driver"
+   depends on (ARCH_HISI || COMPILE_TEST) && SCSI_UFSHCD_PLATFORM
+   ---help---
+ This selects the Hisilicon specific additions to UFSHCD platform 
driver.
+
+ Select this if you have UFS controller on Hisilicon chipset.
+ If unsure, say N.
+
 config SCSI_UFS_DWC_TC_PLATFORM
tristate "DesignWare platform support using a G210 Test Chip"
depends on SCSI_UFSHCD_PLATFORM
diff --git a/drivers/scsi/ufs/Makefile b/drivers/scsi/ufs/Makefile
index 6e77cb0bfee9..9f2c17029a38 100644
--- a/drivers/scsi/ufs/Makefile
+++ b/drivers/scsi/ufs/Makefile
@@ -2,6 +2,7 @@
 obj-$(CONFIG_SCSI_UFS_DWC_TC_PCI) += tc-dwc-g210-pci.o ufshcd-dwc.o 
tc-dwc-g210.o
 obj-$(CONFIG_SCSI_UFS_DWC_TC_PLATFORM) += tc-dwc-g210-pltfrm.o ufshcd-dwc.o 
tc-dwc-g210.o
 obj-$(CONFIG_SCSI_UFS_QCOM) += ufs-qcom.o
+obj-$(CONFIG_SCSI_UFS_HISI) += ufs-hisi.o
 obj-$(CONFIG_SCSI_UFSHCD) += ufshcd.o
 obj-$(CONFIG_SCSI_UFSHCD_PCI) += ufshcd-pci.o
 obj-$(CONFIG_SCSI_UFSHCD_PLATFORM) += ufshcd-pltfrm.o
diff --git a/drivers/scsi/ufs/ufs-hisi.c b/drivers/scsi/ufs/ufs-hisi.c
new file mode 100644
index ..8b7aea2d44c5
--- /dev/null
+++ b/drivers/scsi/ufs/ufs-hisi.c
@@ -0,0 +1,625 @@
+/*
+ *
+ * HiSilicon Hi UFS Driver
+ *
+ * Copyright (c) 2016-2017 Linaro Ltd.
+ * Copyright (c) 2016-2017 HiSilicon Technologies Co., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ufshcd.h"
+#include "ufshcd-pltfrm.h"
+#include "unipro.h"
+#include "ufs-hisi.h"
+#include "ufshci.h"
+
+static int ufs_hisi_check_hibern8(struct ufs_hba *hba)
+{
+   int err;
+   u32 tx_fsm_val_0;
+   u32 tx_fsm_val_1;
+   unsigned long timeout = jiffies + msecs_to_jiffies(HBRN8_POLL_TOUT_MS);
+
+   do {
+   err = ufshcd_dme_get(hba, UIC_ARG_MIB_SEL(MPHY_TX_FSM_STATE, 0),
+ &tx_fsm_val_0);
+   err |= ufshcd_dme_get(hba,
+   UIC_ARG_MIB_SEL(MPHY_TX_FSM_STATE, 1), &tx_fsm_val_1);
+   if (err || (tx_fsm_val_0 == TX_FSM_HIBERN8 &&
+   tx_fsm_val_1 == TX_FSM_HIBERN8))
+   break;
+
+   /* sleep for max. 200us */
+   usleep_range(100, 200);
+   } while (time_before(jiffies, timeout));
+
+   /*
+* we might have scheduled out for long during polling so
+* check the state again.
+*/
+   if (time_after(jiffies, timeout)) {
+   err = ufshcd_dme_get(hba, UIC_ARG_MIB_SEL(MPHY_TX_FSM_STATE, 0),
+&tx_fsm_val_0);
+   err |= ufshcd_dme_get(hba,
+UIC_ARG_MIB_SEL(MPHY_TX_FSM_STATE, 1), &tx_fsm_val_1);
+   }
+
+   if (err) {
+   dev_err(hba->dev, "%s: unable to get TX_FSM_STATE, err %d\n",
+   __func__, err);
+   } else if (tx_fsm_val_0 != TX_FSM_HIBERN8 ||
+tx_fsm_val_1 != TX_FSM_HIBERN8) {
+   err = -1;
+   dev_err(hba->dev, "%s: invalid TX_FSM_STATE, lane0 = %d, lane1 
= %d\n",
+   __func__, tx_fsm_val_0, tx_fsm_val_1);
+   }
+
+   return err;
+}
+
+static void ufs_hisi_clk_init(struct ufs_hba *hba)
+{
+   struct ufs_hisi_host *host = ufshcd_get_variant(hba);
+
+   ufs_sys_ctrl_clr_bits(host, BIT_SYSCTRL_REF_CLOCK_EN, PHY_CLK_CTRL);
+   if (ufs_sys_ctrl_readl(host, PHY_CLK_CTRL) & BIT_SYSCTRL_REF_CLOCK_EN)
+   mdelay(1);
+   /* use abb clk */
+   ufs_sys_ctrl_clr_bits(host, BIT_UFS_REFCLK_SRC_SEl, UFS_SYSCTRL);
+   ufs_sys_ctrl_clr_bits(host, BIT_UFS_REFCLK_ISO_EN, PHY_ISO_EN);
+   /* open mphy ref clk */
+   ufs_sys_ctrl_set_bits(host, BIT_SYSCTRL_REF_CLOCK_EN, PHY_CLK_CTRL);
+}
+
+static void ufs_hisi_soc_init(struct ufs_hba *hba)
+{
+   struct ufs_hisi_host *host = ufshcd_get_variant(

78 matches

Mail list logo