Administrator intervention is currently required to get good numbers when switching from running latency tests to IOPS tests.
The configured interrupt coalescing values will greatly effect the results of these tests. Currently, the driver has a single coalescing value set by values of the module attribute. This patch changes the driver to support auto-configuration of the coalescing value based on the total number of outstanding IOs and average number of CQEs processed per interrupt for an EQ. Values are checked every 5 seconds. The driver defaults to the automatic selection. Automatic selection can be disabled by the new lpfc_auto_imax module_parameter. Older hardware can only change interrupt coalescing by mailbox command. Newer hardware supports change via a register. The patch support both. Signed-off-by: Dick Kennedy <dick.kenn...@broadcom.com> Signed-off-by: James Smart <james.sm...@broadcom.com> --- drivers/scsi/lpfc/lpfc.h | 3 ++ drivers/scsi/lpfc/lpfc_attr.c | 20 +++++++- drivers/scsi/lpfc/lpfc_debugfs.c | 4 +- drivers/scsi/lpfc/lpfc_hw4.h | 14 ++++++ drivers/scsi/lpfc/lpfc_init.c | 104 ++++++++++++++++++++++++++++++++++++++- drivers/scsi/lpfc/lpfc_sli.c | 36 +++++++++++--- drivers/scsi/lpfc/lpfc_sli.h | 1 + drivers/scsi/lpfc/lpfc_sli4.h | 8 +-- 8 files changed, 177 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index a9d73728a68c..562dc0139735 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -756,6 +756,7 @@ struct lpfc_hba { uint8_t nvmet_support; /* driver supports NVMET */ #define LPFC_NVMET_MAX_PORTS 32 uint8_t mds_diags_support; + uint32_t initial_imax; /* HBA Config Parameters */ uint32_t cfg_ack0; @@ -777,6 +778,7 @@ struct lpfc_hba { uint32_t cfg_poll_tmo; uint32_t cfg_task_mgmt_tmo; uint32_t cfg_use_msi; + uint32_t cfg_auto_imax; uint32_t cfg_fcp_imax; uint32_t cfg_fcp_cpu_map; uint32_t cfg_fcp_io_channel; @@ -1050,6 +1052,7 @@ struct lpfc_hba { uint8_t temp_sensor_support; /* Fields used for heart beat. */ + unsigned long last_eqdelay_time; unsigned long last_completion_time; unsigned long skipped_hb; struct timer_list hb_tmofunc; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 8eee39de15f7..66269e342c7e 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -4481,9 +4481,11 @@ lpfc_fcp_imax_store(struct device *dev, struct device_attribute *attr, return -EINVAL; phba->cfg_fcp_imax = (uint32_t)val; + phba->initial_imax = phba->cfg_fcp_imax; for (i = 0; i < phba->io_channel_irqs; i += LPFC_MAX_EQ_DELAY_EQID_CNT) - lpfc_modify_hba_eq_delay(phba, i); + lpfc_modify_hba_eq_delay(phba, i, LPFC_MAX_EQ_DELAY_EQID_CNT, + val); return strlen(buf); } @@ -4538,6 +4540,16 @@ lpfc_fcp_imax_init(struct lpfc_hba *phba, int val) static DEVICE_ATTR(lpfc_fcp_imax, S_IRUGO | S_IWUSR, lpfc_fcp_imax_show, lpfc_fcp_imax_store); +/* + * lpfc_auto_imax: Controls Auto-interrupt coalescing values support. + * 0 No auto_imax support + * 1 auto imax on + * Auto imax will change the value of fcp_imax on a per EQ basis, using + * the EQ Delay Multiplier, depending on the activity for that EQ. + * Value range [0,1]. Default value is 1. + */ +LPFC_ATTR_RW(auto_imax, 1, 0, 1, "Enable Auto imax"); + /** * lpfc_state_show - Display current driver CPU affinity * @dev: class converted to a Scsi_host structure. @@ -5164,6 +5176,7 @@ struct device_attribute *lpfc_hba_attrs[] = { &dev_attr_lpfc_task_mgmt_tmo, &dev_attr_lpfc_use_msi, &dev_attr_lpfc_nvme_oas, + &dev_attr_lpfc_auto_imax, &dev_attr_lpfc_fcp_imax, &dev_attr_lpfc_fcp_cpu_map, &dev_attr_lpfc_fcp_io_channel, @@ -6182,6 +6195,7 @@ lpfc_get_cfgparam(struct lpfc_hba *phba) lpfc_enable_SmartSAN_init(phba, lpfc_enable_SmartSAN); lpfc_use_msi_init(phba, lpfc_use_msi); lpfc_nvme_oas_init(phba, lpfc_nvme_oas); + lpfc_auto_imax_init(phba, lpfc_auto_imax); lpfc_fcp_imax_init(phba, lpfc_fcp_imax); lpfc_fcp_cpu_map_init(phba, lpfc_fcp_cpu_map); lpfc_enable_hba_reset_init(phba, lpfc_enable_hba_reset); @@ -6226,6 +6240,10 @@ lpfc_get_cfgparam(struct lpfc_hba *phba) phba->cfg_enable_fc4_type |= LPFC_ENABLE_FCP; } + if (phba->cfg_auto_imax && !phba->cfg_fcp_imax) + phba->cfg_auto_imax = 0; + phba->initial_imax = phba->cfg_fcp_imax; + /* A value of 0 means use the number of CPUs found in the system */ if (phba->cfg_fcp_io_channel == 0) phba->cfg_fcp_io_channel = phba->sli4_hba.num_present_cpu; diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 6089690fb345..13259ff23887 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -3265,9 +3265,9 @@ __lpfc_idiag_print_eq(struct lpfc_queue *qp, char *eqtype, len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n%s EQ info: EQ-STAT[max:x%x noE:x%x " - "bs:x%x proc:x%llx]\n", + "bs:x%x proc:x%llx eqd %d]\n", eqtype, qp->q_cnt_1, qp->q_cnt_2, qp->q_cnt_3, - (unsigned long long)qp->q_cnt_4); + (unsigned long long)qp->q_cnt_4, qp->q_mode); len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "EQID[%02d], QE-CNT[%04d], QE-SZ[%04d], " "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]", diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index e0a5fce416ae..bb4715705fa3 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -197,6 +197,7 @@ struct lpfc_sli_intf { /* Delay Multiplier constant */ #define LPFC_DMULT_CONST 651042 +#define LPFC_DMULT_MAX 1023 /* Configuration of Interrupts / sec for entire HBA port */ #define LPFC_MIN_IMAX 5000 @@ -657,6 +658,15 @@ struct lpfc_register { #define LPFC_CTL_PORT_ER1_OFFSET 0x40C #define LPFC_CTL_PORT_ER2_OFFSET 0x410 +#define LPFC_CTL_PORT_EQ_DELAY_OFFSET 0x418 +#define lpfc_sliport_eqdelay_delay_SHIFT 16 +#define lpfc_sliport_eqdelay_delay_MASK 0xffff +#define lpfc_sliport_eqdelay_delay_WORD word0 +#define lpfc_sliport_eqdelay_id_SHIFT 0 +#define lpfc_sliport_eqdelay_id_MASK 0xfff +#define lpfc_sliport_eqdelay_id_WORD word0 +#define LPFC_SEC_TO_USEC 1000000 + /* The following Registers apply to SLI4 if_type 0 UCNAs. They typically * reside in BAR 2. */ @@ -3258,6 +3268,10 @@ struct lpfc_sli4_parameters { #define cfg_xib_SHIFT 4 #define cfg_xib_MASK 0x00000001 #define cfg_xib_WORD word19 +#define cfg_eqdr_SHIFT 8 +#define cfg_eqdr_MASK 0x00000001 +#define cfg_eqdr_WORD word19 +#define LPFC_NODELAY_MAX_IO 32 }; #define LPFC_SET_UE_RECOVERY 0x10 diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index a825806036c3..9d3a12636455 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -1249,6 +1249,12 @@ lpfc_hb_timeout_handler(struct lpfc_hba *phba) int retval, i; struct lpfc_sli *psli = &phba->sli; LIST_HEAD(completions); + struct lpfc_queue *qp; + unsigned long time_elapsed; + uint32_t tick_cqe, max_cqe, val; + uint64_t tot, data1, data2, data3; + struct lpfc_register reg_data; + void __iomem *eqdreg = phba->sli4_hba.u.if_type2.EQDregaddr; vports = lpfc_create_vport_work_array(phba); if (vports != NULL) @@ -1263,6 +1269,95 @@ lpfc_hb_timeout_handler(struct lpfc_hba *phba) (phba->pport->fc_flag & FC_OFFLINE_MODE)) return; + if (phba->cfg_auto_imax) { + if (!phba->last_eqdelay_time) { + phba->last_eqdelay_time = jiffies; + goto skip_eqdelay; + } + time_elapsed = jiffies - phba->last_eqdelay_time; + phba->last_eqdelay_time = jiffies; + + tot = 0xffff; + /* Check outstanding IO count */ + if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) { + if (phba->nvmet_support) { + spin_lock(&phba->sli4_hba.nvmet_io_lock); + tot = phba->sli4_hba.nvmet_xri_cnt - + phba->sli4_hba.nvmet_ctx_cnt; + spin_unlock(&phba->sli4_hba.nvmet_io_lock); + } else { + tot = atomic_read(&phba->fc4NvmeIoCmpls); + data1 = atomic_read( + &phba->fc4NvmeInputRequests); + data2 = atomic_read( + &phba->fc4NvmeOutputRequests); + data3 = atomic_read( + &phba->fc4NvmeControlRequests); + tot = (data1 + data2 + data3) - tot; + } + } + + /* Interrupts per sec per EQ */ + val = phba->cfg_fcp_imax / phba->io_channel_irqs; + tick_cqe = val / CONFIG_HZ; /* Per tick per EQ */ + + /* Assume 1 CQE/ISR, calc max CQEs allowed for time duration */ + max_cqe = time_elapsed * tick_cqe; + + for (i = 0; i < phba->io_channel_irqs; i++) { + /* Fast-path EQ */ + qp = phba->sli4_hba.hba_eq[i]; + if (!qp) + continue; + + /* Use no EQ delay if we don't have many outstanding + * IOs, or if we are only processing 1 CQE/ISR or less. + * Otherwise, assume we can process up to lpfc_fcp_imax + * interrupts per HBA. + */ + if (tot < LPFC_NODELAY_MAX_IO || + qp->EQ_cqe_cnt <= max_cqe) + val = 0; + else + val = phba->cfg_fcp_imax; + + if (phba->sli.sli_flag & LPFC_SLI_USE_EQDR) { + /* Use EQ Delay Register method */ + + /* Convert for EQ Delay register */ + if (val) { + /* First, interrupts per sec per EQ */ + val = phba->cfg_fcp_imax / + phba->io_channel_irqs; + + /* us delay between each interrupt */ + val = LPFC_SEC_TO_USEC / val; + } + if (val != qp->q_mode) { + reg_data.word0 = 0; + bf_set(lpfc_sliport_eqdelay_id, + ®_data, qp->queue_id); + bf_set(lpfc_sliport_eqdelay_delay, + ®_data, val); + writel(reg_data.word0, eqdreg); + } + } else { + /* Use mbox command method */ + if (val != qp->q_mode) + lpfc_modify_hba_eq_delay(phba, i, + 1, val); + } + + /* + * val is cfg_fcp_imax or 0 for mbox delay or us delay + * between interrupts for EQDR. + */ + qp->q_mode = val; + qp->EQ_cqe_cnt = 0; + } + } + +skip_eqdelay: spin_lock_irq(&phba->pport->work_port_lock); if (time_after(phba->last_completion_time + @@ -7257,6 +7352,9 @@ lpfc_sli4_bar0_register_memmap(struct lpfc_hba *phba, uint32_t if_type) phba->sli4_hba.conf_regs_memmap_p + LPFC_SLI_INTF; break; case LPFC_SLI_INTF_IF_TYPE_2: + phba->sli4_hba.u.if_type2.EQDregaddr = + phba->sli4_hba.conf_regs_memmap_p + + LPFC_CTL_PORT_EQ_DELAY_OFFSET; phba->sli4_hba.u.if_type2.ERR1regaddr = phba->sli4_hba.conf_regs_memmap_p + LPFC_CTL_PORT_ER1_OFFSET; @@ -8783,7 +8881,8 @@ lpfc_sli4_queue_setup(struct lpfc_hba *phba) } for (qidx = 0; qidx < io_channel; qidx += LPFC_MAX_EQ_DELAY_EQID_CNT) - lpfc_modify_hba_eq_delay(phba, qidx); + lpfc_modify_hba_eq_delay(phba, qidx, LPFC_MAX_EQ_DELAY_EQID_CNT, + phba->cfg_fcp_imax); return 0; @@ -10252,6 +10351,9 @@ lpfc_get_sli4_parameters(struct lpfc_hba *phba, LPFC_MBOXQ_t *mboxq) if (bf_get(cfg_xib, mbx_sli4_parameters) && phba->cfg_suppress_rsp) phba->sli.sli_flag |= LPFC_SLI_SUPPRESS_RSP; + if (bf_get(cfg_eqdr, mbx_sli4_parameters)) + phba->sli.sli_flag |= LPFC_SLI_USE_EQDR; + /* Make sure that sge_supp_len can be handled by the driver */ if (sli4_params->sge_supp_len > LPFC_MAX_SGE_SIZE) sli4_params->sge_supp_len = LPFC_MAX_SGE_SIZE; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index f60c9e3e37d7..040575adf9c6 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -13478,6 +13478,7 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe, /* Track the max number of CQEs processed in 1 EQ */ if (ecount > cq->CQ_max_cqe) cq->CQ_max_cqe = ecount; + cq->assoc_qp->EQ_cqe_cnt += ecount; /* Catch the no cq entry condition */ if (unlikely(ecount == 0)) @@ -13569,6 +13570,7 @@ lpfc_sli4_fof_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe) /* Track the max number of CQEs processed in 1 EQ */ if (ecount > cq->CQ_max_cqe) cq->CQ_max_cqe = ecount; + cq->assoc_qp->EQ_cqe_cnt += ecount; /* Catch the no cq entry condition */ if (unlikely(ecount == 0)) @@ -13629,7 +13631,6 @@ lpfc_sli4_fof_intr_handler(int irq, void *dev_id) /* Check device state for handling interrupt */ if (unlikely(lpfc_intr_state_check(phba))) { - eq->EQ_badstate++; /* Check again for link_state with lock held */ spin_lock_irqsave(&phba->hbalock, iflag); if (phba->link_state < LPFC_LINK_DOWN) @@ -13741,7 +13742,6 @@ lpfc_sli4_hba_intr_handler(int irq, void *dev_id) /* Check device state for handling interrupt */ if (unlikely(lpfc_intr_state_check(phba))) { - fpeq->EQ_badstate++; /* Check again for link_state with lock held */ spin_lock_irqsave(&phba->hbalock, iflag); if (phba->link_state < LPFC_LINK_DOWN) @@ -14000,14 +14000,15 @@ lpfc_dual_chute_pci_bar_map(struct lpfc_hba *phba, uint16_t pci_barset) * fails this function will return -ENXIO. **/ int -lpfc_modify_hba_eq_delay(struct lpfc_hba *phba, uint32_t startq) +lpfc_modify_hba_eq_delay(struct lpfc_hba *phba, uint32_t startq, + uint32_t numq, uint32_t imax) { struct lpfc_mbx_modify_eq_delay *eq_delay; LPFC_MBOXQ_t *mbox; struct lpfc_queue *eq; int cnt, rc, length, status = 0; uint32_t shdr_status, shdr_add_status; - uint32_t result; + uint32_t result, val; int qidx; union lpfc_sli4_cfg_shdr *shdr; uint16_t dmult; @@ -14026,22 +14027,45 @@ lpfc_modify_hba_eq_delay(struct lpfc_hba *phba, uint32_t startq) eq_delay = &mbox->u.mqe.un.eq_delay; /* Calculate delay multiper from maximum interrupt per second */ - result = phba->cfg_fcp_imax / phba->io_channel_irqs; + result = imax / phba->io_channel_irqs; if (result > LPFC_DMULT_CONST || result == 0) dmult = 0; else dmult = LPFC_DMULT_CONST/result - 1; + if (dmult > LPFC_DMULT_MAX) + dmult = LPFC_DMULT_MAX; cnt = 0; for (qidx = startq; qidx < phba->io_channel_irqs; qidx++) { eq = phba->sli4_hba.hba_eq[qidx]; if (!eq) continue; + eq->q_mode = imax; eq_delay->u.request.eq[cnt].eq_id = eq->queue_id; eq_delay->u.request.eq[cnt].phase = 0; eq_delay->u.request.eq[cnt].delay_multi = dmult; cnt++; - if (cnt >= LPFC_MAX_EQ_DELAY_EQID_CNT) + + /* q_mode is only used for auto_imax */ + if (phba->sli.sli_flag & LPFC_SLI_USE_EQDR) { + /* Use EQ Delay Register method for q_mode */ + + /* Convert for EQ Delay register */ + val = phba->cfg_fcp_imax; + if (val) { + /* First, interrupts per sec per EQ */ + val = phba->cfg_fcp_imax / + phba->io_channel_irqs; + + /* us delay between each interrupt */ + val = LPFC_SEC_TO_USEC / val; + } + eq->q_mode = val; + } else { + eq->q_mode = imax; + } + + if (cnt >= numq) break; } eq_delay->u.request.num_eq = cnt; diff --git a/drivers/scsi/lpfc/lpfc_sli.h b/drivers/scsi/lpfc/lpfc_sli.h index 9085306ddd78..a3b1b5145d2b 100644 --- a/drivers/scsi/lpfc/lpfc_sli.h +++ b/drivers/scsi/lpfc/lpfc_sli.h @@ -321,6 +321,7 @@ struct lpfc_sli { #define LPFC_MENLO_MAINT 0x1000 /* need for menl fw download */ #define LPFC_SLI_ASYNC_MBX_BLK 0x2000 /* Async mailbox is blocked */ #define LPFC_SLI_SUPPRESS_RSP 0x4000 /* Suppress RSP feature is supported */ +#define LPFC_SLI_USE_EQDR 0x8000 /* EQ Delay Register is supported */ struct lpfc_sli_ring *sli3_ring; diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 28b75e08e044..830dc83b9c21 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -168,7 +168,7 @@ struct lpfc_queue { struct lpfc_sli_ring *pring; /* ptr to io ring associated with q */ struct lpfc_rqb *rqbp; /* ptr to RQ buffers */ - uint16_t sgl_list_cnt; + uint32_t q_mode; uint16_t db_format; #define LPFC_DB_RING_FORMAT 0x01 #define LPFC_DB_LIST_FORMAT 0x02 @@ -181,7 +181,7 @@ struct lpfc_queue { /* defines for EQ stats */ #define EQ_max_eqe q_cnt_1 #define EQ_no_entry q_cnt_2 -#define EQ_badstate q_cnt_3 +#define EQ_cqe_cnt q_cnt_3 #define EQ_processed q_cnt_4 /* defines for CQ stats */ @@ -523,6 +523,7 @@ struct lpfc_sli4_hba { #define SLIPORT_ERR2_REG_FAILURE_CQ 0x4 #define SLIPORT_ERR2_REG_FAILURE_BUS 0x5 #define SLIPORT_ERR2_REG_FAILURE_RQ 0x6 + void __iomem *EQDregaddr; } if_type2; } u; @@ -755,7 +756,8 @@ struct lpfc_queue *lpfc_sli4_queue_alloc(struct lpfc_hba *, uint32_t, uint32_t); void lpfc_sli4_queue_free(struct lpfc_queue *); int lpfc_eq_create(struct lpfc_hba *, struct lpfc_queue *, uint32_t); -int lpfc_modify_hba_eq_delay(struct lpfc_hba *phba, uint32_t startq); +int lpfc_modify_hba_eq_delay(struct lpfc_hba *phba, uint32_t startq, + uint32_t numq, uint32_t imax); int lpfc_cq_create(struct lpfc_hba *, struct lpfc_queue *, struct lpfc_queue *, uint32_t, uint32_t); int lpfc_cq_create_set(struct lpfc_hba *phba, struct lpfc_queue **cqp, -- 2.11.0