Re: [PATCH v5 1/3] cxlflash: Base error recovery support

2015-08-13 Thread Michael Neuling
On Wed, 2015-08-12 at 18:51 -0500, Matthew R. Ochs wrote:
 Introduce support for enhanced I/O error handling.
 
 A device state is added to track 3 possible states of the device:
 
 Normal - the device is operating normally and is fully operational
 
 Limbo - the device is in a reset/recovery scenario and its operational
 status is paused
 
 Failed/terminating - the device has either failed to be reset/recovered
  or is being terminated (removed); it is no longer
  operational
 
 All operations are allowed when the device is operating normally. When the
 device transitions to limbo state, I/O must be paused. To help accomplish
 this, a wait queue is introduced where existing and new threads can wait
 until the device is no longer in limbo. When coming out of limbo, threads
 need to check the state and error out gracefully when encountering the
 failed state. When the device transitions to the failed/terminating state,
 normal operations are no longer allowed. Only specially designated
 operations related to graceful cleanup are permitted.
 
 Signed-off-by: Matthew R. Ochs mro...@linux.vnet.ibm.com
 Signed-off-by: Manoj N. Kumar ma...@linux.vnet.ibm.com
 Reviewed-by: Daniel Axtens d...@axtens.net

Thanks for integrating my suggestions.

Reviewed-by: Michael Neuling mi...@neuling.org

 ---
  drivers/scsi/cxlflash/Kconfig  |   2 +-
  drivers/scsi/cxlflash/common.h |  12 ++-
  drivers/scsi/cxlflash/main.c   | 174 
 ++---
  drivers/scsi/cxlflash/main.h   |   6 +-
  4 files changed, 177 insertions(+), 17 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/3] cxlflash: Base error recovery support

2015-08-12 Thread Matthew R. Ochs
Introduce support for enhanced I/O error handling.

A device state is added to track 3 possible states of the device:

Normal - the device is operating normally and is fully operational

Limbo - the device is in a reset/recovery scenario and its operational
status is paused

Failed/terminating - the device has either failed to be reset/recovered
 or is being terminated (removed); it is no longer
 operational

All operations are allowed when the device is operating normally. When the
device transitions to limbo state, I/O must be paused. To help accomplish
this, a wait queue is introduced where existing and new threads can wait
until the device is no longer in limbo. When coming out of limbo, threads
need to check the state and error out gracefully when encountering the
failed state. When the device transitions to the failed/terminating state,
normal operations are no longer allowed. Only specially designated
operations related to graceful cleanup are permitted.

Signed-off-by: Matthew R. Ochs mro...@linux.vnet.ibm.com
Signed-off-by: Manoj N. Kumar ma...@linux.vnet.ibm.com
Reviewed-by: Daniel Axtens d...@axtens.net
---
 drivers/scsi/cxlflash/Kconfig  |   2 +-
 drivers/scsi/cxlflash/common.h |  12 ++-
 drivers/scsi/cxlflash/main.c   | 174 ++---
 drivers/scsi/cxlflash/main.h   |   6 +-
 4 files changed, 177 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/cxlflash/Kconfig b/drivers/scsi/cxlflash/Kconfig
index c707508..c052104 100644
--- a/drivers/scsi/cxlflash/Kconfig
+++ b/drivers/scsi/cxlflash/Kconfig
@@ -4,7 +4,7 @@
 
 config CXLFLASH
tristate Support for IBM CAPI Flash
-   depends on PCI  SCSI  CXL
+   depends on PCI  SCSI  CXL  EEH
default m
help
  Allows CAPI Accelerated IO to Flash
diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index fe86bfe..ffdbc57 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -76,6 +76,12 @@ enum cxlflash_init_state {
INIT_STATE_SCSI
 };
 
+enum cxlflash_state {
+   STATE_NORMAL,   /* Normal running state, everything good */
+   STATE_LIMBO,/* Limbo running state, trying to reset/recover */
+   STATE_FAILTERM  /* Failed/terminating state, error out users/threads */
+};
+
 /*
  * Each context has its own set of resource handles that is visible
  * only from that context.
@@ -91,8 +97,6 @@ struct cxlflash_cfg {
 
ulong cxlflash_regs_pci;
 
-   wait_queue_head_t eeh_waitq;
-
struct work_struct work_q;
enum cxlflash_init_state init_state;
enum cxlflash_lr_state lr_state;
@@ -105,7 +109,8 @@ struct cxlflash_cfg {
 
wait_queue_head_t tmf_waitq;
bool tmf_active;
-   u8 err_recovery_active:1;
+   wait_queue_head_t limbo_waitq;
+   enum cxlflash_state state;
 };
 
 struct afu_cmd {
@@ -178,4 +183,3 @@ struct afu_cmd *cxlflash_cmd_checkout(struct afu *);
 void cxlflash_cmd_checkin(struct afu_cmd *);
 int cxlflash_afu_sync(struct afu *, ctx_hndl_t, res_hndl_t, u8);
 #endif /* ifndef _CXLFLASH_COMMON_H */
-
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 76a7286..4df1ff6 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -353,6 +353,7 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scp)
struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)host-hostdata;
struct afu *afu = cfg-afu;
struct pci_dev *pdev = cfg-dev;
+   struct device *dev = cfg-dev-dev;
struct afu_cmd *cmd;
u32 port_sel = scp-device-channel + 1;
int nseg, i, ncount;
@@ -380,6 +381,21 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scp)
}
spin_unlock_irqrestore(cfg-tmf_waitq.lock, lock_flags);
 
+   switch (cfg-state) {
+   case STATE_LIMBO:
+   dev_dbg_ratelimited(dev, %s: device in limbo!\n, __func__);
+   rc = SCSI_MLQUEUE_HOST_BUSY;
+   goto out;
+   case STATE_FAILTERM:
+   dev_dbg_ratelimited(dev, %s: device has failed!\n, __func__);
+   scp-result = (DID_NO_CONNECT  16);
+   scp-scsi_done(scp);
+   rc = 0;
+   goto out;
+   default:
+   break;
+   }
+
cmd = cxlflash_cmd_checkout(afu);
if (unlikely(!cmd)) {
pr_err(%s: could not get a free command\n, __func__);
@@ -455,9 +471,21 @@ static int cxlflash_eh_device_reset_handler(struct 
scsi_cmnd *scp)
 get_unaligned_be32(((u32 *)scp-cmnd)[2]),
 get_unaligned_be32(((u32 *)scp-cmnd)[3]));
 
-   rcr = send_tmf(afu, scp, TMF_LUN_RESET);
-   if (unlikely(rcr))
+   switch (cfg-state) {
+   case STATE_NORMAL:
+   rcr = send_tmf(afu, scp, TMF_LUN_RESET);
+   if (unlikely(rcr))
+