Re: [PATCH 7/17] sym53c8xx: PCI Error Recovery support

2007-10-05 Thread Jeff Garzik

Matthew Wilcox wrote:

From: Linas Vepstas <[EMAIL PROTECTED]>

This patch adds the PCI error recovery callbacks to the Symbios SCSI device
driver.  It includes support for First Failure Data Capture.

Signed-off-by: Linas Vepstas <[EMAIL PROTECTED]>

Assorted changes to initial patches, including returning IRQ_NONE from the
interrupt handler if the device is offline and re-using the eh_done completion
in the scsi error handler.

Signed-off-by: Matthew Wilcox <[EMAIL PROTECTED]>
---
 drivers/scsi/sym53c8xx_2/sym_glue.c |  179 ++-
 drivers/scsi/sym53c8xx_2/sym_glue.h |3 +
 drivers/scsi/sym53c8xx_2/sym_hipd.c |   25 -
 3 files changed, 200 insertions(+), 7 deletions(-)


ACK patches 4-7


-
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/17] sym53c8xx: PCI Error Recovery support

2007-10-05 Thread Matthew Wilcox
From: Linas Vepstas <[EMAIL PROTECTED]>

This patch adds the PCI error recovery callbacks to the Symbios SCSI device
driver.  It includes support for First Failure Data Capture.

Signed-off-by: Linas Vepstas <[EMAIL PROTECTED]>

Assorted changes to initial patches, including returning IRQ_NONE from the
interrupt handler if the device is offline and re-using the eh_done completion
in the scsi error handler.

Signed-off-by: Matthew Wilcox <[EMAIL PROTECTED]>
---
 drivers/scsi/sym53c8xx_2/sym_glue.c |  179 ++-
 drivers/scsi/sym53c8xx_2/sym_glue.h |3 +
 drivers/scsi/sym53c8xx_2/sym_hipd.c |   25 -
 3 files changed, 200 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c 
b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 6bc8789..fec9c9c 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -134,7 +134,7 @@ static struct scsi_transport_template 
*sym2_transport_template = NULL;
  *  Driver private area in the SCSI command structure.
  */
 struct sym_ucmd {  /* Override the SCSI pointer structure */
-   struct completion *eh_done; /* For error handling */
+   struct completion *eh_done; /* SCSI error handling */
 };
 
 #define SYM_UCMD_PTR(cmd)  ((struct sym_ucmd *)(&(cmd)->SCp))
@@ -556,6 +556,10 @@ static irqreturn_t sym53c8xx_intr(int irq, void *dev_id)
 {
struct sym_hcb *np = dev_id;
 
+   /* Avoid spinloop trying to handle interrupts on frozen device */
+   if (pci_channel_offline(np->s.device))
+   return IRQ_NONE;
+
if (DEBUG_FLAGS & DEBUG_TINY) printf_debug ("[");
 
spin_lock(np->s.host->host_lock);
@@ -598,6 +602,7 @@ static int sym_eh_handler(int op, char *opname, struct 
scsi_cmnd *cmd)
struct sym_hcb *np = SYM_SOFTC_PTR(cmd);
struct sym_ucmd *ucmd = SYM_UCMD_PTR(cmd);
struct Scsi_Host *host = cmd->device->host;
+   struct pci_dev *pdev = np->s.device;
SYM_QUEHEAD *qp;
int cmd_queued = 0;
int sts = -1;
@@ -605,6 +610,38 @@ static int sym_eh_handler(int op, char *opname, struct 
scsi_cmnd *cmd)
 
dev_warn(&cmd->device->sdev_gendev, "%s operation started.\n", opname);
 
+   /* We may be in an error condition because the PCI bus
+* went down. In this case, we need to wait until the
+* PCI bus is reset, the card is reset, and only then
+* proceed with the scsi error recovery.  There's no
+* point in hurrying; take a leisurely wait.
+*/
+#define WAIT_FOR_PCI_RECOVERY  35
+   if (pci_channel_offline(pdev)) {
+   struct host_data *hostdata = shost_priv(host);
+   struct completion *io_reset;
+   int finished_reset = 0;
+   init_completion(&eh_done);
+   spin_lock_irq(host->host_lock);
+   /* Make sure we didn't race */
+   if (pci_channel_offline(pdev)) {
+   if (!hostdata->io_reset)
+   hostdata->io_reset = &eh_done;
+   io_reset = hostdata->io_reset;
+   } else {
+   io_reset = NULL;
+   }
+
+   if (!pci_channel_offline(pdev))
+   finished_reset = 1;
+   spin_unlock_irq(host->host_lock);
+   if (!finished_reset)
+   finished_reset = wait_for_completion_timeout(io_reset,
+   WAIT_FOR_PCI_RECOVERY*HZ);
+   if (!finished_reset)
+   return SCSI_FAILED;
+   }
+
spin_lock_irq(host->host_lock);
/* This one is queued in some place -> to wait for completion */
FOR_EACH_QUEUED_ELEMENT(&np->busy_ccbq, qp) {
@@ -630,7 +667,7 @@ static int sym_eh_handler(int op, char *opname, struct 
scsi_cmnd *cmd)
break;
case SYM_EH_HOST_RESET:
sym_reset_scsi_bus(np, 0);
-   sym_start_up (np, 1);
+   sym_start_up(np, 1);
sts = 0;
break;
default:
@@ -1435,7 +1472,7 @@ static struct Scsi_Host * __devinit sym_attach(struct 
scsi_host_template *tpnt,
/*
 *  Start the SCRIPTS.
 */
-   sym_start_up (np, 1);
+   sym_start_up(np, 1);
 
/*
 *  Start the timer daemon
@@ -1822,6 +1859,134 @@ static void __devexit sym2_remove(struct pci_dev *pdev)
attach_count--;
 }
 
+/**
+ * sym2_io_error_detected() - called when PCI error is detected
+ * @pdev: pointer to PCI device
+ * @state: current state of the PCI slot
+ */
+static pci_ers_result_t sym2_io_error_detected(struct pci_dev *pdev,
+ enum pci_channel_state state)
+{
+   /* If slot is permanently frozen, turn everything off */
+   if (state == pci_channel_io_perm_failure) {
+   sym2_remove(pdev);
+   retur