In some case like while performing extensive expander reset
or phy reset user may observe that drives are not visible in
OS and driver's firmware-worker thread got blocked for more
than 120 seconds call trace for below scenarios,

1. Received target add event for Device A and hence driver has
registered this device to SML by calling sas_rphy_add(). SML has
half added this device and returned the control to the driver by
quitting from sas_rphy_add() API, and started some background
scanning on this device A.

2. While background scanning is going on device A, driver has
received SAS DEVICE STATUS CHANGE EVENT with RC code
"Internal device reset" event and hence driver has set tm_busy
flag for this Device A from FW worker thread context. When tm_busy
flag is set then driver return scsi commands with device busy
status asking the kernel to retry the same command after some time.
So background scanning for device A will be waiting for this tm_busy
to be cleared.

3. Meanwhile driver has received a target add event for Device B
and hence driver called  sas_rphy_add() API to register this device
with SML. Bust since background scanning for Device A is still
pending and hence SML is not quitting  from this sas_rphy_add()
and hence driver’s firmware worker thread got blocked.

4. Now driver has received  SAS DEVICE STATUS CHANGE EVENT with RC code
"Internal device reset complete" event, But as driver’s firmware worker
thread got blocked in Step3, so it can’t process this event and it was
not clearing the tm_busy flag and deadlock has occurred.
(where SML was waiting for tm_busy flag to be cleared and our FW worker
thread is waiting for SML to quit from sas_device_rphy_add() API).

Same deadlock will be observed even if device B is getting removed in
step3. So to limit these types of deadlocks driver will process the
SAS DEVICE STATUS CHANGE EVENT events from ISR context instead of
processing this event from worker thread context.
This improvement avoids above deadlock.

Signed-off-by: Suganath Prabu <suganath-prabu.subram...@broadcom.com>
---
 drivers/scsi/mpt3sas/mpt3sas_scsih.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c 
b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 24b5f5f..09b3d3f 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -6469,24 +6469,17 @@ _scsih_sas_device_status_change_event_debug(struct 
MPT3SAS_ADAPTER *ioc,
 /**
  * _scsih_sas_device_status_change_event - handle device status change
  * @ioc: per adapter object
- * @fw_event: The fw_event_work object
+ * @event_data: The fw event
  * Context: user.
  */
 static void
 _scsih_sas_device_status_change_event(struct MPT3SAS_ADAPTER *ioc,
-       struct fw_event_work *fw_event)
+       Mpi2EventDataSasDeviceStatusChange_t *event_data)
 {
        struct MPT3SAS_TARGET *target_priv_data;
        struct _sas_device *sas_device;
        u64 sas_address;
        unsigned long flags;
-       Mpi2EventDataSasDeviceStatusChange_t *event_data =
-               (Mpi2EventDataSasDeviceStatusChange_t *)
-               fw_event->event_data;
-
-       if (ioc->logging_level & MPT_DEBUG_EVENT_WORK_TASK)
-               _scsih_sas_device_status_change_event_debug(ioc,
-                    event_data);
 
        /* In MPI Revision K (0xC), the internal device reset complete was
         * implemented, so avoid setting tm_busy flag for older firmware.
@@ -6518,6 +6511,12 @@ _scsih_sas_device_status_change_event(struct 
MPT3SAS_ADAPTER *ioc,
        else
                target_priv_data->tm_busy = 0;
 
+       if (ioc->logging_level & MPT_DEBUG_EVENT_WORK_TASK)
+               ioc_info(ioc,
+                   "%s tm_busy flag for handle(0x%04x)\n",
+                   (target_priv_data->tm_busy == 1) ? "Enable" : "Disable",
+                   target_priv_data->handle);
+
 out:
        if (sas_device)
                sas_device_put(sas_device);
@@ -9346,7 +9345,10 @@ _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct 
fw_event_work *fw_event)
                _scsih_sas_topology_change_event(ioc, fw_event);
                break;
        case MPI2_EVENT_SAS_DEVICE_STATUS_CHANGE:
-               _scsih_sas_device_status_change_event(ioc, fw_event);
+               if (ioc->logging_level & MPT_DEBUG_EVENT_WORK_TASK)
+                       _scsih_sas_device_status_change_event_debug(ioc,
+                           (Mpi2EventDataSasDeviceStatusChange_t *)
+                           fw_event->event_data);
                break;
        case MPI2_EVENT_SAS_DISCOVERY:
                _scsih_sas_discovery_event(ioc, fw_event);
@@ -9519,6 +9521,10 @@ mpt3sas_scsih_event_callback(struct MPT3SAS_ADAPTER 
*ioc, u8 msix_index,
                break;
        }
        case MPI2_EVENT_SAS_DEVICE_STATUS_CHANGE:
+               _scsih_sas_device_status_change_event(ioc,
+                   (Mpi2EventDataSasDeviceStatusChange_t *)
+                   mpi_reply->EventData);
+               break;
        case MPI2_EVENT_IR_OPERATION_STATUS:
        case MPI2_EVENT_SAS_DISCOVERY:
        case MPI2_EVENT_SAS_DEVICE_DISCOVERY_ERROR:
-- 
1.8.3.1

Reply via email to