Author: hselasky
Date: Fri Mar 30 19:45:48 2018
New Revision: 331822
URL: https://svnweb.freebsd.org/changeset/base/331822

Log:
  Reorganize health recovery in mlx5core.
  
  - Move the semaphore locking and unlocking to the same function.
  - Flags are no longer needed if the reset and crdump will be done in the
    same function.
  
  MFC after:    3 days
  Submitted by: slavash@
  Sponsored by: Mellanox Technologies

Modified:
  head/sys/dev/mlx5/mlx5_core/mlx5_health.c

Modified: head/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_core/mlx5_health.c   Fri Mar 30 19:43:15 2018        
(r331821)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_health.c   Fri Mar 30 19:45:48 2018        
(r331822)
@@ -48,8 +48,6 @@ enum {
 enum {
        MLX5_DROP_NEW_HEALTH_WORK,
        MLX5_DROP_NEW_RECOVERY_WORK,
-       MLX5_SKIP_SW_RESET,
-       MLX5_SW_RESET_SEM_LOCKED,
 };
 
 enum  {
@@ -167,7 +165,6 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
 {
        bool supported = (ioread32be(&dev->iseg->initializing) >>
                          MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
-       struct mlx5_core_health *health = &dev->priv.health;
        u32 cmdq_addr, fatal_error;
 
        if (!supported)
@@ -181,8 +178,7 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
        fatal_error = check_fatal_sensors(dev);
        if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
            fatal_error == MLX5_SENSOR_NIC_DISABLED ||
-           fatal_error == MLX5_SENSOR_NIC_SW_RESET ||
-           test_bit(MLX5_SKIP_SW_RESET, &health->flags)) {
+           fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
                mlx5_core_warn(dev, "Not issuing FW reset. Either it's already 
done or won't help.\n");
                return;
        }
@@ -197,26 +193,65 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
                    &dev->iseg->cmdq_addr_l_sz);
 }
 
+#define MLX5_CRDUMP_WAIT_MS    60000
+#define MLX5_FW_RESET_WAIT_MS  1000
+#define MLX5_NIC_STATE_POLL_MS 5
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 {
+       unsigned long end, delay_ms = MLX5_CRDUMP_WAIT_MS;
+       u32 fatal_error;
+       int lock = -EBUSY;
+
        mutex_lock(&dev->intf_state_mutex);
        if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
                goto unlock;
                return;
        }
 
-       if (!force)
-               mlx5_core_err(dev, "internal state error detected\n");
-       if (check_fatal_sensors(dev) || force) {
-               reset_fw_if_needed(dev);
+       fatal_error = check_fatal_sensors(dev);
+
+       if (fatal_error || force) {
+               if (!force)
+                       mlx5_core_err(dev, "internal state error detected\n");
                dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
                mlx5_trigger_cmd_completions(dev);
        }
 
-       mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
-       if (!force)
-               mlx5_core_err(dev, "system error event triggered\n");
+       if (force)
+               goto err_state_done;
 
+       if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
+               if (mlx5_core_is_pf(dev))
+                       lock = lock_sem_sw_reset(dev, LOCK);
+
+               if (lock != -EBUSY) {
+                       reset_fw_if_needed(dev);
+                       delay_ms = MLX5_FW_RESET_WAIT_MS;
+               }
+       }
+
+       /* Recover from SW reset */
+       end = jiffies + msecs_to_jiffies(delay_ms);
+       do {
+               if (sensor_nic_disabled(dev))
+                       break;
+
+               msleep(MLX5_NIC_STATE_POLL_MS);
+       } while (!time_after(jiffies, end));
+
+       if (!sensor_nic_disabled(dev)) {
+               dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
+                       get_nic_mode(dev), delay_ms);
+       }
+
+       /* Release FW semaphore if you are the lock owner */
+       if (!lock)
+               lock_sem_sw_reset(dev, UNLOCK);
+
+       mlx5_core_err(dev, "system error event triggered\n");
+
+err_state_done:
+       mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
 unlock:
        mutex_unlock(&dev->intf_state_mutex);
 }
@@ -265,7 +300,6 @@ static void health_recover(struct work_struct *work)
        if (sensor_pci_no_comm(dev)) {
                dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI 
reads still not working\n");
                recover = false;
-               goto clear_sem;
        }
 
        nic_mode = get_nic_mode(dev);
@@ -281,14 +315,6 @@ static void health_recover(struct work_struct *work)
                recover = false;
        }
 
-clear_sem:
-       if (test_and_clear_bit(MLX5_SW_RESET_SEM_LOCKED, &health->flags)) {
-               mlx5_core_dbg(dev, "Unlocking FW reset semaphore\n");
-               lock_sem_sw_reset(dev, UNLOCK);
-       }
-
-       test_and_clear_bit(MLX5_SKIP_SW_RESET, &health->flags);
-
        if (recover) {
                dev_err(&dev->pdev->dev, "starting health recovery flow\n");
                mlx5_recover_device(dev);
@@ -312,28 +338,10 @@ static void health_care(struct work_struct *work)
        struct mlx5_core_dev *dev;
        struct mlx5_priv *priv;
        unsigned long flags;
-       int ret;
 
        health = container_of(work, struct mlx5_core_health, work);
        priv = container_of(health, struct mlx5_priv, health);
        dev = container_of(priv, struct mlx5_core_dev, priv);
-
-       if (mlx5_core_is_pf(dev)) {
-               ret = lock_sem_sw_reset(dev, LOCK);
-               if (!ret) {
-                       mlx5_core_warn(dev, "Locked FW reset semaphore\n");
-                       set_bit(MLX5_SW_RESET_SEM_LOCKED, &health->flags);
-               }
-               else if (ret == -EBUSY) {
-                       /* sw reset will be skipped only in case we detect the
-                        * semaphore was already taken. In case of an error
-                        * while taking the semaphore we prefer to issue a
-                        * reset since longer cr-dump time and multiple resets
-                        * are better than a stuck fw.
-                        */
-                       set_bit(MLX5_SKIP_SW_RESET, &health->flags);
-               }
-       }
 
        mlx5_core_warn(dev, "handling bad device here\n");
        mlx5_handle_bad_state(dev);
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to