tdr: Modify error handler for per engine hang recovery

Arun Siluvery Tue, 12 Apr 2016 10:00:23 -0700

This is a preparatory patch which modifies error handler to do per engine
hang recovery. The actual patch which implements this sequence follows
later in the series. The aim is to prepare existing recovery function to
adapt to this new function where applicable (which fails at this point
because core implementation is lacking) and continue recovery using legacy
full gpu reset.


A helper function is also added to query whether engine reset is available
on a particular engine because this can be controlled on an engine basis to
help with a hypothetical situtation of a bad engine e.g., engine reset
works fine for render but fails on blitter. Engine reset is chosen only
when all engines support this otherwise we fallback to full gpu reset.

i915.reset is already prepared for this purpose. It now accepts a mask
formed of exec_id indicating whether that engine supports engine reset or
not, default is full gpu reset for now.

i915.reset == 1, full gpu reset
i915.reset == mask of exec_id, eg: (1 << I915_EXEC_RENDER) | ...;

Engine reset and full gpu reset sections are also extracted to separate
functions, it helps readability and branching between reset type simpler.

The error events behaviour that are used to notify user of reset are
adapted to engine reset such that it doesn't break users listening to these
events. In legacy we report an error event, a reset event before resetting
the gpu and a reset done event marking the completion of reset. The same
behaviour is adapted but reset event is only dispatched once even when
multiple engines are hung. Finally once reset is complete we send reset
done event as usual.

Cc: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
Signed-off-by: Ian Lister <ian.lis...@intel.com>
Signed-off-by: Tomas Elf <tomas....@intel.com>
Signed-off-by: Arun Siluvery <arun.siluv...@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c     |  23 ++++
 drivers/gpu/drm/i915/i915_drv.h     |   2 +
 drivers/gpu/drm/i915/i915_irq.c     | 233 ++++++++++++++++++++++++++----------
 drivers/gpu/drm/i915/intel_uncore.c |  12 ++
 4 files changed, 206 insertions(+), 64 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 29b4e79..1933f87 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -955,6 +955,29 @@ int i915_reset(struct drm_device *dev)
        return 0;
 }
 
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is fairly simple:
+ *  - force engine to idle
+ *  - save current state which includes head and current request
+ *  - reset engine
+ *  - restore saved state and resubmit context
+ */
+int i915_reset_engine(struct intel_engine_cs *engine)
+{
+       int ret;
+
+       /* FIXME: replace me with engine reset sequence */
+       ret = -ENODEV;
+
+       return ret;
+}
+
 static int i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
*ent)
 {
        struct intel_device_info *intel_info =
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 674d2ab..bb34107 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2755,7 +2755,9 @@ extern long i915_compat_ioctl(struct file *filp, unsigned 
int cmd,
 #endif
 extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_device *dev);
+extern bool intel_has_engine_reset_support(struct intel_engine_cs *engine);
 extern int i915_reset(struct drm_device *dev);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6f6a2a2..468058d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2473,6 +2473,89 @@ static void i915_error_wake_up(struct drm_i915_private 
*dev_priv,
                wake_up_all(&dev_priv->gpu_error.reset_queue);
 }
 
+static int i915_reset_engines(struct drm_i915_private *dev_priv)
+{
+       struct intel_engine_cs *engine;
+
+       for_each_engine(engine, dev_priv) {
+               int ret;
+               struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+               if (!i915_engine_reset_in_progress(error, engine->id))
+                       continue;
+
+               ret = i915_reset_engine(engine);
+               if (ret) {
+                       struct intel_engine_cs *e;
+
+                       DRM_ERROR("Reset of %s failed! ret=%d",
+                                 engine->name, ret);
+
+                       /*
+                        * when engine reset fails we switch to full gpu reset
+                        * which clears everything; In the case where multiple
+                        * engines are hung we would've already scheduled work
+                        * items and when they attempt to do engine reset they
+                        * won't find any active request (full gpu reset
+                        * would've cleared it). To make the work items exit
+                        * safely, clear engine reset pending mask.
+                        */
+                       for_each_engine(e, dev_priv) {
+                               if (i915_engine_reset_in_progress(error, e->id))
+                                       
atomic_and(~I915_ENGINE_RESET_IN_PROGRESS,
+                                                  
&error->engine_reset_counter[e->id]);
+                       }
+
+                       return ret;
+               }
+
+               atomic_inc(&error->engine_reset_counter[engine->id]);
+       }
+
+       return 0;
+}
+
+static int i915_reset_full(struct drm_i915_private *dev_priv)
+{
+       struct i915_gpu_error *error = &dev_priv->gpu_error;
+       struct drm_device *dev = dev_priv->dev;
+       int ret;
+
+       /* ensure device is awake */
+       assert_rpm_wakelock_held(dev_priv);
+
+       intel_prepare_reset(dev);
+
+       /*
+        * All state reset _must_ be completed before we update the
+        * reset counter, for otherwise waiters might miss the reset
+        * pending state and not properly drop locks, resulting in
+        * deadlocks with the reset work.
+        */
+       ret = i915_reset(dev);
+
+       intel_finish_reset(dev);
+
+       if (ret == 0) {
+               /*
+                * After all the gem state is reset, increment the reset
+                * counter and wake up everyone waiting for the reset to
+                * complete.
+                *
+                * Since unlock operations are a one-sided barrier only,
+                * we need to insert a barrier here to order any seqno
+                * updates before
+                * the counter increment.
+                */
+               smp_mb__before_atomic();
+               atomic_inc(&error->reset_counter);
+       } else {
+               atomic_or(I915_WEDGED, &error->reset_counter);
+       }
+
+       return ret;
+}
+
 /**
  * i915_error_work_func - do process context error handling work
  * @work: work item containing error struct, passed by the error handler
@@ -2495,6 +2578,39 @@ static void i915_error_work_func(struct work_struct 
*work)
        kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
 
        /*
+        * This event needs to be sent before performing gpu reset. When
+        * engine resets are supported we iterate through all engines and
+        * reset hung engines individually. To keep the event dispatch
+        * mechanism consistent with full gpu reset, this is only sent once
+        * even when multiple engines are hung. It is also safe to move this
+        * here because when we are in this function, we will definitely
+        * perform gpu reset.
+        */
+       kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, reset_event);
+
+       /*
+        * In most cases it's guaranteed that we get here with an RPM
+        * reference held, for example because there is a pending GPU
+        * request that won't finish until the reset is done. This
+        * isn't the case at least when we get here by doing a
+        * simulated reset via debugs, so get an RPM reference.
+        */
+       intel_runtime_pm_get(dev_priv);
+
+       mutex_lock(&dev->struct_mutex);
+
+       if (!i915_reset_in_progress(error)) {
+               ret = i915_reset_engines(dev_priv);
+               if (ret) {
+                       /* attempt full gpu reset to recover */
+                       atomic_or(I915_RESET_IN_PROGRESS_FLAG,
+                                 &dev_priv->gpu_error.reset_counter);
+               }
+       }
+
+       mutex_unlock(&dev->struct_mutex);
+
+       /*
         * Note that there's only one work item which does gpu resets, so we
         * need not worry about concurrent gpu resets potentially incrementing
         * error->reset_counter twice. We only need to take care of another
@@ -2506,58 +2622,21 @@ static void i915_error_work_func(struct work_struct 
*work)
         */
        if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
                DRM_DEBUG_DRIVER("resetting chip\n");
-               kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
-                                  reset_event);
-
-               /*
-                * In most cases it's guaranteed that we get here with an RPM
-                * reference held, for example because there is a pending GPU
-                * request that won't finish until the reset is done. This
-                * isn't the case at least when we get here by doing a
-                * simulated reset via debugs, so get an RPM reference.
-                */
-               intel_runtime_pm_get(dev_priv);
-
-               intel_prepare_reset(dev);
-
-               /*
-                * All state reset _must_ be completed before we update the
-                * reset counter, for otherwise waiters might miss the reset
-                * pending state and not properly drop locks, resulting in
-                * deadlocks with the reset work.
-                */
-               ret = i915_reset(dev);
 
-               intel_finish_reset(dev);
-
-               intel_runtime_pm_put(dev_priv);
-
-               if (ret == 0) {
-                       /*
-                        * After all the gem state is reset, increment the reset
-                        * counter and wake up everyone waiting for the reset to
-                        * complete.
-                        *
-                        * Since unlock operations are a one-sided barrier only,
-                        * we need to insert a barrier here to order any seqno
-                        * updates before
-                        * the counter increment.
-                        */
-                       smp_mb__before_atomic();
-                       atomic_inc(&dev_priv->gpu_error.reset_counter);
-
-                       kobject_uevent_env(&dev->primary->kdev->kobj,
-                                          KOBJ_CHANGE, reset_done_event);
-               } else {
-                       atomic_or(I915_WEDGED, &error->reset_counter);
-               }
+               ret = i915_reset_full(dev_priv);
+       }
 
-               /*
-                * Note: The wake_up also serves as a memory barrier so that
-                * waiters see the update value of the reset counter atomic_t.
-                */
+       /*
+        * Note: The wake_up also serves as a memory barrier so that
+        * waiters see the update value of the reset counter atomic_t.
+        */
+       if (!i915_terminally_wedged(error)) {
                i915_error_wake_up(dev_priv, true);
+               kobject_uevent_env(&dev->primary->kdev->kobj,
+                                  KOBJ_CHANGE, reset_done_event);
        }
+
+       intel_runtime_pm_put(dev_priv);
 }
 
 static void i915_report_and_clear_eir(struct drm_device *dev)
@@ -2656,6 +2735,8 @@ static void i915_report_and_clear_eir(struct drm_device 
*dev)
  * i915_handle_error - handle a gpu error
  * @dev: drm device
  * @engine_mask: mask representing engines that are hung
+ * @fmt: formatted hang msg that gets logged in captured error state
+ *
  * Do some basic checking of register state at error time and
  * dump it to the syslog.  Also call i915_capture_error_state() to make
  * sure we get a record and make it available in debugfs.  Fire a uevent
@@ -2668,6 +2749,8 @@ void i915_handle_error(struct drm_device *dev, u32 
engine_mask,
        struct drm_i915_private *dev_priv = dev->dev_private;
        va_list args;
        char error_msg[80];
+       bool use_engine_reset;
+       struct intel_engine_cs *engine;
 
        va_start(args, fmt);
        vscnprintf(error_msg, sizeof(error_msg), fmt, args);
@@ -2676,27 +2759,49 @@ void i915_handle_error(struct drm_device *dev, u32 
engine_mask,
        i915_capture_error_state(dev, engine_mask, error_msg);
        i915_report_and_clear_eir(dev);
 
+       use_engine_reset = false;
        if (engine_mask) {
-               atomic_or(I915_RESET_IN_PROGRESS_FLAG,
-                               &dev_priv->gpu_error.reset_counter);
+               u32 engines_allowed = 0;
 
-               /*
-                * Wakeup waiting processes so that the reset function
-                * i915_reset_and_wakeup doesn't deadlock trying to grab
-                * various locks. By bumping the reset counter first, the woken
-                * processes will see a reset in progress and back off,
-                * releasing their locks and then wait for the reset completion.
-                * We must do this for _all_ gpu waiters that might hold locks
-                * that the reset work needs to acquire.
-                *
-                * Note: The wake_up serves as the required memory barrier to
-                * ensure that the waiters see the updated value of the reset
-                * counter atomic_t.
-                */
-               i915_error_wake_up(dev_priv, false);
+               for_each_engine_masked(engine, dev_priv, engine_mask) {
+                       if (intel_has_engine_reset_support(engine))
+                               engines_allowed |= intel_engine_flag(engine);
+               }
+
+               use_engine_reset = (engines_allowed == engine_mask);
+       }
+
+       if (use_engine_reset) {
+               struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+               for_each_engine_masked(engine, dev_priv, engine_mask) {
+                       if (i915_engine_reset_in_progress(error, engine->id))
+                               continue;
+
+                       atomic_or(I915_ENGINE_RESET_IN_PROGRESS,
+                                 &error->engine_reset_counter[engine->id]);
+               }
+       } else {
+               atomic_or(I915_RESET_IN_PROGRESS_FLAG,
+                         &dev_priv->gpu_error.reset_counter);
        }
 
        /*
+        * Wakeup waiting processes so that the reset function
+        * i915_reset_and_wakeup doesn't deadlock trying to grab
+        * various locks. By bumping the reset counter first, the woken
+        * processes will see a reset in progress and back off,
+        * releasing their locks and then wait for the reset completion.
+        * We must do this for _all_ gpu waiters that might hold locks
+        * that the reset work needs to acquire.
+        *
+        * Note: The wake_up serves as the required memory barrier to
+        * ensure that the waiters see the updated value of the reset
+        * counter atomic_t.
+        */
+       i915_error_wake_up(dev_priv, false);
+
+       /*
         * Our reset work can grab modeset locks (since it needs to reset the
         * state of outstanding pageflips). Hence it must not be run on our own
         * dev-priv->wq work queue for otherwise the flush_work in the pageflip
diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index ac2ac07..9ce2470 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1673,6 +1673,18 @@ bool intel_has_gpu_reset(struct drm_device *dev)
        return intel_get_gpu_reset(dev) != NULL;
 }
 
+bool intel_has_engine_reset_support(struct intel_engine_cs *engine)
+{
+       u32 reset_param;
+       struct drm_device *dev = engine->dev;
+
+       reset_param = i915.reset & ((1 << (I915_NUM_ENGINES + 1)) - 1);
+
+       return (INTEL_INFO(dev)->gen >=8 &&
+               !(reset_param & 0x01) &&
+               (reset_param & (1 << engine->exec_id)));
+}
+
 int intel_guc_reset(struct drm_i915_private *dev_priv)
 {
        int ret;
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH 04/14] drm/i915/tdr: Modify error handler for per engine hang recovery

Reply via email to