On 09/03/2017 10:22, Chris Wilson wrote:
When we wedge the device, we override engine->submit_request with a nop
to ensure that all in-flight requests are marked in error. However, igt
would like to unwedge the device to test -EIO handling. This requires us
to flush those in-flight requests and restore the original
engine->submit_request.

Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
Testcase: igt/gem_eio
Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
Cc: Mika Kuoppala <mika.kuopp...@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         |  2 +-
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem.c         | 41 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.c | 12 ++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
 5 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index b1e9027a4f80..576b03b0048c 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1825,7 +1825,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
                return;

        /* Clear any previous failed attempts at recovery. Time to try again. */
-       __clear_bit(I915_WEDGED, &error->flags);
+       i915_gem_unset_wedged(dev_priv);
        error->reset_count++;

        pr_notice("drm/i915: Resetting chip after gpu hang\n");
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3002996ddbed..c52aee5141ca 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3409,6 +3409,7 @@ int i915_gem_reset_prepare(struct drm_i915_private 
*dev_priv);
 void i915_gem_reset(struct drm_i915_private *dev_priv);
 void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
+void i915_gem_unset_wedged(struct drm_i915_private *dev_priv);

 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index aca1eaddafb4..0725e7a591a5 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2999,6 +2999,47 @@ void i915_gem_set_wedged(struct drm_i915_private 
*dev_priv)
        mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 }

+void i915_gem_unset_wedged(struct drm_i915_private *dev_priv)
+{
+       struct i915_gem_timeline *tl;
+       int i;
+
+       lockdep_assert_held(&dev_priv->drm.struct_mutex);
+       if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
+               return;
+
+       /* Before unwedging, make sure that all pending operations
+        * are flushed and errored out. No more can be submitted until
+        * we reset the wedged bit.
+        */
+       list_for_each_entry(tl, &dev_priv->gt.timelines, link) {
+               for (i = 0; i < ARRAY_SIZE(tl->engine); i++) {
+                       struct drm_i915_gem_request *rq;
+
+                       rq = i915_gem_active_peek(&tl->engine[i].last_request,
+                                                 &dev_priv->drm.struct_mutex);
+                       if (!rq)
+                               continue;
+
+                       /* We can't use our normal waiter as we want to
+                        * avoid recursively trying to handle the current
+                        * reset.
+                        */
+                       dma_fence_default_wait(&rq->fence, false,
+                                              MAX_SCHEDULE_TIMEOUT);

Who will signal these since GPU is stuck and this happens before the GEM reset calls in i915_reset? Obviously I totally don't understand how is this supposed to work.. :)

+               }
+       }
+
+       /* Undo nop_submit_request */
+       if (i915.enable_execlists)
+               intel_execlists_enable_submission(dev_priv);
+       else
+               intel_ringbuffer_enable_submission(dev_priv);

No need for stop machine as the wedging uses?

+
+       smp_mb__before_atomic();

What is this for?

+       clear_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
+}
+
 static void
 i915_gem_retire_work_handler(struct work_struct *work)
 {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 4a864f8c9387..753586f6ddbe 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2224,3 +2224,15 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs 
*engine)

        return intel_init_ring_buffer(engine);
 }
+
+void intel_ringbuffer_enable_submission(struct drm_i915_private *i915)
+{
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+
+       for_each_engine(engine, i915, id) {
+               engine->submit_request = i9xx_submit_request;
+               if (IS_GEN6(i915) && id == VCS)
+                       engine->submit_request = gen6_bsd_submit_request;
+       }

I wonder if it would be worth extracting setting of this vfunc (and schedule in execlists case) to a helper so the logic is not duplicated. Sounds a bit marginal at the moment, don't know.

+}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 0ef491df5b4e..5601c24b266a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -669,4 +669,6 @@ static inline u32 *gen8_emit_pipe_control(u32 *batch, u32 
flags, u32 offset)
 bool intel_engine_is_idle(struct intel_engine_cs *engine);
 bool intel_engines_are_idle(struct drm_i915_private *dev_priv);

+void intel_ringbuffer_enable_submission(struct drm_i915_private *i915);
+
 #endif /* _INTEL_RINGBUFFER_H_ */


Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to