Re: [Intel-gfx] [PATCH v7] drm/i915: Beware temporary wedging when determining -EIO
Chris Wilson writes: > At a few points in our uABI, we check to see if the driver is wedged and > report -EIO back to the user in that case. However, as we perform the > check and reset asynchronously, we may instead see the temporary wedging > used to cancel inflight rendering to avoid a deadlock during reset. If This could mention that it is the wedge on timeout which will flash the -EIO to userspace. > we suspect this is the case, that is we see a wedged driver and reset in > progress, then wait until the reset is resolved before reporting upon > the wedged status. > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580 > Signed-off-by: Chris Wilson > Cc: Mika Kuoppala > --- > drivers/gpu/drm/i915/i915_debugfs.c | 16 --- > drivers/gpu/drm/i915/i915_drv.h | 7 - > drivers/gpu/drm/i915/i915_gem.c | 22 +++ > drivers/gpu/drm/i915/i915_request.c | 5 ++-- > drivers/gpu/drm/i915/i915_reset.c | 27 +-- > drivers/gpu/drm/i915/i915_reset.h | 2 ++ > drivers/gpu/drm/i915/intel_engine_cs.c| 8 +++--- > drivers/gpu/drm/i915/intel_hangcheck.c| 2 +- > drivers/gpu/drm/i915/selftests/huge_pages.c | 2 +- > drivers/gpu/drm/i915/selftests/i915_active.c | 2 +- > .../drm/i915/selftests/i915_gem_coherency.c | 4 +-- > .../gpu/drm/i915/selftests/i915_gem_context.c | 2 +- > .../gpu/drm/i915/selftests/i915_gem_evict.c | 2 +- > .../gpu/drm/i915/selftests/i915_gem_object.c | 2 +- > drivers/gpu/drm/i915/selftests/i915_request.c | 2 +- > .../gpu/drm/i915/selftests/igt_flush_test.c | 2 +- > .../gpu/drm/i915/selftests/intel_hangcheck.c | 24 - > drivers/gpu/drm/i915/selftests/intel_lrc.c| 2 +- > .../drm/i915/selftests/intel_workarounds.c| 4 +-- > 19 files changed, 87 insertions(+), 50 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c > b/drivers/gpu/drm/i915/i915_debugfs.c > index 2aeea977283f..54928d693c85 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -3833,11 +3833,19 @@ static const struct file_operations > i915_cur_wm_latency_fops = { > static int > i915_wedged_get(void *data, u64 *val) > { > - struct drm_i915_private *dev_priv = data; > + int ret = i915_terminally_wedged(data); > > - *val = i915_terminally_wedged(&dev_priv->gpu_error); > + switch (ret) { > + case -EIO: > + *val = 1; > + ret = 0; > + break; > + case 0: > + *val = 0; > + break; > + } Hmm, joke is still there but it is better. > > - return 0; > + return ret; > } > > static int > @@ -3918,7 +3926,7 @@ i915_drop_caches_set(void *data, u64 val) > mutex_unlock(&i915->drm.struct_mutex); > } > > - if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error)) > + if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915)) > i915_handle_error(i915, ALL_ENGINES, 0, NULL); > > fs_reclaim_acquire(GFP_KERNEL); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 5c8d0489a1cd..14c6f5e65b8d 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct > drm_device *dev, u32 seqno); > struct i915_request * > i915_gem_find_active_request(struct intel_engine_cs *engine); > > -static inline bool i915_terminally_wedged(struct i915_gpu_error *error) > +static inline bool __i915_wedged(struct i915_gpu_error *error) > { > return unlikely(test_bit(I915_WEDGED, &error->flags)); > } > > +static inline bool i915_reset_failed(struct drm_i915_private *i915) > +{ > + return __i915_wedged(&i915->gpu_error); > +} > + > static inline u32 i915_reset_count(struct i915_gpu_error *error) > { > return READ_ONCE(error->reset_count); > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index b421bc7a2e26..b3d918d90c1f 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -1928,7 +1928,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf) >* fail). But any other -EIO isn't ours (e.g. swap in failure) >* and so needs to be reported. >*/ > - if (!i915_terminally_wedged(&dev_priv->gpu_error)) > + if (!i915_terminally_wedged(dev_priv)) > return VM_FAULT_SIGBUS; > /* else: fall through */ > case -EAGAIN: > @@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct > drm_i915_private *i915) > struct intel_engine_cs *engine; > enum intel_engine_id id; > > - if (i915_terminally_wedged(&i915->gpu_error)) > + if (i915_reset_failed(i915)) > return; > > GEM_BUG_ON(i915->gt.ac
[Intel-gfx] [PATCH v7] drm/i915: Beware temporary wedging when determining -EIO
At a few points in our uABI, we check to see if the driver is wedged and report -EIO back to the user in that case. However, as we perform the check and reset asynchronously, we may instead see the temporary wedging used to cancel inflight rendering to avoid a deadlock during reset. If we suspect this is the case, that is we see a wedged driver and reset in progress, then wait until the reset is resolved before reporting upon the wedged status. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580 Signed-off-by: Chris Wilson Cc: Mika Kuoppala --- drivers/gpu/drm/i915/i915_debugfs.c | 16 --- drivers/gpu/drm/i915/i915_drv.h | 7 - drivers/gpu/drm/i915/i915_gem.c | 22 +++ drivers/gpu/drm/i915/i915_request.c | 5 ++-- drivers/gpu/drm/i915/i915_reset.c | 27 +-- drivers/gpu/drm/i915/i915_reset.h | 2 ++ drivers/gpu/drm/i915/intel_engine_cs.c| 8 +++--- drivers/gpu/drm/i915/intel_hangcheck.c| 2 +- drivers/gpu/drm/i915/selftests/huge_pages.c | 2 +- drivers/gpu/drm/i915/selftests/i915_active.c | 2 +- .../drm/i915/selftests/i915_gem_coherency.c | 4 +-- .../gpu/drm/i915/selftests/i915_gem_context.c | 2 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 2 +- .../gpu/drm/i915/selftests/i915_gem_object.c | 2 +- drivers/gpu/drm/i915/selftests/i915_request.c | 2 +- .../gpu/drm/i915/selftests/igt_flush_test.c | 2 +- .../gpu/drm/i915/selftests/intel_hangcheck.c | 24 - drivers/gpu/drm/i915/selftests/intel_lrc.c| 2 +- .../drm/i915/selftests/intel_workarounds.c| 4 +-- 19 files changed, 87 insertions(+), 50 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 2aeea977283f..54928d693c85 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3833,11 +3833,19 @@ static const struct file_operations i915_cur_wm_latency_fops = { static int i915_wedged_get(void *data, u64 *val) { - struct drm_i915_private *dev_priv = data; + int ret = i915_terminally_wedged(data); - *val = i915_terminally_wedged(&dev_priv->gpu_error); + switch (ret) { + case -EIO: + *val = 1; + ret = 0; + break; + case 0: + *val = 0; + break; + } - return 0; + return ret; } static int @@ -3918,7 +3926,7 @@ i915_drop_caches_set(void *data, u64 val) mutex_unlock(&i915->drm.struct_mutex); } - if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error)) + if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915)) i915_handle_error(i915, ALL_ENGINES, 0, NULL); fs_reclaim_acquire(GFP_KERNEL); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 5c8d0489a1cd..14c6f5e65b8d 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno); struct i915_request * i915_gem_find_active_request(struct intel_engine_cs *engine); -static inline bool i915_terminally_wedged(struct i915_gpu_error *error) +static inline bool __i915_wedged(struct i915_gpu_error *error) { return unlikely(test_bit(I915_WEDGED, &error->flags)); } +static inline bool i915_reset_failed(struct drm_i915_private *i915) +{ + return __i915_wedged(&i915->gpu_error); +} + static inline u32 i915_reset_count(struct i915_gpu_error *error) { return READ_ONCE(error->reset_count); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index b421bc7a2e26..b3d918d90c1f 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1928,7 +1928,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf) * fail). But any other -EIO isn't ours (e.g. swap in failure) * and so needs to be reported. */ - if (!i915_terminally_wedged(&dev_priv->gpu_error)) + if (!i915_terminally_wedged(dev_priv)) return VM_FAULT_SIGBUS; /* else: fall through */ case -EAGAIN: @@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915) struct intel_engine_cs *engine; enum intel_engine_id id; - if (i915_terminally_wedged(&i915->gpu_error)) + if (i915_reset_failed(i915)) return; GEM_BUG_ON(i915->gt.active_requests); @@ -3806,8 +3806,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) long ret; /* ABI: return -EIO if already wedged */ - if (i915_terminally_wedged(&dev_priv->gpu_error)) - return -EIO; + ret = i915_terminally_wedged(dev_priv); +