Re: [Intel-gfx] [PATCH v7] drm/i915: Beware temporary wedging when determining -EIO

2019-02-20 Thread Mika Kuoppala
Chris Wilson  writes:

> At a few points in our uABI, we check to see if the driver is wedged and
> report -EIO back to the user in that case. However, as we perform the
> check and reset asynchronously, we may instead see the temporary wedging
> used to cancel inflight rendering to avoid a deadlock during reset. If

This could mention that it is the wedge on timeout which will
flash the -EIO to userspace.

> we suspect this is the case, that is we see a wedged driver and reset in
> progress, then wait until the reset is resolved before reporting upon
> the wedged status.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580
> Signed-off-by: Chris Wilson 
> Cc: Mika Kuoppala 
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   | 16 ---
>  drivers/gpu/drm/i915/i915_drv.h   |  7 -
>  drivers/gpu/drm/i915/i915_gem.c   | 22 +++
>  drivers/gpu/drm/i915/i915_request.c   |  5 ++--
>  drivers/gpu/drm/i915/i915_reset.c | 27 +--
>  drivers/gpu/drm/i915/i915_reset.h |  2 ++
>  drivers/gpu/drm/i915/intel_engine_cs.c|  8 +++---
>  drivers/gpu/drm/i915/intel_hangcheck.c|  2 +-
>  drivers/gpu/drm/i915/selftests/huge_pages.c   |  2 +-
>  drivers/gpu/drm/i915/selftests/i915_active.c  |  2 +-
>  .../drm/i915/selftests/i915_gem_coherency.c   |  4 +--
>  .../gpu/drm/i915/selftests/i915_gem_context.c |  2 +-
>  .../gpu/drm/i915/selftests/i915_gem_evict.c   |  2 +-
>  .../gpu/drm/i915/selftests/i915_gem_object.c  |  2 +-
>  drivers/gpu/drm/i915/selftests/i915_request.c |  2 +-
>  .../gpu/drm/i915/selftests/igt_flush_test.c   |  2 +-
>  .../gpu/drm/i915/selftests/intel_hangcheck.c  | 24 -
>  drivers/gpu/drm/i915/selftests/intel_lrc.c|  2 +-
>  .../drm/i915/selftests/intel_workarounds.c|  4 +--
>  19 files changed, 87 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2aeea977283f..54928d693c85 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3833,11 +3833,19 @@ static const struct file_operations 
> i915_cur_wm_latency_fops = {
>  static int
>  i915_wedged_get(void *data, u64 *val)
>  {
> - struct drm_i915_private *dev_priv = data;
> + int ret = i915_terminally_wedged(data);
>  
> - *val = i915_terminally_wedged(_priv->gpu_error);
> + switch (ret) {
> + case -EIO:
> + *val = 1;
> + ret = 0;
> + break;
> + case 0:
> + *val = 0;
> + break;
> + }

Hmm, joke is still there but it is better.

>  
> - return 0;
> + return ret;
>  }
>  
>  static int
> @@ -3918,7 +3926,7 @@ i915_drop_caches_set(void *data, u64 val)
>   mutex_unlock(>drm.struct_mutex);
>   }
>  
> - if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(>gpu_error))
> + if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915))
>   i915_handle_error(i915, ALL_ENGINES, 0, NULL);
>  
>   fs_reclaim_acquire(GFP_KERNEL);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5c8d0489a1cd..14c6f5e65b8d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct 
> drm_device *dev, u32 seqno);
>  struct i915_request *
>  i915_gem_find_active_request(struct intel_engine_cs *engine);
>  
> -static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> +static inline bool __i915_wedged(struct i915_gpu_error *error)
>  {
>   return unlikely(test_bit(I915_WEDGED, >flags));
>  }
>  
> +static inline bool i915_reset_failed(struct drm_i915_private *i915)
> +{
> + return __i915_wedged(>gpu_error);
> +}
> +
>  static inline u32 i915_reset_count(struct i915_gpu_error *error)
>  {
>   return READ_ONCE(error->reset_count);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b421bc7a2e26..b3d918d90c1f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1928,7 +1928,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
>* fail). But any other -EIO isn't ours (e.g. swap in failure)
>* and so needs to be reported.
>*/
> - if (!i915_terminally_wedged(_priv->gpu_error))
> + if (!i915_terminally_wedged(dev_priv))
>   return VM_FAULT_SIGBUS;
>   /* else: fall through */
>   case -EAGAIN:
> @@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct 
> drm_i915_private *i915)
>   struct intel_engine_cs *engine;
>   enum intel_engine_id id;
>  
> - if (i915_terminally_wedged(>gpu_error))
> + if (i915_reset_failed(i915))
>   return;
>  
>   GEM_BUG_ON(i915->gt.active_requests);
> @@ -3806,8 +3806,9 

[Intel-gfx] [PATCH v7] drm/i915: Beware temporary wedging when determining -EIO

2019-02-19 Thread Chris Wilson
At a few points in our uABI, we check to see if the driver is wedged and
report -EIO back to the user in that case. However, as we perform the
check and reset asynchronously, we may instead see the temporary wedging
used to cancel inflight rendering to avoid a deadlock during reset. If
we suspect this is the case, that is we see a wedged driver and reset in
progress, then wait until the reset is resolved before reporting upon
the wedged status.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580
Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 16 ---
 drivers/gpu/drm/i915/i915_drv.h   |  7 -
 drivers/gpu/drm/i915/i915_gem.c   | 22 +++
 drivers/gpu/drm/i915/i915_request.c   |  5 ++--
 drivers/gpu/drm/i915/i915_reset.c | 27 +--
 drivers/gpu/drm/i915/i915_reset.h |  2 ++
 drivers/gpu/drm/i915/intel_engine_cs.c|  8 +++---
 drivers/gpu/drm/i915/intel_hangcheck.c|  2 +-
 drivers/gpu/drm/i915/selftests/huge_pages.c   |  2 +-
 drivers/gpu/drm/i915/selftests/i915_active.c  |  2 +-
 .../drm/i915/selftests/i915_gem_coherency.c   |  4 +--
 .../gpu/drm/i915/selftests/i915_gem_context.c |  2 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  2 +-
 .../gpu/drm/i915/selftests/i915_gem_object.c  |  2 +-
 drivers/gpu/drm/i915/selftests/i915_request.c |  2 +-
 .../gpu/drm/i915/selftests/igt_flush_test.c   |  2 +-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  | 24 -
 drivers/gpu/drm/i915/selftests/intel_lrc.c|  2 +-
 .../drm/i915/selftests/intel_workarounds.c|  4 +--
 19 files changed, 87 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 2aeea977283f..54928d693c85 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3833,11 +3833,19 @@ static const struct file_operations 
i915_cur_wm_latency_fops = {
 static int
 i915_wedged_get(void *data, u64 *val)
 {
-   struct drm_i915_private *dev_priv = data;
+   int ret = i915_terminally_wedged(data);
 
-   *val = i915_terminally_wedged(_priv->gpu_error);
+   switch (ret) {
+   case -EIO:
+   *val = 1;
+   ret = 0;
+   break;
+   case 0:
+   *val = 0;
+   break;
+   }
 
-   return 0;
+   return ret;
 }
 
 static int
@@ -3918,7 +3926,7 @@ i915_drop_caches_set(void *data, u64 val)
mutex_unlock(>drm.struct_mutex);
}
 
-   if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(>gpu_error))
+   if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915))
i915_handle_error(i915, ALL_ENGINES, 0, NULL);
 
fs_reclaim_acquire(GFP_KERNEL);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5c8d0489a1cd..14c6f5e65b8d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct 
drm_device *dev, u32 seqno);
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
-static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
+static inline bool __i915_wedged(struct i915_gpu_error *error)
 {
return unlikely(test_bit(I915_WEDGED, >flags));
 }
 
+static inline bool i915_reset_failed(struct drm_i915_private *i915)
+{
+   return __i915_wedged(>gpu_error);
+}
+
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
return READ_ONCE(error->reset_count);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b421bc7a2e26..b3d918d90c1f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1928,7 +1928,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 * fail). But any other -EIO isn't ours (e.g. swap in failure)
 * and so needs to be reported.
 */
-   if (!i915_terminally_wedged(_priv->gpu_error))
+   if (!i915_terminally_wedged(dev_priv))
return VM_FAULT_SIGBUS;
/* else: fall through */
case -EAGAIN:
@@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct 
drm_i915_private *i915)
struct intel_engine_cs *engine;
enum intel_engine_id id;
 
-   if (i915_terminally_wedged(>gpu_error))
+   if (i915_reset_failed(i915))
return;
 
GEM_BUG_ON(i915->gt.active_requests);
@@ -3806,8 +3806,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct 
drm_file *file)
long ret;
 
/* ABI: return -EIO if already wedged */
-   if (i915_terminally_wedged(_priv->gpu_error))
-   return -EIO;
+   ret = i915_terminally_wedged(dev_priv);
+   if (ret)
+   return ret;