On Fri, Dec 11, 2015 at 11:33:05AM +0000, Chris Wilson wrote:
> Reporting -EIO from i915_wait_request() has proven very troublematic
> over the years, with numerous hard-to-reproduce bugs cropping up in the
> corner case of where a reset occurs and the code wasn't expecting such
> an error.
> 
> If the we reset the GPU or have detected a hang and wish to reset the
> GPU, the request is forcibly complete and the wait broken. Currently, we
> report either -EAGAIN or -EIO in order for the caller to retreat and
> restart the wait (if appropriate) after dropping and then reacquiring
> the struct_mutex (essential to allow the GPU reset to proceed). However,
> if we take the view that the request is complete (no further work will
> be done on it by the GPU because it is dead and soon to be reset), then
> we can proceed with the task at hand and then drop the struct_mutex
> allowing the reset to occur. This transfers the burden of checking
> whether it is safe to proceed to the caller, which in all but one
> instance it is safe - completely eliminating the source of all spurious
> -EIO.
> 
> Of note, we only have two API entry points where we expect that
> userspace can observe an EIO. First is when submitting an execbuf, if
> the GPU is terminally wedged, then the operation cannot succeed and an
> -EIO is reported. Secondly, existing userspace uses the throttle ioctl
> to detect an already wedged GPU before starting using HW acceleration
> (or to confirm that the GPU is wedged after an error condition). So if
> the GPU is wedged when the user calls throttle, also report -EIO.
> 
> v2: Split more carefully the change to i915_wait_request() and assorted
> ABI from the reset handling.
> 
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vet...@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  2 --
>  drivers/gpu/drm/i915/i915_gem.c         | 44 
> ++++++++++++++++-----------------
>  drivers/gpu/drm/i915/i915_gem_userptr.c |  6 ++---
>  drivers/gpu/drm/i915/intel_display.c    |  8 +-----
>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  2 +-
>  6 files changed, 27 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f30c305a6889..7acbc072973a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2987,8 +2987,6 @@ i915_gem_find_active_request(struct intel_engine_cs 
> *ring);
>  
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_engine_cs *ring);
> -int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> -                                   bool interruptible);
>  
>  static inline u32 i915_reset_counter(struct i915_gpu_error *error)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b17cc0e42a4f..f5760869a17c 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -208,11 +208,10 @@ i915_gem_object_put_pages_phys(struct 
> drm_i915_gem_object *obj)
>       BUG_ON(obj->madv == __I915_MADV_PURGED);
>  
>       ret = i915_gem_object_set_to_cpu_domain(obj, true);
> -     if (ret) {
> +     if (WARN_ON(ret)) {
>               /* In the event of a disaster, abandon all caches and
>                * hope for the best.
>                */
> -             WARN_ON(ret != -EIO);
>               obj->base.read_domains = obj->base.write_domain = 
> I915_GEM_DOMAIN_CPU;
>       }
>  
> @@ -1106,15 +1105,13 @@ put_rpm:
>       return ret;
>  }
>  
> -int
> -i915_gem_check_wedge(struct i915_gpu_error *error,
> -                  bool interruptible)
> +static int
> +i915_gem_check_wedge(unsigned reset_counter, bool interruptible)
>  {
> -     if (i915_reset_in_progress_or_wedged(error)) {
> -             /* Recovery complete, but the reset failed ... */
> -             if (i915_terminally_wedged(error))
> -                     return -EIO;
> +     if (__i915_terminally_wedged(reset_counter))
> +             return -EIO;
>  
> +     if (__i915_reset_in_progress(reset_counter)) {
>               /* Non-interruptible callers can't handle -EAGAIN, hence return
>                * -EIO unconditionally for these. */
>               if (!interruptible)
> @@ -1285,13 +1282,14 @@ int __i915_wait_request(struct drm_i915_gem_request 
> *req,
>               prepare_to_wait(&ring->irq_queue, &wait, state);
>  
>               /* We need to check whether any gpu reset happened in between
> -              * the caller grabbing the seqno and now ... */
> +              * the request being submitted and now. If a reset has occurred,
> +              * the request is effectively complete (we either are in the
> +              * process of or have discarded the rendering and completely
> +              * reset the GPU. The results of the request are lost and we
> +              * are free to continue on with the original operation.
> +              */
>               if (req->reset_counter != 
> i915_reset_counter(&dev_priv->gpu_error)) {
> -                     /* ... but upgrade the -EAGAIN to an -EIO if the gpu
> -                      * is truely gone. */
> -                     ret = i915_gem_check_wedge(&dev_priv->gpu_error, 
> interruptible);
> -                     if (ret == 0)
> -                             ret = -EAGAIN;
> +                     ret = 0;
>                       break;
>               }
>  
> @@ -2154,11 +2152,10 @@ i915_gem_object_put_pages_gtt(struct 
> drm_i915_gem_object *obj)
>       BUG_ON(obj->madv == __I915_MADV_PURGED);
>  
>       ret = i915_gem_object_set_to_cpu_domain(obj, true);
> -     if (ret) {
> +     if (WARN_ON(ret)) {
>               /* In the event of a disaster, abandon all caches and
>                * hope for the best.
>                */
> -             WARN_ON(ret != -EIO);
>               i915_gem_clflush_object(obj, true);
>               obj->base.read_domains = obj->base.write_domain = 
> I915_GEM_DOMAIN_CPU;
>       }
> @@ -2678,8 +2675,11 @@ int i915_gem_request_alloc(struct intel_engine_cs 
> *ring,
>  
>       *req_out = NULL;
>  
> -     ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> -                                dev_priv->mm.interruptible);
> +     /* ABI: Before userspace accesses the GPU (e.g. execbuffer), report
> +      * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
> +      * and restart.
> +      */
> +     ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible);
>       if (ret)
>               return ret;
>  
> @@ -4093,9 +4093,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct 
> drm_file *file)
>       if (ret)
>               return ret;
>  
> -     ret = i915_gem_check_wedge(&dev_priv->gpu_error, false);
> -     if (ret)
> -             return ret;
> +     /* ABI: return -EIO if already wedged */
> +     if (i915_terminally_wedged(&dev_priv->gpu_error))
> +             return -EIO;
>  
>       spin_lock(&file_priv->mm.lock);
>       list_for_each_entry(request, &file_priv->mm.request_list, client_list) {
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
> b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 19fb0bddc1cd..1a5f89dba4af 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -81,10 +81,8 @@ static void __cancel_userptr__worker(struct work_struct 
> *work)
>               was_interruptible = dev_priv->mm.interruptible;
>               dev_priv->mm.interruptible = false;
>  
> -             list_for_each_entry_safe(vma, tmp, &obj->vma_list, vma_link) {
> -                     int ret = i915_vma_unbind(vma);
> -                     WARN_ON(ret && ret != -EIO);
> -             }
> +             list_for_each_entry_safe(vma, tmp, &obj->vma_list, vma_link)
> +                     WARN_ON(i915_vma_unbind(vma));
>               WARN_ON(i915_gem_object_put_pages(obj));
>  
>               dev_priv->mm.interruptible = was_interruptible;
> diff --git a/drivers/gpu/drm/i915/intel_display.c 
> b/drivers/gpu/drm/i915/intel_display.c
> index d7bbd015de35..875bdf814d73 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -13405,10 +13405,6 @@ static int intel_atomic_prepare_commit(struct 
> drm_device *dev,
>  
>                       ret = __i915_wait_request(intel_plane_state->wait_req,
>                                                 true, NULL, NULL);
> -
> -                     /* Swallow -EIO errors to allow updates during hw 
> lockup. */
> -                     if (ret == -EIO)

I'd like to keep a WARN_ON here since it's ABI relevant, and consistent
with the other places we've had a WARN_ON(-EIO).

> -                             ret = 0;
>                       if (ret) {
>                               mutex_lock(&dev->struct_mutex);
>                               drm_atomic_helper_cleanup_planes(dev, state);
> @@ -13744,9 +13740,7 @@ intel_prepare_plane_fb(struct drm_plane *plane,
>                */
>               if (needs_modeset(crtc_state))
>                       ret = i915_gem_object_wait_rendering(old_obj, true);
> -
> -             /* Swallow -EIO errors to allow updates during hw lockup. */
> -             if (ret && ret != -EIO)
> +             if (ret)

Same here really.

We already have a WARN_ON in the mmio_flip, so are covered there already.

Otherwise looks good, and with the above addressed and under the
assumption that the last caller of check_wedge is in request_alloc (which
I was too lazy to check perfectly):

Reviewed-by: Daniel Vetter <daniel.vet...@ffwll.ch>

>                       return ret;
>       }
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index 69b298eed69d..7030ff526941 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -976,7 +976,7 @@ void intel_logical_ring_stop(struct intel_engine_cs *ring)
>               return;
>  
>       ret = intel_ring_idle(ring);
> -     if (ret && 
> !i915_reset_in_progress_or_wedged(&to_i915(ring->dev)->gpu_error))
> +     if (ret)
>               DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
>                         ring->name, ret);
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 942f86b316c2..6cecc15ec01b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -3062,7 +3062,7 @@ intel_stop_ring_buffer(struct intel_engine_cs *ring)
>               return;
>  
>       ret = intel_ring_idle(ring);
> -     if (ret && 
> !i915_reset_in_progress_or_wedged(&to_i915(ring->dev)->gpu_error))
> +     if (ret)
>               DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
>                         ring->name, ret);
>  
> -- 
> 2.6.3
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to