On Thu, Jan 23, 2014 at 09:49:43PM +0000, Chris Wilson wrote:
> Currently we report through our error state only the rings that have
> been initialised (as detected by ring->obj). This check is done after
> the GPU reset and ring re-initialisation, which means that the software
> state may not be the same as when we captured the hardware error and we
> may not print out any of the vital information for debugging the hang.
> 
> This (and the implied object leak) is a regression from
> 
> commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
> Author: Ben Widawsky <b...@bwidawsk.net>
> Date:   Mon Oct 14 10:01:36 2013 -0700
> 
>     drm/i915: Do a fuller init after reset
> 
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Ben Widawsky <b...@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c | 19 +++++++++++++------
>  2 files changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c45cbbecd66a..64a1aca7804d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -334,6 +334,7 @@ struct drm_i915_error_state {
>       struct timeval time;
>  
>       struct drm_i915_error_ring {
> +             int valid;

bool

>               struct drm_i915_error_object {
>                       int page_count;
>                       u32 gtt_offset;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 260a215e3619..e2af1d490f8d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -240,6 +240,9 @@ static void i915_ring_error_state(struct 
> drm_i915_error_state_buf *m,
>                                 unsigned ring)
>  {
>       BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */
> +     if (!error->ring[ring].valid)
> +             return;
> +
>       err_printf(m, "%s command stream:\n", ring_str(ring));
>       err_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
>       err_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
> @@ -294,7 +297,6 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>       struct drm_device *dev = error_priv->dev;
>       drm_i915_private_t *dev_priv = dev->dev_private;
>       struct drm_i915_error_state *error = error_priv->error;
> -     struct intel_ring_buffer *ring;
>       int i, j, page, offset, elt;
>  
>       if (!error) {
> @@ -329,7 +331,7 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>       if (INTEL_INFO(dev)->gen == 7)
>               err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
>  
> -     for_each_ring(ring, dev_priv, i)
> +     for (i = 0; i < ARRAY_SIZE(error->ring); i++)
>               i915_ring_error_state(m, dev, error, i);
>  
>       for (i = 0; i < error->vm_count; i++) {
> @@ -388,8 +390,7 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>                       }
>               }
>  
> -             obj = error->ring[i].ctx;
> -             if (obj) {
> +             if ((obj = error->ring[i].ctx)) {

Unrelated change. Although it does make this more consistent w/ the
surrouding code. But I admit to not being a fan of assignments inside
if statements.

>                       err_printf(m, "%s --- HW Context = 0x%08x\n",
>                                  dev_priv->ring[i].name,
>                                  obj->gtt_offset);
> @@ -826,11 +827,17 @@ static void i915_gem_record_rings(struct drm_device 
> *dev,
>                                 struct drm_i915_error_state *error)
>  {
>       struct drm_i915_private *dev_priv = dev->dev_private;
> -     struct intel_ring_buffer *ring;
>       struct drm_i915_gem_request *request;
>       int i, count;
>  
> -     for_each_ring(ring, dev_priv, i) {
> +     for (i = 0; i < I915_NUM_RINGS; i++) {
> +             struct intel_ring_buffer *ring = &dev_priv->ring[i];
> +
> +             if (ring->dev == NULL)
> +                     continue;
> +
> +             error->ring[i].valid = true;
> +

The code here runs before the reset, and it would actually oops if
ring->obj==NULL, so using for_each_ring() here looks appropriate.

>               i915_record_ring_state(dev, error, ring);
>  
>               error->ring[i].batchbuffer =
> -- 
> 1.8.5.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to