On Mon, Aug 26, 2013 at 02:51:53PM +0100, Chris Wilson wrote:
> When we switched to always using a timeout in conjunction with
> wait_seqno, we lost the ability to detect missed interrupts. Since, we
> have had issues with interrupts on a number of generations, and they are
> required to be delivered in a timely fashion for a smooth UX, it is
> important that we do log errors found in the wild and prevent the
> display stalling for upwards of 1s every time the seqno interrupt is
> missed.
> 
> Rather than continue to fix up the timeouts to work around the interface
> impedence in wait_event_*(), open code the combination of
> wait_event[_interruptible][_timeout], and use the exposed timer to
> poll for seqno should we detect a lost interrupt.
> 
> v2: In order to satisfy the debug requirement of logging missed
> interrupts with the real world requirments of making machines work even
> if interrupts are hosed, we revert to polling after detecting a missed
> interrupt.
> 
> v3: Throw in a debugfs interface to simulate broken hw not reporting
> interrupts.

This interface should be a separate patch.

> 
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   |  68 ++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h       |   5 ++
>  drivers/gpu/drm/i915/i915_gem.c       | 114 
> ++++++++++++++++++++--------------
>  drivers/gpu/drm/i915/i915_gpu_error.c |   1 +
>  drivers/gpu/drm/i915/i915_irq.c       |  11 ++--
>  5 files changed, 148 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 55ab924..e823169 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
>                       i915_ring_stop_get, i915_ring_stop_set,
>                       "0x%08llx\n");
>  
> +static int
> +i915_ring_missed_irq_get(void *data, u64 *val)
> +{
> +     struct drm_device *dev = data;
> +     drm_i915_private_t *dev_priv = dev->dev_private;
> +
> +     *val = dev_priv->gpu_error.missed_irq_rings;
> +     return 0;
> +}
> +
> +static int
> +i915_ring_missed_irq_set(void *data, u64 val)
> +{
> +     struct drm_device *dev = data;
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     int ret;
> +
> +     /* Lock against concurrent debugfs callers */
> +     ret = mutex_lock_interruptible(&dev->struct_mutex);
> +     if (ret)
> +             return ret;
> +     dev_priv->gpu_error.missed_irq_rings = val;
> +     mutex_unlock(&dev->struct_mutex);
> +
> +     return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
> +                     i915_ring_missed_irq_get, i915_ring_missed_irq_set,
> +                     "0x%08llx\n");
> +
> +static int
> +i915_ring_test_irq_get(void *data, u64 *val)
> +{
> +     struct drm_device *dev = data;
> +     drm_i915_private_t *dev_priv = dev->dev_private;
> +
> +     *val = dev_priv->gpu_error.test_irq_rings;
> +
> +     return 0;
> +}
> +
> +static int
> +i915_ring_test_irq_set(void *data, u64 val)
> +{
> +     struct drm_device *dev = data;
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     int ret;
> +
> +     DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
> +
> +     /* Lock against concurrent debugfs callers */
> +     ret = mutex_lock_interruptible(&dev->struct_mutex);
> +     if (ret)
> +             return ret;
> +
> +     dev_priv->gpu_error.test_irq_rings = val;
> +     mutex_unlock(&dev->struct_mutex);
> +
> +     return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
> +                     i915_ring_test_irq_get, i915_ring_test_irq_set,
> +                     "0x%08llx\n");
> +
>  #define DROP_UNBOUND 0x1
>  #define DROP_BOUND 0x2
>  #define DROP_RETIRE 0x4
> @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files {
>       {"i915_min_freq", &i915_min_freq_fops},
>       {"i915_cache_sharing", &i915_cache_sharing_fops},
>       {"i915_ring_stop", &i915_ring_stop_fops},
> +     {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
> +     {"i915_ring_test_irq", &i915_ring_test_irq_fops},
>       {"i915_gem_drop_caches", &i915_drop_caches_fops},
>       {"i915_error_state", &i915_error_state_fops},
>       {"i915_next_seqno", &i915_next_seqno_fops},
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 84b95b1..ac6db6e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -993,6 +993,8 @@ struct i915_gpu_error {
>  
>       unsigned long last_reset;
>  
> +     unsigned long missed_irq_rings;
> +
>       /**
>        * State variable and reset counter controlling the reset flow
>        *
> @@ -1031,6 +1033,9 @@ struct i915_gpu_error {
>  
>       /* For gpu hang simulation. */
>       unsigned int stop_rings;
> +
> +     /* For missed irq/seqno simulation. */
> +     unsigned int test_irq_rings;
>  };
>  
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ccfebf5..4ed782f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 
> seqno)
>       return ret;
>  }
>  
> +static void fake_irq(unsigned long data)
> +{
> +     wake_up_process((struct task_struct *)data);
> +}
> +
> +static bool missed_irq(struct drm_i915_private *dev_priv,
> +                    struct intel_ring_buffer *ring)
> +{
> +     return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> +}
> +
>  /**
>   * __wait_seqno - wait until execution of seqno has finished
>   * @ring: the ring expected to report seqno
> @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, 
> u32 seqno,
>                       bool interruptible, struct timespec *timeout)
>  {
>       drm_i915_private_t *dev_priv = ring->dev->dev_private;
> -     struct timespec before, now, wait_time={1,0};
> -     unsigned long timeout_jiffies;
> -     long end;
> -     bool wait_forever = true;
> +     struct timespec before, now;
> +     DEFINE_WAIT(wait);
> +     long timeout_jiffies;
>       int ret;
>  
>       WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
> @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer 
> *ring, u32 seqno,
>       if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
>               return 0;
>  
> -     trace_i915_gem_request_wait_begin(ring, seqno);
> -
> -     if (timeout != NULL) {
> -             wait_time = *timeout;
> -             wait_forever = false;
> -     }
> -
> -     timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
> +     timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
>  
> -     if (WARN_ON(!ring->irq_get(ring)))
> +     if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
> +         WARN_ON(!ring->irq_get(ring)))
>               return -ENODEV;
>  
> -     /* Record current time in case interrupted by signal, or wedged * */
> +     /* Record current time in case interrupted by signal, or wedged */
> +     trace_i915_gem_request_wait_begin(ring, seqno);
>       getrawmonotonic(&before);
> +     for (;;) {
> +             struct timer_list timer;
> +             unsigned long expire;
>  
> -#define EXIT_COND \
> -     (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
> -      i915_reset_in_progress(&dev_priv->gpu_error) || \
> -      reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
> -     do {
> -             if (interruptible)
> -                     end = wait_event_interruptible_timeout(ring->irq_queue,
> -                                                            EXIT_COND,
> -                                                            timeout_jiffies);
> -             else
> -                     end = wait_event_timeout(ring->irq_queue, EXIT_COND,
> -                                              timeout_jiffies);
> +             prepare_to_wait(&ring->irq_queue, &wait,
> +                             interruptible ? TASK_INTERRUPTIBLE : 
> TASK_UNINTERRUPTIBLE);
>  
>               /* We need to check whether any gpu reset happened in between
>                * the caller grabbing the seqno and now ... */
> -             if (reset_counter != 
> atomic_read(&dev_priv->gpu_error.reset_counter))
> -                     end = -EAGAIN;
> +             if (reset_counter != 
> atomic_read(&dev_priv->gpu_error.reset_counter)) {
> +                     /* ... but upgrade the -EGAIN to an -EIO if the gpu
                                                ^ EAGAIN (I realize it's
                                                copypasta)
> +                      * is truely gone. */
                                ^ truly
> +                     ret = i915_gem_check_wedge(&dev_priv->gpu_error, 
> interruptible);
> +                     if (ret == 0)
> +                             ret = -EAGAIN;
> +                     break;
> +             }
>  
> -             /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
> -              * gone. */
> -             ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> -             if (ret)
> -                     end = ret;
> -     } while (end == 0 && wait_forever);
> +             if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
> +                     ret = 0;
> +                     break;
> +             }
> +
> +             if (interruptible && signal_pending(current)) {
> +                     ret = -ERESTARTSYS;
> +                     break;
> +             }
> +

Not sure, but I think the signal check should come first.  As long as
you've thought about it, I'm okay with whatever.

> +             if (timeout_jiffies <= 0) {
> +                     ret = -ETIME;
> +                     break;
> +             }
>  
> +             timer.function = NULL;
> +             if (timeout || missed_irq(dev_priv, ring)) {
> +                     setup_timer_on_stack(&timer, fake_irq, (unsigned 
> long)current);
> +                     expire = jiffies + (missed_irq(dev_priv, ring) ? 1: 
> timeout_jiffies);
> +                     mod_timer(&timer, expire);
> +             }

Isn't this just a fancy schedule_timeout_interruptible?

> +
> +             schedule();
> +
> +             if (timeout)
> +                     timeout_jiffies = expire - jiffies;
> +
> +             if (timer.function) {
> +                     del_singleshot_timer_sync(&timer);
> +                     destroy_timer_on_stack(&timer);
> +             }
> +     }
>       getrawmonotonic(&now);
> +     trace_i915_gem_request_wait_end(ring, seqno);
>  
>       ring->irq_put(ring);
> -     trace_i915_gem_request_wait_end(ring, seqno);
> -#undef EXIT_COND
> +
> +     finish_wait(&ring->irq_queue, &wait);
>  
>       if (timeout) {
>               struct timespec sleep_time = timespec_sub(now, before);
> @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer 
> *ring, u32 seqno,
>                       set_normalized_timespec(timeout, 0, 0);
>       }
>  
> -     switch (end) {
> -     case -EIO:
> -     case -EAGAIN: /* Wedged */
> -     case -ERESTARTSYS: /* Signal */
> -             return (int)end;
> -     case 0: /* Timeout */
> -             return -ETIME;
> -     default: /* Completed */
> -             WARN_ON(end < 0); /* We're not aware of other errors */
> -             return 0;
> -     }
> +     return ret;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 558e568..62368f4 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>       err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
>       err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
>       err_printf(m, "CCID: 0x%08x\n", error->ccid);
> +     err_printf(m, "Missed interrupts: 0x%08lx\n", 
> dev_priv->gpu_error.missed_irq_rings);
>  
>       for (i = 0; i < dev_priv->num_fence_regs; i++)
>               err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index a03b445..7ebf105 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
>                       if (ring_idle(ring, seqno)) {
>                               if (waitqueue_active(&ring->irq_queue)) {
>                                       /* Issue a wake-up to catch stuck h/w. 
> */
> -                                     DRM_ERROR("Hangcheck timer elapsed... 
> %s idle\n",
> -                                               ring->name);
> -                                     wake_up_all(&ring->irq_queue);
> -                                     ring->hangcheck.score += HUNG;
> +                                     if (!test_and_set_bit(ring->id, 
> &dev_priv->gpu_error.missed_irq_rings)) {
> +                                             DRM_ERROR("Hangcheck timer 
> elapsed... %s idle\n",
> +                                                       ring->name);
> +                                             wake_up_all(&ring->irq_queue);
> +                                     }
> +                                     /* Safeguard against driver failure */
> +                                     ring->hangcheck.score += BUSY;
>                               } else
>                                       busy = false;
>                       } else {

Honestly, I don't think using the scheduler APIs is a step in the right
direction wrt readability - but I have no better solution, and it seems
to do what you want.

Anyway, I tried to review it, and failed. If you want to split it up
more, I can try again.

Meanwhile, the idea seems okay to me.

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to