Re: [Intel-gfx] [PATCH 29/34] drm/i915: Drop fake breadcrumb irq

2019-01-24 Thread Chris Wilson
Quoting Tvrtko Ursulin (2019-01-24 17:55:20)
> 
> On 21/01/2019 22:21, Chris Wilson wrote:
> > Missed breadcrumb detection is defunct due to the tight coupling with
> 
> How it is defunct.. oh because there is no irq_count any more... could 
> you describe the transition from irq_count to irq_fired and then to 
> nothing briefly?

We don't have an independent intel_wait to distinguish irq completions
from dma_fence_signals. Everytime we call dma_fence_signal() we think we
saw an interrupt, and we complete fences very often before we see
interrupts... And then our test completely fails to setup the machine to
detect the missed breadcrumb as the requests get completed by anything
but the missed breadcrumb timer.

> > dma_fence signaling and the myriad ways we may signal fences from
> > everywhere but from an interrupt, i.e. we frequently signal a fence
> > before we even see its interrupt. This means that even if we miss an
> > interrupt for a fence, it still is signaled before our breadcrumb
> > hangcheck fires, so simplify the breadcrumb hangchecking by moving it
> > into the GPU hangcheck and forgo fake interrupts.
> > 
> > Signed-off-by: Chris Wilson 
> > ---
> >   drivers/gpu/drm/i915/i915_debugfs.c   |  93 ---
> >   drivers/gpu/drm/i915/i915_gpu_error.c |   2 -
> >   drivers/gpu/drm/i915/i915_gpu_error.h |   5 -
> >   drivers/gpu/drm/i915/intel_breadcrumbs.c  | 147 +-
> >   drivers/gpu/drm/i915/intel_hangcheck.c|   2 +
> >   drivers/gpu/drm/i915/intel_ringbuffer.h   |   5 -
> >   .../gpu/drm/i915/selftests/igt_live_test.c|   7 -
> >   7 files changed, 5 insertions(+), 256 deletions(-)
> 
> With this balance of insertions and deletions I cannot decide if this is 
> easy or hard to review.
> 
> IGT uses intel_detect_and_clear_missed_interrupts a lot. What is the 
> plan there?

They are defunct. They no longer detect anything useful from the
previous patch, this just makes it official. igt has been prepped for
the loss of the debugfs.

Without this patch we get false positives from i915_missed_interrupt
instead.

I've tried and failed to replace the detection in an acceptable manner,
without a separate irq completion tracker it seems hopeless.

> > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c 
> > b/drivers/gpu/drm/i915/intel_hangcheck.c
> > index 5662d6fed523..a219c796e56d 100644
> > --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> > @@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct 
> > *work)
> >   for_each_engine(engine, dev_priv, id) {
> >   struct hangcheck hc;
> >   
> > + intel_engine_signal_breadcrumbs(engine);
> > +
> 
> Sounds fine. So only downside is detecting missed interrupts gets 
> slower. But in practice they don't happen often?

In practice, no missed interrupts. *touch wood*
That was why fixing gen5-gen7 beforehand was so important.

Having a signal here and in retire_work, means that in the worst case
everything updates once a second. Enough for users to be able to
complain. But more than likely every frame update will flush the earlier
requests anyway, hence why we couldn't detect missed breadcrumbs in igt
in the first place.
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 29/34] drm/i915: Drop fake breadcrumb irq

2019-01-24 Thread Tvrtko Ursulin


On 21/01/2019 22:21, Chris Wilson wrote:

Missed breadcrumb detection is defunct due to the tight coupling with


How it is defunct.. oh because there is no irq_count any more... could 
you describe the transition from irq_count to irq_fired and then to 
nothing briefly?



dma_fence signaling and the myriad ways we may signal fences from
everywhere but from an interrupt, i.e. we frequently signal a fence
before we even see its interrupt. This means that even if we miss an
interrupt for a fence, it still is signaled before our breadcrumb
hangcheck fires, so simplify the breadcrumb hangchecking by moving it
into the GPU hangcheck and forgo fake interrupts.

Signed-off-by: Chris Wilson 
---
  drivers/gpu/drm/i915/i915_debugfs.c   |  93 ---
  drivers/gpu/drm/i915/i915_gpu_error.c |   2 -
  drivers/gpu/drm/i915/i915_gpu_error.h |   5 -
  drivers/gpu/drm/i915/intel_breadcrumbs.c  | 147 +-
  drivers/gpu/drm/i915/intel_hangcheck.c|   2 +
  drivers/gpu/drm/i915/intel_ringbuffer.h   |   5 -
  .../gpu/drm/i915/selftests/igt_live_test.c|   7 -
  7 files changed, 5 insertions(+), 256 deletions(-)


With this balance of insertions and deletions I cannot decide if this is 
easy or hard to review.


IGT uses intel_detect_and_clear_missed_interrupts a lot. What is the 
plan there?




diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index d7764e62e9b4..c2aaf010c3d1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
   intel_engine_last_submit(engine),
   jiffies_to_msecs(jiffies -

engine->hangcheck.action_timestamp));
-   seq_printf(m, "\tfake irq active? %s\n",
-  yesno(test_bit(engine->id,
- 
&dev_priv->gpu_error.missed_irq_rings)));
  
  		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",

   (long long)engine->hangcheck.acthd,
@@ -3874,94 +3871,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
  
-static int

-fault_irq_set(struct drm_i915_private *i915,
- unsigned long *irq,
- unsigned long val)
-{
-   int err;
-
-   err = mutex_lock_interruptible(&i915->drm.struct_mutex);
-   if (err)
-   return err;
-
-   err = i915_gem_wait_for_idle(i915,
-I915_WAIT_LOCKED |
-I915_WAIT_INTERRUPTIBLE,
-MAX_SCHEDULE_TIMEOUT);
-   if (err)
-   goto err_unlock;
-
-   *irq = val;
-   mutex_unlock(&i915->drm.struct_mutex);
-
-   /* Flush idle worker to disarm irq */
-   drain_delayed_work(&i915->gt.idle_work);
-
-   return 0;
-
-err_unlock:
-   mutex_unlock(&i915->drm.struct_mutex);
-   return err;
-}
-
-static int
-i915_ring_missed_irq_get(void *data, u64 *val)
-{
-   struct drm_i915_private *dev_priv = data;
-
-   *val = dev_priv->gpu_error.missed_irq_rings;
-   return 0;
-}
-
-static int
-i915_ring_missed_irq_set(void *data, u64 val)
-{
-   struct drm_i915_private *i915 = data;
-
-   return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
-   i915_ring_missed_irq_get, i915_ring_missed_irq_set,
-   "0x%08llx\n");
-
-static int
-i915_ring_test_irq_get(void *data, u64 *val)
-{
-   struct drm_i915_private *dev_priv = data;
-
-   *val = dev_priv->gpu_error.test_irq_rings;
-
-   return 0;
-}
-
-static int
-i915_ring_test_irq_set(void *data, u64 val)
-{
-   struct drm_i915_private *i915 = data;
-
-   /* GuC keeps the user interrupt permanently enabled for submission */
-   if (USES_GUC_SUBMISSION(i915))
-   return -ENODEV;
-
-   /*
-* From icl, we can no longer individually mask interrupt generation
-* from each engine.
-*/
-   if (INTEL_GEN(i915) >= 11)
-   return -ENODEV;
-
-   val &= INTEL_INFO(i915)->ring_mask;
-   DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
-
-   return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
-   i915_ring_test_irq_get, i915_ring_test_irq_set,
-   "0x%08llx\n");
-
  #define DROP_UNBOUND  BIT(0)
  #define DROP_BOUNDBIT(1)
  #define DROP_RETIRE   BIT(2)
@@ -4724,8 +4633,6 @@ static const struct i915_debugfs_files {
  } i915_debugfs_files[] = {
{"i915_wedged", &i915_wedged_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops},
-  

[Intel-gfx] [PATCH 29/34] drm/i915: Drop fake breadcrumb irq

2019-01-21 Thread Chris Wilson
Missed breadcrumb detection is defunct due to the tight coupling with
dma_fence signaling and the myriad ways we may signal fences from
everywhere but from an interrupt, i.e. we frequently signal a fence
before we even see its interrupt. This means that even if we miss an
interrupt for a fence, it still is signaled before our breadcrumb
hangcheck fires, so simplify the breadcrumb hangchecking by moving it
into the GPU hangcheck and forgo fake interrupts.

Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_debugfs.c   |  93 ---
 drivers/gpu/drm/i915/i915_gpu_error.c |   2 -
 drivers/gpu/drm/i915/i915_gpu_error.h |   5 -
 drivers/gpu/drm/i915/intel_breadcrumbs.c  | 147 +-
 drivers/gpu/drm/i915/intel_hangcheck.c|   2 +
 drivers/gpu/drm/i915/intel_ringbuffer.h   |   5 -
 .../gpu/drm/i915/selftests/igt_live_test.c|   7 -
 7 files changed, 5 insertions(+), 256 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index d7764e62e9b4..c2aaf010c3d1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
   intel_engine_last_submit(engine),
   jiffies_to_msecs(jiffies -

engine->hangcheck.action_timestamp));
-   seq_printf(m, "\tfake irq active? %s\n",
-  yesno(test_bit(engine->id,
- 
&dev_priv->gpu_error.missed_irq_rings)));
 
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
   (long long)engine->hangcheck.acthd,
@@ -3874,94 +3871,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
 
-static int
-fault_irq_set(struct drm_i915_private *i915,
- unsigned long *irq,
- unsigned long val)
-{
-   int err;
-
-   err = mutex_lock_interruptible(&i915->drm.struct_mutex);
-   if (err)
-   return err;
-
-   err = i915_gem_wait_for_idle(i915,
-I915_WAIT_LOCKED |
-I915_WAIT_INTERRUPTIBLE,
-MAX_SCHEDULE_TIMEOUT);
-   if (err)
-   goto err_unlock;
-
-   *irq = val;
-   mutex_unlock(&i915->drm.struct_mutex);
-
-   /* Flush idle worker to disarm irq */
-   drain_delayed_work(&i915->gt.idle_work);
-
-   return 0;
-
-err_unlock:
-   mutex_unlock(&i915->drm.struct_mutex);
-   return err;
-}
-
-static int
-i915_ring_missed_irq_get(void *data, u64 *val)
-{
-   struct drm_i915_private *dev_priv = data;
-
-   *val = dev_priv->gpu_error.missed_irq_rings;
-   return 0;
-}
-
-static int
-i915_ring_missed_irq_set(void *data, u64 val)
-{
-   struct drm_i915_private *i915 = data;
-
-   return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
-   i915_ring_missed_irq_get, i915_ring_missed_irq_set,
-   "0x%08llx\n");
-
-static int
-i915_ring_test_irq_get(void *data, u64 *val)
-{
-   struct drm_i915_private *dev_priv = data;
-
-   *val = dev_priv->gpu_error.test_irq_rings;
-
-   return 0;
-}
-
-static int
-i915_ring_test_irq_set(void *data, u64 val)
-{
-   struct drm_i915_private *i915 = data;
-
-   /* GuC keeps the user interrupt permanently enabled for submission */
-   if (USES_GUC_SUBMISSION(i915))
-   return -ENODEV;
-
-   /*
-* From icl, we can no longer individually mask interrupt generation
-* from each engine.
-*/
-   if (INTEL_GEN(i915) >= 11)
-   return -ENODEV;
-
-   val &= INTEL_INFO(i915)->ring_mask;
-   DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
-
-   return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
-   i915_ring_test_irq_get, i915_ring_test_irq_set,
-   "0x%08llx\n");
-
 #define DROP_UNBOUND   BIT(0)
 #define DROP_BOUND BIT(1)
 #define DROP_RETIREBIT(2)
@@ -4724,8 +4633,6 @@ static const struct i915_debugfs_files {
 } i915_debugfs_files[] = {
{"i915_wedged", &i915_wedged_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops},
-   {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
-   {"i915_ring_test_irq", &i915_ring_test_irq_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
{"i915_error_state", &i915_error_state_fops},
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 825572127029..0584c8dfa6a