Given sufficient preemption, we may see a busy system that doesn't advance seqno while performing work across multiple contexts, and given sufficient pathology not even notice a change in ACTHD. What does change between the preempting contexts is their RING, so take note of that and treat a change in the ring address as being an indication of forward progress.
Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com> --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 1 + drivers/gpu/drm/i915/gt/intel_hangcheck.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index d972c339309c..3b8d2f692bd5 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -53,6 +53,7 @@ struct intel_instdone { struct intel_engine_hangcheck { u64 acthd; + u32 last_ring; u32 last_seqno; u32 next_seqno; unsigned long action_timestamp; diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c index e5eaa06fe74d..721ab74a382f 100644 --- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c @@ -27,6 +27,7 @@ struct hangcheck { u64 acthd; + u32 ring; u32 seqno; enum intel_engine_hangcheck_action action; unsigned long action_timestamp; @@ -134,6 +135,7 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine, { hc->acthd = intel_engine_get_active_head(engine); hc->seqno = intel_engine_get_hangcheck_seqno(engine); + hc->ring = ENGINE_READ(engine, RING_START); } static void hangcheck_store_sample(struct intel_engine_cs *engine, @@ -141,18 +143,22 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine, { engine->hangcheck.acthd = hc->acthd; engine->hangcheck.last_seqno = hc->seqno; + engine->hangcheck.last_ring = hc->ring; } static enum intel_engine_hangcheck_action hangcheck_get_action(struct intel_engine_cs *engine, const struct hangcheck *hc) { - if (engine->hangcheck.last_seqno != hc->seqno) - return ENGINE_ACTIVE_SEQNO; - if (intel_engine_is_idle(engine)) return ENGINE_IDLE; + if (engine->hangcheck.last_ring != hc->ring) + return ENGINE_ACTIVE_SEQNO; + + if (engine->hangcheck.last_seqno != hc->seqno) + return ENGINE_ACTIVE_SEQNO; + return engine_stuck(engine, hc->acthd); } -- 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx