Defined trace points and sprinkled the usage of these throughout the
TDR/watchdog implementation.

The following trace points are supported:

        1. trace_i915_tdr_gpu_recovery:
        Called at the onset of the full GPU reset recovery path.

        2. trace_i915_tdr_engine_recovery:
        Called at the onset of the per-engine recovery path.

        3. i915_tdr_recovery_start:
        Called at the onset of hang recovery before recovery mode has been
        decided.

        4. i915_tdr_recovery_complete:
        Called at the point of hang recovery completion.

        5. i915_tdr_recovery_queued:
        Called once the error handler decides to schedule the actual hang
        recovery, which marks the end of the hang detection path.

        6. i915_tdr_engine_save:
        Called at the point of saving the engine state during per-engine hang
        recovery.

        7. i915_tdr_gpu_reset_complete:
        Called at the point of full GPU reset recovery completion.

        8. i915_tdr_engine_reset_complete:
        Called at the point of per-engine recovery completion.

        9. i915_tdr_forced_csb_check:
        Called at the completion of a forced CSB check.

        10. i915_tdr_hang_check:
        Called for every engine in the periodic hang checker loop before moving
        on to the next engine. Provides an overview of all hang check stats in
        real-time. The collected stats are:

                a. Engine name.

                b. Current engine seqno.

                c. Seqno of previous hang check iteration for that engine.

                d. ACTHD register value of given engine.

                e. Current hang check score of given engine (and whether or not
                the engine has been detected as hung).

                f. Current action for given engine.

                g. Busyness of given engine.

                h. Submission status of currently running context on given 
engine.

        11. i915_tdr_inconsistency:
        Called when an inconsistency is detected to provide more information in
        the log about the nature of the inconsistency. The collected
        information is:

                a. Engine name.

                b. ID of the currently executing context on hardware.

                c. Is the given engine idle or not?

                d. The ID of the context that was most recently submitted to
                the ELSP port from the execlist queue for the given engine.

                e. The submission/IRQ balance of the request most recently
                submitted to hardware (elsp_submitted).

Signed-off-by: Tomas Elf <tomas....@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c       |   3 +
 drivers/gpu/drm/i915/i915_drv.h       |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c |   2 +-
 drivers/gpu/drm/i915/i915_irq.c       |  11 +-
 drivers/gpu/drm/i915/i915_trace.h     | 339 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c      |  21 ++-
 drivers/gpu/drm/i915/intel_uncore.c   |   4 +
 7 files changed, 377 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c32c475..d482341 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -896,6 +896,7 @@ int i915_reset(struct drm_device *dev)
        bool simulated;
        int ret;
 
+       trace_i915_tdr_gpu_recovery(dev);
        intel_reset_gt_powersave(dev);
 
        mutex_lock(&dev->struct_mutex);
@@ -1120,6 +1121,8 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 
        WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
+       trace_i915_tdr_engine_recovery(engine);
+
         /* Take wake lock to prevent power saving mode */
        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7ebf800..5b54675 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3335,6 +3335,7 @@ void i915_destroy_error_state(struct drm_device *dev);
 
 void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
 
 /* i915_cmd_parser.c */
 int i915_cmd_parser_get_version(void);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 2f04e4f..8c0cd0e 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -221,7 +221,7 @@ static void print_error_buffers(struct 
drm_i915_error_state_buf *m,
        }
 }
 
-static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
 {
        switch (a) {
        case HANGCHECK_IDLE:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 71208de..8fe972b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2454,6 +2454,7 @@ static void i915_error_work_func(struct work_struct *work)
 
        mutex_lock(&dev->struct_mutex);
 
+       trace_i915_tdr_recovery_start(dev);
        kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
 
        for_each_ring(ring, dev_priv, i) {
@@ -2572,6 +2573,7 @@ static void i915_error_work_func(struct work_struct *work)
                        kobject_uevent_env(&dev->primary->kdev->kobj,
                                           KOBJ_CHANGE, reset_done_event);
        }
+       trace_i915_tdr_recovery_complete(dev);
 }
 
 static void i915_report_and_clear_eir(struct drm_device *dev)
@@ -2697,6 +2699,7 @@ void i915_handle_error(struct drm_device *dev, u32 
engine_mask,
        struct drm_i915_private *dev_priv = dev->dev_private;
        va_list args;
        char error_msg[80];
+       bool full_reset = true;
 
        struct intel_engine_cs *engine;
 
@@ -2715,7 +2718,6 @@ void i915_handle_error(struct drm_device *dev, u32 
engine_mask,
                 *      2. The hardware does not support it (pre-gen7).
                 *      3. We already tried per-engine reset recently.
                 */
-               bool full_reset = true;
 
                if (!i915.enable_engine_reset) {
                        DRM_INFO("Engine reset disabled: Using full GPU 
reset.\n");
@@ -2744,6 +2746,7 @@ void i915_handle_error(struct drm_device *dev, u32 
engine_mask,
                                                i915.gpu_reset_promotion_time;
 
                                        
engine->hangcheck.last_engine_reset_time = now;
+
                                } else {
                                        /*
                                         * Watchdog timeout always results
@@ -2792,6 +2795,8 @@ void i915_handle_error(struct drm_device *dev, u32 
engine_mask,
                i915_error_wake_up(dev_priv, false);
        }
 
+       trace_i915_tdr_recovery_queued(dev, engine_mask, watchdog, full_reset);
+
        /*
         * Gen 7:
         *
@@ -3158,6 +3163,7 @@ static void i915_hangcheck_elapsed(struct work_struct 
*work)
        for_each_ring(ring, dev_priv, i) {
                u64 acthd;
                u32 seqno;
+               u32 head;
                bool busy = true;
 
                semaphore_clear_deadlocks(dev_priv);
@@ -3234,7 +3240,10 @@ static void i915_hangcheck_elapsed(struct work_struct 
*work)
 
                ring->hangcheck.seqno = seqno;
                ring->hangcheck.acthd = acthd;
+               head = I915_READ_HEAD(ring);
                busy_count += busy;
+
+               trace_i915_tdr_hang_check(ring, seqno, acthd, head, busy);
        }
 
        for_each_ring(ring, dev_priv, i) {
diff --git a/drivers/gpu/drm/i915/i915_trace.h 
b/drivers/gpu/drm/i915/i915_trace.h
index 04fe849..8d86a7d 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -810,6 +810,345 @@ TRACE_EVENT(switch_mm,
                  __entry->dev, __entry->ring, __entry->to, __entry->vm)
 );
 
+/**
+ * DOC: i915_tdr_gpu_recovery
+ *
+ * This tracepoint tracks the onset of the full GPU recovery path
+ */
+TRACE_EVENT(i915_tdr_gpu_recovery,
+       TP_PROTO(struct drm_device *dev),
+
+       TP_ARGS(dev),
+
+       TP_STRUCT__entry(
+                       __field(u32, dev)
+       ),
+
+       TP_fast_assign(
+                       __entry->dev = dev->primary->index;
+       ),
+
+       TP_printk("dev=%u, full GPU recovery started",
+                 __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_engine_recovery
+ *
+ * This tracepoint tracks the onset of the engine recovery path
+ */
+TRACE_EVENT(i915_tdr_engine_recovery,
+       TP_PROTO(struct intel_engine_cs *engine),
+
+       TP_ARGS(engine),
+
+       TP_STRUCT__entry(
+                       __field(struct intel_engine_cs *, engine)
+       ),
+
+       TP_fast_assign(
+                       __entry->engine = engine;
+       ),
+
+       TP_printk("dev=%u, engine=%u, recovery of %s started",
+                 __entry->engine->dev->primary->index,
+                 __entry->engine->id,
+                 __entry->engine->name)
+);
+
+/**
+ * DOC: i915_tdr_recovery_start
+ *
+ * This tracepoint tracks hang recovery start
+ */
+TRACE_EVENT(i915_tdr_recovery_start,
+       TP_PROTO(struct drm_device *dev),
+
+       TP_ARGS(dev),
+
+       TP_STRUCT__entry(
+                       __field(u32, dev)
+       ),
+
+       TP_fast_assign(
+                       __entry->dev = dev->primary->index;
+       ),
+
+       TP_printk("dev=%u, hang recovery started",
+                 __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_complete
+ *
+ * This tracepoint tracks hang recovery completion
+ */
+TRACE_EVENT(i915_tdr_recovery_complete,
+       TP_PROTO(struct drm_device *dev),
+
+       TP_ARGS(dev),
+
+       TP_STRUCT__entry(
+                       __field(u32, dev)
+       ),
+
+       TP_fast_assign(
+                       __entry->dev = dev->primary->index;
+       ),
+
+       TP_printk("dev=%u, hang recovery completed",
+                 __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_queued
+ *
+ * This tracepoint tracks the point of queuing recovery from hang check.
+ * If engine recovery is requested engine name will be displayed, otherwise
+ * it will be set to "none". If too many engine reset was attempted in the
+ * previous history we promote to full GPU reset, which is remarked by 
appending
+ * the "[PROMOTED]" flag.
+ */
+TRACE_EVENT(i915_tdr_recovery_queued,
+       TP_PROTO(struct drm_device *dev,
+                u32 hung_engines,
+                bool watchdog,
+                bool full_reset),
+
+       TP_ARGS(dev, hung_engines, watchdog, full_reset),
+
+       TP_STRUCT__entry(
+                       __field(u32, dev)
+                       __field(u32, hung_engines)
+                       __field(bool, watchdog)
+                       __field(bool, full_reset)
+       ),
+
+       TP_fast_assign(
+                       __entry->dev = dev->primary->index;
+                       __entry->hung_engines = hung_engines;
+                       __entry->watchdog = watchdog;
+                       __entry->full_reset = full_reset;
+       ),
+
+       TP_printk("dev=%u, hung_engines=0x%02x%s%s%s%s%s%s%s, watchdog=%s, 
full_reset=%s",
+                 __entry->dev,
+                 __entry->hung_engines,
+                 __entry->hung_engines ? " (":"",
+                 __entry->hung_engines & RENDER_RING ? " [RCS] " : "",
+                 __entry->hung_engines & BSD_RING ?    " [VCS] " : "",
+                 __entry->hung_engines & BLT_RING ?    " [BCS] " : "",
+                 __entry->hung_engines & VEBOX_RING ?  " [VECS] " : "",
+                 __entry->hung_engines & BSD2_RING ?   " [VCS2] " : "",
+                 __entry->hung_engines ? ")":"",
+                 __entry->watchdog ? "true" : "false",
+                 __entry->full_reset ?
+                       (__entry->hung_engines ? "true [PROMOTED]" : "true") :
+                               "false")
+);
+
+/**
+ * DOC: i915_tdr_engine_save
+ *
+ * This tracepoint tracks the point of engine state save during the engine
+ * recovery path. Logs the head pointer position at point of hang, the position
+ * after recovering and whether or not we forced a head pointer advancement or
+ * rounded up to an aligned QWORD position.
+ */
+TRACE_EVENT(i915_tdr_engine_save,
+       TP_PROTO(struct intel_engine_cs *engine,
+                u32 old_head,
+                u32 new_head,
+                bool forced_advance),
+
+       TP_ARGS(engine, old_head, new_head, forced_advance),
+
+       TP_STRUCT__entry(
+                       __field(struct intel_engine_cs *, engine)
+                       __field(u32, old_head)
+                       __field(u32, new_head)
+                       __field(bool, forced_advance)
+       ),
+
+       TP_fast_assign(
+                       __entry->engine = engine;
+                       __entry->old_head = old_head;
+                       __entry->new_head = new_head;
+                       __entry->forced_advance = forced_advance;
+       ),
+
+       TP_printk("dev=%u, engine=%s, old_head=%u, new_head=%u, 
forced_advance=%s",
+                 __entry->engine->dev->primary->index,
+                 __entry->engine->name,
+                 __entry->old_head,
+                 __entry->new_head,
+                 __entry->forced_advance ? "true" : "false")
+);
+
+/**
+ * DOC: i915_tdr_gpu_reset_complete
+ *
+ * This tracepoint tracks the point of full GPU reset completion
+ */
+TRACE_EVENT(i915_tdr_gpu_reset_complete,
+       TP_PROTO(struct drm_device *dev),
+
+       TP_ARGS(dev),
+
+       TP_STRUCT__entry(
+                       __field(struct drm_device *, dev)
+       ),
+
+       TP_fast_assign(
+                       __entry->dev = dev;
+       ),
+
+       TP_printk("dev=%u, resets=%u",
+               __entry->dev->primary->index,
+               i915_reset_count(&((struct drm_i915_private *)
+                       (__entry->dev)->dev_private)->gpu_error) )
+);
+
+/**
+ * DOC: i915_tdr_engine_reset_complete
+ *
+ * This tracepoint tracks the point of engine reset completion
+ */
+TRACE_EVENT(i915_tdr_engine_reset_complete,
+       TP_PROTO(struct intel_engine_cs *engine),
+
+       TP_ARGS(engine),
+
+       TP_STRUCT__entry(
+                       __field(struct intel_engine_cs *, engine)
+       ),
+
+       TP_fast_assign(
+                       __entry->engine = engine;
+       ),
+
+       TP_printk("dev=%u, engine=%s, resets=%u",
+                 __entry->engine->dev->primary->index,
+                 __entry->engine->name,
+                 __entry->engine->hangcheck.reset_count)
+);
+
+/**
+ * DOC: i915_tdr_forced_csb_check
+ *
+ * This tracepoint tracks the occurences of forced CSB checks
+ * that the driver does when detecting inconsistent context
+ * submission states between the driver state and the current
+ * CPU engine state.
+ */
+TRACE_EVENT(i915_tdr_forced_csb_check,
+       TP_PROTO(struct intel_engine_cs *engine,
+                bool was_effective),
+
+       TP_ARGS(engine, was_effective),
+
+       TP_STRUCT__entry(
+                       __field(struct intel_engine_cs *, engine)
+                       __field(bool, was_effective)
+       ),
+
+       TP_fast_assign(
+                       __entry->engine = engine;
+                       __entry->was_effective = was_effective;
+       ),
+
+       TP_printk("dev=%u, engine=%s, was_effective=%s",
+                 __entry->engine->dev->primary->index,
+                 __entry->engine->name,
+                 __entry->was_effective ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_hang_check
+ *
+ * This tracepoint tracks hang checks on each engine.
+ */
+TRACE_EVENT(i915_tdr_hang_check,
+       TP_PROTO(struct intel_engine_cs *engine,
+                u32 seqno,
+                u64 acthd,
+                u32 hd,
+                bool busy),
+
+       TP_ARGS(engine, seqno, acthd, hd, busy),
+
+       TP_STRUCT__entry(
+                       __field(struct intel_engine_cs *, engine)
+                       __field(u32, seqno)
+                       __field(u64, acthd)
+                       __field(u32, hd)
+                       __field(bool, busy)
+       ),
+
+       TP_fast_assign(
+                       __entry->engine = engine;
+                       __entry->seqno = seqno;
+                       __entry->acthd = acthd;
+                       __entry->hd = hd;
+                       __entry->busy = busy;
+       ),
+
+       TP_printk("dev=%u, engine=%s, seqno=%u (%d), last seqno=%u (%d), 
head=%u (%d), acthd=%lu, score=%d%s, action=%u [%s], busy=%s",
+                 __entry->engine->dev->primary->index,
+                 __entry->engine->name,
+                 __entry->seqno,
+                 __entry->seqno,
+                 __entry->engine->hangcheck.seqno,
+                 __entry->engine->hangcheck.seqno,
+                 __entry->hd,
+                 __entry->hd,
+                 (long unsigned int) __entry->acthd,
+                 __entry->engine->hangcheck.score,
+                 (__entry->engine->hangcheck.score >= 
HANGCHECK_SCORE_RING_HUNG) ? " [HUNG]" : "",
+                 (unsigned int) __entry->engine->hangcheck.action,
+                 hangcheck_action_to_str(__entry->engine->hangcheck.action),
+                 __entry->busy ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_inconsistency
+ *
+ * This tracepoint tracks detected inconsistencies
+ */
+TRACE_EVENT(i915_tdr_inconsistency,
+       TP_PROTO(struct intel_engine_cs *engine,
+                u32 hw_context,
+                bool hw_active,
+                u32 sw_context,
+                struct drm_i915_gem_request *req),
+
+       TP_ARGS(engine, hw_context, hw_active, sw_context, req),
+
+       TP_STRUCT__entry(
+                       __field(struct intel_engine_cs *, engine)
+                       __field(u32, hw_context)
+                       __field(bool, hw_active)
+                       __field(u32, sw_context)
+                       __field(int, elsp_submitted)
+       ),
+
+       TP_fast_assign(
+                       __entry->engine = engine;
+                       __entry->hw_context = hw_context;
+                       __entry->hw_active = hw_active;
+                       __entry->sw_context = sw_context;
+                       __entry->elsp_submitted = req?req->elsp_submitted:0;
+       ),
+
+       TP_printk("dev=%u, engine=%s, hw_context=%x, hw_active=%s, 
sw_context=%x, elsp_submitted=%d",
+                 __entry->engine->dev->primary->index,
+                 __entry->engine->name,
+                 __entry->hw_context,
+                 __entry->hw_active?"true":"false",
+                 __entry->sw_context,
+                 __entry->elsp_submitted)
+);
+
 #endif /* _I915_TRACE_H_ */
 
 /* This part must be outside protection */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5bb7d6e..96f5a28 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2285,7 +2285,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct 
drm_i915_gem_request *req,
        struct intel_context *ctx;
        int ret = 0;
        int clamp_to_tail = 0;
-       uint32_t head;
+       uint32_t head, old_head;
        uint32_t tail;
        uint32_t head_addr;
        uint32_t tail_addr;
@@ -2300,7 +2300,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct 
drm_i915_gem_request *req,
         * Read head from MMIO register since it contains the
         * most up to date value of head at this point.
         */
-       head = I915_READ_HEAD(ring);
+       old_head = head = I915_READ_HEAD(ring);
 
        /*
         * Read tail from the context because the execlist queue
@@ -2357,6 +2357,9 @@ gen8_ring_save(struct intel_engine_cs *ring, struct 
drm_i915_gem_request *req,
        head |= (head_addr & HEAD_ADDR);
        ring->saved_head = head;
 
+       trace_i915_tdr_engine_save(ring, old_head,
+               head, force_advance);
+
        return 0;
 }
 
@@ -3264,6 +3267,19 @@ intel_execlists_TDR_get_current_request(struct 
intel_engine_cs *ring,
                        CONTEXT_SUBMISSION_STATUS_NONE_SUBMITTED;
        }
 
+       /*
+        * This may or may not be a sustained inconsistency. Most of the time
+        * it's only a matter of a transitory inconsistency during context
+        * submission/completion but if we happen to detect a sustained
+        * inconsistency then it helps to have more information.
+        */
+       if (status == CONTEXT_SUBMISSION_STATUS_INCONSISTENT)
+               trace_i915_tdr_inconsistency(ring,
+                                            hw_context,
+                                            hw_active,
+                                            sw_context,
+                                            tmpreq);
+
        if (req)
                *req = tmpreq;
 
@@ -3328,6 +3344,7 @@ bool intel_execlists_TDR_force_CSB_check(struct 
drm_i915_private *dev_priv,
 
        spin_unlock_irqrestore(&engine->execlist_lock, flags);
 
+       trace_i915_tdr_forced_csb_check(engine, !!was_effective);
        wake_up_all(&engine->irq_queue);
 
        return !!was_effective;
diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index 400b679..625eda8 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1464,6 +1464,8 @@ static int gen6_do_reset(struct drm_device *dev)
        /* Spin waiting for the device to ack the reset request */
        ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & 
GEN6_GRDOM_FULL) == 0, 500);
 
+       trace_i915_tdr_gpu_reset_complete(dev);
+
        intel_uncore_forcewake_reset(dev, true);
 
        return ret;
@@ -1620,6 +1622,8 @@ static int do_engine_reset_nolock(struct intel_engine_cs 
*engine)
                break;
        }
 
+       trace_i915_tdr_engine_reset_complete(engine);
+
        return ret;
 }
 
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to