Re: [Intel-xe] [PATCH 11/14] drm/xe: Add HW Engine snapshot to xe_devcoredump.

2023-05-02 Thread Matthew Brost
On Wed, Apr 26, 2023 at 04:57:10PM -0400, Rodrigo Vivi wrote:
> Let's continue to add our existent simple logs to devcoredump one
> by one. Any format change should come on follow-up work.
> 
> Signed-off-by: Rodrigo Vivi 
> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c   | 45 +++
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ++
>  2 files changed, 49 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c 
> b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 0e7ec654a9f2..1ffd12646a99 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -9,10 +9,13 @@
>  #include 
>  #include 
>  
> +#include "xe_device.h"
>  #include "xe_engine.h"
> +#include "xe_force_wake.h"
>  #include "xe_gt.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_submit.h"
> +#include "xe_hw_engine.h"
>  
>  /**
>   * DOC: Xe device coredump
> @@ -62,6 +65,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
> offset,
>   struct drm_printer p;
>   struct drm_print_iterator iter;
>   struct timespec64 ts;
> + struct xe_engine *e;
> + struct xe_hw_engine *hwe;
> + enum xe_hw_engine_id id;
>  
>   /* Our device is gone already... */
>   if (!data || !coredump_to_xe(coredump))
> @@ -75,6 +81,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
> offset,
>   mutex_lock(>lock);
>  
>   ss = >snapshot;
> + e = coredump->faulty_engine;
>   p = drm_coredump_printer();
>  
>   drm_printf(, " Xe Device Coredump \n");
> @@ -92,6 +99,10 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
> offset,
>   xe_guc_ct_snapshot_print(coredump->snapshot.ct, );
>   xe_guc_engine_snapshot_print(coredump->snapshot.ge, );
>  
> + drm_printf(, "\n HW Engines \n");
> + for_each_hw_engine(hwe, e->gt, id)
> + xe_hw_engine_snapshot_print(coredump->snapshot.hwe[id], );
> +
>   mutex_unlock(>lock);
>  
>   return count - iter.remain;
> @@ -100,6 +111,8 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
> offset,
>  static void xe_devcoredump_free(void *data)
>  {
>   struct xe_devcoredump *coredump = data;
> + struct xe_hw_engine *hwe;
> + enum xe_hw_engine_id id;
>  
>   /* Our device is gone. Nothing to do... */
>   if (!data || !coredump_to_xe(coredump))
> @@ -109,6 +122,8 @@ static void xe_devcoredump_free(void *data)
>  
>   xe_guc_ct_snapshot_free(coredump->snapshot.ct);
>   xe_guc_engine_snapshot_free(coredump->snapshot.ge);
> + for_each_hw_engine(hwe, coredump->faulty_engine->gt, id)
> + xe_hw_engine_snapshot_free(coredump->snapshot.hwe[id]);
>  
>   coredump->faulty_engine = NULL;
>   drm_info(_to_xe(coredump)->drm,
> @@ -122,13 +137,43 @@ static void devcoredump_snapshot(struct xe_devcoredump 
> *coredump)
>   struct xe_devcoredump_snapshot *ss = >snapshot;
>   struct xe_engine *e = coredump->faulty_engine;
>   struct xe_guc *guc = engine_to_guc(e);
> + struct xe_hw_engine *hwe;
> + enum xe_hw_engine_id id;
> + u32 adj_logical_mask = e->logical_mask;
> + u32 width_mask = (0x1 << e->width) - 1;
> + int i;
> + bool cookie;
>  
>   lockdep_assert_held(>lock);
>   ss->snapshot_time = ktime_get_real();
>   ss->boot_time = ktime_get_boottime();
>  
> + cookie = dma_fence_begin_signalling();

Why the annotation here? Otherwise LGTM.

Matt

> + for (i = 0; e->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) {
> + if (adj_logical_mask & BIT(i)) {
> + adj_logical_mask |= width_mask << i;
> + i += e->width;
> + } else {
> + ++i;
> + }
> + }
> +
> + xe_force_wake_get(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
> +
>   coredump->snapshot.ct = xe_guc_ct_snapshot_capture(>ct);
>   coredump->snapshot.ge = xe_guc_engine_snapshot_capture(e);
> +
> + for_each_hw_engine(hwe, e->gt, id) {
> + if (hwe->class != e->hwe->class ||
> + !(BIT(hwe->logical_instance) & adj_logical_mask)) {
> + coredump->snapshot.hwe[id] = NULL;
> + continue;
> + }
> + coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe);
> + }
> +
> + xe_force_wake_put(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
> + dma_fence_end_signalling(cookie);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h 
> b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index e055b266af70..8b17ecf1b6e6 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -9,6 +9,8 @@
>  #include 
>  #include 
>  
> +#include "xe_hw_engine_types.h"
> +
>  struct xe_device;
>  
>  /**
> @@ -29,6 +31,8 @@ struct xe_devcoredump_snapshot {
>   struct xe_guc_ct_snapshot *ct;
>   /** @ge: Guc Engine snapshot */
>   struct xe_guc_submit_engine_snapshot 

[PATCH 11/14] drm/xe: Add HW Engine snapshot to xe_devcoredump.

2023-04-26 Thread Rodrigo Vivi
Let's continue to add our existent simple logs to devcoredump one
by one. Any format change should come on follow-up work.

Signed-off-by: Rodrigo Vivi 
---
 drivers/gpu/drm/xe/xe_devcoredump.c   | 45 +++
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ++
 2 files changed, 49 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c 
b/drivers/gpu/drm/xe/xe_devcoredump.c
index 0e7ec654a9f2..1ffd12646a99 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -9,10 +9,13 @@
 #include 
 #include 
 
+#include "xe_device.h"
 #include "xe_engine.h"
+#include "xe_force_wake.h"
 #include "xe_gt.h"
 #include "xe_guc_ct.h"
 #include "xe_guc_submit.h"
+#include "xe_hw_engine.h"
 
 /**
  * DOC: Xe device coredump
@@ -62,6 +65,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
offset,
struct drm_printer p;
struct drm_print_iterator iter;
struct timespec64 ts;
+   struct xe_engine *e;
+   struct xe_hw_engine *hwe;
+   enum xe_hw_engine_id id;
 
/* Our device is gone already... */
if (!data || !coredump_to_xe(coredump))
@@ -75,6 +81,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
offset,
mutex_lock(>lock);
 
ss = >snapshot;
+   e = coredump->faulty_engine;
p = drm_coredump_printer();
 
drm_printf(, " Xe Device Coredump \n");
@@ -92,6 +99,10 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
offset,
xe_guc_ct_snapshot_print(coredump->snapshot.ct, );
xe_guc_engine_snapshot_print(coredump->snapshot.ge, );
 
+   drm_printf(, "\n HW Engines \n");
+   for_each_hw_engine(hwe, e->gt, id)
+   xe_hw_engine_snapshot_print(coredump->snapshot.hwe[id], );
+
mutex_unlock(>lock);
 
return count - iter.remain;
@@ -100,6 +111,8 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
offset,
 static void xe_devcoredump_free(void *data)
 {
struct xe_devcoredump *coredump = data;
+   struct xe_hw_engine *hwe;
+   enum xe_hw_engine_id id;
 
/* Our device is gone. Nothing to do... */
if (!data || !coredump_to_xe(coredump))
@@ -109,6 +122,8 @@ static void xe_devcoredump_free(void *data)
 
xe_guc_ct_snapshot_free(coredump->snapshot.ct);
xe_guc_engine_snapshot_free(coredump->snapshot.ge);
+   for_each_hw_engine(hwe, coredump->faulty_engine->gt, id)
+   xe_hw_engine_snapshot_free(coredump->snapshot.hwe[id]);
 
coredump->faulty_engine = NULL;
drm_info(_to_xe(coredump)->drm,
@@ -122,13 +137,43 @@ static void devcoredump_snapshot(struct xe_devcoredump 
*coredump)
struct xe_devcoredump_snapshot *ss = >snapshot;
struct xe_engine *e = coredump->faulty_engine;
struct xe_guc *guc = engine_to_guc(e);
+   struct xe_hw_engine *hwe;
+   enum xe_hw_engine_id id;
+   u32 adj_logical_mask = e->logical_mask;
+   u32 width_mask = (0x1 << e->width) - 1;
+   int i;
+   bool cookie;
 
lockdep_assert_held(>lock);
ss->snapshot_time = ktime_get_real();
ss->boot_time = ktime_get_boottime();
 
+   cookie = dma_fence_begin_signalling();
+   for (i = 0; e->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) {
+   if (adj_logical_mask & BIT(i)) {
+   adj_logical_mask |= width_mask << i;
+   i += e->width;
+   } else {
+   ++i;
+   }
+   }
+
+   xe_force_wake_get(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
+
coredump->snapshot.ct = xe_guc_ct_snapshot_capture(>ct);
coredump->snapshot.ge = xe_guc_engine_snapshot_capture(e);
+
+   for_each_hw_engine(hwe, e->gt, id) {
+   if (hwe->class != e->hwe->class ||
+   !(BIT(hwe->logical_instance) & adj_logical_mask)) {
+   coredump->snapshot.hwe[id] = NULL;
+   continue;
+   }
+   coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe);
+   }
+
+   xe_force_wake_put(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
+   dma_fence_end_signalling(cookie);
 }
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h 
b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index e055b266af70..8b17ecf1b6e6 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -9,6 +9,8 @@
 #include 
 #include 
 
+#include "xe_hw_engine_types.h"
+
 struct xe_device;
 
 /**
@@ -29,6 +31,8 @@ struct xe_devcoredump_snapshot {
struct xe_guc_ct_snapshot *ct;
/** @ge: Guc Engine snapshot */
struct xe_guc_submit_engine_snapshot *ge;
+   /** @hwe: HW Engine snapshot array */
+   struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
 };
 
 /**
-- 
2.39.2