On Wed, Apr 26, 2023 at 04:57:13PM -0400, Rodrigo Vivi wrote:
> With this patch, we now have some parity between xe_devcoredump
> and the simple_error_capture. The only difference is that
> xe_devcoredump will only stash the 'first' hang, which is the one
> that we care most and should analyze first, while
> simple_error_capture will dump them all the kernel log.
> 
> But this is just a start point to start building a useful and
> organized crash dump, using standard infrastructure. Later this
> will be changed to have output that can be parsed by tools and
> used for error replay.
> 
> Also, it is important to highlight that the goal is not to replace
> the simple_error_capture which is still useful for some cases.
> But simple_error_capture should be protected under DEBUG and
> EXPERT flags, while the devcoredump has its own production config
> and will be useful for bug reporting and for error replay.
> 
> Signed-off-by: Rodrigo Vivi <rodrigo.v...@intel.com>

Again maybe hold this off after GPUVA but LGTM. Also 1 nit below.

Reviewed-by: Matthew Brost <matthew.br...@intel.com>

> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 6 ++++++
>  drivers/gpu/drm/xe/xe_devcoredump_types.h | 3 +++
>  2 files changed, 9 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c 
> b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 1ffd12646a99..9dbafd586fbd 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -16,6 +16,7 @@
>  #include "xe_guc_ct.h"
>  #include "xe_guc_submit.h"
>  #include "xe_hw_engine.h"
> +#include "xe_vm.h"
>  
>  /**
>   * DOC: Xe device coredump
> @@ -103,6 +104,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t 
> offset,
>       for_each_hw_engine(hwe, e->gt, id)
>               xe_hw_engine_snapshot_print(coredump->snapshot.hwe[id], &p);
>  
> +     drm_printf(&p, "\n**** VM ****\n");
> +     xe_vm_snapshot_print(coredump->snapshot.vm, &p);
> +
>       mutex_unlock(&coredump->lock);
>  
>       return count - iter.remain;
> @@ -124,6 +128,7 @@ static void xe_devcoredump_free(void *data)
>       xe_guc_engine_snapshot_free(coredump->snapshot.ge);
>       for_each_hw_engine(hwe, coredump->faulty_engine->gt, id)
>               xe_hw_engine_snapshot_free(coredump->snapshot.hwe[id]);
> +     xe_vm_snapshot_free(coredump->snapshot.vm);
>  
>       coredump->faulty_engine = NULL;
>       drm_info(&coredump_to_xe(coredump)->drm,
> @@ -172,6 +177,7 @@ static void devcoredump_snapshot(struct xe_devcoredump 
> *coredump)
>               coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe);
>       }
>  
> +     coredump->snapshot.vm = xe_vm_snapshot_capture(e->vm, e->gt->info.id);
>       xe_force_wake_put(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
>       dma_fence_end_signalling(cookie);
>  }
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h 
> b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 8b17ecf1b6e6..f508eca292f7 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -31,8 +31,11 @@ struct xe_devcoredump_snapshot {
>       struct xe_guc_ct_snapshot *ct;
>       /** @ge: Guc Engine snapshot */
>       struct xe_guc_submit_engine_snapshot *ge;
> +

Nit extra newline.

>       /** @hwe: HW Engine snapshot array */
>       struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
> +     /** @vm: VM snapshot */
> +     struct xe_vm_snapshot *vm;
>  };
>  
>  /**
> -- 
> 2.39.2
> 

Reply via email to