Re: [PATCH v3] drm/amdgpu: Add Ring Hang Events

2024-05-13 Thread Christian König

Am 13.05.24 um 06:14 schrieb Ori Messinger:

This patch adds 'ring hang' events to the driver.
This is done by adding a 'reset_ring_hang' bool variable to the
struct 'amdgpu_reset_context' in the amdgpu_reset.h file.
The purpose for this 'reset_ring_hang' variable is whenever a GPU
reset is initiated due to a ring hang, the reset_ring_hang should
be set to 'true'.

Additionally, the reset cause is passed into the kfd smi event as
a string, and is displayed in dmesg as an error.

This 'amdgpu_reset_context' struct is now also passed
through across all relevant functions, and another event type
"KFD_SMI_EVENT_RING_HANG" is added to the kfd_smi_event enum.


Well general NAK to that design.

Why in the world would we want to expose the ring hang through the KFD 
SMI interface?


Regards,
Christian.



Signed-off-by: Ori Messinger 
Change-Id: I6af3022eb1b4514201c9430d635ff87f167ad6f7
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  7 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  9 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 16 
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  4 
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h   |  2 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c |  7 ---
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  7 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  5 -
  include/uapi/linux/kfd_ioctl.h  | 15 ---
  10 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index e3738d417245..f1c6dc939cc3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct 
*work)
  
  	reset_context.method = AMD_RESET_METHOD_NONE;

reset_context.reset_req_dev = adev;
+   reset_context.reset_ring_hang = true;
+   strscpy(reset_context.reset_cause, "hws_hang", 
sizeof(reset_context.reset_cause));
+   DRM_ERROR("Reset cause: %s\n", reset_context.reset_cause);
clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
  
  	amdgpu_device_gpu_recover(adev, NULL, _context);

@@ -261,12 +264,12 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool 
run_pm)
return r;
  }
  
-int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)

+int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, struct 
amdgpu_reset_context *reset_context)
  {
int r = 0;
  
  	if (adev->kfd.dev)

-   r = kgd2kfd_pre_reset(adev->kfd.dev);
+   r = kgd2kfd_pre_reset(adev->kfd.dev, reset_context);
  
  	return r;

  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..c9030d8b8308 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
  };
  
  struct amdgpu_device;

+struct amdgpu_reset_context;
  
  enum kfd_mem_attachment_type {

KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */
@@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
amdgpu_device *adev);
  
  bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
  
-int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);

+int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
+   struct amdgpu_reset_context *reset_context);
  
  int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
  
@@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,

  void kgd2kfd_device_exit(struct kfd_dev *kfd);
  void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
-int kgd2kfd_pre_reset(struct kfd_dev *kfd);
+int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+ struct amdgpu_reset_context *reset_context);
  int kgd2kfd_post_reset(struct kfd_dev *kfd);
  void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
  void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
@@ -459,7 +462,7 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool 
run_pm)
return 0;
  }
  
-static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)

+static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd, struct 
amdgpu_reset_context *reset_context)
  {
return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 00fe3c2d5431..b18f37426b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5772,7 +5772,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  
  		cancel_delayed_work_sync(_adev->delayed_init_work);
  
-		amdgpu_amdkfd_pre_reset(tmp_adev);

+   amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
  
  		/*


Re: [PATCH v3] drm/amdgpu: Add Ring Hang Events

2024-05-13 Thread Lazar, Lijo



On 5/13/2024 9:44 AM, Ori Messinger wrote:
> This patch adds 'ring hang' events to the driver.
> This is done by adding a 'reset_ring_hang' bool variable to the
> struct 'amdgpu_reset_context' in the amdgpu_reset.h file.
> The purpose for this 'reset_ring_hang' variable is whenever a GPU
> reset is initiated due to a ring hang, the reset_ring_hang should
> be set to 'true'.
> 
> Additionally, the reset cause is passed into the kfd smi event as
> a string, and is displayed in dmesg as an error.
> 
> This 'amdgpu_reset_context' struct is now also passed
> through across all relevant functions, and another event type
> "KFD_SMI_EVENT_RING_HANG" is added to the kfd_smi_event enum.
> 
> Signed-off-by: Ori Messinger 
> Change-Id: I6af3022eb1b4514201c9430d635ff87f167ad6f7
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  7 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  9 ++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 16 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  4 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h   |  2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c |  7 ---
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  7 ++-
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  5 -
>  include/uapi/linux/kfd_ioctl.h  | 15 ---
>  10 files changed, 56 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index e3738d417245..f1c6dc939cc3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct 
> *work)
>  
>   reset_context.method = AMD_RESET_METHOD_NONE;
>   reset_context.reset_req_dev = adev;
> + reset_context.reset_ring_hang = true;
> + strscpy(reset_context.reset_cause, "hws_hang", 
> sizeof(reset_context.reset_cause));

Please add only reset cause as an enum to reset context. There is no
need for a separate variable like ring hang.

For a user ring that induces a HWS hang, that may be identified
separately or identified generically with "HWS hang" as the reason. HWS
hang also could be caused by RAS error.

A possible list is -

DRIVER_TRIGGERED (suspend/reset on init etc)
JOB HANG,
HWS HANG,
USER TRIGGERED,
RAS ERROR,

The description string for reset cause may be obtained separately with
something like below which returns the details - no need to add them to
reset context and pass around.

amdgpu_reset_get_reset_desc(reset_context);

If reset is caused by a specific job, reset context already has a job
pointer. From there you may get more details like device/partition id,
job id, ring name etc. and provide the description. For RAS errors,
there is already detailed dmesg log of IP which caused issue. So only
device could be sufficient.

> + DRM_ERROR("Reset cause: %s\n", reset_context.reset_cause);

Please don't use DRM_*. They are deprecated. Use either drm_err() or
dev_err() - they help to identify the device also.

Thanks,
Lijo

>   clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
>  
>   amdgpu_device_gpu_recover(adev, NULL, _context);
> @@ -261,12 +264,12 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, 
> bool run_pm)
>   return r;
>  }
>  
> -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)
> +int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, struct 
> amdgpu_reset_context *reset_context)
>  {
>   int r = 0;
>  
>   if (adev->kfd.dev)
> - r = kgd2kfd_pre_reset(adev->kfd.dev);
> + r = kgd2kfd_pre_reset(adev->kfd.dev, reset_context);
>  
>   return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 1de021ebdd46..c9030d8b8308 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
>  };
>  
>  struct amdgpu_device;
> +struct amdgpu_reset_context;
>  
>  enum kfd_mem_attachment_type {
>   KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */
> @@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
> amdgpu_device *adev);
>  
>  bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
>  
> -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
> +int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
> + struct amdgpu_reset_context *reset_context);
>  
>  int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
>  
> @@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  void kgd2kfd_device_exit(struct kfd_dev *kfd);
>  void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> -int kgd2kfd_pre_reset(struct kfd_dev *kfd);
> +int