RE: [PATCH] drm/amdkfd: add edc error interrupt handle for poison propogate mode

2021-04-16 Thread Zhang, Hawking
[AMD Public Use]

Reviewed-by: Hawking Zhang 

Regards,
Hawking

-Original Message-
From: Dennis Li  
Sent: Friday, April 16, 2021 11:18
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Kuehling, Felix ; Zhang, 
Hawking ; Koenig, Christian 
Cc: Li, Dennis 
Subject: [PATCH] drm/amdkfd: add edc error interrupt handle for poison 
propogate mode

In poison progogate mode, when driver receive the edc error interrupt from SQ, 
driver should kill the process by pasid which is using the poison data, and 
then trigger GPU reset.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 1c20458f3962..696944fa0177 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -25,6 +25,70 @@
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_smi_events.h"
+#include "amdgpu.h"
+
+enum SQ_INTERRUPT_WORD_ENCODING {
+   SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
+   SQ_INTERRUPT_WORD_ENCODING_INST,
+   SQ_INTERRUPT_WORD_ENCODING_ERROR,
+};
+
+enum SQ_INTERRUPT_ERROR_TYPE {
+   SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
+   SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
+   SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
+   SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
+};
+
+/* SQ_INTERRUPT_WORD_AUTO_CTXID */
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE__SHIFT 0 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__WLT__SHIFT 1 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL__SHIFT 2 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP__SHIFT 3 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP__SHIFT 4 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW__SHIFT 5 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW__SHIFT 6 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW__SHIFT 7 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR__SHIFT 8 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID__SHIFT 24 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING__SHIFT 26
+
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_MASK 0x0001 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT_MASK 0x0002 #define 
+SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL_MASK 0x0004 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP_MASK 0x0008 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP_MASK 0x0010 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW_MASK 0x0020 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW_MASK 0x0040 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW_MASK 0x0080 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR_MASK 
+0x0100 #define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID_MASK 0x0300 
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING_MASK 0x0c00
+
+/* SQ_INTERRUPT_WORD_WAVE_CTXID */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA__SHIFT 0 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID__SHIFT 12 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV__SHIFT 13 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID__SHIFT 14 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID__SHIFT 18 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID__SHIFT 20 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID__SHIFT 24 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING__SHIFT 26
+
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA_MASK 0x0fff #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID_MASK 0x1000 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK 0x2000 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID_MASK 0x0003c000 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID_MASK 0x000c #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID_MASK 0x00f0 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x0300 #define 
+SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c00
+
+#define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) 
\
+   ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff))
+
+#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF0 #define 
+KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -108,13 +172,15 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
 {
uint16_t source_id, client_id, pasid, vmid;
-   uint32_t context_id;
+   uint32_t context_id0, context_id1;
+   uint32_t sq_intr_err, sq_int_data, encoding;
 
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
-   context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+   context_id0 = SOC

[PATCH] drm/amdkfd: add edc error interrupt handle for poison propogate mode

2021-04-15 Thread Dennis Li
In poison progogate mode, when driver receive the edc error interrupt
from SQ, driver should kill the process by pasid which is using the
poison data, and then trigger GPU reset.

Signed-off-by: Dennis Li 

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 1c20458f3962..696944fa0177 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -25,6 +25,70 @@
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_smi_events.h"
+#include "amdgpu.h"
+
+enum SQ_INTERRUPT_WORD_ENCODING {
+   SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
+   SQ_INTERRUPT_WORD_ENCODING_INST,
+   SQ_INTERRUPT_WORD_ENCODING_ERROR,
+};
+
+enum SQ_INTERRUPT_ERROR_TYPE {
+   SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
+   SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
+   SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
+   SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
+};
+
+/* SQ_INTERRUPT_WORD_AUTO_CTXID */
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE__SHIFT 0
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT__SHIFT 1
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL__SHIFT 2
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP__SHIFT 3
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP__SHIFT 4
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW__SHIFT 5
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW__SHIFT 6
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW__SHIFT 7
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR__SHIFT 8
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID__SHIFT 24
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING__SHIFT 26
+
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_MASK 0x0001
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT_MASK 0x0002
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL_MASK 0x0004
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP_MASK 0x0008
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP_MASK 0x0010
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW_MASK 0x0020
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW_MASK 0x0040
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW_MASK 0x0080
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR_MASK 0x0100
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID_MASK 0x0300
+#define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING_MASK 0x0c00
+
+/* SQ_INTERRUPT_WORD_WAVE_CTXID */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID__SHIFT 12
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV__SHIFT 13
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID__SHIFT 14
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID__SHIFT 18
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID__SHIFT 20
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID__SHIFT 24
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING__SHIFT 26
+
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA_MASK 0x0fff
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID_MASK 0x1000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK 0x2000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID_MASK 0x0003c000
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID_MASK 0x000c
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID_MASK 0x00f0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x0300
+#define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c00
+
+#define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) 
\
+   ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff))
+
+#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF0
+#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -108,13 +172,15 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
 {
uint16_t source_id, client_id, pasid, vmid;
-   uint32_t context_id;
+   uint32_t context_id0, context_id1;
+   uint32_t sq_intr_err, sq_int_data, encoding;
 
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
-   context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+   context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+   context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
 
if (client_id == SOC15_IH_CLIENTID_GRBM_CP ||
client_id == SOC15_IH_CLIENTID_SE0SH ||
@@ -122,10 +188,59 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
client_id == SOC15_IH_CLIENTID_SE2SH ||
client_id == SOC15_IH_CLIENTID_SE3SH) {
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
-