amdkfd: add RAS ECC event support (v2)

Alex Deucher Tue, 05 Mar 2019 12:42:43 -0800

From: Eric Huang <jinhuieric.hu...@amd.com>

RAS ECC event will combine with GPU reset event, due to
ECC interrupts are caused by uncorrectable error that triggers
GPU reset.


v2: Fix misleading-indentation warning

Signed-off-by: Eric Huang <jinhuieric.hu...@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehl...@amd.com>
Reviewed-by: Alex Deucher <alexander.deuc...@amd.com>
Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 11 +++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c    | 18 +++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  3 +++
 include/uapi/linux/kfd_ioctl.h             | 12 +++++++++++-
 8 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 0e1711a75b68..e6a503760b62 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
 int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
                                               struct dma_fence *fence);
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
 
 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 88c45f990f05..6bb71f6ee18e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct 
amdgpu_device *adev,
                struct amdgpu_iv_entry *entry)
 {
        /* TODO ue will trigger an interrupt. */
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
        amdgpu_ras_reset_gpu(adev, 0);
        return AMDGPU_RAS_UE;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1a164fe2dd9a..84904bd680df 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct 
amdgpu_device *adev,
 static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
                struct amdgpu_iv_entry *entry)
 {
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
        amdgpu_ras_reset_gpu(adev, 0);
        return AMDGPU_RAS_UE;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 058b9daec514..f7a6fafd70ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct 
amdgpu_device *adev,
                return 0;
        }
 
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
        amdgpu_ras_reset_gpu(adev, 0);
 
        return AMDGPU_RAS_UE;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8be9677c0c07..b3cdbf79f47b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
        memset(&kfd->doorbell_available_index, 0,
                sizeof(kfd->doorbell_available_index));
 
+       atomic_set(&kfd->sram_ecc_flag, 0);
+
        return kfd;
 }
 
@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
                return ret;
        count = atomic_dec_return(&kfd_locked);
        WARN_ONCE(count != 0, "KFD reset ref. error");
+
+       atomic_set(&kfd->sram_ecc_flag, 0);
+
        return 0;
 }
 
@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct 
kfd_mem_obj *mem_obj)
        return 0;
 }
 
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+       if (kfd)
+               atomic_inc(&kfd->sram_ecc_flag);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9f0e0a1b41c..6e1d41c5bf86 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, 
unsigned int pasid,
 void kfd_signal_reset_event(struct kfd_dev *dev)
 {
        struct kfd_hsa_hw_exception_data hw_exception_data;
+       struct kfd_hsa_memory_exception_data memory_exception_data;
        struct kfd_process *p;
        struct kfd_event *ev;
        unsigned int temp;
        uint32_t id, idx;
+       int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+                       KFD_HW_EXCEPTION_ECC :
+                       KFD_HW_EXCEPTION_GPU_HANG;
 
        /* Whole gpu reset caused by GPU hang and memory is lost */
        memset(&hw_exception_data, 0, sizeof(hw_exception_data));
        hw_exception_data.gpu_id = dev->id;
        hw_exception_data.memory_lost = 1;
+       hw_exception_data.reset_cause = reset_cause;
+
+       memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+       memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+       memory_exception_data.gpu_id = dev->id;
+       memory_exception_data.failure.imprecise = true;
 
        idx = srcu_read_lock(&kfd_processes_srcu);
        hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
                mutex_lock(&p->event_mutex);
                id = KFD_FIRST_NONSIGNAL_EVENT_ID;
-               idr_for_each_entry_continue(&p->event_idr, ev, id)
+               idr_for_each_entry_continue(&p->event_idr, ev, id) {
                        if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
                                ev->hw_exception_data = hw_exception_data;
                                set_event(ev);
                        }
+                       if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+                           reset_cause == KFD_HW_EXCEPTION_ECC) {
+                               ev->memory_exception_data = 
memory_exception_data;
+                               set_event(ev);
+                       }
+               }
                mutex_unlock(&p->event_mutex);
        }
        srcu_read_unlock(&kfd_processes_srcu, idx);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0eeee3c6d6dc..9e0230965675 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -276,6 +276,9 @@ struct kfd_dev {
        uint64_t hive_id;
 
        bool pci_atomic_requested;
+
+       /* SRAM ECC flag */
+       atomic_t sram_ecc_flag;
 };
 
 enum kfd_mempool {
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index e622fd1fbd46..dc067ed0b72d 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
 #define KFD_HW_EXCEPTION_GPU_HANG      0
 #define KFD_HW_EXCEPTION_ECC           1
 
+/* For kfd_hsa_memory_exception_data.ErrorType */
+#define KFD_MEM_ERR_NO_RAS             0
+#define KFD_MEM_ERR_SRAM_ECC           1
+#define KFD_MEM_ERR_POISON_CONSUMED    2
+#define KFD_MEM_ERR_GPU_HANG           3
 
 struct kfd_ioctl_create_event_args {
        __u64 event_page_offset;        /* from KFD */
@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
        struct kfd_memory_exception_failure failure;
        __u64 va;
        __u32 gpu_id;
-       __u32 pad;
+       __u32 ErrorType; /* 0 = no RAS error,
+                         * 1 = ECC_SRAM,
+                         * 2 = Link_SYNFLOOD (poison),
+                         * 3 = GPU hang (not attributable to a specific cause),
+                         * other values reserved
+                         */
 };
 
 /* hw exception data */
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 20/20] drm/amdkfd: add RAS ECC event support (v2)

Reply via email to